diff -uprN linux-2.6.32/arch/alpha/include/asm/errno.h linux-2.6.32.ovz/arch/alpha/include/asm/errno.h --- linux-2.6.32/arch/alpha/include/asm/errno.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/errno.h 2015-09-10 10:07:40.000000000 -0700 @@ -43,7 +43,7 @@ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ -#define ESTALE 70 /* Stale NFS file handle */ +#define ESTALE 70 /* Stale file handle */ #define EREMOTE 71 /* Object is remote */ #define ENOLCK 77 /* No record locks available */ @@ -122,4 +122,6 @@ #define ERFKILL 138 /* Operation not possible due to RF-kill */ +#define EHWPOISON 139 /* Memory page has hardware error */ + #endif diff -uprN linux-2.6.32/arch/alpha/include/asm/mman.h linux-2.6.32.ovz/arch/alpha/include/asm/mman.h --- linux-2.6.32/arch/alpha/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/mman.h 2015-10-24 08:14:47.352501187 -0700 @@ -53,6 +53,13 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + +#define MADV_DONTDUMP 16 /* Explicity exclude from the core dump, + overrides the coredump filter bits */ +#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ + /* compatibility flags */ #define MAP_FILE 0 diff -uprN linux-2.6.32/arch/alpha/include/asm/mmzone.h linux-2.6.32.ovz/arch/alpha/include/asm/mmzone.h --- linux-2.6.32/arch/alpha/include/asm/mmzone.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/mmzone.h 2015-09-10 10:06:40.000000000 -0700 @@ -56,7 +56,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, * Given a kernel address, find the home node of the underlying memory. */ #define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) /* * Given a kaddr, LOCAL_BASE_ADDR finds the owning node of the memory diff -uprN linux-2.6.32/arch/alpha/include/asm/module.h linux-2.6.32.ovz/arch/alpha/include/asm/module.h --- linux-2.6.32/arch/alpha/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/module.h 2015-09-10 10:05:36.000000000 -0700 @@ -6,6 +6,7 @@ struct mod_arch_specific unsigned int gotsecindex; }; +#define MODULES_ARE_ELF64 #define Elf_Sym Elf64_Sym #define Elf_Shdr Elf64_Shdr #define Elf_Ehdr Elf64_Ehdr @@ -13,6 +14,8 @@ struct mod_arch_specific #define Elf_Dyn Elf64_Dyn #define Elf_Rel Elf64_Rel #define Elf_Rela Elf64_Rela +#define ELF_R_TYPE(X) ELF64_R_TYPE(X) +#define ELF_R_SYM(X) ELF64_R_SYM(X) #define ARCH_SHF_SMALL SHF_ALPHA_GPREL diff -uprN linux-2.6.32/arch/alpha/include/asm/socket.h linux-2.6.32.ovz/arch/alpha/include/asm/socket.h --- linux-2.6.32/arch/alpha/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/socket.h 2015-09-10 10:05:37.000000000 -0700 @@ -67,6 +67,8 @@ #define SO_TIMESTAMPING 37 #define SCM_TIMESTAMPING SO_TIMESTAMPING +#define SO_RXQ_OVFL 40 + /* O_NONBLOCK clashes with the bits used for socket types. Therefore we * have to define SOCK_NONBLOCK to a different value here. */ diff -uprN linux-2.6.32/arch/alpha/include/asm/unistd.h linux-2.6.32.ovz/arch/alpha/include/asm/unistd.h --- linux-2.6.32/arch/alpha/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/unistd.h 2015-09-10 10:05:55.000000000 -0700 @@ -433,10 +433,11 @@ #define __NR_signalfd 476 #define __NR_timerfd 477 #define __NR_eventfd 478 +#define __NR_recvmmsg 479 #ifdef __KERNEL__ -#define NR_SYSCALLS 479 +#define NR_SYSCALLS 480 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff -uprN linux-2.6.32/arch/alpha/kernel/osf_sys.c linux-2.6.32.ovz/arch/alpha/kernel/osf_sys.c --- linux-2.6.32/arch/alpha/kernel/osf_sys.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/kernel/osf_sys.c 2015-09-10 10:07:24.000000000 -0700 @@ -178,25 +178,18 @@ SYSCALL_DEFINE6(osf_mmap, unsigned long, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - struct file *file = NULL; - unsigned long ret = -EBADF; + unsigned long ret = -EINVAL; #if 0 if (flags & (_MAP_HASSEMAPHORE | _MAP_INHERIT | _MAP_UNALIGNED)) printk("%s: unimplemented OSF mmap flags %04lx\n", current->comm, flags); #endif - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - ret = do_mmap(file, addr, len, prot, flags, off); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + if ((off + PAGE_ALIGN(len)) < off) + goto out; + if (off & ~PAGE_MASK) + goto out; + ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return ret; } @@ -241,11 +234,11 @@ linux_to_osf_statfs(struct kstatfs *linu } static int -do_osf_statfs(struct dentry * dentry, struct osf_statfs __user *buffer, +do_osf_statfs(struct path *path, struct osf_statfs __user *buffer, unsigned long bufsiz) { struct kstatfs linux_stat; - int error = vfs_statfs(dentry, &linux_stat); + int error = vfs_statfs(path, &linux_stat); if (!error) error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz); return error; @@ -259,7 +252,7 @@ SYSCALL_DEFINE3(osf_statfs, char __user retval = user_path(pathname, &path); if (!retval) { - retval = do_osf_statfs(path.dentry, buffer, bufsiz); + retval = do_osf_statfs(&path, buffer, bufsiz); path_put(&path); } return retval; @@ -274,7 +267,7 @@ SYSCALL_DEFINE3(osf_fstatfs, unsigned lo retval = -EBADF; file = fget(fd); if (file) { - retval = do_osf_statfs(file->f_path.dentry, buffer, bufsiz); + retval = do_osf_statfs(&file->f_path, buffer, bufsiz); fput(file); } return retval; diff -uprN linux-2.6.32/arch/alpha/kernel/pci-sysfs.c linux-2.6.32.ovz/arch/alpha/kernel/pci-sysfs.c --- linux-2.6.32/arch/alpha/kernel/pci-sysfs.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/kernel/pci-sysfs.c 2015-09-10 10:06:06.000000000 -0700 @@ -52,6 +52,7 @@ static int __pci_mmap_fits(struct pci_de /** * pci_mmap_resource - map a PCI resource into user memory space + * @filp: open sysfs file * @kobj: kobject for mapping * @attr: struct bin_attribute for the file being mapped * @vma: struct vm_area_struct passed into the mmap @@ -59,7 +60,8 @@ static int __pci_mmap_fits(struct pci_de * * Use the bus mapping routines to map a PCI resource into userspace. */ -static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, +static int pci_mmap_resource(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, struct vm_area_struct *vma, int sparse) { struct pci_dev *pdev = to_pci_dev(container_of(kobj, @@ -88,14 +90,14 @@ static int pci_mmap_resource(struct kobj return hose_mmap_page_range(pdev->sysdata, vma, mmap_type, sparse); } -static int pci_mmap_resource_sparse(struct kobject *kobj, +static int pci_mmap_resource_sparse(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, struct vm_area_struct *vma) { return pci_mmap_resource(kobj, attr, vma, 1); } -static int pci_mmap_resource_dense(struct kobject *kobj, +static int pci_mmap_resource_dense(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, struct vm_area_struct *vma) { diff -uprN linux-2.6.32/arch/alpha/kernel/systbls.S linux-2.6.32.ovz/arch/alpha/kernel/systbls.S --- linux-2.6.32/arch/alpha/kernel/systbls.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/kernel/systbls.S 2015-09-10 10:05:55.000000000 -0700 @@ -497,6 +497,7 @@ sys_call_table: .quad sys_signalfd .quad sys_ni_syscall .quad sys_eventfd + .quad sys_recvmmsg .size sys_call_table, . - sys_call_table .type sys_call_table, @object diff -uprN linux-2.6.32/arch/arm/include/asm/mman.h linux-2.6.32.ovz/arch/arm/include/asm/mman.h --- linux-2.6.32/arch/arm/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/include/asm/mman.h 2015-09-10 10:05:47.000000000 -0700 @@ -1 +1,4 @@ #include + +#define arch_mmap_check(addr, len, flags) \ + (((flags) & MAP_FIXED && (addr) < FIRST_USER_ADDRESS) ? -EINVAL : 0) diff -uprN linux-2.6.32/arch/arm/include/asm/module.h linux-2.6.32.ovz/arch/arm/include/asm/module.h --- linux-2.6.32/arch/arm/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/include/asm/module.h 2015-09-10 10:05:36.000000000 -0700 @@ -1,9 +1,14 @@ #ifndef _ASM_ARM_MODULE_H #define _ASM_ARM_MODULE_H +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) struct unwind_table; diff -uprN linux-2.6.32/arch/arm/include/asm/socket.h linux-2.6.32.ovz/arch/arm/include/asm/socket.h --- linux-2.6.32/arch/arm/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/include/asm/socket.h 2015-09-10 10:05:37.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/arm/include/asm/unistd.h linux-2.6.32.ovz/arch/arm/include/asm/unistd.h --- linux-2.6.32/arch/arm/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/include/asm/unistd.h 2015-09-10 10:05:55.000000000 -0700 @@ -391,6 +391,7 @@ #define __NR_pwritev (__NR_SYSCALL_BASE+362) #define __NR_rt_tgsigqueueinfo (__NR_SYSCALL_BASE+363) #define __NR_perf_event_open (__NR_SYSCALL_BASE+364) +#define __NR_recvmmsg (__NR_SYSCALL_BASE+365) /* * The following SWIs are ARM private. diff -uprN linux-2.6.32/arch/arm/kernel/calls.S linux-2.6.32.ovz/arch/arm/kernel/calls.S --- linux-2.6.32/arch/arm/kernel/calls.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/kernel/calls.S 2015-09-10 10:05:55.000000000 -0700 @@ -172,7 +172,7 @@ /* 160 */ CALL(sys_sched_get_priority_min) CALL(sys_sched_rr_get_interval) CALL(sys_nanosleep) - CALL(sys_arm_mremap) + CALL(sys_mremap) CALL(sys_setresuid16) /* 165 */ CALL(sys_getresuid16) CALL(sys_ni_syscall) /* vm86 */ @@ -374,6 +374,7 @@ CALL(sys_pwritev) CALL(sys_rt_tgsigqueueinfo) CALL(sys_perf_event_open) +/* 365 */ CALL(sys_recvmmsg) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted diff -uprN linux-2.6.32/arch/arm/kernel/entry-common.S linux-2.6.32.ovz/arch/arm/kernel/entry-common.S --- linux-2.6.32/arch/arm/kernel/entry-common.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/kernel/entry-common.S 2015-09-10 10:05:47.000000000 -0700 @@ -416,12 +416,12 @@ sys_mmap2: tst r5, #PGOFF_MASK moveq r5, r5, lsr #PAGE_SHIFT - 12 streq r5, [sp, #4] - beq do_mmap2 + beq sys_mmap_pgoff mov r0, #-EINVAL mov pc, lr #else str r5, [sp, #4] - b do_mmap2 + b sys_mmap_pgoff #endif ENDPROC(sys_mmap2) diff -uprN linux-2.6.32/arch/arm/kernel/smp.c linux-2.6.32.ovz/arch/arm/kernel/smp.c --- linux-2.6.32/arch/arm/kernel/smp.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/kernel/smp.c 2015-09-10 10:07:26.000000000 -0700 @@ -563,19 +563,6 @@ int setup_profiling_timer(unsigned int m return -EINVAL; } -static void -on_each_cpu_mask(void (*func)(void *), void *info, int wait, - const struct cpumask *mask) -{ - preempt_disable(); - - smp_call_function_many(mask, func, info, wait); - if (cpumask_test_cpu(smp_processor_id(), mask)) - func(info); - - preempt_enable(); -} - /**********************************************************************/ /* @@ -638,7 +625,7 @@ void flush_tlb_all(void) void flush_tlb_mm(struct mm_struct *mm) { if (tlb_ops_need_broadcast()) - on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, mm_cpumask(mm)); + on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1); else local_flush_tlb_mm(mm); } @@ -649,7 +636,8 @@ void flush_tlb_page(struct vm_area_struc struct tlb_args ta; ta.ta_vma = vma; ta.ta_start = uaddr; - on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, mm_cpumask(vma->vm_mm)); + on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page, + &ta, 1); } else local_flush_tlb_page(vma, uaddr); } @@ -672,7 +660,8 @@ void flush_tlb_range(struct vm_area_stru ta.ta_vma = vma; ta.ta_start = start; ta.ta_end = end; - on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, mm_cpumask(vma->vm_mm)); + on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_range, + &ta, 1); } else local_flush_tlb_range(vma, start, end); } diff -uprN linux-2.6.32/arch/arm/kernel/sys_arm.c linux-2.6.32.ovz/arch/arm/kernel/sys_arm.c --- linux-2.6.32/arch/arm/kernel/sys_arm.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/kernel/sys_arm.c 2015-09-10 10:07:24.000000000 -0700 @@ -28,41 +28,6 @@ #include #include -extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, - unsigned long new_len, unsigned long flags, - unsigned long new_addr); - -/* common code for old and new mmaps */ -inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EINVAL; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - if (flags & MAP_FIXED && addr < FIRST_USER_ADDRESS) - goto out; - - error = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - struct mmap_arg_struct { unsigned long addr; unsigned long len; @@ -84,29 +49,11 @@ asmlinkage int old_mmap(struct mmap_arg_ if (a.offset & ~PAGE_MASK) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return error; } -asmlinkage unsigned long -sys_arm_mremap(unsigned long addr, unsigned long old_len, - unsigned long new_len, unsigned long flags, - unsigned long new_addr) -{ - unsigned long ret = -EINVAL; - - if (flags & MREMAP_FIXED && new_addr < FIRST_USER_ADDRESS) - goto out; - - down_write(¤t->mm->mmap_sem); - ret = do_mremap(addr, old_len, new_len, flags, new_addr); - up_write(¤t->mm->mmap_sem); - -out: - return ret; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. diff -uprN linux-2.6.32/arch/arm/mach-kirkwood/cpuidle.c linux-2.6.32.ovz/arch/arm/mach-kirkwood/cpuidle.c --- linux-2.6.32/arch/arm/mach-kirkwood/cpuidle.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/mach-kirkwood/cpuidle.c 2015-09-10 10:07:50.000000000 -0700 @@ -32,17 +32,17 @@ static DEFINE_PER_CPU(struct cpuidle_dev /* Actual code that puts the SoC in different idle states */ static int kirkwood_enter_idle(struct cpuidle_device *dev, - struct cpuidle_state *state) + int index) { struct timeval before, after; int idle_time; local_irq_disable(); do_gettimeofday(&before); - if (state == &dev->states[0]) + if (index == 0) /* Wait for interrupt state */ cpu_do_idle(); - else if (state == &dev->states[1]) { + else if (index == 1) { /* * Following write will put DDR in self refresh. * Note that we have 256 cycles before DDR puts it @@ -57,7 +57,11 @@ static int kirkwood_enter_idle(struct cp local_irq_enable(); idle_time = (after.tv_sec - before.tv_sec) * USEC_PER_SEC + (after.tv_usec - before.tv_usec); - return idle_time; + + /* Update last residency */ + dev->last_residency = idle_time; + + return index; } /* Initialize CPU idle by registering the idle states */ diff -uprN linux-2.6.32/arch/arm/mach-pxa/em-x270.c linux-2.6.32.ovz/arch/arm/mach-pxa/em-x270.c --- linux-2.6.32/arch/arm/mach-pxa/em-x270.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/mach-pxa/em-x270.c 2015-09-10 10:05:42.000000000 -0700 @@ -497,16 +497,15 @@ static int em_x270_usb_hub_init(void) goto err_free_vbus_gpio; /* USB Hub power-on and reset */ - gpio_direction_output(usb_hub_reset, 0); + gpio_direction_output(usb_hub_reset, 1); + gpio_direction_output(GPIO9_USB_VBUS_EN, 0); regulator_enable(em_x270_usb_ldo); - gpio_set_value(usb_hub_reset, 1); gpio_set_value(usb_hub_reset, 0); + gpio_set_value(usb_hub_reset, 1); regulator_disable(em_x270_usb_ldo); regulator_enable(em_x270_usb_ldo); - gpio_set_value(usb_hub_reset, 1); - - /* enable VBUS */ - gpio_direction_output(GPIO9_USB_VBUS_EN, 1); + gpio_set_value(usb_hub_reset, 0); + gpio_set_value(GPIO9_USB_VBUS_EN, 1); return 0; diff -uprN linux-2.6.32/arch/arm/mm/init.c linux-2.6.32.ovz/arch/arm/mm/init.c --- linux-2.6.32/arch/arm/mm/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/mm/init.c 2015-09-10 10:06:38.000000000 -0700 @@ -73,7 +73,7 @@ __tagtable(ATAG_INITRD2, parse_tag_initr */ struct meminfo meminfo; -void show_mem(void) +void show_mem(unsigned int filter) { int free = 0, total = 0, reserved = 0; int shared = 0, cached = 0, slab = 0, node, i; diff -uprN linux-2.6.32/arch/arm/mm/mmap.c linux-2.6.32.ovz/arch/arm/mm/mmap.c --- linux-2.6.32/arch/arm/mm/mmap.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/mm/mmap.c 2015-09-10 10:05:47.000000000 -0700 @@ -54,7 +54,8 @@ arch_get_unmapped_area(struct file *filp * We enforce the MAP_FIXED case. */ if (flags & MAP_FIXED) { - if (aliasing && flags & MAP_SHARED && addr & (SHMLBA - 1)) + if (aliasing && flags & MAP_SHARED && + (addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)) return -EINVAL; return addr; } diff -uprN linux-2.6.32/arch/avr32/include/asm/socket.h linux-2.6.32.ovz/arch/avr32/include/asm/socket.h --- linux-2.6.32/arch/avr32/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/avr32/include/asm/socket.h 2015-09-10 10:05:37.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* __ASM_AVR32_SOCKET_H */ diff -uprN linux-2.6.32/arch/avr32/include/asm/syscalls.h linux-2.6.32.ovz/arch/avr32/include/asm/syscalls.h --- linux-2.6.32/arch/avr32/include/asm/syscalls.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/avr32/include/asm/syscalls.h 2015-09-10 10:05:47.000000000 -0700 @@ -29,10 +29,6 @@ asmlinkage int sys_sigaltstack(const sta struct pt_regs *); asmlinkage int sys_rt_sigreturn(struct pt_regs *); -/* kernel/sys_avr32.c */ -asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, off_t); - /* mm/cache.c */ asmlinkage int sys_cacheflush(int, void __user *, size_t); diff -uprN linux-2.6.32/arch/avr32/kernel/sys_avr32.c linux-2.6.32.ovz/arch/avr32/kernel/sys_avr32.c --- linux-2.6.32/arch/avr32/kernel/sys_avr32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/avr32/kernel/sys_avr32.c 2015-09-10 10:05:47.000000000 -0700 @@ -5,39 +5,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include -#include -#include -#include #include -#include -#include -#include - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, off_t offset) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return error; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, offset); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); - return error; -} - int kernel_execve(const char *file, char **argv, char **envp) { register long scno asm("r8") = __NR_execve; diff -uprN linux-2.6.32/arch/avr32/kernel/syscall-stubs.S linux-2.6.32.ovz/arch/avr32/kernel/syscall-stubs.S --- linux-2.6.32/arch/avr32/kernel/syscall-stubs.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/avr32/kernel/syscall-stubs.S 2015-09-10 10:05:47.000000000 -0700 @@ -61,7 +61,7 @@ __sys_execve: __sys_mmap2: pushm lr st.w --sp, ARG6 - call sys_mmap2 + call sys_mmap_pgoff sub sp, -4 popm pc diff -uprN linux-2.6.32/arch/avr32/kernel/syscall_table.S linux-2.6.32.ovz/arch/avr32/kernel/syscall_table.S --- linux-2.6.32/arch/avr32/kernel/syscall_table.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/avr32/kernel/syscall_table.S 2015-09-10 10:05:55.000000000 -0700 @@ -295,4 +295,5 @@ sys_call_table: .long sys_signalfd .long sys_ni_syscall /* 280, was sys_timerfd */ .long sys_eventfd + .long sys_recvmmsg .long sys_ni_syscall /* r8 is saturated at nr_syscalls */ diff -uprN linux-2.6.32/arch/blackfin/include/asm/unistd.h linux-2.6.32.ovz/arch/blackfin/include/asm/unistd.h --- linux-2.6.32/arch/blackfin/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/blackfin/include/asm/unistd.h 2015-09-10 10:05:55.000000000 -0700 @@ -388,8 +388,9 @@ #define __NR_pwritev 367 #define __NR_rt_tgsigqueueinfo 368 #define __NR_perf_event_open 369 +#define __NR_recvmmsg 370 -#define __NR_syscall 370 +#define __NR_syscall 371 #define NR_syscalls __NR_syscall /* Old optional stuff no one actually uses */ diff -uprN linux-2.6.32/arch/blackfin/kernel/sys_bfin.c linux-2.6.32.ovz/arch/blackfin/kernel/sys_bfin.c --- linux-2.6.32/arch/blackfin/kernel/sys_bfin.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/blackfin/kernel/sys_bfin.c 2015-09-10 10:05:47.000000000 -0700 @@ -22,39 +22,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long -do_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); - out: - return error; -} - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - return do_mmap2(addr, len, prot, flags, fd, pgoff); -} - asmlinkage void *sys_sram_alloc(size_t size, unsigned long flags) { return sram_alloc_with_lsl(size, flags); diff -uprN linux-2.6.32/arch/blackfin/mach-common/entry.S linux-2.6.32.ovz/arch/blackfin/mach-common/entry.S --- linux-2.6.32/arch/blackfin/mach-common/entry.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/blackfin/mach-common/entry.S 2015-09-10 10:05:55.000000000 -0700 @@ -1422,7 +1422,7 @@ ENTRY(_sys_call_table) .long _sys_ni_syscall /* streams2 */ .long _sys_vfork /* 190 */ .long _sys_getrlimit - .long _sys_mmap2 + .long _sys_mmap_pgoff .long _sys_truncate64 .long _sys_ftruncate64 .long _sys_stat64 /* 195 */ @@ -1600,6 +1600,7 @@ ENTRY(_sys_call_table) .long _sys_pwritev .long _sys_rt_tgsigqueueinfo .long _sys_perf_event_open + .long _sys_recvmmsg /* 370 */ .rept NR_syscalls-(.-_sys_call_table)/4 .long _sys_ni_syscall diff -uprN linux-2.6.32/arch/cris/include/asm/module.h linux-2.6.32.ovz/arch/cris/include/asm/module.h --- linux-2.6.32/arch/cris/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/cris/include/asm/module.h 2015-09-10 10:05:36.000000000 -0700 @@ -3,7 +3,12 @@ /* cris is simple */ struct mod_arch_specific { }; +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #endif /* _ASM_CRIS_MODULE_H */ diff -uprN linux-2.6.32/arch/cris/include/asm/socket.h linux-2.6.32.ovz/arch/cris/include/asm/socket.h --- linux-2.6.32/arch/cris/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/cris/include/asm/socket.h 2015-09-10 10:05:37.000000000 -0700 @@ -62,6 +62,7 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/cris/kernel/sys_cris.c linux-2.6.32.ovz/arch/cris/kernel/sys_cris.c --- linux-2.6.32/arch/cris/kernel/sys_cris.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/cris/kernel/sys_cris.c 2015-09-10 10:05:47.000000000 -0700 @@ -26,31 +26,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long -do_mmap2(unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage unsigned long old_mmap(unsigned long __user *args) { unsigned long buffer[6]; @@ -63,7 +38,7 @@ asmlinkage unsigned long old_mmap(unsign if (buffer[5] & ~PAGE_MASK) /* verify that offset is on page boundary */ goto out; - err = do_mmap2(buffer[0], buffer[1], buffer[2], buffer[3], + err = sys_mmap_pgoff(buffer[0], buffer[1], buffer[2], buffer[3], buffer[4], buffer[5] >> PAGE_SHIFT); out: return err; @@ -73,7 +48,8 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - return do_mmap2(addr, len, prot, flags, fd, pgoff); + /* bug(?): 8Kb pages here */ + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } /* diff -uprN linux-2.6.32/arch/frv/include/asm/socket.h linux-2.6.32.ovz/arch/frv/include/asm/socket.h --- linux-2.6.32/arch/frv/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/frv/include/asm/socket.h 2015-09-10 10:05:37.000000000 -0700 @@ -60,5 +60,6 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/frv/kernel/sys_frv.c linux-2.6.32.ovz/arch/frv/kernel/sys_frv.c --- linux-2.6.32/arch/frv/kernel/sys_frv.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/frv/kernel/sys_frv.c 2015-09-10 10:05:47.000000000 -0700 @@ -31,9 +31,6 @@ asmlinkage long sys_mmap2(unsigned long unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - int error = -EBADF; - struct file * file = NULL; - /* As with sparc32, make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE we have.... */ @@ -41,69 +38,10 @@ asmlinkage long sys_mmap2(unsigned long trying to map something we can't */ if (pgoff & ((1 << (PAGE_SHIFT - 12)) - 1)) return -EINVAL; - pgoff >>= PAGE_SHIFT - 12; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -#if 0 /* DAVIDM - do we want this */ -struct mmap_arg_struct64 { - __u32 addr; - __u32 len; - __u32 prot; - __u32 flags; - __u64 offset; /* 64 bits */ - __u32 fd; -}; - -asmlinkage long sys_mmap64(struct mmap_arg_struct64 *arg) -{ - int error = -EFAULT; - struct file * file = NULL; - struct mmap_arg_struct64 a; - unsigned long pgoff; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if ((long)a.offset & ~PAGE_MASK) - return -EINVAL; - - pgoff = a.offset >> PAGE_SHIFT; - if ((a.offset >> PAGE_SHIFT) != pgoff) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(a.fd); - if (!file) - goto out; - } - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); -out: - return error; + return sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } -#endif /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. diff -uprN linux-2.6.32/arch/h8300/include/asm/module.h linux-2.6.32.ovz/arch/h8300/include/asm/module.h --- linux-2.6.32/arch/h8300/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/h8300/include/asm/module.h 2015-09-10 10:05:36.000000000 -0700 @@ -4,9 +4,14 @@ * This file contains the H8/300 architecture specific module code. */ struct mod_arch_specific { }; +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #define MODULE_SYMBOL_PREFIX "_" diff -uprN linux-2.6.32/arch/h8300/include/asm/socket.h linux-2.6.32.ovz/arch/h8300/include/asm/socket.h --- linux-2.6.32/arch/h8300/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/h8300/include/asm/socket.h 2015-09-10 10:05:37.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/h8300/kernel/syscalls.S linux-2.6.32.ovz/arch/h8300/kernel/syscalls.S --- linux-2.6.32/arch/h8300/kernel/syscalls.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/h8300/kernel/syscalls.S 2015-09-10 10:05:47.000000000 -0700 @@ -206,7 +206,7 @@ SYMBOL_NAME_LABEL(sys_call_table) .long SYMBOL_NAME(sys_ni_syscall) /* streams2 */ .long SYMBOL_NAME(sys_vfork) /* 190 */ .long SYMBOL_NAME(sys_getrlimit) - .long SYMBOL_NAME(sys_mmap2) + .long SYMBOL_NAME(sys_mmap_pgoff) .long SYMBOL_NAME(sys_truncate64) .long SYMBOL_NAME(sys_ftruncate64) .long SYMBOL_NAME(sys_stat64) /* 195 */ diff -uprN linux-2.6.32/arch/h8300/kernel/sys_h8300.c linux-2.6.32.ovz/arch/h8300/kernel/sys_h8300.c --- linux-2.6.32/arch/h8300/kernel/sys_h8300.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/h8300/kernel/sys_h8300.c 2015-09-10 10:05:47.000000000 -0700 @@ -26,39 +26,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - return do_mmap2(addr, len, prot, flags, fd, pgoff); -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/m68k cloned Linux/i386, which didn't use to be able to @@ -87,58 +54,12 @@ asmlinkage int old_mmap(struct mmap_arg_ if (a.offset & ~PAGE_MASK) goto out; - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } -#if 0 /* DAVIDM - do we want this */ -struct mmap_arg_struct64 { - __u32 addr; - __u32 len; - __u32 prot; - __u32 flags; - __u64 offset; /* 64 bits */ - __u32 fd; -}; - -asmlinkage long sys_mmap64(struct mmap_arg_struct64 *arg) -{ - int error = -EFAULT; - struct file * file = NULL; - struct mmap_arg_struct64 a; - unsigned long pgoff; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if ((long)a.offset & ~PAGE_MASK) - return -EINVAL; - - pgoff = a.offset >> PAGE_SHIFT; - if ((a.offset >> PAGE_SHIFT) != pgoff) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(a.fd); - if (!file) - goto out; - } - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); -out: - return error; -} -#endif - struct sel_arg_struct { unsigned long n; fd_set *inp, *outp, *exp; diff -uprN linux-2.6.32/arch/ia64/ia32/binfmt_elf32.c linux-2.6.32.ovz/arch/ia64/ia32/binfmt_elf32.c --- linux-2.6.32/arch/ia64/ia32/binfmt_elf32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/ia32/binfmt_elf32.c 2015-09-10 10:05:52.000000000 -0700 @@ -89,6 +89,7 @@ ia64_elf32_init (struct pt_regs *regs) */ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_start = IA32_GDT_OFFSET; vma->vm_end = vma->vm_start + PAGE_SIZE; @@ -114,6 +115,7 @@ ia64_elf32_init (struct pt_regs *regs) */ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_start = IA32_GATE_OFFSET; vma->vm_end = vma->vm_start + PAGE_SIZE; @@ -138,6 +140,7 @@ ia64_elf32_init (struct pt_regs *regs) */ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_start = IA32_LDT_OFFSET; vma->vm_end = vma->vm_start + PAGE_ALIGN(IA32_LDT_ENTRIES*IA32_LDT_ENTRY_SIZE); diff -uprN linux-2.6.32/arch/ia64/ia32/sys_ia32.c linux-2.6.32.ovz/arch/ia64/ia32/sys_ia32.c --- linux-2.6.32/arch/ia64/ia32/sys_ia32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/ia32/sys_ia32.c 2015-09-10 10:07:24.000000000 -0700 @@ -86,7 +86,7 @@ sys32_execve (char __user *name, compat_ struct pt_regs *regs) { long error; - char *filename; + struct filename *filename; unsigned long old_map_base, old_task_size, tssd; filename = getname(name); @@ -104,7 +104,7 @@ sys32_execve (char __user *name, compat_ ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob); ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1); - error = compat_do_execve(filename, argv, envp, regs); + error = compat_do_execve(filename->name, argv, envp, regs); putname(filename); if (error < 0) { @@ -858,6 +858,9 @@ ia32_do_mmap (struct file *file, unsigne prot = get_prot32(prot); + if (flags & MAP_HUGETLB) + return -ENOMEM; + #if PAGE_SHIFT > IA32_PAGE_SHIFT mutex_lock(&ia32_mmap_mutex); { diff -uprN linux-2.6.32/arch/ia64/include/asm/acpi.h linux-2.6.32.ovz/arch/ia64/include/asm/acpi.h --- linux-2.6.32/arch/ia64/include/asm/acpi.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/acpi.h 2015-09-10 10:06:01.000000000 -0700 @@ -94,6 +94,7 @@ ia64_acpi_release_global_lock (unsigned #define acpi_noirq 0 /* ACPI always enabled on IA64 */ #define acpi_pci_disabled 0 /* ACPI PCI always enabled on IA64 */ #define acpi_strict 1 /* no ACPI spec workarounds on IA64 */ +#define acpi_ht 0 /* no HT-only mode on IA64 */ #endif #define acpi_processor_cstate_check(x) (x) /* no idle limits on IA64 :) */ static inline void disable_acpi(void) { } diff -uprN linux-2.6.32/arch/ia64/include/asm/cputime.h linux-2.6.32.ovz/arch/ia64/include/asm/cputime.h --- linux-2.6.32/arch/ia64/include/asm/cputime.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/cputime.h 2015-09-10 10:06:35.000000000 -0700 @@ -62,6 +62,12 @@ typedef u64 cputime64_t; #define msecs_to_cputime(__msecs) ((__msecs) * NSEC_PER_MSEC) /* + * Convert cputime <-> microseconds + */ +#define cputime_to_usecs(__ct) ((__ct) / NSEC_PER_USEC) +#define usecs_to_cputime(__usecs) ((__usecs) * NSEC_PER_USEC) + +/* * Convert cputime <-> seconds */ #define cputime_to_secs(__ct) ((__ct) / NSEC_PER_SEC) diff -uprN linux-2.6.32/arch/ia64/include/asm/crash.h linux-2.6.32.ovz/arch/ia64/include/asm/crash.h --- linux-2.6.32/arch/ia64/include/asm/crash.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/crash.h 2015-09-10 10:05:36.000000000 -0700 @@ -0,0 +1,90 @@ +#ifndef _ASM_IA64_CRASH_H +#define _ASM_IA64_CRASH_H + +/* + * linux/include/asm-ia64/crash.h + * + * Copyright (c) 2004 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifdef __KERNEL__ + +#include +#include +#include + +static inline void * +map_virtual(u64 offset, struct page **pp) +{ + struct page *page; + unsigned long pfn; + u32 type; + + if (REGION_NUMBER(offset) == 5) { + char byte; + + if (__get_user(byte, (char *)offset) == 0) + return (void *)offset; + else + return NULL; + } + + switch (type = efi_mem_type(offset)) + { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + break; + + default: + printk(KERN_INFO + "crash memory driver: invalid memory type for %lx: %d\n", + offset, type); + return NULL; + } + + pfn = offset >> PAGE_SHIFT; + + if (!pfn_valid(pfn)) { + printk(KERN_INFO + "crash memory driver: invalid pfn: %lx )\n", pfn); + return NULL; + } + + page = pfn_to_page(pfn); + + if (!page->virtual) { + printk(KERN_INFO + "crash memory driver: offset: %lx page: %lx page->virtual: NULL\n", + offset, (unsigned long)page); + return NULL; + } + + return (page->virtual + (offset & (PAGE_SIZE-1))); +} + +static inline void unmap_virtual(struct page *page) +{ + return; +} + +#endif /* __KERNEL__ */ + +#endif /* _ASM_IA64_CRASH_H */ diff -uprN linux-2.6.32/arch/ia64/include/asm/elf.h linux-2.6.32.ovz/arch/ia64/include/asm/elf.h --- linux-2.6.32/arch/ia64/include/asm/elf.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/elf.h 2015-09-10 10:06:00.000000000 -0700 @@ -218,54 +218,6 @@ do { \ NEW_AUX_ENT(AT_SYSINFO_EHDR, (unsigned long) GATE_EHDR); \ } while (0) - -/* - * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out - * extra segments containing the gate DSO contents. Dumping its - * contents makes post-mortem fully interpretable later without matching up - * the same kernel and hardware config to see what PC values meant. - * Dumping its extra ELF program headers includes all the other information - * a debugger needs to easily find how the gate DSO was being used. - */ -#define ELF_CORE_EXTRA_PHDRS (GATE_EHDR->e_phnum) -#define ELF_CORE_WRITE_EXTRA_PHDRS \ -do { \ - const struct elf_phdr *const gate_phdrs = \ - (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); \ - int i; \ - Elf64_Off ofs = 0; \ - for (i = 0; i < GATE_EHDR->e_phnum; ++i) { \ - struct elf_phdr phdr = gate_phdrs[i]; \ - if (phdr.p_type == PT_LOAD) { \ - phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \ - phdr.p_filesz = phdr.p_memsz; \ - if (ofs == 0) { \ - ofs = phdr.p_offset = offset; \ - offset += phdr.p_filesz; \ - } \ - else \ - phdr.p_offset = ofs; \ - } \ - else \ - phdr.p_offset += ofs; \ - phdr.p_paddr = 0; /* match other core phdrs */ \ - DUMP_WRITE(&phdr, sizeof(phdr)); \ - } \ -} while (0) -#define ELF_CORE_WRITE_EXTRA_DATA \ -do { \ - const struct elf_phdr *const gate_phdrs = \ - (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); \ - int i; \ - for (i = 0; i < GATE_EHDR->e_phnum; ++i) { \ - if (gate_phdrs[i].p_type == PT_LOAD) { \ - DUMP_WRITE((void *) gate_phdrs[i].p_vaddr, \ - PAGE_ALIGN(gate_phdrs[i].p_memsz)); \ - break; \ - } \ - } \ -} while (0) - /* * format for entries in the Global Offset Table */ diff -uprN linux-2.6.32/arch/ia64/include/asm/io.h linux-2.6.32.ovz/arch/ia64/include/asm/io.h --- linux-2.6.32/arch/ia64/include/asm/io.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/io.h 2015-09-10 10:06:38.000000000 -0700 @@ -424,6 +424,13 @@ __writeq (unsigned long val, volatile vo extern void __iomem * ioremap(unsigned long offset, unsigned long size); extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size); extern void iounmap (volatile void __iomem *addr); +extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size); +extern void early_iounmap (volatile void __iomem *addr, unsigned long size); +static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned long size) +{ + return ioremap(phys_addr, size); +} + /* * String version of IO memory access ops: diff -uprN linux-2.6.32/arch/ia64/include/asm/kexec.h linux-2.6.32.ovz/arch/ia64/include/asm/kexec.h --- linux-2.6.32/arch/ia64/include/asm/kexec.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/kexec.h 2015-09-10 10:06:34.000000000 -0700 @@ -1,6 +1,7 @@ #ifndef _ASM_IA64_KEXEC_H #define _ASM_IA64_KEXEC_H +#include /* Maximum physical address we can use pages from */ #define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) @@ -19,6 +20,12 @@ flush_icache_range(page_addr, page_addr + PAGE_SIZE); \ } while(0) +#ifdef CONFIG_KEXEC_AUTO_RESERVE +extern +unsigned long long __init arch_default_crash_size(unsigned long long); +#define arch_default_crash_size arch_default_crash_size +#endif + extern struct kimage *ia64_kimage; extern const unsigned int relocate_new_kernel_size; extern void relocate_new_kernel(unsigned long, unsigned long, diff -uprN linux-2.6.32/arch/ia64/include/asm/kvm.h linux-2.6.32.ovz/arch/ia64/include/asm/kvm.h --- linux-2.6.32/arch/ia64/include/asm/kvm.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/kvm.h 2015-09-10 10:05:44.000000000 -0700 @@ -60,6 +60,7 @@ struct kvm_ioapic_state { #define KVM_IRQCHIP_PIC_MASTER 0 #define KVM_IRQCHIP_PIC_SLAVE 1 #define KVM_IRQCHIP_IOAPIC 2 +#define KVM_NR_IRQCHIPS 3 #define KVM_CONTEXT_SIZE 8*1024 diff -uprN linux-2.6.32/arch/ia64/include/asm/kvm_host.h linux-2.6.32.ovz/arch/ia64/include/asm/kvm_host.h --- linux-2.6.32/arch/ia64/include/asm/kvm_host.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/kvm_host.h 2015-09-10 10:05:44.000000000 -0700 @@ -475,7 +475,6 @@ struct kvm_arch { struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; int iommu_flags; - struct hlist_head irq_ack_notifier_list; unsigned long irq_sources_bitmap; unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; diff -uprN linux-2.6.32/arch/ia64/include/asm/module.h linux-2.6.32.ovz/arch/ia64/include/asm/module.h --- linux-2.6.32/arch/ia64/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/module.h 2015-09-10 10:05:36.000000000 -0700 @@ -29,9 +29,14 @@ struct mod_arch_specific { unsigned int next_got_entry; /* index of next available got entry */ }; +#define MODULES_ARE_ELF64 #define Elf_Shdr Elf64_Shdr #define Elf_Sym Elf64_Sym #define Elf_Ehdr Elf64_Ehdr +#define Elf_Rel Elf64_Rel +#define Elf_Rela Elf64_Rela +#define ELF_R_TYPE(X) ELF64_R_TYPE(X) +#define ELF_R_SYM(X) ELF64_R_SYM(X) #define MODULE_PROC_FAMILY "ia64" #define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY \ diff -uprN linux-2.6.32/arch/ia64/include/asm/numa.h linux-2.6.32.ovz/arch/ia64/include/asm/numa.h --- linux-2.6.32/arch/ia64/include/asm/numa.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/numa.h 2015-09-10 10:06:22.000000000 -0700 @@ -22,8 +22,6 @@ #include -#define NUMA_NO_NODE -1 - extern u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned; extern cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; extern pg_data_t *pgdat_list[MAX_NUMNODES]; diff -uprN linux-2.6.32/arch/ia64/include/asm/processor.h linux-2.6.32.ovz/arch/ia64/include/asm/processor.h --- linux-2.6.32/arch/ia64/include/asm/processor.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/processor.h 2015-09-10 10:07:39.000000000 -0700 @@ -361,7 +361,7 @@ struct thread_struct { regs->loadrs = 0; \ regs->r8 = get_dumpable(current->mm); /* set "don't zap registers" flag */ \ regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \ - if (unlikely(!get_dumpable(current->mm))) { \ + if (unlikely(get_dumpable(current->mm) != SUID_DUMP_USER)) { \ /* \ * Zap scratch regs to avoid leaking bits between processes with different \ * uid/privileges. \ diff -uprN linux-2.6.32/arch/ia64/include/asm/socket.h linux-2.6.32.ovz/arch/ia64/include/asm/socket.h --- linux-2.6.32/arch/ia64/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/socket.h 2015-09-10 10:07:41.000000000 -0700 @@ -69,4 +69,8 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 + +#define SO_BUSY_POLL 46 + #endif /* _ASM_IA64_SOCKET_H */ diff -uprN linux-2.6.32/arch/ia64/include/asm/swiotlb.h linux-2.6.32.ovz/arch/ia64/include/asm/swiotlb.h --- linux-2.6.32/arch/ia64/include/asm/swiotlb.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/swiotlb.h 2015-09-10 10:08:00.000000000 -0700 @@ -4,8 +4,6 @@ #include #include -extern int swiotlb_force; - #ifdef CONFIG_SWIOTLB extern int swiotlb; extern void pci_swiotlb_init(void); diff -uprN linux-2.6.32/arch/ia64/include/asm/unistd.h linux-2.6.32.ovz/arch/ia64/include/asm/unistd.h --- linux-2.6.32/arch/ia64/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/unistd.h 2015-09-10 10:05:55.000000000 -0700 @@ -311,11 +311,12 @@ #define __NR_preadv 1319 #define __NR_pwritev 1320 #define __NR_rt_tgsigqueueinfo 1321 +#define __NR_recvmmsg 1322 #ifdef __KERNEL__ -#define NR_syscalls 298 /* length of syscall table */ +#define NR_syscalls 299 /* length of syscall table */ /* * The following defines stop scripts/checksyscalls.sh from complaining about diff -uprN linux-2.6.32/arch/ia64/Kconfig linux-2.6.32.ovz/arch/ia64/Kconfig --- linux-2.6.32/arch/ia64/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/Kconfig 2015-09-10 10:06:34.000000000 -0700 @@ -591,6 +591,20 @@ config KEXEC support. As of this writing the exact hardware interface is strongly in flux, so no good recommendation can be made. +config KEXEC_AUTO_RESERVE + bool "automatically reserve memory for kexec kernel" + depends on KEXEC + default y + ---help--- + Automatically reserve memory for a kexec kernel, so that you don't + need to specify numbers for the "crashkernel=X@Y" boot option, + instead you can use "crashkernel=auto". To make this work, you need + to have more than 4G memory. + + The reserved memory size is different depends on how much memory + you actually have. Please check Documentation/kdump/kdump.txt. + If you doubt, say N. + config CRASH_DUMP bool "kernel crash dumps" depends on IA64_MCA_RECOVERY && !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) diff -uprN linux-2.6.32/arch/ia64/kernel/acpi.c linux-2.6.32.ovz/arch/ia64/kernel/acpi.c --- linux-2.6.32/arch/ia64/kernel/acpi.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/acpi.c 2015-09-10 10:06:37.000000000 -0700 @@ -884,8 +884,8 @@ __init void prefill_possible_map(void) possible = available_cpus + additional_cpus; - if (possible > NR_CPUS) - possible = NR_CPUS; + if (possible > nr_cpu_ids) + possible = nr_cpu_ids; printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", possible, max((possible - available_cpus), 0)); diff -uprN linux-2.6.32/arch/ia64/kernel/crash_dump.c linux-2.6.32.ovz/arch/ia64/kernel/crash_dump.c --- linux-2.6.32/arch/ia64/kernel/crash_dump.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/crash_dump.c 2015-09-10 10:08:00.000000000 -0700 @@ -15,6 +15,7 @@ /* Stores the physical address of elf header of crash image. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +EXPORT_SYMBOL(elfcorehdr_addr); /** * copy_oldmem_page - copy one page from "oldmem" diff -uprN linux-2.6.32/arch/ia64/kernel/elfcore.c linux-2.6.32.ovz/arch/ia64/kernel/elfcore.c --- linux-2.6.32/arch/ia64/kernel/elfcore.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/elfcore.c 2015-09-10 10:06:00.000000000 -0700 @@ -0,0 +1,80 @@ +#include +#include +#include +#include + +#include + + +Elf64_Half elf_core_extra_phdrs(void) +{ + return GATE_EHDR->e_phnum; +} + +int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + Elf64_Off ofs = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + struct elf_phdr phdr = gate_phdrs[i]; + + if (phdr.p_type == PT_LOAD) { + phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); + phdr.p_filesz = phdr.p_memsz; + if (ofs == 0) { + ofs = phdr.p_offset = offset; + offset += phdr.p_filesz; + } else { + phdr.p_offset = ofs; + } + } else { + phdr.p_offset += ofs; + } + phdr.p_paddr = 0; /* match other core phdrs */ + *size += sizeof(phdr); + if (*size > limit || !dump_write(file, &phdr, sizeof(phdr))) + return 0; + } + return 1; +} + +int elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + void *addr = (void *)gate_phdrs[i].p_vaddr; + size_t memsz = PAGE_ALIGN(gate_phdrs[i].p_memsz); + + *size += memsz; + if (*size > limit || !dump_write(file, addr, memsz)) + return 0; + break; + } + } + return 1; +} + +size_t elf_core_extra_data_size(void) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + size_t size = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + size += PAGE_ALIGN(gate_phdrs[i].p_memsz); + break; + } + } + return size; +} diff -uprN linux-2.6.32/arch/ia64/kernel/entry.S linux-2.6.32.ovz/arch/ia64/kernel/entry.S --- linux-2.6.32/arch/ia64/kernel/entry.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/entry.S 2015-09-10 10:05:55.000000000 -0700 @@ -1806,6 +1806,7 @@ sys_call_table: data8 sys_preadv data8 sys_pwritev // 1320 data8 sys_rt_tgsigqueueinfo + data8 sys_recvmmsg .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ diff -uprN linux-2.6.32/arch/ia64/kernel/ia64_ksyms.c linux-2.6.32.ovz/arch/ia64/kernel/ia64_ksyms.c --- linux-2.6.32/arch/ia64/kernel/ia64_ksyms.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/ia64_ksyms.c 2015-09-10 10:05:36.000000000 -0700 @@ -84,6 +84,9 @@ EXPORT_SYMBOL(ia64_save_scratch_fpregs); #include EXPORT_SYMBOL(unw_init_running); +#include +EXPORT_SYMBOL_GPL(efi_mem_type); + #if defined(CONFIG_IA64_ESI) || defined(CONFIG_IA64_ESI_MODULE) extern void esi_call_phys (void); EXPORT_SYMBOL_GPL(esi_call_phys); diff -uprN linux-2.6.32/arch/ia64/kernel/kprobes.c linux-2.6.32.ovz/arch/ia64/kernel/kprobes.c --- linux-2.6.32/arch/ia64/kernel/kprobes.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/kprobes.c 2015-09-10 10:06:28.000000000 -0700 @@ -870,7 +870,7 @@ static int __kprobes pre_kprobes_handler return 1; ss_probe: -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) +#if !defined(CONFIG_PREEMPT) if (p->ainsn.inst_flag == INST_FLAG_BOOSTABLE && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ ia64_psr(regs)->ri = p->ainsn.slot; diff -uprN linux-2.6.32/arch/ia64/kernel/machine_kexec.c linux-2.6.32.ovz/arch/ia64/kernel/machine_kexec.c --- linux-2.6.32/arch/ia64/kernel/machine_kexec.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/machine_kexec.c 2015-09-10 10:06:34.000000000 -0700 @@ -162,6 +162,44 @@ void arch_crash_save_vmcoreinfo(void) #endif } +#ifdef CONFIG_KEXEC_AUTO_RESERVE +#define MBYTES(n) ((n)*1024*1024ULL) +#define GBYTES(n) ((n)*1024*1024*1024ULL) +/* + Memory size Reserved memory + =========== =============== + [4G, 12G) 256M + [12G, 128G) 512M + [128G, 256G) 768M + [256G, 378G) 1024M + [378G, 512G) 1536M + [512G, 768G) 2048M + [768G, ) 3072M + */ +unsigned long long __init arch_default_crash_size(unsigned long long total_size) +{ + unsigned long long ret; + + if (total_size >= GBYTES(4) && total_size < GBYTES(12)) + ret = MBYTES(256); + else if (total_size >= GBYTES(12) && total_size < GBYTES(128)) + ret = MBYTES(512); + else if (total_size >= GBYTES(128) && total_size < GBYTES(256)) + ret = MBYTES(768); + else if (total_size >= GBYTES(256) && total_size < GBYTES(378)) + ret = MBYTES(1024); + else if (total_size >= GBYTES(318) && total_size < GBYTES(512)) + ret = MBYTES(1536); + else if (total_size >= GBYTES(512) && total_size < GBYTES(768)) + ret = MBYTES(2048); + else + ret = MBYTES(3072); + return ret; +} +#undef GBYTES +#undef MBYTES +#endif + unsigned long paddr_vmcoreinfo_note(void) { return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note); diff -uprN linux-2.6.32/arch/ia64/kernel/Makefile linux-2.6.32.ovz/arch/ia64/kernel/Makefile --- linux-2.6.32/arch/ia64/kernel/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/Makefile 2015-09-10 10:06:00.000000000 -0700 @@ -51,6 +51,8 @@ endif obj-$(CONFIG_DMAR) += pci-dma.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o + # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 diff -uprN linux-2.6.32/arch/ia64/kernel/pci-swiotlb.c linux-2.6.32.ovz/arch/ia64/kernel/pci-swiotlb.c --- linux-2.6.32/arch/ia64/kernel/pci-swiotlb.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/pci-swiotlb.c 2015-09-10 10:08:00.000000000 -0700 @@ -41,7 +41,7 @@ struct dma_map_ops swiotlb_dma_ops = { void __init swiotlb_dma_init(void) { dma_ops = &swiotlb_dma_ops; - swiotlb_init(); + swiotlb_init(1); } void __init pci_swiotlb_init(void) @@ -51,7 +51,7 @@ void __init pci_swiotlb_init(void) swiotlb = 1; printk(KERN_INFO "PCI-DMA: Re-initialize machine vector.\n"); machvec_init("dig"); - swiotlb_init(); + swiotlb_init(1); dma_ops = &swiotlb_dma_ops; #else panic("Unable to find Intel IOMMU"); diff -uprN linux-2.6.32/arch/ia64/kernel/perfmon.c linux-2.6.32.ovz/arch/ia64/kernel/perfmon.c --- linux-2.6.32/arch/ia64/kernel/perfmon.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/perfmon.c 2015-09-10 10:06:09.000000000 -0700 @@ -2206,7 +2206,7 @@ pfm_alloc_file(pfm_context_t *ctx) { struct file *file; struct inode *inode; - struct dentry *dentry; + struct path path; char name[32]; struct qstr this; @@ -2231,18 +2231,19 @@ pfm_alloc_file(pfm_context_t *ctx) /* * allocate a new dcache entry */ - dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); - if (!dentry) { + path.dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); + if (!path.dentry) { iput(inode); return ERR_PTR(-ENOMEM); } + path.mnt = mntget(pfmfs_mnt); - dentry->d_op = &pfmfs_dentry_operations; - d_add(dentry, inode); + path.dentry->d_op = &pfmfs_dentry_operations; + d_add(path.dentry, inode); - file = alloc_file(pfmfs_mnt, dentry, FMODE_READ, &pfm_file_ops); + file = alloc_file(&path, FMODE_READ, &pfm_file_ops); if (!file) { - dput(dentry); + path_put(&path); return ERR_PTR(-ENFILE); } @@ -2320,6 +2321,7 @@ pfm_smpl_buffer_alloc(struct task_struct DPRINT(("Cannot allocate vma\n")); goto error_kmem; } + INIT_LIST_HEAD(&vma->anon_vma_chain); /* * partially initialize the vma for the sampling buffer diff -uprN linux-2.6.32/arch/ia64/kernel/process.c linux-2.6.32.ovz/arch/ia64/kernel/process.c --- linux-2.6.32/arch/ia64/kernel/process.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/process.c 2015-09-10 10:07:24.000000000 -0700 @@ -662,14 +662,14 @@ long sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp, struct pt_regs *regs) { - char *fname; + struct filename *fname; int error; fname = getname(filename); error = PTR_ERR(fname); if (IS_ERR(fname)) goto out; - error = do_execve(fname, argv, envp, regs); + error = do_execve(fname->name, argv, envp, regs); putname(fname); out: return error; diff -uprN linux-2.6.32/arch/ia64/kernel/setup.c linux-2.6.32.ovz/arch/ia64/kernel/setup.c --- linux-2.6.32/arch/ia64/kernel/setup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/setup.c 2015-10-24 08:14:47.352501187 -0700 @@ -273,7 +273,7 @@ static void __init setup_crashkernel(uns int ret; ret = parse_crashkernel(boot_command_line, total, - &size, &base); + &size, &base, NULL); if (ret == 0 && size > 0) { if (!base) { sort_regions(rsvd_region, *n); diff -uprN linux-2.6.32/arch/ia64/kernel/sys_ia64.c linux-2.6.32.ovz/arch/ia64/kernel/sys_ia64.c --- linux-2.6.32/arch/ia64/kernel/sys_ia64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/sys_ia64.c 2015-09-10 10:05:47.000000000 -0700 @@ -100,51 +100,7 @@ sys_getpagesize (void) asmlinkage unsigned long ia64_brk (unsigned long brk) { - unsigned long rlim, retval, newbrk, oldbrk; - struct mm_struct *mm = current->mm; - - /* - * Most of this replicates the code in sys_brk() except for an additional safety - * check and the clearing of r8. However, we can't call sys_brk() because we need - * to acquire the mmap_sem before we can do the test... - */ - down_write(&mm->mmap_sem); - - if (brk < mm->end_code) - goto out; - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; - - /* Always allow shrinking brk. */ - if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk)) - goto set_brk; - goto out; - } - - /* Check against unimplemented/unmapped addresses: */ - if ((newbrk - oldbrk) > RGN_MAP_LIMIT || REGION_OFFSET(newbrk) > RGN_MAP_LIMIT) - goto out; - - /* Check against rlimit.. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; - if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) - goto out; - - /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) - goto out; - - /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) - goto out; -set_brk: - mm->brk = brk; -out: - retval = mm->brk; - up_write(&mm->mmap_sem); + unsigned long retval = sys_brk(brk); force_successful_syscall_return(); return retval; } @@ -185,39 +141,6 @@ int ia64_mmap_check(unsigned long addr, return 0; } -static inline unsigned long -do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, unsigned long pgoff) -{ - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return -EBADF; - - if (!file->f_op || !file->f_op->mmap) { - addr = -ENODEV; - goto out; - } - } - - /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) { - addr = -EINVAL; - goto out; - } - - down_write(¤t->mm->mmap_sem); - addr = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - -out: if (file) - fput(file); - return addr; -} - /* * mmap2() is like mmap() except that the offset is expressed in units * of PAGE_SIZE (instead of bytes). This allows to mmap2() (pieces @@ -226,7 +149,7 @@ out: if (file) asmlinkage unsigned long sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff) { - addr = do_mmap2(addr, len, prot, flags, fd, pgoff); + addr = sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; @@ -238,7 +161,7 @@ sys_mmap (unsigned long addr, unsigned l if (offset_in_page(off) != 0) return -EINVAL; - addr = do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + addr = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; diff -uprN linux-2.6.32/arch/ia64/kernel/topology.c linux-2.6.32.ovz/arch/ia64/kernel/topology.c --- linux-2.6.32/arch/ia64/kernel/topology.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/topology.c 2015-09-10 10:06:07.000000000 -0700 @@ -282,7 +282,7 @@ static ssize_t cache_show(struct kobject return ret; } -static struct sysfs_ops cache_sysfs_ops = { +static const struct sysfs_ops cache_sysfs_ops = { .show = cache_show }; diff -uprN linux-2.6.32/arch/ia64/kvm/Kconfig linux-2.6.32.ovz/arch/ia64/kvm/Kconfig --- linux-2.6.32/arch/ia64/kvm/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kvm/Kconfig 2015-09-10 10:05:38.000000000 -0700 @@ -47,6 +47,7 @@ config KVM_INTEL Provides support for KVM on Itanium 2 processors equipped with the VT extensions. +source drivers/vhost/Kconfig source drivers/virtio/Kconfig endif # VIRTUALIZATION diff -uprN linux-2.6.32/arch/ia64/kvm/kvm-ia64.c linux-2.6.32.ovz/arch/ia64/kvm/kvm-ia64.c --- linux-2.6.32/arch/ia64/kvm/kvm-ia64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kvm/kvm-ia64.c 2015-09-10 10:06:53.000000000 -0700 @@ -124,7 +124,7 @@ long ia64_pal_vp_create(u64 *vpd, u64 *h static DEFINE_SPINLOCK(vp_lock); -void kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void *garbage) { long status; long tmp_base; @@ -137,7 +137,7 @@ void kvm_arch_hardware_enable(void *garb slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT); local_irq_restore(saved_psr); if (slot < 0) - return; + return -EINVAL; spin_lock(&vp_lock); status = ia64_pal_vp_init_env(kvm_vsa_base ? @@ -145,7 +145,7 @@ void kvm_arch_hardware_enable(void *garb __pa(kvm_vm_buffer), KVM_VM_BUFFER_BASE, &tmp_base); if (status != 0) { printk(KERN_WARNING"kvm: Failed to Enable VT Support!!!!\n"); - return ; + return -EINVAL; } if (!kvm_vsa_base) { @@ -154,6 +154,8 @@ void kvm_arch_hardware_enable(void *garb } spin_unlock(&vp_lock); ia64_ptr_entry(0x3, slot); + + return 0; } void kvm_arch_hardware_disable(void *garbage) @@ -239,10 +241,10 @@ static int handle_mmio(struct kvm_vcpu * return 0; mmio: if (p->dir) - r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr, + r = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, p->addr, p->size, &p->data); else - r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr, + r = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, p->addr, p->size, &p->data); if (r) printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr); @@ -634,12 +636,9 @@ static void kvm_vcpu_post_transition(str static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { union context *host_ctx, *guest_ctx; - int r; + int r, idx; - /* - * down_read() may sleep and return with interrupts enabled - */ - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); again: if (signal_pending(current)) { @@ -661,7 +660,7 @@ again: if (r < 0) goto vcpu_run_fail; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); kvm_guest_enter(); /* @@ -685,7 +684,7 @@ again: kvm_guest_exit(); preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_handle_exit(kvm_run, vcpu); @@ -695,10 +694,10 @@ again: } out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); if (r > 0) { kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); goto again; } @@ -851,8 +850,7 @@ static int kvm_vm_ioctl_get_irqchip(stru r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_IOAPIC: - memcpy(&chip->chip.ioapic, ioapic_irqchip(kvm), - sizeof(struct kvm_ioapic_state)); + r = kvm_get_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; @@ -868,9 +866,7 @@ static int kvm_vm_ioctl_set_irqchip(stru r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_IOAPIC: - memcpy(ioapic_irqchip(kvm), - &chip->chip.ioapic, - sizeof(struct kvm_ioapic_state)); + r = kvm_set_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; @@ -985,10 +981,8 @@ long kvm_arch_vm_ioctl(struct file *filp goto out; if (irqchip_in_kernel(kvm)) { __s32 status; - mutex_lock(&kvm->irq_lock); status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); - mutex_unlock(&kvm->irq_lock); if (ioctl == KVM_IRQ_LINE_STATUS) { irq_event.status = status; if (copy_to_user(argp, &irq_event, @@ -1185,6 +1179,11 @@ out: #define PALE_RESET_ENTRY 0x80000000ffffffb0UL +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct kvm_vcpu *v; @@ -1380,12 +1379,14 @@ static void free_kvm(struct kvm *kvm) static void kvm_release_vm_pages(struct kvm *kvm) { + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i, j; unsigned long base_gfn; - for (i = 0; i < kvm->nmemslots; i++) { - memslot = &kvm->memslots[i]; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) { + memslot = &slots->memslots[i]; base_gfn = memslot->base_gfn; for (j = 0; j < memslot->npages; j++) { @@ -1408,6 +1409,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm kfree(kvm->arch.vioapic); kvm_release_vm_pages(kvm); kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); free_kvm(kvm); } @@ -1579,15 +1581,15 @@ out: return r; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { unsigned long i; unsigned long pfn; - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + int npages = memslot->npages; unsigned long base_gfn = memslot->base_gfn; if (base_gfn + npages > (KVM_MAX_MEM_SIZE >> PAGE_SHIFT)) @@ -1611,6 +1613,14 @@ int kvm_arch_set_memory_region(struct kv return 0; } +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + return; +} + void kvm_arch_flush_shadow(struct kvm *kvm) { kvm_flush_remote_tlbs(kvm); @@ -1797,7 +1807,8 @@ static int kvm_ia64_sync_dirty_log(struc { struct kvm_memory_slot *memslot; int r, i; - long n, base; + long base; + unsigned long n; unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base + offsetof(struct kvm_vm_data, kvm_mem_dirty_log)); @@ -1805,12 +1816,12 @@ static int kvm_ia64_sync_dirty_log(struc if (log->slot >= KVM_MEMORY_SLOTS) goto out; - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; r = -ENOENT; if (!memslot->dirty_bitmap) goto out; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + n = kvm_dirty_bitmap_bytes(memslot); base = memslot->base_gfn / BITS_PER_LONG; for (i = 0; i < n/sizeof(long); ++i) { @@ -1826,10 +1837,11 @@ int kvm_vm_ioctl_get_dirty_log(struct kv struct kvm_dirty_log *log) { int r; - int n; + unsigned long n; struct kvm_memory_slot *memslot; int is_dirty = 0; + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->arch.dirty_log_lock); r = kvm_ia64_sync_dirty_log(kvm, log); @@ -1843,12 +1855,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kv /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { kvm_flush_remote_tlbs(kvm); - memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + memslot = &kvm->memslots->memslots[log->slot]; + n = kvm_dirty_bitmap_bytes(memslot); memset(memslot->dirty_bitmap, 0, n); } r = 0; out: + mutex_unlock(&kvm->slots_lock); spin_unlock(&kvm->arch.dirty_log_lock); return r; } diff -uprN linux-2.6.32/arch/ia64/mm/contig.c linux-2.6.32.ovz/arch/ia64/mm/contig.c --- linux-2.6.32/arch/ia64/mm/contig.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/mm/contig.c 2015-09-10 10:06:38.000000000 -0700 @@ -36,7 +36,7 @@ static unsigned long max_gap; * Shows a simple page count of reserved and used pages in the system. * For discontig machines, it does this on a per-pgdat basis. */ -void show_mem(void) +void show_mem(unsigned int filter) { int i, total_reserved = 0; int total_shared = 0, total_cached = 0; diff -uprN linux-2.6.32/arch/ia64/mm/discontig.c linux-2.6.32.ovz/arch/ia64/mm/discontig.c --- linux-2.6.32/arch/ia64/mm/discontig.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/mm/discontig.c 2015-09-10 10:06:38.000000000 -0700 @@ -514,7 +514,7 @@ void __cpuinit *per_cpu_init(void) * Shows a simple page count of reserved and used pages in the system. * For discontig machines, it does this on a per-pgdat basis. */ -void show_mem(void) +void show_mem(unsigned int filter) { int i, total_reserved = 0; int total_shared = 0, total_cached = 0; diff -uprN linux-2.6.32/arch/ia64/mm/hugetlbpage.c linux-2.6.32.ovz/arch/ia64/mm/hugetlbpage.c --- linux-2.6.32/arch/ia64/mm/hugetlbpage.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/mm/hugetlbpage.c 2015-09-10 10:07:03.000000000 -0700 @@ -26,7 +26,8 @@ unsigned int hpage_shift = HPAGE_SHIFT_D EXPORT_SYMBOL(hpage_shift); pte_t * -huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) +huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz, + bool *shared) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; diff -uprN linux-2.6.32/arch/ia64/mm/init.c linux-2.6.32.ovz/arch/ia64/mm/init.c --- linux-2.6.32/arch/ia64/mm/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/mm/init.c 2015-09-10 10:05:52.000000000 -0700 @@ -118,6 +118,7 @@ ia64_init_addr_space (void) */ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; @@ -136,6 +137,7 @@ ia64_init_addr_space (void) if (!(current->personality & MMAP_PAGE_ZERO)) { vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); diff -uprN linux-2.6.32/arch/ia64/mm/ioremap.c linux-2.6.32.ovz/arch/ia64/mm/ioremap.c --- linux-2.6.32/arch/ia64/mm/ioremap.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/mm/ioremap.c 2015-09-10 10:05:43.000000000 -0700 @@ -22,6 +22,12 @@ __ioremap (unsigned long phys_addr) } void __iomem * +early_ioremap (unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr); +} + +void __iomem * ioremap (unsigned long phys_addr, unsigned long size) { void __iomem *addr; @@ -102,6 +108,11 @@ ioremap_nocache (unsigned long phys_addr EXPORT_SYMBOL(ioremap_nocache); void +early_iounmap (volatile void __iomem *addr, unsigned long size) +{ +} + +void iounmap (volatile void __iomem *addr) { if (REGION_NUMBER(addr) == RGN_GATE) diff -uprN linux-2.6.32/arch/ia64/pci/pci.c linux-2.6.32.ovz/arch/ia64/pci/pci.c --- linux-2.6.32/arch/ia64/pci/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/pci/pci.c 2015-09-10 10:06:44.000000000 -0700 @@ -438,13 +438,12 @@ EXPORT_SYMBOL(pcibios_bus_to_resource); static int __devinit is_valid_resource(struct pci_dev *dev, int idx) { unsigned int i, type_mask = IORESOURCE_IO | IORESOURCE_MEM; - struct resource *devr = &dev->resource[idx]; + struct resource *devr = &dev->resource[idx], *busr; if (!dev->bus) return 0; - for (i=0; ibus->resource[i]; + pci_bus_for_each_resource(dev->bus, busr, i) { if (!busr || ((busr->flags ^ devr->flags) & type_mask)) continue; if ((devr->start) && (devr->start >= busr->start) && diff -uprN linux-2.6.32/arch/ia64/scripts/unwcheck.py linux-2.6.32.ovz/arch/ia64/scripts/unwcheck.py --- linux-2.6.32/arch/ia64/scripts/unwcheck.py 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/scripts/unwcheck.py 2015-09-10 10:06:06.000000000 -0700 @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # Usage: unwcheck.py FILE # diff -uprN linux-2.6.32/arch/ia64/xen/irq_xen.c linux-2.6.32.ovz/arch/ia64/xen/irq_xen.c --- linux-2.6.32/arch/ia64/xen/irq_xen.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/xen/irq_xen.c 2015-09-10 10:07:48.000000000 -0700 @@ -63,19 +63,19 @@ xen_free_irq_vector(int vector) } -static DEFINE_PER_CPU(int, timer_irq) = -1; -static DEFINE_PER_CPU(int, ipi_irq) = -1; -static DEFINE_PER_CPU(int, resched_irq) = -1; -static DEFINE_PER_CPU(int, cmc_irq) = -1; -static DEFINE_PER_CPU(int, cmcp_irq) = -1; -static DEFINE_PER_CPU(int, cpep_irq) = -1; +static DEFINE_PER_CPU(int, xen_timer_irq) = -1; +static DEFINE_PER_CPU(int, xen_ipi_irq) = -1; +static DEFINE_PER_CPU(int, xen_resched_irq) = -1; +static DEFINE_PER_CPU(int, xen_cmc_irq) = -1; +static DEFINE_PER_CPU(int, xen_cmcp_irq) = -1; +static DEFINE_PER_CPU(int, xen_cpep_irq) = -1; #define NAME_SIZE 15 -static DEFINE_PER_CPU(char[NAME_SIZE], timer_name); -static DEFINE_PER_CPU(char[NAME_SIZE], ipi_name); -static DEFINE_PER_CPU(char[NAME_SIZE], resched_name); -static DEFINE_PER_CPU(char[NAME_SIZE], cmc_name); -static DEFINE_PER_CPU(char[NAME_SIZE], cmcp_name); -static DEFINE_PER_CPU(char[NAME_SIZE], cpep_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_timer_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_ipi_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_resched_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_cmc_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_cmcp_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_cpep_name); #undef NAME_SIZE struct saved_irq { @@ -144,64 +144,64 @@ __xen_register_percpu_irq(unsigned int c if (xen_slab_ready) { switch (vec) { case IA64_TIMER_VECTOR: - snprintf(per_cpu(timer_name, cpu), - sizeof(per_cpu(timer_name, cpu)), + snprintf(per_cpu(xen_timer_name, cpu), + sizeof(per_cpu(xen_timer_name, cpu)), "%s%d", action->name, cpu); irq = bind_virq_to_irqhandler(VIRQ_ITC, cpu, action->handler, action->flags, - per_cpu(timer_name, cpu), action->dev_id); - per_cpu(timer_irq, cpu) = irq; + per_cpu(xen_timer_name, cpu), action->dev_id); + per_cpu(xen_timer_irq, cpu) = irq; break; case IA64_IPI_RESCHEDULE: - snprintf(per_cpu(resched_name, cpu), - sizeof(per_cpu(resched_name, cpu)), + snprintf(per_cpu(xen_resched_name, cpu), + sizeof(per_cpu(xen_resched_name, cpu)), "%s%d", action->name, cpu); irq = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, cpu, action->handler, action->flags, - per_cpu(resched_name, cpu), action->dev_id); - per_cpu(resched_irq, cpu) = irq; + per_cpu(xen_resched_name, cpu), action->dev_id); + per_cpu(xen_resched_irq, cpu) = irq; break; case IA64_IPI_VECTOR: - snprintf(per_cpu(ipi_name, cpu), - sizeof(per_cpu(ipi_name, cpu)), + snprintf(per_cpu(xen_ipi_name, cpu), + sizeof(per_cpu(xen_ipi_name, cpu)), "%s%d", action->name, cpu); irq = bind_ipi_to_irqhandler(XEN_IPI_VECTOR, cpu, action->handler, action->flags, - per_cpu(ipi_name, cpu), action->dev_id); - per_cpu(ipi_irq, cpu) = irq; + per_cpu(xen_ipi_name, cpu), action->dev_id); + per_cpu(xen_ipi_irq, cpu) = irq; break; case IA64_CMC_VECTOR: - snprintf(per_cpu(cmc_name, cpu), - sizeof(per_cpu(cmc_name, cpu)), + snprintf(per_cpu(xen_cmc_name, cpu), + sizeof(per_cpu(xen_cmc_name, cpu)), "%s%d", action->name, cpu); irq = bind_virq_to_irqhandler(VIRQ_MCA_CMC, cpu, - action->handler, - action->flags, - per_cpu(cmc_name, cpu), - action->dev_id); - per_cpu(cmc_irq, cpu) = irq; + action->handler, + action->flags, + per_cpu(xen_cmc_name, cpu), + action->dev_id); + per_cpu(xen_cmc_irq, cpu) = irq; break; case IA64_CMCP_VECTOR: - snprintf(per_cpu(cmcp_name, cpu), - sizeof(per_cpu(cmcp_name, cpu)), + snprintf(per_cpu(xen_cmcp_name, cpu), + sizeof(per_cpu(xen_cmcp_name, cpu)), "%s%d", action->name, cpu); irq = bind_ipi_to_irqhandler(XEN_CMCP_VECTOR, cpu, - action->handler, - action->flags, - per_cpu(cmcp_name, cpu), - action->dev_id); - per_cpu(cmcp_irq, cpu) = irq; + action->handler, + action->flags, + per_cpu(xen_cmcp_name, cpu), + action->dev_id); + per_cpu(xen_cmcp_irq, cpu) = irq; break; case IA64_CPEP_VECTOR: - snprintf(per_cpu(cpep_name, cpu), - sizeof(per_cpu(cpep_name, cpu)), + snprintf(per_cpu(xen_cpep_name, cpu), + sizeof(per_cpu(xen_cpep_name, cpu)), "%s%d", action->name, cpu); irq = bind_ipi_to_irqhandler(XEN_CPEP_VECTOR, cpu, - action->handler, - action->flags, - per_cpu(cpep_name, cpu), - action->dev_id); - per_cpu(cpep_irq, cpu) = irq; + action->handler, + action->flags, + per_cpu(xen_cpep_name, cpu), + action->dev_id); + per_cpu(xen_cpep_irq, cpu) = irq; break; case IA64_CPE_VECTOR: case IA64_MCA_RENDEZ_VECTOR: @@ -275,30 +275,33 @@ unbind_evtchn_callback(struct notifier_b if (action == CPU_DEAD) { /* Unregister evtchn. */ - if (per_cpu(cpep_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(cpep_irq, cpu), NULL); - per_cpu(cpep_irq, cpu) = -1; + if (per_cpu(xen_cpep_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_cpep_irq, cpu), + NULL); + per_cpu(xen_cpep_irq, cpu) = -1; } - if (per_cpu(cmcp_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(cmcp_irq, cpu), NULL); - per_cpu(cmcp_irq, cpu) = -1; + if (per_cpu(xen_cmcp_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_cmcp_irq, cpu), + NULL); + per_cpu(xen_cmcp_irq, cpu) = -1; } - if (per_cpu(cmc_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(cmc_irq, cpu), NULL); - per_cpu(cmc_irq, cpu) = -1; + if (per_cpu(xen_cmc_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_cmc_irq, cpu), NULL); + per_cpu(xen_cmc_irq, cpu) = -1; } - if (per_cpu(ipi_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(ipi_irq, cpu), NULL); - per_cpu(ipi_irq, cpu) = -1; + if (per_cpu(xen_ipi_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_ipi_irq, cpu), NULL); + per_cpu(xen_ipi_irq, cpu) = -1; } - if (per_cpu(resched_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(resched_irq, cpu), - NULL); - per_cpu(resched_irq, cpu) = -1; + if (per_cpu(xen_resched_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), + NULL); + per_cpu(xen_resched_irq, cpu) = -1; } - if (per_cpu(timer_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL); - per_cpu(timer_irq, cpu) = -1; + if (per_cpu(xen_timer_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_timer_irq, cpu), + NULL); + per_cpu(xen_timer_irq, cpu) = -1; } } return NOTIFY_OK; diff -uprN linux-2.6.32/arch/ia64/xen/time.c linux-2.6.32.ovz/arch/ia64/xen/time.c --- linux-2.6.32/arch/ia64/xen/time.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/xen/time.c 2015-09-10 10:07:48.000000000 -0700 @@ -34,15 +34,15 @@ #include "../kernel/fsyscall_gtod_data.h" -DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); -DEFINE_PER_CPU(unsigned long, processed_stolen_time); -DEFINE_PER_CPU(unsigned long, processed_blocked_time); +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); +static DEFINE_PER_CPU(unsigned long, xen_stolen_time); +static DEFINE_PER_CPU(unsigned long, xen_blocked_time); /* taken from i386/kernel/time-xen.c */ static void xen_init_missing_ticks_accounting(int cpu) { struct vcpu_register_runstate_memory_area area; - struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); + struct vcpu_runstate_info *runstate = &per_cpu(xen_runstate, cpu); int rc; memset(runstate, 0, sizeof(*runstate)); @@ -52,8 +52,8 @@ static void xen_init_missing_ticks_accou &area); WARN_ON(rc && rc != -ENOSYS); - per_cpu(processed_blocked_time, cpu) = runstate->time[RUNSTATE_blocked]; - per_cpu(processed_stolen_time, cpu) = runstate->time[RUNSTATE_runnable] + per_cpu(xen_blocked_time, cpu) = runstate->time[RUNSTATE_blocked]; + per_cpu(xen_stolen_time, cpu) = runstate->time[RUNSTATE_runnable] + runstate->time[RUNSTATE_offline]; } @@ -68,7 +68,7 @@ static void get_runstate_snapshot(struct BUG_ON(preemptible()); - state = &__get_cpu_var(runstate); + state = &__get_cpu_var(xen_runstate); /* * The runstate info is always updated by the hypervisor on @@ -103,12 +103,12 @@ consider_steal_time(unsigned long new_it * This function just checks and reject this effect. */ if (!time_after_eq(runstate.time[RUNSTATE_blocked], - per_cpu(processed_blocked_time, cpu))) + per_cpu(xen_blocked_time, cpu))) blocked = 0; if (!time_after_eq(runstate.time[RUNSTATE_runnable] + runstate.time[RUNSTATE_offline], - per_cpu(processed_stolen_time, cpu))) + per_cpu(xen_stolen_time, cpu))) stolen = 0; if (!time_after(delta_itm + new_itm, ia64_get_itc())) @@ -147,8 +147,8 @@ consider_steal_time(unsigned long new_it } else { local_cpu_data->itm_next = delta_itm + new_itm; } - per_cpu(processed_stolen_time, cpu) += NS_PER_TICK * stolen; - per_cpu(processed_blocked_time, cpu) += NS_PER_TICK * blocked; + per_cpu(xen_stolen_time, cpu) += NS_PER_TICK * stolen; + per_cpu(xen_blocked_time, cpu) += NS_PER_TICK * blocked; } return delta_itm; } diff -uprN linux-2.6.32/arch/Kconfig linux-2.6.32.ovz/arch/Kconfig --- linux-2.6.32/arch/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/Kconfig 2015-09-10 10:07:41.000000000 -0700 @@ -6,8 +6,6 @@ config OPROFILE tristate "OProfile system profiling (EXPERIMENTAL)" depends on PROFILING depends on HAVE_OPROFILE - depends on TRACING_SUPPORT - select TRACING select RING_BUFFER select RING_BUFFER_ALLOW_SWAP help @@ -17,20 +15,6 @@ config OPROFILE If unsure, say N. -config OPROFILE_IBS - bool "OProfile AMD IBS support (EXPERIMENTAL)" - default n - depends on OPROFILE && SMP && X86 - help - Instruction-Based Sampling (IBS) is a new profiling - technique that provides rich, precise program performance - information. IBS is introduced by AMD Family10h processors - (AMD Opteron Quad-Core processor "Barcelona") to overcome - the limitations of conventional performance counter - sampling. - - If unsure, say N. - config OPROFILE_EVENT_MULTIPLEX bool "OProfile multiplexing support (EXPERIMENTAL)" default n @@ -57,6 +41,17 @@ config KPROBES for kernel debugging, non-intrusive instrumentation and testing. If in doubt, say "N". +config OPTPROBES + bool "Kprobes jump optimization support (EXPERIMENTAL)" + default y + depends on KPROBES + depends on !PREEMPT + depends on HAVE_OPTPROBES + select KALLSYMS_ALL + help + This option will allow kprobes to optimize breakpoint to + a jump for reducing its overhead. + config HAVE_EFFICIENT_UNALIGNED_ACCESS bool help @@ -83,6 +78,13 @@ config KRETPROBES def_bool y depends on KPROBES && HAVE_KRETPROBES +config USER_RETURN_NOTIFIER + bool + depends on HAVE_USER_RETURN_NOTIFIER + help + Provide a kernel-internal notification when a cpu is about to + switch to user mode. + config HAVE_IOREMAP_PROT bool @@ -92,6 +94,8 @@ config HAVE_KPROBES config HAVE_KRETPROBES bool +config HAVE_OPTPROBES + bool # # An arch should select this if it provides all these things: # @@ -126,4 +130,33 @@ config HAVE_DMA_API_DEBUG config HAVE_DEFAULT_NO_SPIN_MUTEXES bool +config HAVE_USER_RETURN_NOTIFIER + bool + +config HAVE_PERF_EVENTS_NMI + bool + help + System hardware can generate an NMI using the perf event + subsystem. Also has support for calculating CPU cycle events + to determine how many clock cycles in a given period. + +config HAVE_PERF_REGS + bool + help + Support selective register dumps for perf events. This includes + bit-mapping of each registers and a unique architecture id. + +config HAVE_PERF_USER_STACK_DUMP + bool + help + Support user stack dumps for perf event samples. This needs + access to the user stack pointer which is not unified across + architectures. + +config HAVE_ARCH_MUTEX_CPU_RELAX + bool + +config ARCH_HAVE_NMI_SAFE_CMPXCHG + bool + source "kernel/gcov/Kconfig" diff -uprN linux-2.6.32/arch/m32r/include/asm/mmzone.h linux-2.6.32.ovz/arch/m32r/include/asm/mmzone.h --- linux-2.6.32/arch/m32r/include/asm/mmzone.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m32r/include/asm/mmzone.h 2015-09-10 10:06:40.000000000 -0700 @@ -14,12 +14,6 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) #define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) \ -({ \ - pg_data_t *__pgdat = NODE_DATA(nid); \ - __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ -}) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) /* @@ -44,7 +38,7 @@ static __inline__ int pfn_to_nid(unsigne int node; for (node = 0 ; node < MAX_NUMNODES ; node++) - if (pfn >= node_start_pfn(node) && pfn <= node_end_pfn(node)) + if (pfn >= node_start_pfn(node) && pfn < node_end_pfn(node)) break; return node; diff -uprN linux-2.6.32/arch/m32r/include/asm/module.h linux-2.6.32.ovz/arch/m32r/include/asm/module.h --- linux-2.6.32/arch/m32r/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m32r/include/asm/module.h 2015-09-10 10:05:36.000000000 -0700 @@ -3,8 +3,13 @@ struct mod_arch_specific { }; +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #endif /* _ASM_M32R_MODULE_H */ diff -uprN linux-2.6.32/arch/m32r/include/asm/socket.h linux-2.6.32.ovz/arch/m32r/include/asm/socket.h --- linux-2.6.32/arch/m32r/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m32r/include/asm/socket.h 2015-09-10 10:05:37.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_M32R_SOCKET_H */ diff -uprN linux-2.6.32/arch/m32r/kernel/syscall_table.S linux-2.6.32.ovz/arch/m32r/kernel/syscall_table.S --- linux-2.6.32/arch/m32r/kernel/syscall_table.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m32r/kernel/syscall_table.S 2015-09-10 10:05:47.000000000 -0700 @@ -191,7 +191,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ diff -uprN linux-2.6.32/arch/m32r/kernel/sys_m32r.c linux-2.6.32.ovz/arch/m32r/kernel/sys_m32r.c --- linux-2.6.32/arch/m32r/kernel/sys_m32r.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m32r/kernel/sys_m32r.c 2015-09-10 10:05:47.000000000 -0700 @@ -76,30 +76,6 @@ asmlinkage int sys_tas(int __user *addr) return oldval; } -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. * diff -uprN linux-2.6.32/arch/m68k/include/asm/module.h linux-2.6.32.ovz/arch/m68k/include/asm/module.h --- linux-2.6.32/arch/m68k/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m68k/include/asm/module.h 2015-09-10 10:05:36.000000000 -0700 @@ -41,8 +41,13 @@ struct mod_arch_specific { #endif /* CONFIG_MMU */ +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #endif /* _ASM_M68K_MODULE_H */ diff -uprN linux-2.6.32/arch/m68k/include/asm/socket.h linux-2.6.32.ovz/arch/m68k/include/asm/socket.h --- linux-2.6.32/arch/m68k/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m68k/include/asm/socket.h 2015-09-10 10:05:37.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/m68k/kernel/sys_m68k.c linux-2.6.32.ovz/arch/m68k/kernel/sys_m68k.c --- linux-2.6.32/arch/m68k/kernel/sys_m68k.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m68k/kernel/sys_m68k.c 2015-09-10 10:05:47.000000000 -0700 @@ -29,37 +29,16 @@ #include #include -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - return do_mmap2(addr, len, prot, flags, fd, pgoff); + /* + * This is wrong for sun3 - there PAGE_SIZE is 8Kb, + * so we need to shift the argument down by 1; m68k mmap64(3) + * (in libc) expects the last argument of mmap2 in 4Kb units. + */ + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } /* @@ -90,57 +69,11 @@ asmlinkage int old_mmap(struct mmap_arg_ if (a.offset & ~PAGE_MASK) goto out; - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); -out: - return error; -} - -#if 0 -struct mmap_arg_struct64 { - __u32 addr; - __u32 len; - __u32 prot; - __u32 flags; - __u64 offset; /* 64 bits */ - __u32 fd; -}; - -asmlinkage long sys_mmap64(struct mmap_arg_struct64 *arg) -{ - int error = -EFAULT; - struct file * file = NULL; - struct mmap_arg_struct64 a; - unsigned long pgoff; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if ((long)a.offset & ~PAGE_MASK) - return -EINVAL; - - pgoff = a.offset >> PAGE_SHIFT; - if ((a.offset >> PAGE_SHIFT) != pgoff) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(a.fd); - if (!file) - goto out; - } - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } -#endif struct sel_arg_struct { unsigned long n; diff -uprN linux-2.6.32/arch/m68knommu/kernel/syscalltable.S linux-2.6.32.ovz/arch/m68knommu/kernel/syscalltable.S --- linux-2.6.32/arch/m68knommu/kernel/syscalltable.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m68knommu/kernel/syscalltable.S 2015-09-10 10:05:47.000000000 -0700 @@ -210,7 +210,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ diff -uprN linux-2.6.32/arch/m68knommu/kernel/sys_m68k.c linux-2.6.32.ovz/arch/m68knommu/kernel/sys_m68k.c --- linux-2.6.32/arch/m68knommu/kernel/sys_m68k.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m68knommu/kernel/sys_m68k.c 2015-09-10 10:05:47.000000000 -0700 @@ -27,39 +27,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - return do_mmap2(addr, len, prot, flags, fd, pgoff); -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/m68k cloned Linux/i386, which didn't use to be able to @@ -88,9 +55,8 @@ asmlinkage int old_mmap(struct mmap_arg_ if (a.offset & ~PAGE_MASK) goto out; - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } diff -uprN linux-2.6.32/arch/microblaze/include/asm/unistd.h linux-2.6.32.ovz/arch/microblaze/include/asm/unistd.h --- linux-2.6.32/arch/microblaze/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/microblaze/include/asm/unistd.h 2015-09-10 10:05:55.000000000 -0700 @@ -382,8 +382,9 @@ #define __NR_pwritev 364 /* new */ #define __NR_rt_tgsigqueueinfo 365 /* new */ #define __NR_perf_event_open 366 /* new */ +#define __NR_recvmmsg 367 /* new */ -#define __NR_syscalls 367 +#define __NR_syscalls 368 #ifdef __KERNEL__ #ifndef __ASSEMBLY__ diff -uprN linux-2.6.32/arch/microblaze/kernel/syscall_table.S linux-2.6.32.ovz/arch/microblaze/kernel/syscall_table.S --- linux-2.6.32/arch/microblaze/kernel/syscall_table.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/microblaze/kernel/syscall_table.S 2015-09-10 10:05:55.000000000 -0700 @@ -196,7 +196,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* reserved for streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 /* mmap2 */ + .long sys_mmap_pgoff /* mmap2 */ .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ @@ -371,3 +371,4 @@ ENTRY(sys_call_table) .long sys_ni_syscall .long sys_rt_tgsigqueueinfo /* 365 */ .long sys_perf_event_open + .long sys_recvmmsg diff -uprN linux-2.6.32/arch/microblaze/kernel/sys_microblaze.c linux-2.6.32.ovz/arch/microblaze/kernel/sys_microblaze.c --- linux-2.6.32/arch/microblaze/kernel/sys_microblaze.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/microblaze/kernel/sys_microblaze.c 2015-09-10 10:07:24.000000000 -0700 @@ -62,46 +62,14 @@ out: return error; } -asmlinkage long -sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct file *file = NULL; - int ret = -EBADF; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) { - printk(KERN_INFO "no fd in mmap\r\n"); - goto out; - } - } - - down_write(¤t->mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); -out: - return ret; -} - asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, off_t pgoff) { - int err = -EINVAL; - - if (pgoff & ~PAGE_MASK) { - printk(KERN_INFO "no pagemask in mmap\r\n"); - goto out; - } + if (pgoff & ~PAGE_MASK) + return -EINVAL; - err = sys_mmap2(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT); -out: - return err; + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT); } /* diff -uprN linux-2.6.32/arch/mips/include/asm/errno.h linux-2.6.32.ovz/arch/mips/include/asm/errno.h --- linux-2.6.32/arch/mips/include/asm/errno.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/errno.h 2015-09-10 10:07:40.000000000 -0700 @@ -102,7 +102,7 @@ #define EWOULDBLOCK EAGAIN /* Operation would block */ #define EALREADY 149 /* Operation already in progress */ #define EINPROGRESS 150 /* Operation now in progress */ -#define ESTALE 151 /* Stale NFS file handle */ +#define ESTALE 151 /* Stale file handle */ #define ECANCELED 158 /* AIO operation canceled */ /* @@ -121,6 +121,8 @@ #define ERFKILL 167 /* Operation not possible due to RF-kill */ +#define EHWPOISON 168 /* Memory page has hardware error */ + #define EDQUOT 1133 /* Quota exceeded */ #ifdef __KERNEL__ diff -uprN linux-2.6.32/arch/mips/include/asm/mman.h linux-2.6.32.ovz/arch/mips/include/asm/mman.h --- linux-2.6.32/arch/mips/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/mman.h 2015-10-24 08:14:47.354501187 -0700 @@ -77,6 +77,13 @@ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ #define MADV_HWPOISON 100 /* poison a page for testing */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + +#define MADV_DONTDUMP 16 /* Explicity exclude from the core dump, + overrides the coredump filter bits */ +#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ + /* compatibility flags */ #define MAP_FILE 0 diff -uprN linux-2.6.32/arch/mips/include/asm/module.h linux-2.6.32.ovz/arch/mips/include/asm/module.h --- linux-2.6.32/arch/mips/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/module.h 2015-09-10 10:05:36.000000000 -0700 @@ -33,11 +33,15 @@ typedef struct { } Elf64_Mips_Rela; #ifdef CONFIG_32BIT - +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr #define Elf_Addr Elf32_Addr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #define Elf_Mips_Rel Elf32_Rel #define Elf_Mips_Rela Elf32_Rela @@ -48,11 +52,15 @@ typedef struct { #endif #ifdef CONFIG_64BIT - +#define MODULES_ARE_ELF64 #define Elf_Shdr Elf64_Shdr #define Elf_Sym Elf64_Sym #define Elf_Ehdr Elf64_Ehdr #define Elf_Addr Elf64_Addr +#define Elf_Rel Elf64_Rel +#define Elf_Rela Elf64_Rela +#define ELF_R_TYPE(X) ELF64_R_TYPE(X) +#define ELF_R_SYM(X) ELF64_R_SYM(X) #define Elf_Mips_Rel Elf64_Mips_Rel #define Elf_Mips_Rela Elf64_Mips_Rela diff -uprN linux-2.6.32/arch/mips/include/asm/socket.h linux-2.6.32.ovz/arch/mips/include/asm/socket.h --- linux-2.6.32/arch/mips/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/socket.h 2015-09-10 10:05:37.000000000 -0700 @@ -80,6 +80,8 @@ To add: #define SO_REUSEPORT 0x0200 /* A #define SO_TIMESTAMPING 37 #define SCM_TIMESTAMPING SO_TIMESTAMPING +#define SO_RXQ_OVFL 40 + #ifdef __KERNEL__ /** sock_type - Socket types diff -uprN linux-2.6.32/arch/mips/include/asm/statfs.h linux-2.6.32.ovz/arch/mips/include/asm/statfs.h --- linux-2.6.32/arch/mips/include/asm/statfs.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/statfs.h 2015-09-10 10:06:38.000000000 -0700 @@ -33,7 +33,8 @@ struct statfs { /* Linux specials */ __kernel_fsid_t f_fsid; long f_namelen; - long f_spare[6]; + long f_flags; + long f_spare[5]; }; #if (_MIPS_SIM == _MIPS_SIM_ABI32) || (_MIPS_SIM == _MIPS_SIM_NABI32) @@ -53,7 +54,8 @@ struct statfs64 { __u64 f_bavail; __kernel_fsid_t f_fsid; __u32 f_namelen; - __u32 f_spare[6]; + __u32 f_flags; + __u32 f_spare[5]; }; #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ @@ -73,7 +75,8 @@ struct statfs64 { /* Same as struct st /* Linux specials */ __kernel_fsid_t f_fsid; long f_namelen; - long f_spare[6]; + long f_flags; + long f_spare[5]; }; struct compat_statfs64 { @@ -88,7 +91,8 @@ struct compat_statfs64 { __u64 f_bavail; __kernel_fsid_t f_fsid; __u32 f_namelen; - __u32 f_spare[6]; + __u32 f_flags; + __u32 f_spare[5]; }; #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */ diff -uprN linux-2.6.32/arch/mips/include/asm/unistd.h linux-2.6.32.ovz/arch/mips/include/asm/unistd.h --- linux-2.6.32/arch/mips/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/unistd.h 2015-09-10 10:05:55.000000000 -0700 @@ -355,16 +355,17 @@ #define __NR_rt_tgsigqueueinfo (__NR_Linux + 332) #define __NR_perf_event_open (__NR_Linux + 333) #define __NR_accept4 (__NR_Linux + 334) +#define __NR_recvmmsg (__NR_Linux + 335) /* * Offset of the last Linux o32 flavoured syscall */ -#define __NR_Linux_syscalls 334 +#define __NR_Linux_syscalls 335 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ #define __NR_O32_Linux 4000 -#define __NR_O32_Linux_syscalls 334 +#define __NR_O32_Linux_syscalls 335 #if _MIPS_SIM == _MIPS_SIM_ABI64 @@ -666,16 +667,17 @@ #define __NR_rt_tgsigqueueinfo (__NR_Linux + 291) #define __NR_perf_event_open (__NR_Linux + 292) #define __NR_accept4 (__NR_Linux + 293) +#define __NR_recvmmsg (__NR_Linux + 294) /* * Offset of the last Linux 64-bit flavoured syscall */ -#define __NR_Linux_syscalls 293 +#define __NR_Linux_syscalls 294 #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */ #define __NR_64_Linux 5000 -#define __NR_64_Linux_syscalls 293 +#define __NR_64_Linux_syscalls 294 #if _MIPS_SIM == _MIPS_SIM_NABI32 @@ -981,16 +983,17 @@ #define __NR_rt_tgsigqueueinfo (__NR_Linux + 295) #define __NR_perf_event_open (__NR_Linux + 296) #define __NR_accept4 (__NR_Linux + 297) +#define __NR_recvmmsg (__NR_Linux + 298) /* * Offset of the last N32 flavoured syscall */ -#define __NR_Linux_syscalls 297 +#define __NR_Linux_syscalls 298 #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */ #define __NR_N32_Linux 6000 -#define __NR_N32_Linux_syscalls 297 +#define __NR_N32_Linux_syscalls 298 #ifdef __KERNEL__ diff -uprN linux-2.6.32/arch/mips/kernel/linux32.c linux-2.6.32.ovz/arch/mips/kernel/linux32.c --- linux-2.6.32/arch/mips/kernel/linux32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/linux32.c 2015-09-10 10:07:24.000000000 -0700 @@ -67,28 +67,13 @@ SYSCALL_DEFINE6(32_mmap2, unsigned long, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { - struct file * file = NULL; unsigned long error; error = -EINVAL; if (pgoff & (~PAGE_MASK >> 12)) goto out; - pgoff >>= PAGE_SHIFT-12; - - if (!(flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); - + error = sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT-12)); out: return error; } diff -uprN linux-2.6.32/arch/mips/kernel/scall32-o32.S linux-2.6.32.ovz/arch/mips/kernel/scall32-o32.S --- linux-2.6.32/arch/mips/kernel/scall32-o32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/scall32-o32.S 2015-09-10 10:05:55.000000000 -0700 @@ -583,6 +583,7 @@ einval: li v0, -ENOSYS sys sys_rt_tgsigqueueinfo 4 sys sys_perf_event_open 5 sys sys_accept4 4 + sys sys_recvmmsg 5 .endm /* We pre-compute the number of _instruction_ bytes needed to diff -uprN linux-2.6.32/arch/mips/kernel/scall64-64.S linux-2.6.32.ovz/arch/mips/kernel/scall64-64.S --- linux-2.6.32/arch/mips/kernel/scall64-64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/scall64-64.S 2015-09-10 10:05:55.000000000 -0700 @@ -420,4 +420,5 @@ sys_call_table: PTR sys_rt_tgsigqueueinfo PTR sys_perf_event_open PTR sys_accept4 + PTR sys_recvmmsg .size sys_call_table,.-sys_call_table diff -uprN linux-2.6.32/arch/mips/kernel/scall64-n32.S linux-2.6.32.ovz/arch/mips/kernel/scall64-n32.S --- linux-2.6.32/arch/mips/kernel/scall64-n32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/scall64-n32.S 2015-09-10 10:05:55.000000000 -0700 @@ -418,4 +418,5 @@ EXPORT(sysn32_call_table) PTR compat_sys_rt_tgsigqueueinfo /* 5295 */ PTR sys_perf_event_open PTR sys_accept4 + PTR compat_sys_recvmmsg .size sysn32_call_table,.-sysn32_call_table diff -uprN linux-2.6.32/arch/mips/kernel/scall64-o32.S linux-2.6.32.ovz/arch/mips/kernel/scall64-o32.S --- linux-2.6.32/arch/mips/kernel/scall64-o32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/scall64-o32.S 2015-09-10 10:05:55.000000000 -0700 @@ -538,4 +538,5 @@ sys_call_table: PTR compat_sys_rt_tgsigqueueinfo PTR sys_perf_event_open PTR sys_accept4 + PTR compat_sys_recvmmsg .size sys_call_table,.-sys_call_table diff -uprN linux-2.6.32/arch/mips/kernel/syscall.c linux-2.6.32.ovz/arch/mips/kernel/syscall.c --- linux-2.6.32/arch/mips/kernel/syscall.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/syscall.c 2015-09-10 10:07:24.000000000 -0700 @@ -93,7 +93,8 @@ unsigned long arch_get_unmapped_area(str * We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && (addr & shm_align_mask)) + if ((flags & MAP_SHARED) && + ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) return -EINVAL; return addr; } @@ -129,31 +130,6 @@ unsigned long arch_get_unmapped_area(str } } -/* common code for old and new mmaps */ -static inline unsigned long -do_mmap2(unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long pgoff) -{ - unsigned long error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - SYSCALL_DEFINE6(mips_mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, off_t, offset) @@ -164,7 +140,7 @@ SYSCALL_DEFINE6(mips_mmap, unsigned long if (offset & ~PAGE_MASK) goto out; - result = do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + result = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); out: return result; @@ -177,7 +153,7 @@ SYSCALL_DEFINE6(mips_mmap2, unsigned lon if (pgoff & (~PAGE_MASK >> 12)) return -EINVAL; - return do_mmap2(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT-12)); + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT-12)); } save_static_function(sys_fork); diff -uprN linux-2.6.32/arch/mips/mm/hugetlbpage.c linux-2.6.32.ovz/arch/mips/mm/hugetlbpage.c --- linux-2.6.32/arch/mips/mm/hugetlbpage.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/mm/hugetlbpage.c 2015-09-10 10:07:03.000000000 -0700 @@ -24,7 +24,7 @@ #include pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, - unsigned long sz) + unsigned long sz, bool *shared) { pgd_t *pgd; pud_t *pud; diff -uprN linux-2.6.32/arch/mips/mm/init.c linux-2.6.32.ovz/arch/mips/mm/init.c --- linux-2.6.32/arch/mips/mm/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/mm/init.c 2015-09-10 10:05:55.000000000 -0700 @@ -298,7 +298,7 @@ void __init fixrange_init(unsigned long } #ifndef CONFIG_NEED_MULTIPLE_NODES -static int __init page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { int i; diff -uprN linux-2.6.32/arch/mips/txx9/generic/setup.c linux-2.6.32.ovz/arch/mips/txx9/generic/setup.c --- linux-2.6.32/arch/mips/txx9/generic/setup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/txx9/generic/setup.c 2015-09-10 10:06:06.000000000 -0700 @@ -930,7 +930,7 @@ struct txx9_sramc_sysdev { void __iomem *base; }; -static ssize_t txx9_sram_read(struct kobject *kobj, +static ssize_t txx9_sram_read(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t size) { @@ -945,7 +945,7 @@ static ssize_t txx9_sram_read(struct kob return size; } -static ssize_t txx9_sram_write(struct kobject *kobj, +static ssize_t txx9_sram_write(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t size) { diff -uprN linux-2.6.32/arch/mn10300/include/asm/mman.h linux-2.6.32.ovz/arch/mn10300/include/asm/mman.h --- linux-2.6.32/arch/mn10300/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mn10300/include/asm/mman.h 2015-09-10 10:05:47.000000000 -0700 @@ -1 +1,6 @@ #include + +#define MIN_MAP_ADDR PAGE_SIZE /* minimum fixed mmap address */ + +#define arch_mmap_check(addr, len, flags) \ + (((flags) & MAP_FIXED && (addr) < MIN_MAP_ADDR) ? -EINVAL : 0) diff -uprN linux-2.6.32/arch/mn10300/include/asm/socket.h linux-2.6.32.ovz/arch/mn10300/include/asm/socket.h --- linux-2.6.32/arch/mn10300/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mn10300/include/asm/socket.h 2015-09-10 10:05:37.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/mn10300/kernel/entry.S linux-2.6.32.ovz/arch/mn10300/kernel/entry.S --- linux-2.6.32/arch/mn10300/kernel/entry.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mn10300/kernel/entry.S 2015-09-10 10:05:47.000000000 -0700 @@ -578,7 +578,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* reserved for streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ diff -uprN linux-2.6.32/arch/mn10300/kernel/sys_mn10300.c linux-2.6.32.ovz/arch/mn10300/kernel/sys_mn10300.c --- linux-2.6.32/arch/mn10300/kernel/sys_mn10300.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mn10300/kernel/sys_mn10300.c 2015-09-10 10:05:47.000000000 -0700 @@ -23,47 +23,13 @@ #include -#define MIN_MAP_ADDR PAGE_SIZE /* minimum fixed mmap address */ - -/* - * memory mapping syscall - */ -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct file *file = NULL; - long error = -EINVAL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - if (flags & MAP_FIXED && addr < MIN_MAP_ADDR) - goto out; - - error = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage long old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) { if (offset & ~PAGE_MASK) return -EINVAL; - return sys_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + return sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); } struct sel_arg_struct { diff -uprN linux-2.6.32/arch/parisc/hpux/sys_hpux.c linux-2.6.32.ovz/arch/parisc/hpux/sys_hpux.c --- linux-2.6.32/arch/parisc/hpux/sys_hpux.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/hpux/sys_hpux.c 2015-09-10 10:07:24.000000000 -0700 @@ -145,7 +145,7 @@ static int hpux_ustat(dev_t dev, struct s = user_get_super(dev); if (s == NULL) goto out; - err = vfs_statfs(s->s_root, &sbuf); + err = statfs_by_dentry(s->s_root, &sbuf); drop_super(s); if (err) goto out; @@ -186,12 +186,12 @@ struct hpux_statfs { int16_t f_pad; }; -static int vfs_statfs_hpux(struct dentry *dentry, struct hpux_statfs *buf) +static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf) { struct kstatfs st; int retval; - retval = vfs_statfs(dentry, &st); + retval = vfs_statfs(path, &st); if (retval) return retval; @@ -219,7 +219,7 @@ asmlinkage long hpux_statfs(const char _ error = user_path(pathname, &path); if (!error) { struct hpux_statfs tmp; - error = vfs_statfs_hpux(path.dentry, &tmp); + error = do_statfs_hpux(&path, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_put(&path); @@ -237,7 +237,7 @@ asmlinkage long hpux_fstatfs(unsigned in file = fget(fd); if (!file) goto out; - error = vfs_statfs_hpux(file->f_path.dentry, &tmp); + error = do_statfs_hpux(&file->f_path, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); diff -uprN linux-2.6.32/arch/parisc/include/asm/errno.h linux-2.6.32.ovz/arch/parisc/include/asm/errno.h --- linux-2.6.32/arch/parisc/include/asm/errno.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/errno.h 2015-09-10 10:07:40.000000000 -0700 @@ -37,7 +37,7 @@ #define EBADMSG 67 /* Not a data message */ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ -#define ESTALE 70 /* Stale NFS file handle */ +#define ESTALE 70 /* Stale file handle */ #define EREMOTE 71 /* Object is remote */ #define EOVERFLOW 72 /* Value too large for defined data type */ @@ -122,4 +122,6 @@ #define ERFKILL 256 /* Operation not possible due to RF-kill */ +#define EHWPOISON 257 /* Memory page has hardware error */ + #endif diff -uprN linux-2.6.32/arch/parisc/include/asm/mman.h linux-2.6.32.ovz/arch/parisc/include/asm/mman.h --- linux-2.6.32/arch/parisc/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/mman.h 2015-10-24 08:14:47.354501187 -0700 @@ -59,6 +59,13 @@ #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 67 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 68 /* Not worth backing with hugepages */ + +#define MADV_DONTDUMP 69 /* Explicity exclude from the core dump, + overrides the coredump filter bits */ +#define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */ + /* compatibility flags */ #define MAP_FILE 0 #define MAP_VARIABLE 0 diff -uprN linux-2.6.32/arch/parisc/include/asm/mmzone.h linux-2.6.32.ovz/arch/parisc/include/asm/mmzone.h --- linux-2.6.32/arch/parisc/include/asm/mmzone.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/mmzone.h 2015-09-10 10:06:40.000000000 -0700 @@ -14,13 +14,6 @@ extern struct node_map_data node_data[]; #define NODE_DATA(nid) (&node_data[nid].pg_data) -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) \ -({ \ - pg_data_t *__pgdat = NODE_DATA(nid); \ - __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ -}) - /* We have these possible memory map layouts: * Astro: 0-3.75, 67.75-68, 4-64 * zx1: 0-1, 257-260, 4-256 diff -uprN linux-2.6.32/arch/parisc/include/asm/module.h linux-2.6.32.ovz/arch/parisc/include/asm/module.h --- linux-2.6.32/arch/parisc/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/module.h 2015-09-10 10:05:36.000000000 -0700 @@ -4,17 +4,25 @@ * This file contains the parisc architecture specific module code. */ #ifdef CONFIG_64BIT +#define MODULES_ARE_ELF64 #define Elf_Shdr Elf64_Shdr #define Elf_Sym Elf64_Sym #define Elf_Ehdr Elf64_Ehdr #define Elf_Addr Elf64_Addr +#define Elf_Rel Elf64_Rel #define Elf_Rela Elf64_Rela +#define ELF_R_TYPE(X) ELF64_R_TYPE(X) +#define ELF_R_SYM(X) ELF64_R_SYM(X) #else +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr #define Elf_Addr Elf32_Addr +#define Elf_Rel Elf32_Rel #define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #endif struct unwind_table; diff -uprN linux-2.6.32/arch/parisc/include/asm/socket.h linux-2.6.32.ovz/arch/parisc/include/asm/socket.h --- linux-2.6.32/arch/parisc/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/socket.h 2015-09-10 10:05:37.000000000 -0700 @@ -59,6 +59,8 @@ #define SO_TIMESTAMPING 0x4020 #define SCM_TIMESTAMPING SO_TIMESTAMPING +#define SO_RXQ_OVFL 0x4021 + /* O_NONBLOCK clashes with the bits used for socket types. Therefore we * have to define SOCK_NONBLOCK to a different value here. */ diff -uprN linux-2.6.32/arch/parisc/include/asm/unistd.h linux-2.6.32.ovz/arch/parisc/include/asm/unistd.h --- linux-2.6.32/arch/parisc/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/unistd.h 2015-09-10 10:05:55.000000000 -0700 @@ -811,8 +811,9 @@ #define __NR_pwritev (__NR_Linux + 316) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 317) #define __NR_perf_event_open (__NR_Linux + 318) +#define __NR_recvmmsg (__NR_Linux + 319) -#define __NR_Linux_syscalls (__NR_perf_event_open + 1) +#define __NR_Linux_syscalls (__NR_recvmmsg + 1) #define __IGNORE_select /* newselect */ diff -uprN linux-2.6.32/arch/parisc/kernel/syscall_table.S linux-2.6.32.ovz/arch/parisc/kernel/syscall_table.S --- linux-2.6.32/arch/parisc/kernel/syscall_table.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/kernel/syscall_table.S 2015-09-10 10:05:55.000000000 -0700 @@ -417,6 +417,7 @@ ENTRY_COMP(pwritev) ENTRY_COMP(rt_tgsigqueueinfo) ENTRY_SAME(perf_event_open) + ENTRY_COMP(recvmmsg) /* Nothing yet */ diff -uprN linux-2.6.32/arch/parisc/kernel/sys_parisc32.c linux-2.6.32.ovz/arch/parisc/kernel/sys_parisc32.c --- linux-2.6.32/arch/parisc/kernel/sys_parisc32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/kernel/sys_parisc32.c 2015-09-10 10:07:24.000000000 -0700 @@ -26,13 +26,7 @@ #include #include #include -#include #include -#include -#include -#include -#include -#include #include #include #include diff -uprN linux-2.6.32/arch/parisc/kernel/sys_parisc.c linux-2.6.32.ovz/arch/parisc/kernel/sys_parisc.c --- linux-2.6.32/arch/parisc/kernel/sys_parisc.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/kernel/sys_parisc.c 2015-09-10 10:05:47.000000000 -0700 @@ -110,37 +110,14 @@ unsigned long arch_get_unmapped_area(str return addr; } -static unsigned long do_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, unsigned long fd, - unsigned long pgoff) -{ - struct file * file = NULL; - unsigned long error = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file != NULL) - fput(file); -out: - return error; -} - asmlinkage unsigned long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { /* Make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE we have. */ - return do_mmap2(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT - 12)); + return sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len, @@ -148,7 +125,8 @@ asmlinkage unsigned long sys_mmap(unsign unsigned long offset) { if (!(offset & ~PAGE_MASK)) { - return do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + return sys_mmap_pgoff(addr, len, prot, flags, fd, + offset >> PAGE_SHIFT); } else { return -EINVAL; } diff -uprN linux-2.6.32/arch/parisc/mm/init.c linux-2.6.32.ovz/arch/parisc/mm/init.c --- linux-2.6.32/arch/parisc/mm/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/mm/init.c 2015-09-10 10:06:38.000000000 -0700 @@ -543,7 +543,7 @@ void __init mem_init(void) unsigned long *empty_zero_page __read_mostly; EXPORT_SYMBOL(empty_zero_page); -void show_mem(void) +void show_mem(unsigned int filter) { int i,free = 0,total = 0,reserved = 0; int shared = 0, cached = 0; diff -uprN linux-2.6.32/arch/powerpc/include/asm/asm-compat.h linux-2.6.32.ovz/arch/powerpc/include/asm/asm-compat.h --- linux-2.6.32/arch/powerpc/include/asm/asm-compat.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/asm-compat.h 2015-09-10 10:06:05.000000000 -0700 @@ -2,6 +2,7 @@ #define _ASM_POWERPC_ASM_COMPAT_H #include +#include #ifdef __ASSEMBLY__ # define stringify_in_c(...) __VA_ARGS__ @@ -24,7 +25,7 @@ #define PPC_LONG stringify_in_c(.llong) #define PPC_LONG_ALIGN stringify_in_c(.balign 8) #define PPC_TLNEI stringify_in_c(tdnei) -#define PPC_LLARX stringify_in_c(ldarx) +#define PPC_LLARX(t, a, b, eh) PPC_LDARX(t, a, b, eh) #define PPC_STLCX stringify_in_c(stdcx.) #define PPC_CNTLZL stringify_in_c(cntlzd) @@ -46,7 +47,7 @@ #define PPC_LONG stringify_in_c(.long) #define PPC_LONG_ALIGN stringify_in_c(.balign 4) #define PPC_TLNEI stringify_in_c(twnei) -#define PPC_LLARX stringify_in_c(lwarx) +#define PPC_LLARX(t, a, b, eh) PPC_LWARX(t, a, b, eh) #define PPC_STLCX stringify_in_c(stwcx.) #define PPC_CNTLZL stringify_in_c(cntlzw) #define PPC_MTOCRF stringify_in_c(mtcrf) diff -uprN linux-2.6.32/arch/powerpc/include/asm/bitops.h linux-2.6.32.ovz/arch/powerpc/include/asm/bitops.h --- linux-2.6.32/arch/powerpc/include/asm/bitops.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/bitops.h 2015-09-10 10:07:34.000000000 -0700 @@ -65,7 +65,7 @@ static __inline__ void fn(unsigned long unsigned long *p = (unsigned long *)_p; \ __asm__ __volatile__ ( \ prefix \ -"1:" PPC_LLARX "%0,0,%3\n" \ +"1:" PPC_LLARX(%0,0,%3,0) "\n" \ stringify_in_c(op) "%0,%0,%2\n" \ PPC405_ERR77(0,%3) \ PPC_STLCX "%0,0,%3\n" \ @@ -103,31 +103,31 @@ static __inline__ void change_bit(int nr /* Like DEFINE_BITOP(), with changes to the arguments to 'op' and the output * operands. */ -#define DEFINE_TESTOP(fn, op, prefix, postfix) \ -static __inline__ unsigned long fn( \ - unsigned long mask, \ - volatile unsigned long *_p) \ -{ \ - unsigned long old, t; \ - unsigned long *p = (unsigned long *)_p; \ - __asm__ __volatile__ ( \ - prefix \ -"1:" PPC_LLARX "%0,0,%3\n" \ - stringify_in_c(op) "%1,%0,%2\n" \ - PPC405_ERR77(0,%3) \ - PPC_STLCX "%1,0,%3\n" \ - "bne- 1b\n" \ - postfix \ - : "=&r" (old), "=&r" (t) \ - : "r" (mask), "r" (p) \ - : "cc", "memory"); \ - return (old & mask); \ -} - -DEFINE_TESTOP(test_and_set_bits, or, LWSYNC_ON_SMP, ISYNC_ON_SMP) -DEFINE_TESTOP(test_and_set_bits_lock, or, "", ISYNC_ON_SMP) -DEFINE_TESTOP(test_and_clear_bits, andc, LWSYNC_ON_SMP, ISYNC_ON_SMP) -DEFINE_TESTOP(test_and_change_bits, xor, LWSYNC_ON_SMP, ISYNC_ON_SMP) +#define DEFINE_TESTOP(fn, op, prefix, postfix, eh) \ +static __inline__ unsigned long fn( \ + unsigned long mask, \ + volatile unsigned long *_p) \ +{ \ + unsigned long old, t; \ + unsigned long *p = (unsigned long *)_p; \ + __asm__ __volatile__ ( \ + prefix \ +"1:" PPC_LLARX(%0,0,%3,eh) "\n" \ + stringify_in_c(op) "%1,%0,%2\n" \ + PPC405_ERR77(0,%3) \ + PPC_STLCX "%1,0,%3\n" \ + "bne- 1b\n" \ + postfix \ + : "=&r" (old), "=&r" (t) \ + : "r" (mask), "r" (p) \ + : "cc", "memory"); \ + return (old & mask); \ +} + +DEFINE_TESTOP(test_and_set_bits, or, LWSYNC_ON_SMP, ISYNC_ON_SMP, 0) +DEFINE_TESTOP(test_and_set_bits_lock, or, "", ISYNC_ON_SMP, 1) +DEFINE_TESTOP(test_and_clear_bits, andc, LWSYNC_ON_SMP, ISYNC_ON_SMP, 0) +DEFINE_TESTOP(test_and_change_bits, xor, LWSYNC_ON_SMP, ISYNC_ON_SMP, 0) static __inline__ int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) @@ -333,6 +333,33 @@ unsigned long generic_find_next_le_bit(c #include +/* Compat macros for RHEL6 */ +#define find_next_zero_bit_le(addr, size, offset) \ + generic_find_next_zero_le_bit(addr, size, offset) +#define find_next_bit_le(addr, size, offset) \ + generic_find_next_le_bit(addr, size, offset) +#define __set_bit_le(nr,addr) \ + __set_le_bit(nr, addr) +#define __clear_bit_le(nr, addr) \ + __clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) + +#define test_bit_le(nr, addr) \ + test_le_bit((nr), (addr)) +#define set_bit_le(nr, addr) \ + set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) +#define clear_bit_le(nr, addr) \ + clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) + +#define test_and_set_bit_le(nr, addr) \ + test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) +#define test_and_clear_bit_le(nr, addr) \ + test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) + +#define __test_and_set_bit_le(nr, addr) \ + __test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) +#define __test_and_clear_bit_le(nr, addr) \ + __test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BITOPS_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/bug.h linux-2.6.32.ovz/arch/powerpc/include/asm/bug.h --- linux-2.6.32/arch/powerpc/include/asm/bug.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/bug.h 2015-09-10 10:06:13.000000000 -0700 @@ -85,12 +85,12 @@ } \ } while (0) -#define __WARN() do { \ +#define __WARN_TAINT(taint) do { \ __asm__ __volatile__( \ "1: twi 31,0,0\n" \ _EMIT_BUG_ENTRY \ : : "i" (__FILE__), "i" (__LINE__), \ - "i" (BUGFLAG_WARNING), \ + "i" (BUGFLAG_TAINT(taint)), \ "i" (sizeof(struct bug_entry))); \ } while (0) @@ -104,7 +104,7 @@ "1: "PPC_TLNEI" %4,0\n" \ _EMIT_BUG_ENTRY \ : : "i" (__FILE__), "i" (__LINE__), \ - "i" (BUGFLAG_WARNING), \ + "i" (BUGFLAG_TAINT(TAINT_WARN)), \ "i" (sizeof(struct bug_entry)), \ "r" (__ret_warn_on)); \ } \ diff -uprN linux-2.6.32/arch/powerpc/include/asm/checksum.h linux-2.6.32.ovz/arch/powerpc/include/asm/checksum.h --- linux-2.6.32/arch/powerpc/include/asm/checksum.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/checksum.h 2015-09-10 10:07:25.000000000 -0700 @@ -52,12 +52,22 @@ extern __wsum csum_partial(const void *b extern __wsum csum_partial_copy_generic(const void *src, void *dst, int len, __wsum sum, int *src_err, int *dst_err); + +#ifdef __powerpc64__ +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER +extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, + int len, __wsum sum, int *err_ptr); +#define HAVE_CSUM_COPY_USER +extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, + int len, __wsum sum, int *err_ptr); +#else /* * the same as csum_partial, but copies from src to dst while it * checksums. */ #define csum_partial_copy_from_user(src, dst, len, sum, errp) \ csum_partial_copy_generic((__force const void *)(src), (dst), (len), (sum), (errp), NULL) +#endif #define csum_partial_copy_nocheck(src, dst, len, sum) \ csum_partial_copy_generic((src), (dst), (len), (sum), NULL, NULL) diff -uprN linux-2.6.32/arch/powerpc/include/asm/compat.h linux-2.6.32.ovz/arch/powerpc/include/asm/compat.h --- linux-2.6.32/arch/powerpc/include/asm/compat.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/compat.h 2015-09-10 10:06:15.000000000 -0700 @@ -133,7 +133,7 @@ static inline compat_uptr_t ptr_to_compa return (u32)(unsigned long)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = current->thread.regs; unsigned long usp = regs->gpr[1]; diff -uprN linux-2.6.32/arch/powerpc/include/asm/cputable.h linux-2.6.32.ovz/arch/powerpc/include/asm/cputable.h --- linux-2.6.32/arch/powerpc/include/asm/cputable.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/cputable.h 2015-09-10 10:06:45.000000000 -0700 @@ -195,6 +195,8 @@ extern const char *powerpc_base_platform #define CPU_FTR_SAO LONG_ASM_CONST(0x0020000000000000) #define CPU_FTR_CP_USE_DCBTZ LONG_ASM_CONST(0x0040000000000000) #define CPU_FTR_UNALIGNED_LD_STD LONG_ASM_CONST(0x0080000000000000) +#define CPU_FTR_ASYM_SMT LONG_ASM_CONST(0x0100000000000000) +#define CPU_FTR_VMX_COPY LONG_ASM_CONST(0x2000000000000000) #ifndef __ASSEMBLY__ @@ -409,7 +411,7 @@ extern const char *powerpc_base_platform CPU_FTR_MMCRA | CPU_FTR_SMT | \ CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \ CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ - CPU_FTR_DSCR | CPU_FTR_SAO) + CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | CPU_FTR_VMX_COPY) #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ diff -uprN linux-2.6.32/arch/powerpc/include/asm/cputime.h linux-2.6.32.ovz/arch/powerpc/include/asm/cputime.h --- linux-2.6.32/arch/powerpc/include/asm/cputime.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/cputime.h 2015-09-10 10:06:56.000000000 -0700 @@ -124,10 +124,11 @@ static inline u64 cputime64_to_jiffies64 return mulhdu(ct, __cputime_jiffies_factor); } +extern u64 __cputime_msec_factor; + /* * Convert cputime <-> milliseconds */ -extern u64 __cputime_msec_factor; static inline unsigned long cputime_to_msecs(const cputime_t ct) { @@ -152,6 +153,36 @@ static inline cputime_t msecs_to_cputime } /* + * Convert cputime <-> microseconds + */ + +extern u64 __cputime_usec_factor; + +static inline unsigned long cputime_to_usecs(const cputime_t ct) +{ + return mulhdu(ct, __cputime_usec_factor); +} + +static inline cputime_t usecs_to_cputime(const unsigned long us) +{ + cputime_t ct; + unsigned long sec; + + /* have to be a little careful about overflow */ + ct = us % 1000000; + sec = us / 1000000; + if (ct) { + ct *= tb_ticks_per_sec; + do_div(ct, 1000000); + } + if (sec) + ct += (cputime_t) sec * tb_ticks_per_sec; + return ct; +} + +#define usecs_to_cputime64(us) usecs_to_cputime(us) + +/* * Convert cputime <-> seconds */ extern u64 __cputime_sec_factor; diff -uprN linux-2.6.32/arch/powerpc/include/asm/crash.h linux-2.6.32.ovz/arch/powerpc/include/asm/crash.h --- linux-2.6.32/arch/powerpc/include/asm/crash.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/crash.h 2015-09-10 10:06:52.000000000 -0700 @@ -0,0 +1,51 @@ +#ifndef _PPC64_CRASH_H +#define _PPC64_CRASH_H + +#ifdef __KERNEL__ + +#include +#include + + +static inline void *map_virtual(u64 offset, struct page **pp) +{ + struct page *page; + unsigned long pfn; + void *vaddr; + + pfn = (unsigned long)(offset >> PAGE_SHIFT); + + if (!page_is_ram(pfn)) { + printk(KERN_INFO + "crash memory driver: !page_is_ram(pfn: %lx)\n", pfn); + return NULL; + } + + if (!pfn_valid(pfn)) { + printk(KERN_INFO + "crash memory driver: invalid pfn: %lx )\n", pfn); + return NULL; + } + + page = pfn_to_page(pfn); + + vaddr = kmap(page); + if (!vaddr) { + printk(KERN_INFO + "crash memory driver: pfn: %lx kmap(page: %lx) failed\n", + pfn, (unsigned long) page); + return NULL; + } + + *pp = page; + return (vaddr + (offset & (PAGE_SIZE - 1))); +} + +static inline void unmap_virtual(struct page *page) +{ + kunmap(page); +} + +#endif /* __KERNEL__ */ + +#endif /* _PPC64_CRASH_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/dma-mapping.h linux-2.6.32.ovz/arch/powerpc/include/asm/dma-mapping.h --- linux-2.6.32/arch/powerpc/include/asm/dma-mapping.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/dma-mapping.h 2015-09-10 10:06:29.000000000 -0700 @@ -130,19 +130,7 @@ static inline int dma_supported(struct d /* We have our own implementation of pci_set_dma_mask() */ #define HAVE_ARCH_PCI_SET_DMA_MASK -static inline int dma_set_mask(struct device *dev, u64 dma_mask) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - if (unlikely(dma_ops == NULL)) - return -EIO; - if (dma_ops->set_dma_mask != NULL) - return dma_ops->set_dma_mask(dev, dma_mask); - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - *dev->dma_mask = dma_mask; - return 0; -} +extern int dma_set_mask(struct device *dev, u64 dma_mask); static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) diff -uprN linux-2.6.32/arch/powerpc/include/asm/eeh.h linux-2.6.32.ovz/arch/powerpc/include/asm/eeh.h --- linux-2.6.32/arch/powerpc/include/asm/eeh.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/eeh.h 2015-09-10 10:08:00.000000000 -0700 @@ -59,8 +59,11 @@ void __init pci_addr_cache_build(void); * device (including config space i/o). Call eeh_add_device_late * to finish the eeh setup for this device. */ +void eeh_add_device_early(struct device_node *); void eeh_add_device_tree_early(struct device_node *); +void eeh_add_device_late(struct pci_dev *); void eeh_add_device_tree_late(struct pci_bus *); +void eeh_remove_device(struct pci_dev *); /** * eeh_remove_device_recursive - undo EEH for device & children. @@ -101,10 +104,16 @@ static inline int eeh_dn_check_failure(s static inline void pci_addr_cache_build(void) { } +static inline void eeh_add_device_early(struct device_node *dn) { } + static inline void eeh_add_device_tree_early(struct device_node *dn) { } +static inline void eeh_add_device_late(struct pci_dev *dev) { } + static inline void eeh_add_device_tree_late(struct pci_bus *bus) { } +static inline void eeh_remove_device(struct pci_dev *dev) { } + static inline void eeh_remove_bus_device(struct pci_dev *dev) { } #define EEH_POSSIBLE_ERROR(val, type) (0) #define EEH_IO_ERROR_VALUE(size) (-1UL) diff -uprN linux-2.6.32/arch/powerpc/include/asm/elf.h linux-2.6.32.ovz/arch/powerpc/include/asm/elf.h --- linux-2.6.32/arch/powerpc/include/asm/elf.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/elf.h 2015-10-24 08:14:47.354501188 -0700 @@ -236,14 +236,10 @@ typedef elf_vrregset_t elf_fpxregset_t; #ifdef __powerpc64__ # define SET_PERSONALITY(ex) \ do { \ - unsigned long new_flags = 0; \ if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ - new_flags = _TIF_32BIT; \ - if ((current_thread_info()->flags & _TIF_32BIT) \ - != new_flags) \ - set_thread_flag(TIF_ABI_PENDING); \ + set_thread_flag(TIF_32BIT); \ else \ - clear_thread_flag(TIF_ABI_PENDING); \ + clear_thread_flag(TIF_32BIT); \ if (personality(current->personality) != PER_LINUX32) \ set_personality(PER_LINUX | \ (current->personality & (~PER_MASK))); \ @@ -270,6 +266,7 @@ extern int ucache_bsize; /* vDSO has arch_setup_additional_pages */ #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; +export struct page *vdso32_pages[1]; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b); diff -uprN linux-2.6.32/arch/powerpc/include/asm/exception-64s.h linux-2.6.32.ovz/arch/powerpc/include/asm/exception-64s.h --- linux-2.6.32/arch/powerpc/include/asm/exception-64s.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/exception-64s.h 2015-09-10 10:06:17.000000000 -0700 @@ -137,7 +137,8 @@ li r10,0; \ ld r11,exception_marker@toc(r2); \ std r10,RESULT(r1); /* clear regs->result */ \ - std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ + std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \ + ACCOUNT_STOLEN_TIME /* * Exception vectors. diff -uprN linux-2.6.32/arch/powerpc/include/asm/fadump.h linux-2.6.32.ovz/arch/powerpc/include/asm/fadump.h --- linux-2.6.32/arch/powerpc/include/asm/fadump.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/fadump.h 2015-09-10 10:07:15.000000000 -0700 @@ -0,0 +1,223 @@ +/* + * Firmware Assisted dump header file. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2011 IBM Corporation + * Author: Mahesh Salgaonkar + */ + +#ifndef __PPC64_FA_DUMP_H__ +#define __PPC64_FA_DUMP_H__ + +#ifdef CONFIG_FA_DUMP + +/* + * The RMA region will be saved for later dumping when kernel crashes. + * RMA is Real Mode Area, the first block of logical memory address owned + * by logical partition, containing the storage that may be accessed with + * translate off. + */ +#define RMA_START 0x0 +#define RMA_END (lmb.rmo_size) + +/* + * On some Power systems where RMO is 128MB, it still requires minimum of + * 256MB for kernel to boot successfully. When kdump infrastructure is + * configured to save vmcore over network, we run into OOM issue while + * loading modules related to network setup. Hence we need aditional 64M + * of memory to avoid OOM issue. + */ +#define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ + + (0x1UL << 26)) + +#define lmb_num_regions(lmb_type) (lmb.lmb_type.cnt) + +#define for_each_lmb(region_iter, lmb_type, reg) \ + for (region_iter = 0, reg = &lmb.lmb_type.region[region_iter]; \ + region_iter < lmb.lmb_type.cnt; \ + region_iter++, reg = &lmb.lmb_type.region[region_iter]) + +#ifndef ELF_CORE_EFLAGS +#define ELF_CORE_EFLAGS 0 +#endif + +/* Firmware provided dump sections */ +#define FADUMP_CPU_STATE_DATA 0x0001 +#define FADUMP_HPTE_REGION 0x0002 +#define FADUMP_REAL_MODE_REGION 0x0011 + +/* Dump request flag */ +#define FADUMP_REQUEST_FLAG 0x00000001 + +/* FAD commands */ +#define FADUMP_REGISTER 1 +#define FADUMP_UNREGISTER 2 +#define FADUMP_INVALIDATE 3 + +/* Dump status flag */ +#define FADUMP_ERROR_FLAG 0x2000 + +#define FADUMP_CPU_ID_MASK ((1UL << 32) - 1) + +#define CPU_UNKNOWN (~((u32)0)) + +/* Utility macros */ +#define SKIP_TO_NEXT_CPU(reg_entry) \ +({ \ + while (reg_entry->reg_id != REG_ID("CPUEND")) \ + reg_entry++; \ + reg_entry++; \ +}) + +/* Kernel Dump section info */ +struct fadump_section { + u32 request_flag; + u16 source_data_type; + u16 error_flags; + u64 source_address; + u64 source_len; + u64 bytes_dumped; + u64 destination_address; +}; + +/* ibm,configure-kernel-dump header. */ +struct fadump_section_header { + u32 dump_format_version; + u16 dump_num_sections; + u16 dump_status_flag; + u32 offset_first_dump_section; + + /* Fields for disk dump option. */ + u32 dd_block_size; + u64 dd_block_offset; + u64 dd_num_blocks; + u32 dd_offset_disk_path; + + /* Maximum time allowed to prevent an automatic dump-reboot. */ + u32 max_time_auto; +}; + +/* + * Firmware Assisted dump memory structure. This structure is required for + * registering future kernel dump with power firmware through rtas call. + * + * No disk dump option. Hence disk dump path string section is not included. + */ +struct fadump_mem_struct { + struct fadump_section_header header; + + /* Kernel dump sections */ + struct fadump_section cpu_state_data; + struct fadump_section hpte_region; + struct fadump_section rmr_region; +}; + +/* Firmware-assisted dump configuration details. */ +struct fw_dump { + unsigned long cpu_state_data_size; + unsigned long hpte_region_size; + unsigned long boot_memory_size; + unsigned long reserve_dump_area_start; + unsigned long reserve_dump_area_size; + /* cmd line option during boot */ + unsigned long reserve_bootvar; + + unsigned long fadumphdr_addr; + unsigned long cpu_notes_buf; + unsigned long cpu_notes_buf_size; + + int ibm_configure_kernel_dump; + + unsigned long fadump_enabled:1; + unsigned long fadump_supported:1; + unsigned long dump_active:1; + unsigned long dump_registered:1; +}; + +/* + * Copy the ascii values for first 8 characters from a string into u64 + * variable at their respective indexes. + * e.g. + * The string "FADMPINF" will be converted into 0x4641444d50494e46 + */ +static inline u64 str_to_u64(const char *str) +{ + u64 val = 0; + int i; + + for (i = 0; i < sizeof(val); i++) + val = (*str) ? (val << 8) | *str++ : val << 8; + return val; +} +#define STR_TO_HEX(x) str_to_u64(x) +#define REG_ID(x) str_to_u64(x) + +#define FADUMP_CRASH_INFO_MAGIC STR_TO_HEX("FADMPINF") +#define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE") + +/* The firmware-assisted dump format. + * + * The register save area is an area in the partition's memory used to preserve + * the register contents (CPU state data) for the active CPUs during a firmware + * assisted dump. The dump format contains register save area header followed + * by register entries. Each list of registers for a CPU starts with + * "CPUSTRT" and ends with "CPUEND". + */ + +/* Register save area header. */ +struct fadump_reg_save_area_header { + u64 magic_number; + u32 version; + u32 num_cpu_offset; +}; + +/* Register entry. */ +struct fadump_reg_entry { + u64 reg_id; + u64 reg_value; +}; + +/* fadump crash info structure */ +struct fadump_crash_info_header { + u64 magic_number; + u64 elfcorehdr_addr; + u32 crashing_cpu; + struct pt_regs regs; + struct cpumask cpu_online_mask; +}; + +/* Crash memory ranges */ +#define INIT_CRASHMEM_RANGES (MAX_LMB_REGIONS + 2) + +struct fad_crash_memory_ranges { + unsigned long long base; + unsigned long long size; +}; + +extern int early_init_dt_scan_fw_dump(unsigned long node, + const char *uname, int depth, void *data); +extern int fadump_reserve_mem(void); +extern int setup_fadump(void); +extern int is_fadump_active(void); +extern void crash_fadump(struct pt_regs *, const char *); +extern void fadump_cleanup(void); + +extern void vmcore_cleanup(void); +#else /* CONFIG_FA_DUMP */ +static inline int is_fadump_active(void) { return 0; } +static inline void crash_fadump(struct pt_regs *regs, const char *str) { } +#endif +#endif diff -uprN linux-2.6.32/arch/powerpc/include/asm/firmware.h linux-2.6.32.ovz/arch/powerpc/include/asm/firmware.h --- linux-2.6.32/arch/powerpc/include/asm/firmware.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/firmware.h 2015-09-10 10:06:35.000000000 -0700 @@ -46,6 +46,8 @@ #define FW_FEATURE_PS3_LV1 ASM_CONST(0x0000000000800000) #define FW_FEATURE_BEAT ASM_CONST(0x0000000001000000) #define FW_FEATURE_CMO ASM_CONST(0x0000000002000000) +#define FW_FEATURE_VPHN ASM_CONST(0x0000000004000000) +#define FW_FEATURE_XCMO ASM_CONST(0x0000000008000000) #ifndef __ASSEMBLY__ @@ -59,7 +61,7 @@ enum { FW_FEATURE_VIO | FW_FEATURE_RDMA | FW_FEATURE_LLAN | FW_FEATURE_BULK_REMOVE | FW_FEATURE_XDABR | FW_FEATURE_MULTITCE | FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | - FW_FEATURE_CMO, + FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_ISERIES_POSSIBLE = FW_FEATURE_ISERIES | FW_FEATURE_LPAR, FW_FEATURE_ISERIES_ALWAYS = FW_FEATURE_ISERIES | FW_FEATURE_LPAR, diff -uprN linux-2.6.32/arch/powerpc/include/asm/hvcall.h linux-2.6.32.ovz/arch/powerpc/include/asm/hvcall.h --- linux-2.6.32/arch/powerpc/include/asm/hvcall.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/hvcall.h 2015-09-10 10:07:23.000000000 -0700 @@ -73,7 +73,27 @@ #define H_MR_CONDITION -43 #define H_NOT_ENOUGH_RESOURCES -44 #define H_R_STATE -45 -#define H_RESCINDEND -46 +#define H_RESCINDED -46 +#define H_P2 -55 +#define H_P3 -56 +#define H_P4 -57 +#define H_P5 -58 +#define H_P6 -59 +#define H_P7 -60 +#define H_P8 -61 +#define H_P9 -62 +#define H_TOO_BIG -64 +#define H_OVERLAP -68 +#define H_INTERRUPT -69 +#define H_BAD_DATA -70 +#define H_NOT_ACTIVE -71 +#define H_SG_LIST -72 +#define H_OP_MODE -73 +#define H_COP_HW -74 +#define H_UNSUPPORTED_FLAG_START -256 +#define H_UNSUPPORTED_FLAG_END -511 +#define H_MULTI_THREADS_ACTIVE -9005 +#define H_OUTSTANDING_COP_OPS -9006 /* Long Busy is a condition that can be returned by the firmware @@ -101,6 +121,7 @@ #define H_ANDCOND (1UL<<(63-33)) #define H_ICACHE_INVALIDATE (1UL<<(63-40)) /* icbi, etc. (ignored for IO pages) */ #define H_ICACHE_SYNCHRONIZE (1UL<<(63-41)) /* dcbst, icbi, etc (ignored for IO pages */ +#define H_COALESCE_CAND (1UL<<(63-42)) /* page is a good candidate for coalescing */ #define H_ZERO_PAGE (1UL<<(63-48)) /* zero the page before mapping (ignored for IO pages) */ #define H_COPY_PAGE (1UL<<(63-49)) #define H_N (1UL<<(63-61)) @@ -215,9 +236,14 @@ #define H_JOIN 0x298 #define H_VASI_STATE 0x2A4 #define H_ENABLE_CRQ 0x2B0 +#define H_GET_EM_PARMS 0x2B8 #define H_SET_MPP 0x2D0 #define H_GET_MPP 0x2D4 -#define MAX_HCALL_OPCODE H_GET_MPP +#define H_HOME_NODE_ASSOCIATIVITY 0x2EC +#define H_RANDOM 0x300 +#define H_COP 0x304 +#define H_GET_MPP_X 0x314 +#define MAX_HCALL_OPCODE H_GET_MPP_X #ifndef __ASSEMBLY__ @@ -292,6 +318,16 @@ struct hvcall_mpp_data { int h_get_mpp(struct hvcall_mpp_data *); +struct hvcall_mpp_x_data { + unsigned long coalesced_bytes; + unsigned long pool_coalesced_bytes; + unsigned long pool_purr_cycles; + unsigned long pool_spurr_cycles; + unsigned long reserved[3]; +}; + +int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data); + #ifdef CONFIG_PPC_PSERIES extern int CMO_PrPSP; extern int CMO_SecPSP; diff -uprN linux-2.6.32/arch/powerpc/include/asm/hw_irq.h linux-2.6.32.ovz/arch/powerpc/include/asm/hw_irq.h --- linux-2.6.32/arch/powerpc/include/asm/hw_irq.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/hw_irq.h 2015-09-10 10:06:02.000000000 -0700 @@ -135,43 +135,5 @@ static inline int irqs_disabled_flags(un */ struct irq_chip; -#ifdef CONFIG_PERF_EVENTS - -#ifdef CONFIG_PPC64 -static inline unsigned long test_perf_event_pending(void) -{ - unsigned long x; - - asm volatile("lbz %0,%1(13)" - : "=r" (x) - : "i" (offsetof(struct paca_struct, perf_event_pending))); - return x; -} - -static inline void set_perf_event_pending(void) -{ - asm volatile("stb %0,%1(13)" : : - "r" (1), - "i" (offsetof(struct paca_struct, perf_event_pending))); -} - -static inline void clear_perf_event_pending(void) -{ - asm volatile("stb %0,%1(13)" : : - "r" (0), - "i" (offsetof(struct paca_struct, perf_event_pending))); -} -#endif /* CONFIG_PPC64 */ - -#else /* CONFIG_PERF_EVENTS */ - -static inline unsigned long test_perf_event_pending(void) -{ - return 0; -} - -static inline void clear_perf_event_pending(void) {} -#endif /* CONFIG_PERF_EVENTS */ - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HW_IRQ_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/io.h linux-2.6.32.ovz/arch/powerpc/include/asm/io.h --- linux-2.6.32/arch/powerpc/include/asm/io.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/io.h 2015-09-10 10:07:50.000000000 -0700 @@ -18,6 +18,14 @@ extern int check_legacy_ioport(unsigned #define _PNPWRP 0xa79 #define PNPBIOS_BASE 0xf000 +#if defined(CONFIG_PPC64) && defined(CONFIG_PCI) +extern struct pci_dev *isa_bridge_pcidev; +/* + * has legacy ISA devices ? + */ +#define arch_has_dev_port() (isa_bridge_pcidev != NULL) +#endif + #include #include diff -uprN linux-2.6.32/arch/powerpc/include/asm/iommu.h linux-2.6.32.ovz/arch/powerpc/include/asm/iommu.h --- linux-2.6.32/arch/powerpc/include/asm/iommu.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/iommu.h 2015-09-10 10:07:33.000000000 -0700 @@ -53,6 +53,16 @@ static __inline__ __attribute_const__ in */ #define IOMAP_MAX_ORDER 13 +#define IOMMU_POOL_HASHBITS 2 +#define IOMMU_NR_POOLS (1 << IOMMU_POOL_HASHBITS) + +struct iommu_pool { + unsigned long start; + unsigned long end; + unsigned long hint; + spinlock_t lock; +} ____cacheline_aligned_in_smp; + struct iommu_table { unsigned long it_busno; /* Bus number this table belongs to */ unsigned long it_size; /* Size of iommu table in entries */ @@ -61,10 +71,10 @@ struct iommu_table { unsigned long it_index; /* which iommu table this is */ unsigned long it_type; /* type: PCI or Virtual Bus */ unsigned long it_blocksize; /* Entries in each block (cacheline) */ - unsigned long it_hint; /* Hint for next alloc */ - unsigned long it_largehint; /* Hint for large allocs */ - unsigned long it_halfpoint; /* Breaking point for small/large allocs */ - spinlock_t it_lock; /* Protects it_map */ + unsigned long poolsize; + unsigned long nr_pools; + struct iommu_pool large_pool; + struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ }; diff -uprN linux-2.6.32/arch/powerpc/include/asm/Kbuild linux-2.6.32.ovz/arch/powerpc/include/asm/Kbuild --- linux-2.6.32/arch/powerpc/include/asm/Kbuild 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/Kbuild 2015-09-10 10:07:43.000000000 -0700 @@ -22,6 +22,8 @@ header-y += sigcontext.h header-y += statfs.h header-y += ps3fb.h +generic-y += trace_clock.h + unifdef-y += bootx.h unifdef-y += byteorder.h unifdef-y += cputable.h diff -uprN linux-2.6.32/arch/powerpc/include/asm/kdump.h linux-2.6.32.ovz/arch/powerpc/include/asm/kdump.h --- linux-2.6.32/arch/powerpc/include/asm/kdump.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/kdump.h 2015-09-10 10:06:08.000000000 -0700 @@ -3,8 +3,17 @@ #include -/* Kdump kernel runs at 32 MB, change at your peril. */ +/* + * If CONFIG_RELOCATABLE is enabled we can place the kdump kernel anywhere. + * To keep enough space in the RMO for the first stage kernel on 64bit, we + * place it at 64MB. If CONFIG_RELOCATABLE is not enabled we must place + * the second stage at 32MB. + */ +#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_PPC64) +#define KDUMP_KERNELBASE 0x4000000 +#else #define KDUMP_KERNELBASE 0x2000000 +#endif /* How many bytes to reserve at zero for kdump. The reserve limit should * be greater or equal to the trampoline's end address. diff -uprN linux-2.6.32/arch/powerpc/include/asm/kexec.h linux-2.6.32.ovz/arch/powerpc/include/asm/kexec.h --- linux-2.6.32/arch/powerpc/include/asm/kexec.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/kexec.h 2015-09-10 10:07:59.000000000 -0700 @@ -31,6 +31,10 @@ #define KEXEC_ARCH KEXEC_ARCH_PPC #endif +#define KEXEC_STATE_NONE 0 +#define KEXEC_STATE_IRQS_OFF 1 +#define KEXEC_STATE_REAL_MODE 2 + #ifndef __ASSEMBLY__ #include #include @@ -39,6 +43,22 @@ typedef void (*crash_shutdown_t)(void); #ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_AUTO_RESERVE + +#if PAGE_SHIFT==12 +#ifndef KEXEC_AUTO_THRESHOLD +#define KEXEC_AUTO_THRESHOLD (1ULL<<31) /* 2G */ +#endif +#else +#ifndef KEXEC_AUTO_THRESHOLD +#define KEXEC_AUTO_THRESHOLD (1ULL<<33) /* 8G */ +#endif +#endif /*PAGE_SHIFT == 12 */ + +#endif + +#include + /* * This function is responsible for capturing register states if coming * via panic or invoking dump using sysrq-trigger. diff -uprN linux-2.6.32/arch/powerpc/include/asm/kvm_para.h linux-2.6.32.ovz/arch/powerpc/include/asm/kvm_para.h --- linux-2.6.32/arch/powerpc/include/asm/kvm_para.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/kvm_para.h 2015-09-10 10:07:13.000000000 -0700 @@ -32,6 +32,11 @@ static inline unsigned int kvm_arch_para return 0; } +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + #endif /* __KERNEL__ */ #endif /* __POWERPC_KVM_PARA_H__ */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/local64.h linux-2.6.32.ovz/arch/powerpc/include/asm/local64.h --- linux-2.6.32/arch/powerpc/include/asm/local64.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/local64.h 2015-09-10 10:06:30.000000000 -0700 @@ -0,0 +1 @@ +#include diff -uprN linux-2.6.32/arch/powerpc/include/asm/local.h linux-2.6.32.ovz/arch/powerpc/include/asm/local.h --- linux-2.6.32/arch/powerpc/include/asm/local.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/local.h 2015-09-10 10:06:05.000000000 -0700 @@ -4,6 +4,8 @@ #include #include +#define ARCH_USES_RELOC_ENTRIES + typedef struct { atomic_long_t a; @@ -24,7 +26,7 @@ static __inline__ long local_add_return( long t; __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%2 # local_add_return\n\ +"1:" PPC_LLARX(%0,0,%2,0) " # local_add_return\n\ add %0,%1,%0\n" PPC405_ERR77(0,%2) PPC_STLCX "%0,0,%2 \n\ @@ -43,7 +45,7 @@ static __inline__ long local_sub_return( long t; __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%2 # local_sub_return\n\ +"1:" PPC_LLARX(%0,0,%2,0) " # local_sub_return\n\ subf %0,%1,%0\n" PPC405_ERR77(0,%2) PPC_STLCX "%0,0,%2 \n\ @@ -60,7 +62,7 @@ static __inline__ long local_inc_return( long t; __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%1 # local_inc_return\n\ +"1:" PPC_LLARX(%0,0,%1,0) " # local_inc_return\n\ addic %0,%0,1\n" PPC405_ERR77(0,%1) PPC_STLCX "%0,0,%1 \n\ @@ -87,7 +89,7 @@ static __inline__ long local_dec_return( long t; __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%1 # local_dec_return\n\ +"1:" PPC_LLARX(%0,0,%1,0) " # local_dec_return\n\ addic %0,%0,-1\n" PPC405_ERR77(0,%1) PPC_STLCX "%0,0,%1\n\ @@ -117,7 +119,7 @@ static __inline__ int local_add_unless(l long t; __asm__ __volatile__ ( -"1:" PPC_LLARX "%0,0,%1 # local_add_unless\n\ +"1:" PPC_LLARX(%0,0,%1,0) " # local_add_unless\n\ cmpw 0,%0,%3 \n\ beq- 2f \n\ add %0,%2,%0 \n" @@ -147,7 +149,7 @@ static __inline__ long local_dec_if_posi long t; __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%1 # local_dec_if_positive\n\ +"1:" PPC_LLARX(%0,0,%1,0) " # local_dec_if_positive\n\ cmpwi %0,1\n\ addi %0,%0,-1\n\ blt- 2f\n" diff -uprN linux-2.6.32/arch/powerpc/include/asm/lppaca.h linux-2.6.32.ovz/arch/powerpc/include/asm/lppaca.h --- linux-2.6.32/arch/powerpc/include/asm/lppaca.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/lppaca.h 2015-09-10 10:06:32.000000000 -0700 @@ -62,7 +62,10 @@ struct lppaca { volatile u32 dyn_pir; // Dynamic ProcIdReg value x20-x23 u32 dsei_data; // DSEI data x24-x27 u64 sprg3; // SPRG3 value x28-x2F - u8 reserved3[80]; // Reserved x30-x7F + u8 reserved3[40]; // Reserved x30-x57 + volatile u8 vphn_assoc_counts[8]; // Virtual processor home node + // associativity change counters x58-x5F + u8 reserved4[32]; // Reserved x60-x7F //============================================================================= // CACHE_LINE_2 0x0080 - 0x00FF Contains local read-write data @@ -100,7 +103,14 @@ struct lppaca { // Used to pass parms from the OS to PLIC for SetAsrAndRfid u64 saved_gpr3; // Saved GPR3 x20-x27 u64 saved_gpr4; // Saved GPR4 x28-x2F - u64 saved_gpr5; // Saved GPR5 x30-x37 + union { + u64 saved_gpr5; /* Saved GPR5 x30-x37 */ + struct { + u8 cede_latency_hint; /* x30 */ + u8 reserved[7]; /* x31-x36 */ + } fields; + } gpr5_dword; + u8 dtl_enable_mask; // Dispatch Trace Log mask x38-x38 u8 donate_dedicated_cpu; // Donate dedicated CPU cycles x39-x39 @@ -163,6 +173,33 @@ struct slb_shadow { extern struct slb_shadow slb_shadow[]; +/* + * Layout of entries in the hypervisor's dispatch trace log buffer. + */ +struct dtl_entry { + u8 dispatch_reason; + u8 preempt_reason; + u16 processor_id; + u32 enqueue_to_dispatch_time; + u32 ready_to_enqueue_time; + u32 waiting_to_ready_time; + u64 timebase; + u64 fault_addr; + u64 srr0; + u64 srr1; +}; + +#define DISPATCH_LOG_BYTES 4096 /* bytes per cpu */ +#define N_DISPATCH_LOG (DISPATCH_LOG_BYTES / sizeof(struct dtl_entry)) + +/* + * When CONFIG_VIRT_CPU_ACCOUNTING = y, the cpu accounting code controls + * reading from the dispatch trace log. If other code wants to consume + * DTL entries, it can set this pointer to a function that will get + * called once for each DTL entry that gets processed. + */ +extern void (*dtl_consumer)(struct dtl_entry *entry, u64 index); + #endif /* CONFIG_PPC_BOOK3S */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_LPPACA_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/machdep.h linux-2.6.32.ovz/arch/powerpc/include/asm/machdep.h --- linux-2.6.32/arch/powerpc/include/asm/machdep.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/machdep.h 2015-09-10 10:06:29.000000000 -0700 @@ -102,6 +102,9 @@ struct machdep_calls { void (*pci_dma_dev_setup)(struct pci_dev *dev); void (*pci_dma_bus_setup)(struct pci_bus *bus); + /* Platform set_dma_mask override */ + int (*dma_set_mask)(struct device *dev, u64 dma_mask); + int (*probe)(void); void (*setup_arch)(void); /* Optional, may be NULL */ void (*init_early)(void); @@ -266,6 +269,11 @@ struct machdep_calls { void (*suspend_disable_irqs)(void); void (*suspend_enable_irqs)(void); #endif + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + ssize_t (*cpu_probe)(const char *, size_t); + ssize_t (*cpu_release)(const char *, size_t); +#endif }; extern void e500_idle(void); diff -uprN linux-2.6.32/arch/powerpc/include/asm/mmu-hash64.h linux-2.6.32.ovz/arch/powerpc/include/asm/mmu-hash64.h --- linux-2.6.32/arch/powerpc/include/asm/mmu-hash64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/mmu-hash64.h 2015-09-10 10:06:13.000000000 -0700 @@ -257,7 +257,9 @@ extern int hash_page(unsigned long ea, u extern int hash_huge_page(struct mm_struct *mm, unsigned long access, unsigned long ea, unsigned long vsid, int local, unsigned long trap); - +extern void hash_failure_debug(unsigned long ea, unsigned long access, + unsigned long vsid, unsigned long trap, + int ssize, int psize, unsigned long pte); extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend, unsigned long pstart, unsigned long prot, int psize, int ssize); diff -uprN linux-2.6.32/arch/powerpc/include/asm/mmzone.h linux-2.6.32.ovz/arch/powerpc/include/asm/mmzone.h --- linux-2.6.32/arch/powerpc/include/asm/mmzone.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/mmzone.h 2015-09-10 10:06:40.000000000 -0700 @@ -33,15 +33,13 @@ extern int numa_cpu_lookup_table[]; extern cpumask_t numa_cpumask_lookup_table[]; #ifdef CONFIG_MEMORY_HOTPLUG extern unsigned long max_pfn; +u64 memory_hotplug_max(void); +#else +#define memory_hotplug_max() lmb_end_of_DRAM() #endif -/* - * Following are macros that each numa implmentation must define. - */ - -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) - +#else +#define memory_hotplug_max() lmb_end_of_DRAM() #endif /* CONFIG_NEED_MULTIPLE_NODES */ #endif /* __KERNEL__ */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/module.h linux-2.6.32.ovz/arch/powerpc/include/asm/module.h --- linux-2.6.32/arch/powerpc/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/module.h 2015-09-10 10:05:48.000000000 -0700 @@ -60,16 +60,26 @@ struct mod_arch_specific { */ #ifdef __powerpc64__ +# define MODULES_ARE_ELF64 # define Elf_Shdr Elf64_Shdr # define Elf_Sym Elf64_Sym # define Elf_Ehdr Elf64_Ehdr +# define Elf_Rel Elf64_Rel +# define Elf_Rela Elf64_Rela +# define ELF_R_TYPE(X) ELF64_R_TYPE(X) +# define ELF_R_SYM(X) ELF64_R_SYM(X) # ifdef MODULE asm(".section .stubs,\"ax\",@nobits; .align 3; .previous"); # endif #else +# define MODULES_ARE_ELF32 # define Elf_Shdr Elf32_Shdr # define Elf_Sym Elf32_Sym # define Elf_Ehdr Elf32_Ehdr +# define Elf_Rel Elf32_Rel +# define Elf_Rela Elf32_Rela +# define ELF_R_TYPE(X) ELF32_R_TYPE(X) +# define ELF_R_SYM(X) ELF32_R_SYM(X) # ifdef MODULE asm(".section .plt,\"ax\",@nobits; .align 3; .previous"); asm(".section .init.plt,\"ax\",@nobits; .align 3; .previous"); @@ -87,5 +97,10 @@ struct exception_table_entry; void sort_ex_table(struct exception_table_entry *start, struct exception_table_entry *finish); +#ifdef CONFIG_MODVERSIONS +#define ARCH_RELOCATES_KCRCTAB + +extern const unsigned long reloc_start[]; +#endif #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MODULE_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/mutex.h linux-2.6.32.ovz/arch/powerpc/include/asm/mutex.h --- linux-2.6.32/arch/powerpc/include/asm/mutex.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/mutex.h 2015-09-10 10:07:45.000000000 -0700 @@ -82,17 +82,15 @@ __mutex_fastpath_lock(atomic_t *count, v * __mutex_fastpath_lock_retval - try to take the lock by moving the count * from 1 to a 0 value * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 1 * - * Change the count from 1 to a value lower than 1, and call if - * it wasn't 1 originally. This function returns 0 if the fastpath succeeds, - * or anything the slow path function returns. + * Change the count from 1 to a value lower than 1. This function returns 0 + * if the fastpath succeeds, or -1 otherwise. */ static inline int -__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *)) +__mutex_fastpath_lock_retval(atomic_t *count) { if (unlikely(__mutex_dec_return_lock(count) < 0)) - return fail_fn(count); + return -1; return 0; } diff -uprN linux-2.6.32/arch/powerpc/include/asm/nvram.h linux-2.6.32.ovz/arch/powerpc/include/asm/nvram.h --- linux-2.6.32/arch/powerpc/include/asm/nvram.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/nvram.h 2015-09-10 10:06:37.000000000 -0700 @@ -12,6 +12,7 @@ #include +/* Max bytes to read/write in one go */ #define NVRW_CNT 0x20 #define NVRAM_HEADER_LEN 16 /* sizeof(struct nvram_header) */ #define NVRAM_BLOCK_LEN 16 @@ -35,6 +36,7 @@ #define MOTO_RTC_CONTROLA 0x1FF8 #define MOTO_RTC_CONTROLB 0x1FF9 +/* Signatures for nvram partitions */ #define NVRAM_SIG_SP 0x02 /* support processor */ #define NVRAM_SIG_OF 0x50 /* open firmware config */ #define NVRAM_SIG_FW 0x51 /* general firmware */ @@ -54,6 +56,7 @@ struct nvram_header { unsigned char signature; unsigned char checksum; unsigned short length; + /* Terminating null required only for names < 12 chars. */ char name[12]; }; @@ -68,14 +71,15 @@ struct nvram_partition { }; +#ifdef CONFIG_PPC_PSERIES extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, unsigned int * err_type, unsigned int *err_seq); extern int nvram_clear_error_log(void); -extern struct nvram_partition *nvram_find_partition(int sig, const char *name); extern int pSeries_nvram_init(void); +#endif #ifdef CONFIG_MMIO_NVRAM extern int mmio_nvram_init(void); @@ -86,6 +90,20 @@ static inline int mmio_nvram_init(void) } #endif +/* + * This function is no longer used internally, but it's exported, so we + * retain it to preserve the kABI. + */ +extern struct nvram_partition *nvram_find_partition(int sig, const char *name); + +extern int __init nvram_scan_partitions(void); +extern loff_t nvram_create_partition(const char *name, int sig, + int req_size, int min_size); +extern int nvram_remove_partition(const char *name, int sig, + const char *const exceptions[]); +extern int nvram_get_partition_size(loff_t data_index); +extern loff_t nvram_find_partition2(const char *name, int sig, int *out_size); + #endif /* __KERNEL__ */ /* PowerMac specific nvram stuffs */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/paca.h linux-2.6.32.ovz/arch/powerpc/include/asm/paca.h --- linux-2.6.32/arch/powerpc/include/asm/paca.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/paca.h 2015-09-10 10:06:51.000000000 -0700 @@ -78,6 +78,8 @@ struct paca_struct { /* this becomes non-zero. */ #ifdef CONFIG_PPC_STD_MMU_64 struct slb_shadow *slb_shadow_ptr; + struct dtl_entry *dispatch_log; + struct dtl_entry *dispatch_log_end; /* * Now, starting in cacheline 2, the exception save areas @@ -122,13 +124,23 @@ struct paca_struct { u8 soft_enabled; /* irq soft-enable flag */ u8 hard_enabled; /* set if irqs are enabled in MSR */ u8 io_sync; /* writel() needs spin_unlock sync */ +#ifndef __GENKSYMS__ + u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */ +#else u8 perf_event_pending; /* PM interrupt while soft-disabled */ +#endif /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ u64 system_time; /* accumulated system TB ticks */ - u64 startpurr; /* PURR/TB value snapshot */ + u64 user_time_scaled; /* accumulated usermode SPURR ticks */ + u64 starttime; /* TB value snapshot */ + u64 starttime_user; /* TB value on exit to usermode */ u64 startspurr; /* SPURR value snapshot */ + u64 utime_sspurr; /* ->user_time when ->startspurr set */ + u64 stolen_time; /* TB ticks taken by hypervisor */ + u64 dtl_ridx; /* read index in dispatch log */ + struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */ }; extern struct paca_struct paca[]; diff -uprN linux-2.6.32/arch/powerpc/include/asm/page_64.h linux-2.6.32.ovz/arch/powerpc/include/asm/page_64.h --- linux-2.6.32/arch/powerpc/include/asm/page_64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/page_64.h 2015-09-10 10:07:25.000000000 -0700 @@ -59,24 +59,7 @@ static __inline__ void clear_page(void * : "ctr", "memory"); } -extern void copy_4K_page(void *to, void *from); - -#ifdef CONFIG_PPC_64K_PAGES -static inline void copy_page(void *to, void *from) -{ - unsigned int i; - for (i=0; i < (1 << (PAGE_SHIFT - 12)); i++) { - copy_4K_page(to, from); - to += 4096; - from += 4096; - } -} -#else /* CONFIG_PPC_64K_PAGES */ -static inline void copy_page(void *to, void *from) -{ - copy_4K_page(to, from); -} -#endif /* CONFIG_PPC_64K_PAGES */ +extern void copy_page(void *to, void *from); /* Log 2 of page table size */ extern u64 ppc64_pft_size; diff -uprN linux-2.6.32/arch/powerpc/include/asm/page.h linux-2.6.32.ovz/arch/powerpc/include/asm/page.h --- linux-2.6.32/arch/powerpc/include/asm/page.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/page.h 2015-09-10 10:06:52.000000000 -0700 @@ -234,6 +234,7 @@ extern void clear_user_page(void *page, extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *p); extern int page_is_ram(unsigned long pfn); +extern int devmem_is_allowed(unsigned long pfn); #ifdef CONFIG_PPC_SMLPAR void arch_free_page(struct page *page, int order); diff -uprN linux-2.6.32/arch/powerpc/include/asm/pci-bridge.h linux-2.6.32.ovz/arch/powerpc/include/asm/pci-bridge.h --- linux-2.6.32/arch/powerpc/include/asm/pci-bridge.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/pci-bridge.h 2015-09-10 10:07:23.000000000 -0700 @@ -10,58 +10,10 @@ #include #include #include +#include struct device_node; -enum { - /* Force re-assigning all resources (ignore firmware - * setup completely) - */ - PPC_PCI_REASSIGN_ALL_RSRC = 0x00000001, - - /* Re-assign all bus numbers */ - PPC_PCI_REASSIGN_ALL_BUS = 0x00000002, - - /* Do not try to assign, just use existing setup */ - PPC_PCI_PROBE_ONLY = 0x00000004, - - /* Don't bother with ISA alignment unless the bridge has - * ISA forwarding enabled - */ - PPC_PCI_CAN_SKIP_ISA_ALIGN = 0x00000008, - - /* Enable domain numbers in /proc */ - PPC_PCI_ENABLE_PROC_DOMAINS = 0x00000010, - /* ... except for domain 0 */ - PPC_PCI_COMPAT_DOMAIN_0 = 0x00000020, -}; -#ifdef CONFIG_PCI -extern unsigned int ppc_pci_flags; - -static inline void ppc_pci_set_flags(int flags) -{ - ppc_pci_flags = flags; -} - -static inline void ppc_pci_add_flags(int flags) -{ - ppc_pci_flags |= flags; -} - -static inline int ppc_pci_has_flag(int flag) -{ - return (ppc_pci_flags & flag); -} -#else -static inline void ppc_pci_set_flags(int flags) { } -static inline void ppc_pci_add_flags(int flags) { } -static inline int ppc_pci_has_flag(int flag) -{ - return 0; -} -#endif - - /* * Structure of a PCI controller (host bridge) */ @@ -300,7 +252,6 @@ extern void pci_process_bridge_OF_ranges /* Allocate & free a PCI host bridge structure */ extern struct pci_controller *pcibios_alloc_controller(struct device_node *dev); extern void pcibios_free_controller(struct pci_controller *phb); -extern void pcibios_setup_phb_resources(struct pci_controller *hose); #ifdef CONFIG_PCI extern unsigned long pci_address_to_pio(phys_addr_t address); diff -uprN linux-2.6.32/arch/powerpc/include/asm/pci.h linux-2.6.32.ovz/arch/powerpc/include/asm/pci.h --- linux-2.6.32/arch/powerpc/include/asm/pci.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/pci.h 2015-09-10 10:07:23.000000000 -0700 @@ -44,7 +44,7 @@ struct pci_dev; * bus numbers (don't do that on ppc64 yet !) */ #define pcibios_assign_all_busses() \ - (ppc_pci_has_flag(PPC_PCI_REASSIGN_ALL_BUS)) + (pci_has_flag(PCI_REASSIGN_ALL_BUS)) static inline void pcibios_set_master(struct pci_dev *dev) { @@ -141,37 +141,7 @@ extern int pci_mmap_legacy_page_range(st #define HAVE_PCI_LEGACY 1 -#if defined(CONFIG_PPC64) || defined(CONFIG_NOT_COHERENT_CACHE) -/* - * For 64-bit kernels, pci_unmap_{single,page} is not a nop. - * For 32-bit non-coherent kernels, pci_dma_sync_single_for_cpu() and - * so on are not nops. - * and thus... - */ -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ - dma_addr_t ADDR_NAME; -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ - __u32 LEN_NAME; -#define pci_unmap_addr(PTR, ADDR_NAME) \ - ((PTR)->ADDR_NAME) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ - (((PTR)->ADDR_NAME) = (VAL)) -#define pci_unmap_len(PTR, LEN_NAME) \ - ((PTR)->LEN_NAME) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ - (((PTR)->LEN_NAME) = (VAL)) - -#else /* 32-bit && coherent */ - -/* pci_unmap_{page,single} is a nop so... */ -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) -#define pci_unmap_addr(PTR, ADDR_NAME) (0) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) -#define pci_unmap_len(PTR, LEN_NAME) (0) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) - -#endif /* CONFIG_PPC64 || CONFIG_NOT_COHERENT_CACHE */ +#include #ifdef CONFIG_PPC64 @@ -191,14 +161,6 @@ extern int pci_mmap_legacy_page_range(st #endif /* CONFIG_PPC64 */ -extern void pcibios_resource_to_bus(struct pci_dev *dev, - struct pci_bus_region *region, - struct resource *res); - -extern void pcibios_bus_to_resource(struct pci_dev *dev, - struct resource *res, - struct pci_bus_region *region); - extern void pcibios_claim_one_bus(struct pci_bus *b); extern void pcibios_finish_adding_to_bus(struct pci_bus *bus); diff -uprN linux-2.6.32/arch/powerpc/include/asm/perf_event_fsl_emb.h linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event_fsl_emb.h --- linux-2.6.32/arch/powerpc/include/asm/perf_event_fsl_emb.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event_fsl_emb.h 2015-09-10 10:06:30.000000000 -0700 @@ -0,0 +1,50 @@ +/* + * Performance event support - Freescale embedded specific definitions. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * Copyright 2010 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#define MAX_HWEVENTS 4 + +/* event flags */ +#define FSL_EMB_EVENT_VALID 1 +#define FSL_EMB_EVENT_RESTRICTED 2 + +/* upper half of event flags is PMLCb */ +#define FSL_EMB_EVENT_THRESHMUL 0x0000070000000000ULL +#define FSL_EMB_EVENT_THRESH 0x0000003f00000000ULL + +struct fsl_emb_pmu { + const char *name; + int n_counter; /* total number of counters */ + + /* + * The number of contiguous counters starting at zero that + * can hold restricted events, or zero if there are no + * restricted events. + * + * This isn't a very flexible method of expressing constraints, + * but it's very simple and is adequate for existing chips. + */ + int n_restricted; + + /* Returns event flags and PMLCb (FSL_EMB_EVENT_*) */ + u64 (*xlate_event)(u64 event_id); + + int n_generic; + int *generic_events; + int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +}; + +int register_fsl_emb_pmu(struct fsl_emb_pmu *); diff -uprN linux-2.6.32/arch/powerpc/include/asm/perf_event.h linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event.h --- linux-2.6.32/arch/powerpc/include/asm/perf_event.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event.h 2015-09-10 10:07:03.000000000 -0700 @@ -1,110 +1,40 @@ /* - * Performance event support - PowerPC-specific definitions. + * Performance event support - hardware-specific disambiguation * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * For now this is a compile-time decision, but eventually it should be + * runtime. This would allow multiplatform perf event support for e300 (fsl + * embedded perf counters) plus server/classic, and would accommodate + * devices other than the core which provide their own performance counters. + * + * Copyright 2010 Freescale Semiconductor, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include - -#include -#define MAX_HWEVENTS 8 -#define MAX_EVENT_ALTERNATIVES 8 -#define MAX_LIMITED_HWCOUNTERS 2 - -/* - * This struct provides the constants and functions needed to - * describe the PMU on a particular POWER-family CPU. - */ -struct power_pmu { - const char *name; - int n_counter; - int max_alternatives; - unsigned long add_fields; - unsigned long test_adder; - int (*compute_mmcr)(u64 events[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]); - int (*get_constraint)(u64 event_id, unsigned long *mskp, - unsigned long *valp); - int (*get_alternatives)(u64 event_id, unsigned int flags, - u64 alt[]); - void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); - int (*limited_pmc_event)(u64 event_id); - u32 flags; - int n_generic; - int *generic_events; - int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; -}; - -/* - * Values for power_pmu.flags - */ -#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ -#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ - -/* - * Values for flags to get_alternatives() - */ -#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ -#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ -#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ - -extern int register_power_pmu(struct power_pmu *); +#ifdef CONFIG_PPC_PERF_CTRS +#include +#endif -struct pt_regs; -extern unsigned long perf_misc_flags(struct pt_regs *regs); -extern unsigned long perf_instruction_pointer(struct pt_regs *regs); +#ifdef CONFIG_FSL_EMB_PERF_EVENT +#include +#endif -#define PERF_EVENT_INDEX_OFFSET 1 +#ifdef CONFIG_PERF_EVENTS +#include +#include /* - * Only override the default definitions in include/linux/perf_event.h - * if we have hardware PMU support. - */ -#ifdef CONFIG_PPC_PERF_CTRS -#define perf_misc_flags(regs) perf_misc_flags(regs) + * Overload regs->result to specify whether we should use the MSR (result + * is zero) or the SIAR (result is non zero). + */ +#define perf_arch_fetch_caller_regs(regs, __ip) \ + do { \ + (regs)->result = 0; \ + (regs)->nip = __ip; \ + (regs)->gpr[1] = *(unsigned long *)__get_SP(); \ + asm volatile("mfmsr %0" : "=r" ((regs)->msr)); \ + } while (0) #endif - -/* - * The power_pmu.get_constraint function returns a 32/64-bit value and - * a 32/64-bit mask that express the constraints between this event_id and - * other events. - * - * The value and mask are divided up into (non-overlapping) bitfields - * of three different types: - * - * Select field: this expresses the constraint that some set of bits - * in MMCR* needs to be set to a specific value for this event_id. For a - * select field, the mask contains 1s in every bit of the field, and - * the value contains a unique value for each possible setting of the - * MMCR* bits. The constraint checking code will ensure that two events - * that set the same field in their masks have the same value in their - * value dwords. - * - * Add field: this expresses the constraint that there can be at most - * N events in a particular class. A field of k bits can be used for - * N <= 2^(k-1) - 1. The mask has the most significant bit of the field - * set (and the other bits 0), and the value has only the least significant - * bit of the field set. In addition, the 'add_fields' and 'test_adder' - * in the struct power_pmu for this processor come into play. The - * add_fields value contains 1 in the LSB of the field, and the - * test_adder contains 2^(k-1) - 1 - N in the field. - * - * NAND field: this expresses the constraint that you may not have events - * in all of a set of classes. (For example, on PPC970, you can't select - * events from the FPU, ISU and IDU simultaneously, although any two are - * possible.) For N classes, the field is N+1 bits wide, and each class - * is assigned one bit from the least-significant N bits. The mask has - * only the most-significant bit set, and the value has only the bit - * for the event_id's class set. The test_adder has the least significant - * bit set in the field. - * - * If an event_id is not subject to the constraint expressed by a particular - * field, then it will have 0 in both the mask and value for that field. - */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/perf_event_server.h linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event_server.h --- linux-2.6.32/arch/powerpc/include/asm/perf_event_server.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event_server.h 2015-09-10 10:07:34.000000000 -0700 @@ -0,0 +1,113 @@ +/* + * Performance event support - PowerPC classic/server specific definitions. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#define MAX_HWEVENTS 8 +#define MAX_EVENT_ALTERNATIVES 8 +#define MAX_LIMITED_HWCOUNTERS 2 + +/* + * This struct provides the constants and functions needed to + * describe the PMU on a particular POWER-family CPU. + */ +struct power_pmu { + const char *name; + int n_counter; + int max_alternatives; + unsigned long add_fields; + unsigned long test_adder; + int (*compute_mmcr)(u64 events[], int n_ev, + unsigned int hwc[], unsigned long mmcr[]); + int (*get_constraint)(u64 event_id, unsigned long *mskp, + unsigned long *valp); + int (*get_alternatives)(u64 event_id, unsigned int flags, + u64 alt[]); + void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); + int (*limited_pmc_event)(u64 event_id); + u32 flags; + int n_generic; + int *generic_events; + int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +}; + +/* + * Values for power_pmu.flags + */ +#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ +#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ +#define PPMU_NO_SIPR 4 /* no SIPR/HV in MMCRA at all */ +#define PPMU_NO_CONT_SAMPLING 8 /* no continuous sampling */ +#define PPMU_SIAR_VALID 16 /* Processor has SIAR Valid bit */ +#define PPMU_HAS_SSLOT 0x00000020 /* Has sampled slot in MMCRA */ +#define PPMU_HAS_SIER 0x00000040 /* Has SIER */ + +/* + * Values for flags to get_alternatives() + */ +#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ +#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ +#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ + +extern int register_power_pmu(struct power_pmu *); + +struct pt_regs; +extern unsigned long perf_misc_flags(struct pt_regs *regs); +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); + +/* + * Only override the default definitions in include/linux/perf_event.h + * if we have hardware PMU support. + */ +#ifdef CONFIG_PPC_PERF_CTRS +#define perf_misc_flags(regs) perf_misc_flags(regs) +#endif + +/* + * The power_pmu.get_constraint function returns a 32/64-bit value and + * a 32/64-bit mask that express the constraints between this event_id and + * other events. + * + * The value and mask are divided up into (non-overlapping) bitfields + * of three different types: + * + * Select field: this expresses the constraint that some set of bits + * in MMCR* needs to be set to a specific value for this event_id. For a + * select field, the mask contains 1s in every bit of the field, and + * the value contains a unique value for each possible setting of the + * MMCR* bits. The constraint checking code will ensure that two events + * that set the same field in their masks have the same value in their + * value dwords. + * + * Add field: this expresses the constraint that there can be at most + * N events in a particular class. A field of k bits can be used for + * N <= 2^(k-1) - 1. The mask has the most significant bit of the field + * set (and the other bits 0), and the value has only the least significant + * bit of the field set. In addition, the 'add_fields' and 'test_adder' + * in the struct power_pmu for this processor come into play. The + * add_fields value contains 1 in the LSB of the field, and the + * test_adder contains 2^(k-1) - 1 - N in the field. + * + * NAND field: this expresses the constraint that you may not have events + * in all of a set of classes. (For example, on PPC970, you can't select + * events from the FPU, ISU and IDU simultaneously, although any two are + * possible.) For N classes, the field is N+1 bits wide, and each class + * is assigned one bit from the least-significant N bits. The mask has + * only the most-significant bit set, and the value has only the bit + * for the event_id's class set. The test_adder has the least significant + * bit set in the field. + * + * If an event_id is not subject to the constraint expressed by a particular + * field, then it will have 0 in both the mask and value for that field. + */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/pgalloc-64.h linux-2.6.32.ovz/arch/powerpc/include/asm/pgalloc-64.h --- linux-2.6.32/arch/powerpc/include/asm/pgalloc-64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/pgalloc-64.h 2015-09-10 10:05:57.000000000 -0700 @@ -11,6 +11,12 @@ #include #include +struct vmemmap_backing { + struct vmemmap_backing *list; + unsigned long phys; + unsigned long virt_addr; +}; + #ifndef CONFIG_PPC_SUBPAGE_PROT static inline void subpage_prot_free(pgd_t *pgd) {} #endif diff -uprN linux-2.6.32/arch/powerpc/include/asm/pgtable.h linux-2.6.32.ovz/arch/powerpc/include/asm/pgtable.h --- linux-2.6.32/arch/powerpc/include/asm/pgtable.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/pgtable.h 2015-09-10 10:06:35.000000000 -0700 @@ -170,6 +170,7 @@ extern int ptep_set_access_flags(struct #define pgprot_cached_wthru(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \ _PAGE_COHERENT | _PAGE_WRITETHRU)) +#define pgprot_writecombine pgprot_noncached_wc struct file; extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, diff -uprN linux-2.6.32/arch/powerpc/include/asm/phyp_dump.h linux-2.6.32.ovz/arch/powerpc/include/asm/phyp_dump.h --- linux-2.6.32/arch/powerpc/include/asm/phyp_dump.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/phyp_dump.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,47 +0,0 @@ -/* - * Hypervisor-assisted dump - * - * Linas Vepstas, Manish Ahuja 2008 - * Copyright 2008 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _PPC64_PHYP_DUMP_H -#define _PPC64_PHYP_DUMP_H - -#ifdef CONFIG_PHYP_DUMP - -/* The RMR region will be saved for later dumping - * whenever the kernel crashes. Set this to 256MB. */ -#define PHYP_DUMP_RMR_START 0x0 -#define PHYP_DUMP_RMR_END (1UL<<28) - -struct phyp_dump { - /* Memory that is reserved during very early boot. */ - unsigned long init_reserve_start; - unsigned long init_reserve_size; - /* cmd line options during boot */ - unsigned long reserve_bootvar; - unsigned long phyp_dump_at_boot; - /* Check status during boot if dump supported, active & present*/ - unsigned long phyp_dump_configured; - unsigned long phyp_dump_is_active; - /* store cpu & hpte size */ - unsigned long cpu_state_size; - unsigned long hpte_region_size; - /* previous scratch area values */ - unsigned long reserved_scratch_addr; - unsigned long reserved_scratch_size; -}; - -extern struct phyp_dump *phyp_dump_info; - -int early_init_dt_scan_phyp_dump(unsigned long node, - const char *uname, int depth, void *data); - -#endif /* CONFIG_PHYP_DUMP */ -#endif /* _PPC64_PHYP_DUMP_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/ppc_asm.h linux-2.6.32.ovz/arch/powerpc/include/asm/ppc_asm.h --- linux-2.6.32/arch/powerpc/include/asm/ppc_asm.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/ppc_asm.h 2015-09-10 10:06:17.000000000 -0700 @@ -9,6 +9,7 @@ #include #include #include +#include #ifndef __ASSEMBLY__ #error __FILE__ should only be used in assembler files @@ -26,17 +27,13 @@ #ifndef CONFIG_VIRT_CPU_ACCOUNTING #define ACCOUNT_CPU_USER_ENTRY(ra, rb) #define ACCOUNT_CPU_USER_EXIT(ra, rb) +#define ACCOUNT_STOLEN_TIME #else #define ACCOUNT_CPU_USER_ENTRY(ra, rb) \ beq 2f; /* if from kernel mode */ \ -BEGIN_FTR_SECTION; \ - mfspr ra,SPRN_PURR; /* get processor util. reg */ \ -END_FTR_SECTION_IFSET(CPU_FTR_PURR); \ -BEGIN_FTR_SECTION; \ - MFTB(ra); /* or get TB if no PURR */ \ -END_FTR_SECTION_IFCLR(CPU_FTR_PURR); \ - ld rb,PACA_STARTPURR(r13); \ - std ra,PACA_STARTPURR(r13); \ + MFTB(ra); /* get timebase */ \ + ld rb,PACA_STARTTIME_USER(r13); \ + std ra,PACA_STARTTIME(r13); \ subf rb,rb,ra; /* subtract start value */ \ ld ra,PACA_USER_TIME(r13); \ add ra,ra,rb; /* add on to user time */ \ @@ -44,19 +41,34 @@ END_FTR_SECTION_IFCLR(CPU_FTR_PURR); 2: #define ACCOUNT_CPU_USER_EXIT(ra, rb) \ -BEGIN_FTR_SECTION; \ - mfspr ra,SPRN_PURR; /* get processor util. reg */ \ -END_FTR_SECTION_IFSET(CPU_FTR_PURR); \ -BEGIN_FTR_SECTION; \ - MFTB(ra); /* or get TB if no PURR */ \ -END_FTR_SECTION_IFCLR(CPU_FTR_PURR); \ - ld rb,PACA_STARTPURR(r13); \ - std ra,PACA_STARTPURR(r13); \ + MFTB(ra); /* get timebase */ \ + ld rb,PACA_STARTTIME(r13); \ + std ra,PACA_STARTTIME_USER(r13); \ subf rb,rb,ra; /* subtract start value */ \ ld ra,PACA_SYSTEM_TIME(r13); \ - add ra,ra,rb; /* add on to user time */ \ - std ra,PACA_SYSTEM_TIME(r13); -#endif + add ra,ra,rb; /* add on to system time */ \ + std ra,PACA_SYSTEM_TIME(r13) + +#ifdef CONFIG_PPC_SPLPAR +#define ACCOUNT_STOLEN_TIME \ +BEGIN_FW_FTR_SECTION; \ + beq 33f; \ + /* from user - see if there are any DTL entries to process */ \ + ld r10,PACALPPACAPTR(r13); /* get ptr to VPA */ \ + ld r11,PACA_DTL_RIDX(r13); /* get log read index */ \ + ld r10,LPPACA_DTLIDX(r10); /* get log write index */ \ + cmpd cr1,r11,r10; \ + beq+ cr1,33f; \ + bl .accumulate_stolen_time; \ +33: \ +END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) + +#else /* CONFIG_PPC_SPLPAR */ +#define ACCOUNT_STOLEN_TIME + +#endif /* CONFIG_PPC_SPLPAR */ + +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ /* * Macros for storing registers into and loading registers from diff -uprN linux-2.6.32/arch/powerpc/include/asm/ppc-opcode.h linux-2.6.32.ovz/arch/powerpc/include/asm/ppc-opcode.h --- linux-2.6.32/arch/powerpc/include/asm/ppc-opcode.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/ppc-opcode.h 2015-09-10 10:06:05.000000000 -0700 @@ -22,8 +22,10 @@ #define PPC_INST_DCBZL 0x7c2007ec #define PPC_INST_ISEL 0x7c00001e #define PPC_INST_ISEL_MASK 0xfc00003e +#define PPC_INST_LDARX 0x7c0000a8 #define PPC_INST_LSWI 0x7c0004aa #define PPC_INST_LSWX 0x7c00042a +#define PPC_INST_LWARX 0x7c000028 #define PPC_INST_LWSYNC 0x7c2004ac #define PPC_INST_LXVD2X 0x7c000698 #define PPC_INST_MCRXR 0x7c000400 @@ -55,15 +57,31 @@ #define __PPC_RA(a) (((a) & 0x1f) << 16) #define __PPC_RB(b) (((b) & 0x1f) << 11) #define __PPC_RS(s) (((s) & 0x1f) << 21) +#define __PPC_RT(s) __PPC_RS(s) #define __PPC_XS(s) ((((s) & 0x1f) << 21) | (((s) & 0x20) >> 5)) #define __PPC_T_TLB(t) (((t) & 0x3) << 21) #define __PPC_WC(w) (((w) & 0x3) << 21) +/* + * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a + * larx with EH set as an illegal instruction. + */ +#ifdef CONFIG_PPC64 +#define __PPC_EH(eh) (((eh) & 0x1) << 0) +#else +#define __PPC_EH(eh) 0 +#endif /* Deal with instructions that older assemblers aren't aware of */ #define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ __PPC_RA(a) | __PPC_RB(b)) #define PPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \ __PPC_RA(a) | __PPC_RB(b)) +#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LDARX | \ + __PPC_RT(t) | __PPC_RA(a) | \ + __PPC_RB(b) | __PPC_EH(eh)) +#define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LWARX | \ + __PPC_RT(t) | __PPC_RA(a) | \ + __PPC_RB(b) | __PPC_EH(eh)) #define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ __PPC_RB(b)) #define PPC_RFCI stringify_in_c(.long PPC_INST_RFCI) diff -uprN linux-2.6.32/arch/powerpc/include/asm/ppc-pci.h linux-2.6.32.ovz/arch/powerpc/include/asm/ppc-pci.h --- linux-2.6.32/arch/powerpc/include/asm/ppc-pci.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/ppc-pci.h 2015-09-10 10:07:23.000000000 -0700 @@ -45,8 +45,6 @@ extern void init_pci_config_tokens (void extern unsigned long get_phb_buid (struct device_node *); extern int rtas_setup_phb(struct pci_controller *phb); -extern unsigned long pci_probe_only; - /* ---- EEH internal-use-only related routines ---- */ #ifdef CONFIG_EEH @@ -137,6 +135,16 @@ struct device_node * find_device_pe(stru void eeh_sysfs_add_device(struct pci_dev *pdev); void eeh_sysfs_remove_device(struct pci_dev *pdev); +static inline const char *eeh_pci_name(struct pci_dev *pdev) +{ + return pdev ? pci_name(pdev) : ""; +} + +static inline const char *eeh_driver_name(struct pci_dev *pdev) +{ + return (pdev && pdev->driver) ? pdev->driver->name : ""; +} + #endif /* CONFIG_EEH */ #else /* CONFIG_PCI */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/pSeries_reconfig.h linux-2.6.32.ovz/arch/powerpc/include/asm/pSeries_reconfig.h --- linux-2.6.32/arch/powerpc/include/asm/pSeries_reconfig.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/pSeries_reconfig.h 2015-09-10 10:07:48.000000000 -0700 @@ -13,16 +13,33 @@ #define PSERIES_RECONFIG_REMOVE 0x0002 #define PSERIES_DRCONF_MEM_ADD 0x0003 #define PSERIES_DRCONF_MEM_REMOVE 0x0004 +#define PSERIES_UPDATE_PROPERTY 0x0005 + +/** + * pSeries_reconfig_notify - Notifier value structure for OFDT property updates + * + * @node: Device tree node which owns the property being updated + * @property: Updated property + */ +struct pSeries_reconfig_prop_update { + struct device_node *node; + struct property *property; +}; #ifdef CONFIG_PPC_PSERIES extern int pSeries_reconfig_notifier_register(struct notifier_block *); extern void pSeries_reconfig_notifier_unregister(struct notifier_block *); +extern struct blocking_notifier_head pSeries_reconfig_chain; +/* Not the best place to put this, will be fixed when we move some + * of the rtas suspend-me stuff to pseries */ +extern void pSeries_coalesce_init(void); #else /* !CONFIG_PPC_PSERIES */ static inline int pSeries_reconfig_notifier_register(struct notifier_block *nb) { return 0; } static inline void pSeries_reconfig_notifier_unregister(struct notifier_block *nb) { } +static inline void pSeries_coalesce_init(void) { } #endif /* CONFIG_PPC_PSERIES */ #endif /* __KERNEL__ */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/ptrace.h linux-2.6.32.ovz/arch/powerpc/include/asm/ptrace.h --- linux-2.6.32/arch/powerpc/include/asm/ptrace.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/ptrace.h 2015-09-10 10:06:36.000000000 -0700 @@ -83,7 +83,20 @@ struct pt_regs { #define instruction_pointer(regs) ((regs)->nip) #define user_stack_pointer(regs) ((regs)->gpr[1]) -#define regs_return_value(regs) ((regs)->gpr[3]) + +static inline int is_syscall_success(struct pt_regs *regs) +{ + return !(regs->ccr & 0x10000000); +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + if (is_syscall_success(regs)) + return regs->gpr[3]; + else + return -regs->gpr[3]; +} + #ifdef CONFIG_SMP extern unsigned long profile_pc(struct pt_regs *regs); @@ -122,8 +135,10 @@ extern int ptrace_put_reg(struct task_st #endif /* ! __powerpc64__ */ #define TRAP(regs) ((regs)->trap & ~0xF) #ifdef __powerpc64__ +#define NV_REG_POISON 0xdeadbeefdeadbeefUL #define CHECK_FULL_REGS(regs) BUG_ON(regs->trap & 1) #else +#define NV_REG_POISON 0xdeadbeef #define CHECK_FULL_REGS(regs) \ do { \ if ((regs)->trap & 1) \ @@ -140,6 +155,8 @@ extern void user_enable_single_step(stru extern void user_enable_block_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); +#define ARCH_HAS_USER_SINGLE_STEP_INFO + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/reg.h linux-2.6.32.ovz/arch/powerpc/include/asm/reg.h --- linux-2.6.32/arch/powerpc/include/asm/reg.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/reg.h 2015-09-10 10:07:34.000000000 -0700 @@ -413,6 +413,7 @@ #define SPRN_SPRG1 0x111 /* Special Purpose Register General 1 */ #define SPRN_SPRG2 0x112 /* Special Purpose Register General 2 */ #define SPRN_SPRG3 0x113 /* Special Purpose Register General 3 */ +#define SPRN_USPRG3 0x103 /* SPRG3 userspace read */ #define SPRN_SPRG4 0x114 /* Special Purpose Register General 4 */ #define SPRN_SPRG5 0x115 /* Special Purpose Register General 5 */ #define SPRN_SPRG6 0x116 /* Special Purpose Register General 6 */ @@ -489,6 +490,8 @@ #define SPRN_MMCR1 798 #define SPRN_MMCRA 0x312 #define MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */ +#define MMCRA_SDAR_DCACHE_MISS 0x40000000UL +#define MMCRA_SDAR_ERAT_MISS 0x20000000UL #define MMCRA_SIHV 0x10000000UL /* state of MSR HV when SIAR set */ #define MMCRA_SIPR 0x08000000UL /* state of MSR PR when SIAR set */ #define MMCRA_SLOT 0x07000000UL /* SLOT bits (37-39) */ @@ -499,6 +502,10 @@ #define POWER6_MMCRA_SIPR 0x0000020000000000ULL #define POWER6_MMCRA_THRM 0x00000020UL #define POWER6_MMCRA_OTHER 0x0000000EUL + +#define POWER7P_MMCRA_SIAR_VALID 0x10000000 /* P7+ SIAR contents valid */ +#define POWER7P_MMCRA_SDAR_VALID 0x08000000 /* P7+ SDAR contents valid */ + #define SPRN_PMC1 787 #define SPRN_PMC2 788 #define SPRN_PMC3 789 @@ -509,6 +516,11 @@ #define SPRN_PMC8 794 #define SPRN_SIAR 780 #define SPRN_SDAR 781 +#define SPRN_SIER 784 +#define SIER_SIPR 0x2000000 /* Sampled MSR_PR */ +#define SIER_SIHV 0x1000000 /* Sampled MSR_HV */ +#define SIER_SIAR_VALID 0x0400000 /* SIAR contents valid */ +#define SIER_SDAR_VALID 0x0200000 /* SDAR contents valid */ #define SPRN_PA6T_MMCR0 795 #define PA6T_MMCR0_EN0 0x0000000000000001UL @@ -650,12 +662,12 @@ * 64-bit server: * - SPRG0 unused (reserved for HV on Power4) * - SPRG2 scratch for exception vectors - * - SPRG3 unused (user visible) + * - SPRG3 CPU and NUMA node for VDSO getcpu (user visible) * * 64-bit embedded * - SPRG0 generic exception scratch * - SPRG2 TLB exception stack - * - SPRG3 unused (user visible) + * - SPRG3 CPU and NUMA node for VDSO getcpu (user visible) * - SPRG4 unused (user visible) * - SPRG6 TLB miss scratch (user visible, sorry !) * - SPRG7 critical exception scratch @@ -858,11 +870,13 @@ #define PV_970 0x0039 #define PV_POWER5 0x003A #define PV_POWER5p 0x003B +#define PV_POWER7 0x003F #define PV_970FX 0x003C #define PV_630 0x0040 #define PV_630p 0x0041 #define PV_970MP 0x0044 #define PV_970GX 0x0045 +#define PVR_POWER7p 0x004A #define PV_BE 0x0070 #define PV_PA6T 0x0090 diff -uprN linux-2.6.32/arch/powerpc/include/asm/rtas.h linux-2.6.32.ovz/arch/powerpc/include/asm/rtas.h --- linux-2.6.32/arch/powerpc/include/asm/rtas.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/rtas.h 2015-09-10 10:07:40.000000000 -0700 @@ -63,6 +63,14 @@ struct rtas_t { struct device_node *dev; /* virtual address pointer */ }; +struct rtas_suspend_me_data { + atomic_t working; /* number of cpus accessing this struct */ + int done; + int token; /* ibm,suspend-me */ + int error; + struct completion *complete; /* wait on this until working == 0 */ +}; + /* RTAS event classes */ #define RTAS_INTERNAL_ERROR 0x80000000 /* set bit 0 */ #define RTAS_EPOW_WARNING 0x40000000 /* set bit 1 */ @@ -147,9 +155,85 @@ struct rtas_error_log { unsigned long target:4; /* Target of failed operation */ unsigned long type:8; /* General event or error*/ unsigned long extended_log_length:32; /* length in bytes */ - unsigned char buffer[1]; + unsigned char buffer[1]; /* Start of extended log */ + /* Variable length. */ +}; + +#define RTAS_V6EXT_LOG_FORMAT_EVENT_LOG 14 + +#define RTAS_V6EXT_COMPANY_ID_IBM (('I' << 24) | ('B' << 16) | ('M' << 8)) + +/* RTAS general extended event log, Version 6. The extended log starts + * from "buffer" field of struct rtas_error_log defined above. + */ +struct rtas_ext_event_log_v6 { + /* Byte 0 */ + uint32_t log_valid:1; /* 1:Log valid */ + uint32_t unrecoverable_error:1; /* 1:Unrecoverable error */ + uint32_t recoverable_error:1; /* 1:recoverable (correctable */ + /* or successfully retried) */ + uint32_t degraded_operation:1; /* 1:Unrecoverable err, bypassed*/ + /* - degraded operation (e.g. */ + /* CPU or mem taken off-line) */ + uint32_t predictive_error:1; + uint32_t new_log:1; /* 1:"New" log (Always 1 for */ + /* data returned from RTAS */ + uint32_t big_endian:1; /* 1: Big endian */ + uint32_t :1; /* reserved */ + /* Byte 1 */ + uint32_t :8; /* reserved */ + /* Byte 2 */ + uint32_t powerpc_format:1; /* Set to 1 (indicating log is */ + /* in PowerPC format */ + uint32_t :3; /* reserved */ + uint32_t log_format:4; /* Log format indicator. Define */ + /* format used for byte 12-2047 */ + /* Byte 3 */ + uint32_t :8; /* reserved */ + /* Byte 4-11 */ + uint8_t reserved[8]; /* reserved */ + /* Byte 12-15 */ + uint32_t company_id; /* Company ID of the company */ + /* that defines the format for */ + /* the vendor specific log type */ + /* Byte 16-end of log */ + uint8_t vendor_log[1]; /* Start of vendor specific log */ + /* Variable length. */ }; +/* pSeries event log format */ + +/* Two bytes ASCII section IDs */ +#define PSERIES_ELOG_SECT_ID_PRIV_HDR (('P' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_USER_HDR (('U' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_PRIMARY_SRC (('P' << 8) | 'S') +#define PSERIES_ELOG_SECT_ID_EXTENDED_UH (('E' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_FAILING_MTMS (('M' << 8) | 'T') +#define PSERIES_ELOG_SECT_ID_SECONDARY_SRC (('S' << 8) | 'S') +#define PSERIES_ELOG_SECT_ID_DUMP_LOCATOR (('D' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_FW_ERROR (('S' << 8) | 'W') +#define PSERIES_ELOG_SECT_ID_IMPACT_PART_ID (('L' << 8) | 'P') +#define PSERIES_ELOG_SECT_ID_LOGIC_RESOURCE_ID (('L' << 8) | 'R') +#define PSERIES_ELOG_SECT_ID_HMC_ID (('H' << 8) | 'M') +#define PSERIES_ELOG_SECT_ID_EPOW (('E' << 8) | 'P') +#define PSERIES_ELOG_SECT_ID_IO_EVENT (('I' << 8) | 'E') +#define PSERIES_ELOG_SECT_ID_MANUFACT_INFO (('M' << 8) | 'I') +#define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D') + +/* Vendor specific Platform Event Log Format, Version 6, section header */ +struct pseries_errorlog { + uint16_t id; /* 0x00 2-byte ASCII section ID */ + uint16_t length; /* 0x02 Section length in bytes */ + uint8_t version; /* 0x04 Section version */ + uint8_t subtype; /* 0x05 Section subtype */ + uint16_t creator_component; /* 0x06 Creator component ID */ + uint8_t data[]; /* 0x08 Start of section data */ +}; + +struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, + uint16_t section_id); + /* * This can be set by the rtas_flash module so that it can get called * as the absolutely last thing before the kernel terminates. @@ -174,6 +258,10 @@ extern int rtas_set_indicator(int indica extern int rtas_set_indicator_fast(int indicator, int index, int new_value); extern void rtas_progress(char *s, unsigned short hex); extern void rtas_initialize(void); +extern int rtas_suspend_cpu(struct rtas_suspend_me_data *data); +extern int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data); +extern int rtas_online_cpus_mask(cpumask_var_t cpus); +extern int rtas_offline_cpus_mask(cpumask_var_t cpus); struct rtc_time; extern unsigned long rtas_get_boot_time(void); @@ -188,6 +276,12 @@ extern int early_init_dt_scan_rtas(unsig extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); +#ifdef CONFIG_PPC_PSERIES +extern void rtas_cancel_event_scan(void); +#else +static inline void rtas_cancel_event_scan(void) { } +#endif + /* Error types logged. */ #define ERR_FLAG_ALREADY_LOGGED 0x0 #define ERR_FLAG_BOOT 0x1 /* log was pulled from NVRAM on boot */ @@ -248,5 +342,17 @@ static inline u32 rtas_config_addr(int b extern void __cpuinit rtas_give_timebase(void); extern void __cpuinit rtas_take_timebase(void); +#ifdef CONFIG_PPC_RTAS +static inline int page_is_rtas_user_buf(unsigned long pfn) +{ + unsigned long paddr = (pfn << PAGE_SHIFT); + if (paddr >= rtas_rmo_buf && paddr < (rtas_rmo_buf + RTAS_RMOBUF_MAX)) + return 1; + return 0; +} +#else +static inline int page_is_rtas_user_buf(unsigned long pfn) { return 0;} +#endif + #endif /* __KERNEL__ */ #endif /* _POWERPC_RTAS_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/socket.h linux-2.6.32.ovz/arch/powerpc/include/asm/socket.h --- linux-2.6.32/arch/powerpc/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/socket.h 2015-09-10 10:07:41.000000000 -0700 @@ -29,7 +29,7 @@ #define SO_PRIORITY 12 #define SO_LINGER 13 #define SO_BSDCOMPAT 14 -/* To add :#define SO_REUSEPORT 15 */ +#define SO_REUSEPORT 15 #define SO_RCVLOWAT 16 #define SO_SNDLOWAT 17 #define SO_RCVTIMEO 18 @@ -67,4 +67,9 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 + +#define SO_BUSY_POLL 46 +#define SO_BPF_EXTENSIONS 48 + #endif /* _ASM_POWERPC_SOCKET_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/sparsemem.h linux-2.6.32.ovz/arch/powerpc/include/asm/sparsemem.h --- linux-2.6.32/arch/powerpc/include/asm/sparsemem.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/sparsemem.h 2015-09-10 10:06:45.000000000 -0700 @@ -16,7 +16,7 @@ #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_MEMORY_HOTPLUG -extern void create_section_mapping(unsigned long start, unsigned long end); +extern int create_section_mapping(unsigned long start, unsigned long end); extern int remove_section_mapping(unsigned long start, unsigned long end); #ifdef CONFIG_NUMA extern int hot_add_scn_to_nid(unsigned long scn_addr); diff -uprN linux-2.6.32/arch/powerpc/include/asm/spinlock.h linux-2.6.32.ovz/arch/powerpc/include/asm/spinlock.h --- linux-2.6.32/arch/powerpc/include/asm/spinlock.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/spinlock.h 2015-09-10 10:07:50.000000000 -0700 @@ -27,8 +27,7 @@ #endif #include #include - -#define __raw_spin_is_locked(x) ((x)->slock != 0) +#include #ifdef CONFIG_PPC64 /* use 0x800000yy when locked, where yy == CPU number */ @@ -50,6 +49,12 @@ #define SYNC_IO #endif +static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +{ + smp_mb(); + return lock->slock != 0; +} + /* * This returns the old value in the lock, so we succeeded * in getting the lock if the return value is 0. @@ -60,7 +65,7 @@ static inline unsigned long arch_spin_tr token = LOCK_TOKEN; __asm__ __volatile__( -"1: lwarx %0,0,%2\n\ +"1: " PPC_LWARX(%0,0,%2,1) "\n\ cmpwi 0,%0,0\n\ bne- 2f\n\ stwcx. %1,0,%2\n\ @@ -186,7 +191,7 @@ static inline long arch_read_trylock(raw long tmp; __asm__ __volatile__( -"1: lwarx %0,0,%1\n" +"1: " PPC_LWARX(%0,0,%1,1) "\n" __DO_SIGN_EXTEND " addic. %0,%0,1\n\ ble- 2f\n" @@ -211,7 +216,7 @@ static inline long arch_write_trylock(ra token = WRLOCK_TOKEN; __asm__ __volatile__( -"1: lwarx %0,0,%2\n\ +"1: " PPC_LWARX(%0,0,%2,1) "\n\ cmpwi 0,%0,0\n\ bne- 2f\n" PPC405_ERR77(0,%1) diff -uprN linux-2.6.32/arch/powerpc/include/asm/syscall.h linux-2.6.32.ovz/arch/powerpc/include/asm/syscall.h --- linux-2.6.32/arch/powerpc/include/asm/syscall.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/syscall.h 2015-09-10 10:06:38.000000000 -0700 @@ -30,7 +30,7 @@ static inline void syscall_rollback(stru static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { - return (regs->ccr & 0x1000) ? -regs->gpr[3] : 0; + return (regs->ccr & 0x10000000) ? -regs->gpr[3] : 0; } static inline long syscall_get_return_value(struct task_struct *task, @@ -44,10 +44,10 @@ static inline void syscall_set_return_va int error, long val) { if (error) { - regs->ccr |= 0x1000L; + regs->ccr |= 0x10000000L; regs->gpr[3] = -error; } else { - regs->ccr &= ~0x1000L; + regs->ccr &= ~0x10000000L; regs->gpr[3] = val; } } diff -uprN linux-2.6.32/arch/powerpc/include/asm/systbl.h linux-2.6.32.ovz/arch/powerpc/include/asm/systbl.h --- linux-2.6.32/arch/powerpc/include/asm/systbl.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/systbl.h 2015-09-10 10:07:44.000000000 -0700 @@ -326,3 +326,33 @@ SYSCALL_SPU(perf_event_open) COMPAT_SYS_SPU(preadv) COMPAT_SYS_SPU(pwritev) COMPAT_SYS(rt_tgsigqueueinfo) +SYSCALL_SPU(ni_syscall) /* fanotify_init */ +SYSCALL_SPU(ni_syscall) /* fanotify_mark */ +SYSCALL_SPU(ni_syscall) /* prlimit64 */ +SYSCALL_SPU(ni_syscall) /* socket */ +SYSCALL_SPU(ni_syscall) /* bind */ +SYSCALL_SPU(ni_syscall) /* connect */ +SYSCALL_SPU(ni_syscall) /* listen */ +SYSCALL_SPU(ni_syscall) /* accept */ +SYSCALL_SPU(ni_syscall) /* getsockname */ +SYSCALL_SPU(ni_syscall) /* getpeername */ +SYSCALL_SPU(ni_syscall) /* socketpair */ +SYSCALL_SPU(ni_syscall) /* send */ +SYSCALL_SPU(ni_syscall) /* sendto */ +SYSCALL_SPU(ni_syscall) /* recv */ +SYSCALL_SPU(ni_syscall) /* recvfrom */ +SYSCALL_SPU(ni_syscall) /* shutdown */ +SYSCALL_SPU(ni_syscall) /* setsockopt */ +SYSCALL_SPU(ni_syscall) /* getsockopt */ +SYSCALL_SPU(ni_syscall) /* sendmsg */ +SYSCALL_SPU(ni_syscall) /* recvmsg */ +SYSCALL_SPU(ni_syscall) /* recvmmsg */ +SYSCALL_SPU(ni_syscall) /* accept4 */ +SYSCALL_SPU(ni_syscall) /* name_to_handle_at */ +SYSCALL_SPU(ni_syscall) /* open_by_handle_at */ +COMPAT_SYS_SPU(clock_adjtime) +SYSCALL_SPU(syncfs) +COMPAT_SYS_SPU(sendmmsg) +SYSCALL_SPU(setns) /* setns */ +COMPAT_SYS(process_vm_readv) +COMPAT_SYS(process_vm_writev) diff -uprN linux-2.6.32/arch/powerpc/include/asm/system.h linux-2.6.32.ovz/arch/powerpc/include/asm/system.h --- linux-2.6.32/arch/powerpc/include/asm/system.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/system.h 2015-09-10 10:07:19.000000000 -0700 @@ -212,7 +212,7 @@ extern struct task_struct *_switch(struc extern unsigned int rtas_data; extern int mem_init_done; /* set on boot once kmalloc can be called */ extern int init_bootmem_done; /* set once bootmem is available */ -extern phys_addr_t memory_limit; +extern unsigned long long memory_limit; extern unsigned long klimit; extern void *alloc_maybe_bootmem(size_t size, gfp_t mask); diff -uprN linux-2.6.32/arch/powerpc/include/asm/thread_info.h linux-2.6.32.ovz/arch/powerpc/include/asm/thread_info.h --- linux-2.6.32/arch/powerpc/include/asm/thread_info.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/thread_info.h 2015-09-10 10:07:47.000000000 -0700 @@ -72,7 +72,7 @@ struct thread_info { #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR -extern struct thread_info *alloc_thread_info(struct task_struct *tsk); +extern struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node); extern void free_thread_info(struct thread_info *ti); #endif /* THREAD_SHIFT < PAGE_SHIFT */ @@ -111,7 +111,6 @@ static inline struct thread_info *curren #define TIF_NOTIFY_RESUME 13 /* callback before returning to user */ #define TIF_FREEZE 14 /* Freezing for suspend */ #define TIF_RUNLATCH 15 /* Is the runlatch enabled? */ -#define TIF_ABI_PENDING 16 /* 32/64 bit switch needed */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1< static inline int cpu_to_node(int cpu) @@ -57,13 +67,31 @@ static inline int pcibus_to_node(struct .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ + .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } +extern int __node_distance(int, int); +#define node_distance(a, b) __node_distance(a, b) + extern void __init dump_numa_cpu_topology(void); extern int sysfs_add_device_to_node(struct sys_device *dev, int nid); extern void sysfs_remove_device_from_node(struct sys_device *dev, int nid); +#ifdef CONFIG_PPC_SPLPAR +extern int start_topology_update(void); +extern int stop_topology_update(void); +#else +static inline int start_topology_update(void) +{ + return 0; +} +static inline int stop_topology_update(void) +{ + return 0; +} +#endif /* CONFIG_PPC_SPLPAR */ #else static inline int of_node_to_nid(struct device_node *device) @@ -82,7 +110,6 @@ static inline void sysfs_remove_device_f int nid) { } - #endif /* CONFIG_NUMA */ #include diff -uprN linux-2.6.32/arch/powerpc/include/asm/trace_clock.h linux-2.6.32.ovz/arch/powerpc/include/asm/trace_clock.h --- linux-2.6.32/arch/powerpc/include/asm/trace_clock.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/trace_clock.h 2015-09-10 10:07:32.000000000 -0700 @@ -0,0 +1 @@ +#include diff -uprN linux-2.6.32/arch/powerpc/include/asm/types.h linux-2.6.32.ovz/arch/powerpc/include/asm/types.h --- linux-2.6.32/arch/powerpc/include/asm/types.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/types.h 2015-09-10 10:06:57.000000000 -0700 @@ -44,11 +44,6 @@ typedef struct { typedef __vector128 vector128; -#if defined(__powerpc64__) || defined(CONFIG_PHYS_64BIT) -typedef u64 dma_addr_t; -#else -typedef u32 dma_addr_t; -#endif typedef u64 dma64_addr_t; typedef struct { diff -uprN linux-2.6.32/arch/powerpc/include/asm/unistd.h linux-2.6.32.ovz/arch/powerpc/include/asm/unistd.h --- linux-2.6.32/arch/powerpc/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/unistd.h 2015-09-10 10:07:44.000000000 -0700 @@ -345,10 +345,40 @@ #define __NR_preadv 320 #define __NR_pwritev 321 #define __NR_rt_tgsigqueueinfo 322 +/* #define __NR_fanotify_init 323 */ +/* #define __NR_fanotify_mark 324 */ +/* #define __NR_prlimit64 325 */ +/* #define __NR_socket 326 */ +/* #define __NR_bind 327 */ +/* #define __NR_connect 328 */ +/* #define __NR_listen 329 */ +/* #define __NR_accept 330 */ +/* #define __NR_getsockname 331 */ +/* #define __NR_getpeername 332 */ +/* #define __NR_socketpair 333 */ +/* #define __NR_send 334 */ +/* #define __NR_sendto 335 */ +/* #define __NR_recv 336 */ +/* #define __NR_recvfrom 337 */ +/* #define __NR_shutdown 338 */ +/* #define __NR_setsockopt 339 */ +/* #define __NR_getsockopt 340 */ +/* #define __NR_sendmsg 341 */ +/* #define __NR_recvmsg 342 */ +/* #define __NR_recvmmsg 343 */ +/* #define __NR_accept4 344 */ +/* #define __NR_name_to_handle_at 345 */ +/* #define __NR_open_by_handle_at 346 */ +#define __NR_clock_adjtime 347 +#define __NR_syncfs 348 +#define __NR_sendmmsg 349 +#define __NR_setns 350 +#define __NR_process_vm_readv 351 +#define __NR_process_vm_writev 352 #ifdef __KERNEL__ -#define __NR_syscalls 323 +#define __NR_syscalls 353 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls diff -uprN linux-2.6.32/arch/powerpc/include/asm/vdso_datapage.h linux-2.6.32.ovz/arch/powerpc/include/asm/vdso_datapage.h --- linux-2.6.32/arch/powerpc/include/asm/vdso_datapage.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/vdso_datapage.h 2015-09-10 10:06:08.000000000 -0700 @@ -85,6 +85,7 @@ struct vdso_data { __s32 wtom_clock_sec; /* Wall to monotonic clock */ __s32 wtom_clock_nsec; struct timespec stamp_xtime; /* xtime as at tb_orig_stamp */ + __u32 stamp_sec_fraction; /* fractional seconds of stamp_xtime */ __u32 syscall_map_64[SYSCALL_MAP_SIZE]; /* map of syscalls */ __u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */ }; @@ -105,6 +106,7 @@ struct vdso_data { __s32 wtom_clock_sec; /* Wall to monotonic clock */ __s32 wtom_clock_nsec; struct timespec stamp_xtime; /* xtime as at tb_orig_stamp */ + __u32 stamp_sec_fraction; /* fractional seconds of stamp_xtime */ __u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */ __u32 dcache_block_size; /* L1 d-cache block size */ __u32 icache_block_size; /* L1 i-cache block size */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/vdso.h linux-2.6.32.ovz/arch/powerpc/include/asm/vdso.h --- linux-2.6.32/arch/powerpc/include/asm/vdso.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/vdso.h 2015-09-10 10:07:26.000000000 -0700 @@ -21,6 +21,7 @@ extern unsigned long vdso64_rt_sigtramp; extern unsigned long vdso32_sigtramp; extern unsigned long vdso32_rt_sigtramp; +int __cpuinit vdso_getcpu_init(void); #else /* __ASSEMBLY__ */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/vio.h linux-2.6.32.ovz/arch/powerpc/include/asm/vio.h --- linux-2.6.32/arch/powerpc/include/asm/vio.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/vio.h 2015-09-10 10:07:23.000000000 -0700 @@ -46,6 +46,48 @@ struct iommu_table; +/* + * Platform Facilities Option (PFO)-specific data + */ + +/* Starting unit address for PFO devices on the VIO BUS */ +#define VIO_BASE_PFO_UA 0x50000000 + +/** + * vio_pfo_op - PFO operation parameters + * + * @flags: h_call subfunctions and modifiers + * @in: Input data block logical real address + * @inlen: If non-negative, the length of the input data block. If negative, + * the length of the input data descriptor list in bytes. + * @out: Output data block logical real address + * @outlen: If non-negative, the length of the input data block. If negative, + * the length of the input data descriptor list in bytes. + * @csbcpb: Logical real address of the 4k naturally-aligned storage block + * containing the CSB & optional FC field specific CPB + * @timeout: # of milliseconds to retry h_call, 0 for no timeout. + * @hcall_err: pointer to return the h_call return value, else NULL + */ +struct vio_pfo_op { + u64 flags; + s64 in; + s64 inlen; + s64 out; + s64 outlen; + u64 csbcpb; + void *done; + unsigned long handle; + unsigned int timeout; + long hcall_err; +}; + +/* End PFO specific data */ + +enum vio_dev_family { + VDEVICE, /* The OF node is a child of /vdevice */ + PFO, /* The OF node is a child of /ibm,platform-facilities */ +}; + /** * vio_dev - This structure is used to describe virtual I/O devices. * @@ -58,6 +100,7 @@ struct vio_dev { const char *name; const char *type; uint32_t unit_address; + uint32_t resource_id; unsigned int irq; struct { size_t desired; @@ -65,6 +108,7 @@ struct vio_dev { size_t allocated; atomic_t allocs_failed; } cmo; + enum vio_dev_family family; struct device dev; }; @@ -87,6 +131,8 @@ extern void vio_cmo_set_dev_desired(stru extern void __devinit vio_unregister_device(struct vio_dev *dev); +extern int vio_h_cop_sync(struct vio_dev *vdev, struct vio_pfo_op *op); + struct device_node; extern struct vio_dev *vio_register_device_node( diff -uprN linux-2.6.32/arch/powerpc/Kconfig linux-2.6.32.ovz/arch/powerpc/Kconfig --- linux-2.6.32/arch/powerpc/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/Kconfig 2015-09-10 10:07:15.000000000 -0700 @@ -22,6 +22,9 @@ config WORD_SIZE config ARCH_PHYS_ADDR_T_64BIT def_bool PPC64 || PHYS_64BIT +config ARCH_DMA_ADDR_T_64BIT + def_bool ARCH_PHYS_ADDR_T_64BIT + config MMU bool default y @@ -29,9 +32,6 @@ config MMU config GENERIC_CMOS_UPDATE def_bool y -config GENERIC_TIME - def_bool y - config GENERIC_TIME_VSYSCALL def_bool y @@ -129,7 +129,9 @@ config PPC select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS if PPC64 select GENERIC_ATOMIC64 if PPC32 + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS + select ARCH_HAVE_NMI_SAFE_CMPXCHG config EARLY_PRINTK bool @@ -216,7 +218,7 @@ config ARCH_HIBERNATION_POSSIBLE config ARCH_SUSPEND_POSSIBLE def_bool y - depends on ADB_PMU || PPC_EFIKA || PPC_LITE5200 || PPC_83xx + depends on ADB_PMU || PPC_EFIKA || PPC_LITE5200 || PPC_83xx || PPC_PSERIES config PPC_DCR_NATIVE bool @@ -320,6 +322,10 @@ config HOTPLUG_CPU Say N if you are unsure. +config ARCH_CPU_PROBE_RELEASE + def_bool y + depends on HOTPLUG_CPU + config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y @@ -346,6 +352,17 @@ config KEXEC support. As of this writing the exact hardware interface is strongly in flux, so no good recommendation can be made. +config KEXEC_AUTO_RESERVE + bool "automatically reserve memory for kexec kernel" + depends on KEXEC + default y + ---help--- + Automatically reserve memory for a kexec kernel, so that you don't + need to specify numbers for the "crashkernel=X@Y" boot option, + instead you can use "crashkernel=auto". To make this work, you need + to have more than 2G/8G memory. On PPC, 128M or 256M is reserved, on + PPC64 1/32 of your physical memory, but it will not exceed 4G. + config CRASH_DUMP bool "Build a kdump crash kernel" depends on PPC64 || 6xx @@ -355,13 +372,16 @@ config CRASH_DUMP The same kernel binary can be used as production kernel and dump capture kernel. -config PHYP_DUMP - bool "Hypervisor-assisted dump (EXPERIMENTAL)" - depends on PPC_PSERIES && EXPERIMENTAL - help - Hypervisor-assisted dump is meant to be a kdump replacement - offering robustness and speed not possible without system - hypervisor assistance. +config FA_DUMP + bool "Firmware-assisted dump" + depends on PPC64 && PPC_RTAS && CRASH_DUMP + help + A robust mechanism to get reliable kernel crash dump with + assistance from firmware. This approach does not use kexec, + instead firmware assists in booting the kdump kernel + while preserving memory contents. Firmware-assisted dump + is meant to be a kdump replacement offering robustness and + speed not possible without system firmware assistance. If unsure, say "N" @@ -627,6 +647,9 @@ config ZONE_DMA bool default y +config NEED_DMA_MAP_STATE + def_bool (PPC64 || NOT_COHERENT_CACHE) + config GENERIC_ISA_DMA bool depends on PPC64 || POWER4 || 6xx && !CPM2 diff -uprN linux-2.6.32/arch/powerpc/Kconfig.debug linux-2.6.32.ovz/arch/powerpc/Kconfig.debug --- linux-2.6.32/arch/powerpc/Kconfig.debug 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/Kconfig.debug 2015-09-10 10:06:52.000000000 -0700 @@ -286,4 +286,16 @@ config PPC_EARLY_DEBUG_CPM_ADDR platform probing is done, all platforms selected must share the same address. +config STRICT_DEVMEM + def_bool y + prompt "Filter access to /dev/mem" + help + This option restricts access to /dev/mem. If this option is + disabled, you allow userspace access to all memory, including + kernel and userspace memory. Accidental memory access is likely + to be disastrous. + Memory access is required for experts who want to debug the kernel. + + If you are unsure, say Y. + endmenu diff -uprN linux-2.6.32/arch/powerpc/kernel/align.c linux-2.6.32.ovz/arch/powerpc/kernel/align.c --- linux-2.6.32/arch/powerpc/kernel/align.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/align.c 2015-09-10 10:05:43.000000000 -0700 @@ -642,10 +642,14 @@ static int emulate_spe(struct pt_regs *r */ static int emulate_vsx(unsigned char __user *addr, unsigned int reg, unsigned int areg, struct pt_regs *regs, - unsigned int flags, unsigned int length) + unsigned int flags, unsigned int length, + unsigned int elsize) { char *ptr; + unsigned long *lptr; int ret = 0; + int sw = 0; + int i, j; flush_vsx_to_thread(current); @@ -654,19 +658,35 @@ static int emulate_vsx(unsigned char __u else ptr = (char *) ¤t->thread.vr[reg - 32]; - if (flags & ST) - ret = __copy_to_user(addr, ptr, length); - else { - if (flags & SPLT){ - ret = __copy_from_user(ptr, addr, length); - ptr += length; + lptr = (unsigned long *) ptr; + + if (flags & SW) + sw = elsize-1; + + for (j = 0; j < length; j += elsize) { + for (i = 0; i < elsize; ++i) { + if (flags & ST) + ret |= __put_user(ptr[i^sw], addr + i); + else + ret |= __get_user(ptr[i^sw], addr + i); } - ret |= __copy_from_user(ptr, addr, length); + ptr += elsize; + addr += elsize; } - if (flags & U) - regs->gpr[areg] = regs->dar; - if (ret) + + if (!ret) { + if (flags & U) + regs->gpr[areg] = regs->dar; + + /* Splat load copies the same data to top and bottom 8 bytes */ + if (flags & SPLT) + lptr[1] = lptr[0]; + /* For 8 byte loads, zero the top 8 bytes */ + else if (!(flags & ST) && (8 == length)) + lptr[1] = 0; + } else return -EFAULT; + return 1; } #endif @@ -767,16 +787,25 @@ int fix_alignment(struct pt_regs *regs) #ifdef CONFIG_VSX if ((instruction & 0xfc00003e) == 0x7c000018) { - /* Additional register addressing bit (64 VSX vs 32 FPR/GPR */ + unsigned int elsize; + + /* Additional register addressing bit (64 VSX vs 32 FPR/GPR) */ reg |= (instruction & 0x1) << 5; /* Simple inline decoder instead of a table */ + /* VSX has only 8 and 16 byte memory accesses */ + nb = 8; if (instruction & 0x200) nb = 16; - else if (instruction & 0x080) - nb = 8; - else - nb = 4; + + /* Vector stores in little-endian mode swap individual + elements, so process them separately */ + elsize = 4; + if (instruction & 0x80) + elsize = 8; + flags = 0; + if (regs->msr & MSR_LE) + flags |= SW; if (instruction & 0x100) flags |= ST; if (instruction & 0x040) @@ -787,7 +816,7 @@ int fix_alignment(struct pt_regs *regs) nb = 8; } PPC_WARN_EMULATED(vsx); - return emulate_vsx(addr, reg, areg, regs, flags, nb); + return emulate_vsx(addr, reg, areg, regs, flags, nb, elsize); } #endif /* A size of 0 indicates an instruction we don't support, with diff -uprN linux-2.6.32/arch/powerpc/kernel/asm-offsets.c linux-2.6.32.ovz/arch/powerpc/kernel/asm-offsets.c --- linux-2.6.32/arch/powerpc/kernel/asm-offsets.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/asm-offsets.c 2015-09-10 10:06:17.000000000 -0700 @@ -133,7 +133,6 @@ int main(void) DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); - DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_event_pending)); DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); #ifdef CONFIG_PPC_MM_SLICES DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct, @@ -181,11 +180,13 @@ int main(void) DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int)); DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int)); DEFINE(SLBSHADOW_SAVEAREA, offsetof(struct slb_shadow, save_area)); + DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx)); + DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx)); #endif /* CONFIG_PPC_STD_MMU_64 */ DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp)); DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id)); - DEFINE(PACA_STARTPURR, offsetof(struct paca_struct, startpurr)); - DEFINE(PACA_STARTSPURR, offsetof(struct paca_struct, startspurr)); + DEFINE(PACA_STARTTIME, offsetof(struct paca_struct, starttime)); + DEFINE(PACA_STARTTIME_USER, offsetof(struct paca_struct, starttime_user)); DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time)); DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time)); DEFINE(PACA_DATA_OFFSET, offsetof(struct paca_struct, data_offset)); @@ -331,6 +332,7 @@ int main(void) DEFINE(WTOM_CLOCK_SEC, offsetof(struct vdso_data, wtom_clock_sec)); DEFINE(WTOM_CLOCK_NSEC, offsetof(struct vdso_data, wtom_clock_nsec)); DEFINE(STAMP_XTIME, offsetof(struct vdso_data, stamp_xtime)); + DEFINE(STAMP_SEC_FRAC, offsetof(struct vdso_data, stamp_sec_fraction)); DEFINE(CFG_ICACHE_BLOCKSZ, offsetof(struct vdso_data, icache_block_size)); DEFINE(CFG_DCACHE_BLOCKSZ, offsetof(struct vdso_data, dcache_block_size)); DEFINE(CFG_ICACHE_LOGBLOCKSZ, offsetof(struct vdso_data, icache_log_block_size)); diff -uprN linux-2.6.32/arch/powerpc/kernel/cacheinfo.c linux-2.6.32.ovz/arch/powerpc/kernel/cacheinfo.c --- linux-2.6.32/arch/powerpc/kernel/cacheinfo.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/cacheinfo.c 2015-09-10 10:06:07.000000000 -0700 @@ -642,7 +642,7 @@ static struct kobj_attribute *cache_inde &cache_assoc_attr, }; -static struct sysfs_ops cache_index_ops = { +static const struct sysfs_ops cache_index_ops = { .show = cache_index_show, }; diff -uprN linux-2.6.32/arch/powerpc/kernel/cputable.c linux-2.6.32.ovz/arch/powerpc/kernel/cputable.c --- linux-2.6.32/arch/powerpc/kernel/cputable.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/cputable.c 2015-09-10 10:07:27.000000000 -0700 @@ -440,6 +440,14 @@ static struct cpu_spec __initdata cpu_sp .oprofile_cpu_type = "ppc64/ibm-compat-v1", .platform = "power7", }, + { /* 2.07-compliant processor, i.e. Power8 "architected" mode */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000004, + .cpu_name = "POWER8 (architected)", + .oprofile_type = PPC_OPROFILE_INVALID, + .oprofile_cpu_type = "ppc64/ibm-compat-v1", + .platform = "power8", + }, { /* Power7 */ .pvr_mask = 0xffff0000, .pvr_value = 0x003f0000, @@ -462,6 +470,31 @@ static struct cpu_spec __initdata cpu_sp POWER6_MMCRA_OTHER, .platform = "power7", }, + { /* Power7+ */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004A0000, + .cpu_name = "POWER7+ (raw)", + .cpu_features = CPU_FTRS_POWER7, + .cpu_user_features = COMMON_USER_POWER7, + .mmu_features = MMU_FTR_HPTE_TABLE | + MMU_FTR_TLBIE_206, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power7", + .oprofile_type = PPC_OPROFILE_POWER4, + .platform = "power7+", + }, + { /* Power8 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004b0000, + .cpu_name = "POWER8 (raw)", + .num_pmcs = 6, + .oprofile_cpu_type = "ppc64/power8", + .oprofile_type = PPC_OPROFILE_INVALID, + .platform = "power8", + }, { /* Cell Broadband Engine */ .pvr_mask = 0xffff0000, .pvr_value = 0x00700000, diff -uprN linux-2.6.32/arch/powerpc/kernel/crash.c linux-2.6.32.ovz/arch/powerpc/kernel/crash.c --- linux-2.6.32/arch/powerpc/kernel/crash.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/crash.c 2015-09-10 10:06:40.000000000 -0700 @@ -162,6 +162,33 @@ static void crash_kexec_prepare_cpus(int /* Leave the IPI callback set */ } +/* wait for all the CPUs to hit real mode but timeout if they don't come in */ +static void crash_kexec_wait_realmode(int cpu) +{ + extern u8 kexec_state[NR_CPUS]; + unsigned int msecs; + int i; + + msecs = 10000; + for (i=0; i < NR_CPUS && msecs > 0; i++) { + if (i == cpu) + continue; + + while (kexec_state[i] < KEXEC_STATE_REAL_MODE) { + barrier(); + if (!cpu_possible(i)) { + break; + } + if (!cpu_online(i)) { + break; + } + msecs--; + mdelay(1); + } + } + mb(); +} + /* * This function will be called by secondary cpus or by kexec cpu * if soft-reset is activated to stop some CPUs. @@ -347,10 +374,12 @@ int crash_shutdown_unregister(crash_shut EXPORT_SYMBOL(crash_shutdown_unregister); static unsigned long crash_shutdown_buf[JMP_BUF_LEN]; +static int crash_shutdown_cpu = -1; static int handle_fault(struct pt_regs *regs) { - longjmp(crash_shutdown_buf, 1); + if (crash_shutdown_cpu == smp_processor_id()) + longjmp(crash_shutdown_buf, 1); return 0; } @@ -388,6 +417,7 @@ void default_machine_crash_shutdown(stru */ old_handler = __debugger_fault_handler; __debugger_fault_handler = handle_fault; + crash_shutdown_cpu = smp_processor_id(); for (i = 0; crash_shutdown_handles[i]; i++) { if (setjmp(crash_shutdown_buf) == 0) { /* @@ -401,6 +431,7 @@ void default_machine_crash_shutdown(stru asm volatile("sync; isync"); } } + crash_shutdown_cpu = -1; __debugger_fault_handler = old_handler; /* @@ -412,6 +443,7 @@ void default_machine_crash_shutdown(stru crash_kexec_prepare_cpus(crashing_cpu); cpu_set(crashing_cpu, cpus_in_crash); crash_kexec_stop_spus(); + crash_kexec_wait_realmode(crashing_cpu); if (ppc_md.kexec_cpu_down) ppc_md.kexec_cpu_down(1, 0); } diff -uprN linux-2.6.32/arch/powerpc/kernel/crash_dump.c linux-2.6.32.ovz/arch/powerpc/kernel/crash_dump.c --- linux-2.6.32/arch/powerpc/kernel/crash_dump.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/crash_dump.c 2015-09-10 10:08:00.000000000 -0700 @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef DEBUG #include @@ -29,6 +30,7 @@ /* Stores the physical address of elf header of crash image. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +EXPORT_SYMBOL(elfcorehdr_addr); #ifndef CONFIG_RELOCATABLE void __init reserve_kdump_trampoline(void) @@ -124,20 +126,54 @@ ssize_t copy_oldmem_page(unsigned long p size_t csize, unsigned long offset, int userbuf) { void *vaddr; + phys_addr_t paddr; if (!csize) return 0; csize = min(csize, PAGE_SIZE); + paddr = pfn << PAGE_SHIFT; - if (pfn < max_pfn) { - vaddr = __va(pfn << PAGE_SHIFT); + if (lmb_is_region_memory(paddr, csize)) { + vaddr = __va(paddr); csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); } else { - vaddr = __ioremap(pfn << PAGE_SHIFT, PAGE_SIZE, 0); + vaddr = __ioremap(paddr, PAGE_SIZE, 0); csize = copy_oldmem_vaddr(vaddr, buf, csize, offset, userbuf); iounmap(vaddr); } return csize; } + +#ifdef CONFIG_PPC_RTAS +/* + * The crashkernel region will almost always overlap the RTAS region, so + * we have to be careful when shrinking the crashkernel region. + */ +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr; + const u32 *basep, *sizep; + unsigned int rtas_start = 0, rtas_end = 0; + + basep = of_get_property(rtas.dev, "linux,rtas-base", NULL); + sizep = of_get_property(rtas.dev, "rtas-size", NULL); + + if (basep && sizep) { + rtas_start = *basep; + rtas_end = *basep + *sizep; + } + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + /* Does this page overlap with the RTAS region? */ + if (addr <= rtas_end && ((addr + PAGE_SIZE) > rtas_start)) + continue; + + ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); + init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); + free_page((unsigned long)__va(addr)); + totalram_pages++; + } +} +#endif diff -uprN linux-2.6.32/arch/powerpc/kernel/dma.c linux-2.6.32.ovz/arch/powerpc/kernel/dma.c --- linux-2.6.32/arch/powerpc/kernel/dma.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/dma.c 2015-09-10 10:06:29.000000000 -0700 @@ -11,6 +11,7 @@ #include #include #include +#include /* * Generic direct DMA implementation @@ -153,6 +154,23 @@ EXPORT_SYMBOL(dma_direct_ops); #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) +int dma_set_mask(struct device *dev, u64 dma_mask) +{ + struct dma_map_ops *dma_ops = get_dma_ops(dev); + + if (ppc_md.dma_set_mask) + return ppc_md.dma_set_mask(dev, dma_mask); + if (unlikely(dma_ops == NULL)) + return -EIO; + if (dma_ops->set_dma_mask != NULL) + return dma_ops->set_dma_mask(dev, dma_mask); + if (!dev->dma_mask || !dma_supported(dev, dma_mask)) + return -EIO; + *dev->dma_mask = dma_mask; + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + static int __init dma_init(void) { dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); diff -uprN linux-2.6.32/arch/powerpc/kernel/e500-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/e500-pmu.c --- linux-2.6.32/arch/powerpc/kernel/e500-pmu.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/e500-pmu.c 2015-09-10 10:06:30.000000000 -0700 @@ -0,0 +1,129 @@ +/* + * Performance counter support for e500 family processors. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * Copyright 2010 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include + +/* + * Map of generic hardware event types to hardware events + * Zero if unsupported + */ +static int e500_generic_events[] = { + [PERF_COUNT_HW_CPU_CYCLES] = 1, + [PERF_COUNT_HW_INSTRUCTIONS] = 2, + [PERF_COUNT_HW_CACHE_MISSES] = 41, /* Data L1 cache reloads */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 12, + [PERF_COUNT_HW_BRANCH_MISSES] = 15, +}; + +#define C(x) PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int e500_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { + /* + * D-cache misses are not split into read/write/prefetch; + * use raw event 41. + */ + [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 27, 0 }, + [C(OP_WRITE)] = { 28, 0 }, + [C(OP_PREFETCH)] = { 29, 0 }, + }, + [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 2, 60 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { 0, 0 }, + }, + /* + * Assuming LL means L2, it's not a good match for this model. + * It allocates only on L1 castout or explicit prefetch, and + * does not have separate read/write events (but it does have + * separate instruction/data events). + */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0 }, + [C(OP_WRITE)] = { 0, 0 }, + [C(OP_PREFETCH)] = { 0, 0 }, + }, + /* + * There are data/instruction MMU misses, but that's a miss on + * the chip's internal level-one TLB which is probably not + * what the user wants. Instead, unified level-two TLB misses + * are reported here. + */ + [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 26, 66 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 12, 15 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, +}; + +static int num_events = 128; + +/* Upper half of event id is PMLCb, for threshold events */ +static u64 e500_xlate_event(u64 event_id) +{ + u32 event_low = (u32)event_id; + u64 ret; + + if (event_low >= num_events) + return 0; + + ret = FSL_EMB_EVENT_VALID; + + if (event_low >= 76 && event_low <= 81) { + ret |= FSL_EMB_EVENT_RESTRICTED; + ret |= event_id & + (FSL_EMB_EVENT_THRESHMUL | FSL_EMB_EVENT_THRESH); + } else if (event_id & + (FSL_EMB_EVENT_THRESHMUL | FSL_EMB_EVENT_THRESH)) { + /* Threshold requested on non-threshold event */ + return 0; + } + + return ret; +} + +static struct fsl_emb_pmu e500_pmu = { + .name = "e500 family", + .n_counter = 4, + .n_restricted = 2, + .xlate_event = e500_xlate_event, + .n_generic = ARRAY_SIZE(e500_generic_events), + .generic_events = e500_generic_events, + .cache_events = &e500_cache_events, +}; + +static int init_e500_pmu(void) +{ + if (!cur_cpu_spec->oprofile_cpu_type) + return -ENODEV; + + if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/e500mc")) + num_events = 256; + else if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/e500")) + return -ENODEV; + + return register_fsl_emb_pmu(&e500_pmu); +} + +early_initcall(init_e500_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/entry_64.S linux-2.6.32.ovz/arch/powerpc/kernel/entry_64.S --- linux-2.6.32/arch/powerpc/kernel/entry_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/entry_64.S 2015-09-10 10:06:17.000000000 -0700 @@ -97,6 +97,24 @@ system_call_common: addi r9,r1,STACK_FRAME_OVERHEAD ld r11,exception_marker@toc(r2) std r11,-16(r9) /* "regshere" marker */ +#if defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR) +BEGIN_FW_FTR_SECTION + beq 33f + /* if from user, see if there are any DTL entries to process */ + ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */ + ld r11,PACA_DTL_RIDX(r13) /* get log read index */ + ld r10,LPPACA_DTLIDX(r10) /* get log write index */ + cmpd cr1,r11,r10 + beq+ cr1,33f + bl .accumulate_stolen_time + REST_GPR(0,r1) + REST_4GPRS(3,r1) + REST_2GPRS(7,r1) + addi r9,r1,STACK_FRAME_OVERHEAD +33: +END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) +#endif /* CONFIG_VIRT_CPU_ACCOUNTING && CONFIG_PPC_SPLPAR */ + #ifdef CONFIG_TRACE_IRQFLAGS bl .trace_hardirqs_on REST_GPR(0,r1) @@ -556,15 +574,6 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ 2: TRACE_AND_RESTORE_IRQ(r5); -#ifdef CONFIG_PERF_EVENTS - /* check paca->perf_event_pending if we're enabling ints */ - lbz r3,PACAPERFPEND(r13) - and. r3,r3,r5 - beq 27f - bl .perf_event_do_pending -27: -#endif /* CONFIG_PERF_EVENTS */ - /* extract EE bit and use it to restore paca->hard_enabled */ ld r3,_MSR(r1) rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ diff -uprN linux-2.6.32/arch/powerpc/kernel/fadump.c linux-2.6.32.ovz/arch/powerpc/kernel/fadump.c --- linux-2.6.32/arch/powerpc/kernel/fadump.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/fadump.c 2015-09-10 10:07:19.000000000 -0700 @@ -0,0 +1,1321 @@ +/* + * Firmware Assisted dump: A robust mechanism to get reliable kernel crash + * dump with assistance from firmware. This approach does not use kexec, + * instead firmware assists in booting the kdump kernel while preserving + * memory contents. The most of the code implementation has been adapted + * from phyp assisted dump implementation written by Linas Vepstas and + * Manish Ahuja + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2011 IBM Corporation + * Author: Mahesh Salgaonkar + */ + +#undef DEBUG +#define pr_fmt(fmt) "fadump: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * The RTAS property "ibm,configure-kernel-dump-sizes" returns dump + * sizes for the firmware provided dump sections (cpu state data + * and hpte region). + * For each dump section type supported, a 32bit cell which defines + * the ID of a supported section followed by two 32 bit cells which + * gives teh size of the section in bytes. + */ +struct dump_section { + u32 dump_section; + unsigned long section_size; +} __packed; + +static struct fw_dump fw_dump; +static struct fadump_mem_struct fdm; +static const struct fadump_mem_struct *fdm_active; + +static DEFINE_MUTEX(fadump_mutex); +struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES]; +int crash_mem_ranges; + +/* Scan the Firmware Assisted dump configuration details. */ +int __init early_init_dt_scan_fw_dump(unsigned long node, + const char *uname, int depth, void *data) +{ + const struct dump_section *sections; + int i, num_sections; + unsigned long size; + const int *token; + + if (depth != 1 || strcmp(uname, "rtas") != 0) + return 0; + + /* + * Check if Firmware Assisted dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); + if (!token) + return 0; + + fw_dump.fadump_supported = 1; + fw_dump.ibm_configure_kernel_dump = *token; + + /* + * The 'ibm,kernel-dump' rtas node is present only if there is + * dump data waiting for us. + */ + fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL); + if (fdm_active) + fw_dump.dump_active = 1; + + /* Get the sizes required to store dump data for the firmware provided + * dump sections. + */ + sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", + &size); + + if (!sections) + return 0; + + num_sections = size / sizeof(struct dump_section); + + for (i = 0; i < num_sections; i++) { + switch (sections[i].dump_section) { + case FADUMP_CPU_STATE_DATA: + fw_dump.cpu_state_data_size = sections[i].section_size; + break; + case FADUMP_HPTE_REGION: + fw_dump.hpte_region_size = sections[i].section_size; + break; + } + } + return 1; +} + +int is_fadump_active(void) +{ + return fw_dump.dump_active; +} + +/* Print firmware assisted dump configurations for debugging purpose. */ +static void fadump_show_config(void) +{ + pr_debug("Support for firmware-assisted dump (fadump): %s\n", + (fw_dump.fadump_supported ? "present" : "no support")); + + if (!fw_dump.fadump_supported) + return; + + pr_debug("Fadump enabled : %s\n", + (fw_dump.fadump_enabled ? "yes" : "no")); + pr_debug("Dump Active : %s\n", + (fw_dump.dump_active ? "yes" : "no")); + pr_debug("Dump section sizes:\n"); + pr_debug(" CPU state data size: %lx\n", fw_dump.cpu_state_data_size); + pr_debug(" HPTE region size : %lx\n", fw_dump.hpte_region_size); + pr_debug("Boot memory size : %lx\n", fw_dump.boot_memory_size); +} + +static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, + unsigned long addr) +{ + if (!fdm) + return 0; + + memset(fdm, 0, sizeof(struct fadump_mem_struct)); + addr = addr & PAGE_MASK; + + fdm->header.dump_format_version = 0x00000001; + fdm->header.dump_num_sections = 3; + fdm->header.dump_status_flag = 0; + fdm->header.offset_first_dump_section = + (u32)offsetof(struct fadump_mem_struct, cpu_state_data); + + /* + * Fields for disk dump option. + * We are not using disk dump option, hence set these fields to 0. + */ + fdm->header.dd_block_size = 0; + fdm->header.dd_block_offset = 0; + fdm->header.dd_num_blocks = 0; + fdm->header.dd_offset_disk_path = 0; + + /* set 0 to disable an automatic dump-reboot. */ + fdm->header.max_time_auto = 0; + + /* Kernel dump sections */ + /* cpu state data section. */ + fdm->cpu_state_data.request_flag = FADUMP_REQUEST_FLAG; + fdm->cpu_state_data.source_data_type = FADUMP_CPU_STATE_DATA; + fdm->cpu_state_data.source_address = 0; + fdm->cpu_state_data.source_len = fw_dump.cpu_state_data_size; + fdm->cpu_state_data.destination_address = addr; + addr += fw_dump.cpu_state_data_size; + + /* hpte region section */ + fdm->hpte_region.request_flag = FADUMP_REQUEST_FLAG; + fdm->hpte_region.source_data_type = FADUMP_HPTE_REGION; + fdm->hpte_region.source_address = 0; + fdm->hpte_region.source_len = fw_dump.hpte_region_size; + fdm->hpte_region.destination_address = addr; + addr += fw_dump.hpte_region_size; + + /* RMA region section */ + fdm->rmr_region.request_flag = FADUMP_REQUEST_FLAG; + fdm->rmr_region.source_data_type = FADUMP_REAL_MODE_REGION; + fdm->rmr_region.source_address = RMA_START; + fdm->rmr_region.source_len = fw_dump.boot_memory_size; + fdm->rmr_region.destination_address = addr; + addr += fw_dump.boot_memory_size; + + return addr; +} + +/** + * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM + * + * Function to find the largest memory size we need to reserve during early + * boot process. This will be the size of the memory that is required for a + * kernel to boot successfully. + * + * This function has been taken from phyp-assisted dump feature implementation. + * + * returns larger of 256MB or 5% rounded down to multiples of 256MB. + * + * TODO: Come up with better approach to find out more accurate memory size + * that is required for a kernel to boot successfully. + * + */ +static inline unsigned long fadump_calculate_reserve_size(void) +{ + unsigned long size; + + /* + * Check if the size is specified through fadump_reserve_mem= cmdline + * option. If yes, then use that. + */ + if (fw_dump.reserve_bootvar) + return fw_dump.reserve_bootvar; + + /* divide by 20 to get 5% of value */ + size = lmb_end_of_DRAM() / 20; + + /* round it down in multiples of 256 */ + size = size & ~0x0FFFFFFFUL; + + /* Truncate to memory_limit. We don't want to over reserve the memory.*/ + if (memory_limit && size > memory_limit) + size = memory_limit; + + return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM); +} + +/* + * Calculate the total memory size required to be reserved for + * firmware-assisted dump registration. + */ +static unsigned long get_fadump_area_size(void) +{ + unsigned long size = 0; + + size += fw_dump.cpu_state_data_size; + size += fw_dump.hpte_region_size; + size += fw_dump.boot_memory_size; + size += sizeof(struct fadump_crash_info_header); + size += sizeof(struct elfhdr); /* ELF core header.*/ + size += sizeof(struct elf_phdr); /* place holder for cpu notes */ + /* Program headers for crash memory regions. */ + size += sizeof(struct elf_phdr) * (lmb_num_regions(memory) + 2); + + size = PAGE_ALIGN(size); + return size; +} + +int __init fadump_reserve_mem(void) +{ + unsigned long base, size, memory_boundary; + + if (!fw_dump.fadump_enabled) + return 0; + + if (!fw_dump.fadump_supported) { + printk(KERN_INFO "Firmware-assisted dump is not supported on" + " this hardware\n"); + fw_dump.fadump_enabled = 0; + return 0; + } + /* + * Initialize boot memory size + * If dump is active then we have already calculated the size during + * first kernel. + */ + if (fdm_active) + fw_dump.boot_memory_size = fdm_active->rmr_region.source_len; + else + fw_dump.boot_memory_size = fadump_calculate_reserve_size(); + + /* + * Calculate the memory boundary. + * If memory_limit is less than actual memory boundary then reserve + * the memory for fadump beyond the memory_limit and adjust the + * memory_limit accordingly, so that the running kernel can run with + * specified memory_limit. + */ + if (memory_limit && memory_limit < lmb_end_of_DRAM()) { + size = get_fadump_area_size(); + if ((memory_limit + size) < lmb_end_of_DRAM()) + memory_limit += size; + else + memory_limit = lmb_end_of_DRAM(); + printk(KERN_INFO "Adjusted memory_limit for firmware-assisted" + " dump, now %#016llx\n", memory_limit); + } + if (memory_limit) + memory_boundary = memory_limit; + else + memory_boundary = lmb_end_of_DRAM(); + + if (fw_dump.dump_active) { + printk(KERN_INFO "Firmware-assisted dump is active.\n"); + /* + * If last boot has crashed then reserve all the memory + * above boot_memory_size so that we don't touch it until + * dump is written to disk by userspace tool. This memory + * will be released for general use once the dump is saved. + */ + base = fw_dump.boot_memory_size; + size = memory_boundary - base; + lmb_reserve(base, size); + printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " + "for saving crash dump\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20)); + + fw_dump.fadumphdr_addr = + fdm_active->rmr_region.destination_address + + fdm_active->rmr_region.source_len; + pr_debug("fadumphdr_addr = %p\n", + (void *) fw_dump.fadumphdr_addr); + } else { + /* Reserve the memory at the top of memory. */ + size = get_fadump_area_size(); + base = memory_boundary - size; + lmb_reserve(base, size); + printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " + "for firmware-assisted dump\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20)); + } + fw_dump.reserve_dump_area_start = base; + fw_dump.reserve_dump_area_size = size; + return 1; +} + +/* Look for fadump= cmdline option. */ +static int __init early_fadump_param(char *p) +{ + if (!p) + return 1; + + if (strncmp(p, "on", 2) == 0) + fw_dump.fadump_enabled = 1; + else if (strncmp(p, "off", 3) == 0) + fw_dump.fadump_enabled = 0; + + return 0; +} +early_param("fadump", early_fadump_param); + +/* Look for fadump_reserve_mem= cmdline option */ +static int __init early_fadump_reserve_mem(char *p) +{ + if (p) + fw_dump.reserve_bootvar = memparse(p, &p); + return 0; +} +early_param("fadump_reserve_mem", early_fadump_reserve_mem); + +static void register_fw_dump(struct fadump_mem_struct *fdm) +{ + int rc; + unsigned int wait_time; + + pr_debug("Registering for firmware-assisted kernel dump...\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_REGISTER, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + + } while (wait_time); + + switch (rc) { + case -1: + printk(KERN_ERR "Failed to register firmware-assisted kernel" + " dump. Hardware Error(%d).\n", rc); + break; + case -3: + printk(KERN_ERR "Failed to register firmware-assisted kernel" + " dump. Parameter Error(%d).\n", rc); + break; + case -9: + printk(KERN_ERR "firmware-assisted kernel dump is already " + " registered."); + fw_dump.dump_registered = 1; + break; + case 0: + printk(KERN_INFO "firmware-assisted kernel dump registration" + " is successful\n"); + fw_dump.dump_registered = 1; + break; + } +} + +void crash_fadump(struct pt_regs *regs, const char *str) +{ + struct fadump_crash_info_header *fdh = NULL; + + if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr) + return; + + fdh = __va(fw_dump.fadumphdr_addr); + crashing_cpu = smp_processor_id(); + fdh->crashing_cpu = crashing_cpu; + crash_save_vmcoreinfo(); + + if (regs) + fdh->regs = *regs; + else + ppc_save_regs(&fdh->regs); + + fdh->cpu_online_mask = *cpu_online_mask; + + /* Call ibm,os-term rtas call to trigger firmware assisted dump */ + rtas_os_term((char *)str); +} + +#define GPR_MASK 0xffffff0000000000 +static inline int fadump_gpr_index(u64 id) +{ + int i = -1; + char str[3]; + + if ((id & GPR_MASK) == REG_ID("GPR")) { + /* get the digits at the end */ + id &= ~GPR_MASK; + id >>= 24; + str[2] = '\0'; + str[1] = id & 0xff; + str[0] = (id >> 8) & 0xff; + sscanf(str, "%d", &i); + if (i > 31) + i = -1; + } + return i; +} + +static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id, + u64 reg_val) +{ + int i; + + i = fadump_gpr_index(reg_id); + if (i >= 0) + regs->gpr[i] = (unsigned long)reg_val; + else if (reg_id == REG_ID("NIA")) + regs->nip = (unsigned long)reg_val; + else if (reg_id == REG_ID("MSR")) + regs->msr = (unsigned long)reg_val; + else if (reg_id == REG_ID("CTR")) + regs->ctr = (unsigned long)reg_val; + else if (reg_id == REG_ID("LR")) + regs->link = (unsigned long)reg_val; + else if (reg_id == REG_ID("XER")) + regs->xer = (unsigned long)reg_val; + else if (reg_id == REG_ID("CR")) + regs->ccr = (unsigned long)reg_val; + else if (reg_id == REG_ID("DAR")) + regs->dar = (unsigned long)reg_val; + else if (reg_id == REG_ID("DSISR")) + regs->dsisr = (unsigned long)reg_val; +} + +static struct fadump_reg_entry* +fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + + while (reg_entry->reg_id != REG_ID("CPUEND")) { + fadump_set_regval(regs, reg_entry->reg_id, + reg_entry->reg_value); + reg_entry++; + } + reg_entry++; + return reg_entry; +} + +static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type, + void *data, size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void fadump_final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) +{ + struct elf_prstatus prstatus; + + memset(&prstatus, 0, sizeof(prstatus)); + /* + * FIXME: How do i get PID? Do I really need it? + * prstatus.pr_pid = ???? + */ + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + return buf; +} + +static void fadump_update_elfcore_header(char *bufp) +{ + struct elfhdr *elf; + struct elf_phdr *phdr; + + elf = (struct elfhdr *)bufp; + bufp += sizeof(struct elfhdr); + + /* First note is a place holder for cpu notes info. */ + phdr = (struct elf_phdr *)bufp; + + if (phdr->p_type == PT_NOTE) { + phdr->p_paddr = fw_dump.cpu_notes_buf; + phdr->p_offset = phdr->p_paddr; + phdr->p_filesz = fw_dump.cpu_notes_buf_size; + phdr->p_memsz = fw_dump.cpu_notes_buf_size; + } + return; +} + +static void *fadump_cpu_notes_buf_alloc(unsigned long size) +{ + void *vaddr; + struct page *page; + unsigned long order, count, i; + + order = get_order(size); + vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + if (!vaddr) + return NULL; + + count = 1 << order; + page = virt_to_page(vaddr); + for (i = 0; i < count; i++) + SetPageReserved(page + i); + return vaddr; +} + +static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size) +{ + struct page *page; + unsigned long order, count, i; + + order = get_order(size); + count = 1 << order; + page = virt_to_page(vaddr); + for (i = 0; i < count; i++) + ClearPageReserved(page + i); + __free_pages(page, order); +} + +/* + * Read CPU state dump data and convert it into ELF notes. + * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be + * used to access the data to allow for additional fields to be added without + * affecting compatibility. Each list of registers for a CPU starts with + * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes, + * 8 Byte ASCII identifier and 8 Byte register value. The register entry + * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part + * of register value. For more details refer to PAPR document. + * + * Only for the crashing cpu we ignore the CPU dump data and get exact + * state from fadump crash info structure populated by first kernel at the + * time of crash. + */ +static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) +{ + struct fadump_reg_save_area_header *reg_header; + struct fadump_reg_entry *reg_entry; + struct fadump_crash_info_header *fdh = NULL; + void *vaddr; + unsigned long addr; + u32 num_cpus, *note_buf; + struct pt_regs regs; + int i, rc = 0, cpu = 0; + + if (!fdm->cpu_state_data.bytes_dumped) + return -EINVAL; + + addr = fdm->cpu_state_data.destination_address; + vaddr = __va(addr); + + reg_header = vaddr; + if (reg_header->magic_number != REGSAVE_AREA_MAGIC) { + printk(KERN_ERR "Unable to read register save area.\n"); + return -ENOENT; + } + pr_debug("--------CPU State Data------------\n"); + pr_debug("Magic Number: %llx\n", reg_header->magic_number); + pr_debug("NumCpuOffset: %x\n", reg_header->num_cpu_offset); + + vaddr += reg_header->num_cpu_offset; + num_cpus = *((u32 *)(vaddr)); + pr_debug("NumCpus : %u\n", num_cpus); + vaddr += sizeof(u32); + reg_entry = (struct fadump_reg_entry *)vaddr; + + /* Allocate buffer to hold cpu crash notes. */ + fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); + fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size); + note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size); + if (!note_buf) { + printk(KERN_ERR "Failed to allocate 0x%lx bytes for " + "cpu notes buffer\n", fw_dump.cpu_notes_buf_size); + return -ENOMEM; + } + fw_dump.cpu_notes_buf = __pa(note_buf); + + pr_debug("Allocated buffer for cpu notes of size %ld at %p\n", + (num_cpus * sizeof(note_buf_t)), note_buf); + + if (fw_dump.fadumphdr_addr) + fdh = __va(fw_dump.fadumphdr_addr); + + for (i = 0; i < num_cpus; i++) { + if (reg_entry->reg_id != REG_ID("CPUSTRT")) { + printk(KERN_ERR "Unable to read CPU state data\n"); + rc = -ENOENT; + goto error_out; + } + /* Lower 4 bytes of reg_value contains logical cpu id */ + cpu = reg_entry->reg_value & FADUMP_CPU_ID_MASK; + if (!cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) { + SKIP_TO_NEXT_CPU(reg_entry); + continue; + } + pr_debug("Reading register data for cpu %d...\n", cpu); + if (fdh && fdh->crashing_cpu == cpu) { + regs = fdh->regs; + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + SKIP_TO_NEXT_CPU(reg_entry); + } else { + reg_entry++; + reg_entry = fadump_read_registers(reg_entry, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + } + } + fadump_final_note(note_buf); + + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr)); + return 0; + +error_out: + fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf), + fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf = 0; + fw_dump.cpu_notes_buf_size = 0; + return rc; + +} + +/* + * Validate and process the dump data stored by firmware before exporting + * it through '/proc/vmcore'. + */ +static int __init process_fadump(const struct fadump_mem_struct *fdm_active) +{ + struct fadump_crash_info_header *fdh; + int rc = 0; + + if (!fdm_active || !fw_dump.fadumphdr_addr) + return -EINVAL; + + /* Check if the dump data is valid. */ + if ((fdm_active->header.dump_status_flag == FADUMP_ERROR_FLAG) || + (fdm_active->cpu_state_data.error_flags != 0) || + (fdm_active->rmr_region.error_flags != 0)) { + printk(KERN_ERR "Dump taken by platform is not valid\n"); + return -EINVAL; + } + if ((fdm_active->rmr_region.bytes_dumped != + fdm_active->rmr_region.source_len) || + !fdm_active->cpu_state_data.bytes_dumped) { + printk(KERN_ERR "Dump taken by platform is incomplete\n"); + return -EINVAL; + } + + /* Validate the fadump crash info header */ + fdh = __va(fw_dump.fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + printk(KERN_ERR "Crash info header is not valid.\n"); + return -EINVAL; + } + + rc = fadump_build_cpu_notes(fdm_active); + if (rc) + return rc; + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return 0; +} + +static inline void fadump_add_crash_memory(unsigned long long base, + unsigned long long end) +{ + if (base == end) + return; + + pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", + crash_mem_ranges, base, end - 1, (end - base)); + crash_memory_ranges[crash_mem_ranges].base = base; + crash_memory_ranges[crash_mem_ranges].size = end - base; + crash_mem_ranges++; +} + +static void fadump_exclude_reserved_area(unsigned long long start, + unsigned long long end) +{ + unsigned long long ra_start, ra_end; + + ra_start = fw_dump.reserve_dump_area_start; + ra_end = ra_start + fw_dump.reserve_dump_area_size; + + if ((ra_start < end) && (ra_end > start)) { + if ((start < ra_start) && (end > ra_end)) { + fadump_add_crash_memory(start, ra_start); + fadump_add_crash_memory(ra_end, end); + } else if (start < ra_start) { + fadump_add_crash_memory(start, ra_start); + } else if (ra_end < end) { + fadump_add_crash_memory(ra_end, end); + } + } else + fadump_add_crash_memory(start, end); +} + +static int fadump_init_elfcore_header(char *bufp) +{ + struct elfhdr *elf; + + elf = (struct elfhdr *) bufp; + bufp += sizeof(struct elfhdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = 0; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + return 0; +} + +/* + * Traverse through lmb structure and setup crash memory ranges. These + * ranges will be used create PT_LOAD program headers in elfcore header. + */ +static void fadump_setup_crash_memory_ranges(void) +{ + struct lmb_property *reg; + unsigned long long start, end; + unsigned long iter; + + pr_debug("Setup crash memory ranges.\n"); + crash_mem_ranges = 0; + /* + * add the first memory chunk (RMA_START through boot_memory_size) as + * a separate memory chunk. The reason is, at the time crash firmware + * will move the content of this memory chunk to different location + * specified during fadump registration. We need to create a separate + * program header for this chunk with the correct offset. + */ + fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size); + + for_each_lmb(iter, memory, reg) { + start = (unsigned long long)reg->base; + end = start + (unsigned long long)reg->size; + if (start == RMA_START && end >= fw_dump.boot_memory_size) + start = fw_dump.boot_memory_size; + + /* add this range excluding the reserved dump area. */ + fadump_exclude_reserved_area(start, end); + } +} + +/* + * If the given physical address falls within the boot memory region then + * return the relocated address that points to the dump region reserved + * for saving initial boot memory contents. + */ +static inline unsigned long fadump_relocate(unsigned long paddr) +{ + if (paddr > RMA_START && paddr < fw_dump.boot_memory_size) + return fdm.rmr_region.destination_address + paddr; + else + return paddr; +} + +static int fadump_create_elfcore_headers(char *bufp) +{ + struct elfhdr *elf; + struct elf_phdr *phdr; + int i; + + fadump_init_elfcore_header(bufp); + elf = (struct elfhdr *)bufp; + bufp += sizeof(struct elfhdr); + + /* + * setup ELF PT_NOTE, place holder for cpu notes info. The notes info + * will be populated during second kernel boot after crash. Hence + * this PT_NOTE will always be the first elf note. + * + * NOTE: Any new ELF note addition should be placed after this note. + */ + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_NOTE; + phdr->p_flags = 0; + phdr->p_vaddr = 0; + phdr->p_align = 0; + + phdr->p_offset = 0; + phdr->p_paddr = 0; + phdr->p_filesz = 0; + phdr->p_memsz = 0; + + (elf->e_phnum)++; + + /* setup ELF PT_NOTE for vmcoreinfo */ + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_NOTE; + phdr->p_flags = 0; + phdr->p_vaddr = 0; + phdr->p_align = 0; + + phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); + phdr->p_offset = phdr->p_paddr; + phdr->p_memsz = vmcoreinfo_max_size; + phdr->p_filesz = vmcoreinfo_max_size; + + /* Increment number of program headers. */ + (elf->e_phnum)++; + + /* setup PT_LOAD sections. */ + + for (i = 0; i < crash_mem_ranges; i++) { + unsigned long long mbase, msize; + mbase = crash_memory_ranges[i].base; + msize = crash_memory_ranges[i].size; + + if (!msize) + continue; + + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mbase; + + if (mbase == RMA_START) { + /* + * The entire RMA region will be moved by firmware + * to the specified destination_address. Hence set + * the correct offset. + */ + phdr->p_offset = fdm.rmr_region.destination_address; + } + + phdr->p_paddr = mbase; + phdr->p_vaddr = (unsigned long)__va(mbase); + phdr->p_filesz = msize; + phdr->p_memsz = msize; + phdr->p_align = 0; + + /* Increment number of program headers. */ + (elf->e_phnum)++; + } + return 0; +} + +static unsigned long init_fadump_header(unsigned long addr) +{ + struct fadump_crash_info_header *fdh; + + if (!addr) + return 0; + + fw_dump.fadumphdr_addr = addr; + fdh = __va(addr); + addr += sizeof(struct fadump_crash_info_header); + + memset(fdh, 0, sizeof(struct fadump_crash_info_header)); + fdh->magic_number = FADUMP_CRASH_INFO_MAGIC; + fdh->elfcorehdr_addr = addr; + /* We will set the crashing cpu id in crash_fadump() during crash. */ + fdh->crashing_cpu = CPU_UNKNOWN; + + return addr; +} + +static void register_fadump(void) +{ + unsigned long addr; + void *vaddr; + + /* + * If no memory is reserved then we can not register for firmware- + * assisted dump. + */ + if (!fw_dump.reserve_dump_area_size) + return; + + fadump_setup_crash_memory_ranges(); + + addr = fdm.rmr_region.destination_address + fdm.rmr_region.source_len; + /* Initialize fadump crash info header. */ + addr = init_fadump_header(addr); + vaddr = __va(addr); + + pr_debug("Creating ELF core headers at %#016lx\n", addr); + fadump_create_elfcore_headers(vaddr); + + /* register the future kernel dump with firmware. */ + register_fw_dump(&fdm); +} + +static int fadump_unregister_dump(struct fadump_mem_struct *fdm) +{ + int rc = 0; + unsigned int wait_time; + + pr_debug("Un-register firmware-assisted dump\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_UNREGISTER, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + printk(KERN_ERR "Failed to un-register firmware-assisted dump." + " unexpected error(%d).\n", rc); + return rc; + } + fw_dump.dump_registered = 0; + return 0; +} + +static int fadump_invalidate_dump(struct fadump_mem_struct *fdm) +{ + int rc = 0; + unsigned int wait_time; + + pr_debug("Invalidating firmware-assisted dump registration\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_INVALIDATE, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + printk(KERN_ERR "Failed to invalidate firmware-assisted dump " + "rgistration. unexpected error(%d).\n", rc); + return rc; + } + fw_dump.dump_active = 0; + fdm_active = NULL; + return 0; +} + +void fadump_cleanup(void) +{ + /* Invalidate the registration only if dump is active. */ + if (fw_dump.dump_active) { + init_fadump_mem_struct(&fdm, + fdm_active->cpu_state_data.destination_address); + fadump_invalidate_dump(&fdm); + } +} + +/* + * Release the memory that was reserved in early boot to preserve the memory + * contents. The released memory will be available for general use. + */ +static void fadump_release_memory(unsigned long begin, unsigned long end) +{ + unsigned long addr; + unsigned long ra_start, ra_end; + + ra_start = fw_dump.reserve_dump_area_start; + ra_end = ra_start + fw_dump.reserve_dump_area_size; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + /* + * exclude the dump reserve area. Will reuse it for next + * fadump registration. + */ + if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start)) + continue; + + ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); + init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); + free_page((unsigned long)__va(addr)); + totalram_pages++; + } +} + +static void fadump_invalidate_release_mem(void) +{ + unsigned long reserved_area_start, reserved_area_end; + unsigned long destination_address; + + mutex_lock(&fadump_mutex); + if (!fw_dump.dump_active) { + mutex_unlock(&fadump_mutex); + return; + } + + destination_address = fdm_active->cpu_state_data.destination_address; + fadump_cleanup(); + mutex_unlock(&fadump_mutex); + + /* + * Save the current reserved memory bounds we will require them + * later for releasing the memory for general use. + */ + reserved_area_start = fw_dump.reserve_dump_area_start; + reserved_area_end = reserved_area_start + + fw_dump.reserve_dump_area_size; + /* + * Setup reserve_dump_area_start and its size so that we can + * reuse this reserved memory for Re-registration. + */ + fw_dump.reserve_dump_area_start = destination_address; + fw_dump.reserve_dump_area_size = get_fadump_area_size(); + + fadump_release_memory(reserved_area_start, reserved_area_end); + if (fw_dump.cpu_notes_buf) { + fadump_cpu_notes_buf_free( + (unsigned long)__va(fw_dump.cpu_notes_buf), + fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf = 0; + fw_dump.cpu_notes_buf_size = 0; + } + /* Initialize the kernel dump memory structure for FAD registration. */ + init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); +} + +static ssize_t fadump_release_memory_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!fw_dump.dump_active) + return -EPERM; + + if (buf[0] == '1') { + /* + * Take away the '/proc/vmcore'. We are releasing the dump + * memory, hence it will not be valid anymore. + */ + vmcore_cleanup(); + fadump_invalidate_release_mem(); + + } else + return -EINVAL; + return count; +} + +static ssize_t fadump_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", fw_dump.fadump_enabled); +} + +static ssize_t fadump_register_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", fw_dump.dump_registered); +} + +static ssize_t fadump_register_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret = 0; + + if (!fw_dump.fadump_enabled || fdm_active) + return -EPERM; + + mutex_lock(&fadump_mutex); + + switch (buf[0]) { + case '0': + if (fw_dump.dump_registered == 0) { + ret = -EINVAL; + goto unlock_out; + } + /* Un-register Firmware-assisted dump */ + fadump_unregister_dump(&fdm); + break; + case '1': + if (fw_dump.dump_registered == 1) { + ret = -EINVAL; + goto unlock_out; + } + /* Register Firmware-assisted dump */ + register_fadump(); + break; + default: + ret = -EINVAL; + break; + } + +unlock_out: + mutex_unlock(&fadump_mutex); + return ret < 0 ? ret : count; +} + +static int fadump_region_show(struct seq_file *m, void *private) +{ + const struct fadump_mem_struct *fdm_ptr; + + if (!fw_dump.fadump_enabled) + return 0; + + mutex_lock(&fadump_mutex); + if (fdm_active) + fdm_ptr = fdm_active; + else { + mutex_unlock(&fadump_mutex); + fdm_ptr = &fdm; + } + + seq_printf(m, + "CPU : [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->cpu_state_data.destination_address, + fdm_ptr->cpu_state_data.destination_address + + fdm_ptr->cpu_state_data.source_len - 1, + fdm_ptr->cpu_state_data.source_len, + fdm_ptr->cpu_state_data.bytes_dumped); + seq_printf(m, + "HPTE: [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->hpte_region.destination_address, + fdm_ptr->hpte_region.destination_address + + fdm_ptr->hpte_region.source_len - 1, + fdm_ptr->hpte_region.source_len, + fdm_ptr->hpte_region.bytes_dumped); + seq_printf(m, + "DUMP: [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->rmr_region.destination_address, + fdm_ptr->rmr_region.destination_address + + fdm_ptr->rmr_region.source_len - 1, + fdm_ptr->rmr_region.source_len, + fdm_ptr->rmr_region.bytes_dumped); + + if (!fdm_active || + (fw_dump.reserve_dump_area_start == + fdm_ptr->cpu_state_data.destination_address)) + goto out; + + /* Dump is active. Show reserved memory region. */ + seq_printf(m, + " : [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + (unsigned long long)fw_dump.reserve_dump_area_start, + fdm_ptr->cpu_state_data.destination_address - 1, + fdm_ptr->cpu_state_data.destination_address - + fw_dump.reserve_dump_area_start, + fdm_ptr->cpu_state_data.destination_address - + fw_dump.reserve_dump_area_start); +out: + if (fdm_active) + mutex_unlock(&fadump_mutex); + return 0; +} + +static struct kobj_attribute fadump_release_attr = __ATTR(fadump_release_mem, + 0200, NULL, + fadump_release_memory_store); +static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled, + 0444, fadump_enabled_show, + NULL); +static struct kobj_attribute fadump_register_attr = __ATTR(fadump_registered, + 0644, fadump_register_show, + fadump_register_store); + +static int fadump_region_open(struct inode *inode, struct file *file) +{ + return single_open(file, fadump_region_show, inode->i_private); +} + +static const struct file_operations fadump_region_fops = { + .open = fadump_region_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void fadump_init_files(void) +{ + struct dentry *debugfs_file; + int rc = 0; + + rc = sysfs_create_file(kernel_kobj, &fadump_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_enabled (%d)\n", rc); + + rc = sysfs_create_file(kernel_kobj, &fadump_register_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_registered (%d)\n", rc); + + debugfs_file = debugfs_create_file("fadump_region", 0444, + powerpc_debugfs_root, NULL, + &fadump_region_fops); + if (!debugfs_file) + printk(KERN_ERR "fadump: unable to create debugfs file" + " fadump_region\n"); + + if (fw_dump.dump_active) { + rc = sysfs_create_file(kernel_kobj, &fadump_release_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_release_mem (%d)\n", rc); + } + return; +} + +/* + * Prepare for firmware-assisted dump. + */ +int __init setup_fadump(void) +{ + if (!fw_dump.fadump_enabled) + return 0; + + if (!fw_dump.fadump_supported) { + printk(KERN_ERR "Firmware-assisted dump is not supported on" + " this hardware\n"); + return 0; + } + + fadump_show_config(); + /* + * If dump data is available then see if it is valid and prepare for + * saving it to the disk. + */ + if (fw_dump.dump_active) { + /* + * if dump process fails then invalidate the registration + * and release memory before proceeding for re-registration. + */ + if (process_fadump(fdm_active) < 0) + fadump_invalidate_release_mem(); + } + /* Initialize the kernel dump memory structure for FAD registration. */ + else if (fw_dump.reserve_dump_area_size) + init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); + fadump_init_files(); + + return 1; +} +subsys_initcall(setup_fadump); diff -uprN linux-2.6.32/arch/powerpc/kernel/head_64.S linux-2.6.32.ovz/arch/powerpc/kernel/head_64.S --- linux-2.6.32/arch/powerpc/kernel/head_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/head_64.S 2015-09-10 10:07:39.000000000 -0700 @@ -478,6 +478,7 @@ _GLOBAL(copy_and_flush) sync addi r5,r5,8 addi r6,r6,8 + isync blr .align 8 @@ -563,15 +564,21 @@ __secondary_start: /* Set thread priority to MEDIUM */ HMT_MEDIUM - /* Do early setup for that CPU (stab, slb, hash table pointer) */ - bl .early_setup_secondary - /* Initialize the kernel stack. Just a repeat for iSeries. */ LOAD_REG_ADDR(r3, current_set) sldi r28,r24,3 /* get current_set[cpu#] */ - ldx r1,r3,r28 - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD - std r1,PACAKSAVE(r13) + ldx r14,r3,r28 + addi r14,r14,THREAD_SIZE-STACK_FRAME_OVERHEAD + std r14,PACAKSAVE(r13) + + /* Do early setup for that CPU (stab, slb, hash table pointer) */ + bl .early_setup_secondary + + /* + * setup the new stack pointer, but *don't* use this until + * translation is on. + */ + mr r1, r14 /* Clear backchain so we get nice backtraces */ li r7,0 @@ -608,6 +615,17 @@ _GLOBAL(start_secondary_prolog) std r3,0(r1) /* Zero the stack frame pointer */ bl .start_secondary b . +/* + * Reset stack pointer and call start_secondary + * to continue with online operation when woken up + * from cede in cpu offline. + */ +_GLOBAL(start_secondary_resume) + ld r1,PACAKSAVE(r13) /* Reload kernel stack pointer */ + li r3,0 + std r3,0(r1) /* Zero the stack frame pointer */ + bl .start_secondary + b . #endif /* diff -uprN linux-2.6.32/arch/powerpc/kernel/iommu.c linux-2.6.32.ovz/arch/powerpc/kernel/iommu.c --- linux-2.6.32/arch/powerpc/kernel/iommu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/iommu.c 2015-09-10 10:07:39.000000000 -0700 @@ -33,12 +33,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #define DBG(...) @@ -74,6 +76,26 @@ static int __init setup_iommu(char *str) __setup("protect4gb=", setup_protect4gb); __setup("iommu=", setup_iommu); +static DEFINE_PER_CPU(unsigned int, iommu_pool_hash); + +/* + * We precalculate the hash to avoid doing it on every allocation. + * + * The hash is important to spread CPUs across all the pools. For example, + * on a POWER7 with 4 way SMT we want interrupts on the primary threads and + * with 4 pools all primary threads would map to the same pool. + */ +static int __init setup_iommu_pool_hash(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS); + + return 0; +} +subsys_initcall(setup_iommu_pool_hash); + static unsigned long iommu_range_alloc(struct device *dev, struct iommu_table *tbl, unsigned long npages, @@ -87,6 +109,9 @@ static unsigned long iommu_range_alloc(s int pass = 0; unsigned long align_mask; unsigned long boundary_size; + unsigned long flags; + unsigned int pool_nr; + struct iommu_pool *pool; align_mask = 0xffffffffffffffffl >> (64 - align_order); @@ -99,36 +124,49 @@ static unsigned long iommu_range_alloc(s return DMA_ERROR_CODE; } - if (handle && *handle) - start = *handle; + /* + * We don't need to disable preemption here because any CPU can + * safely use any IOMMU pool. + */ + pool_nr = __raw_get_cpu_var(iommu_pool_hash) & (tbl->nr_pools - 1); + + if (largealloc) + pool = &(tbl->large_pool); else - start = largealloc ? tbl->it_largehint : tbl->it_hint; + pool = &(tbl->pools[pool_nr]); - /* Use only half of the table for small allocs (15 pages or less) */ - limit = largealloc ? tbl->it_size : tbl->it_halfpoint; + spin_lock_irqsave(&(pool->lock), flags); - if (largealloc && start < tbl->it_halfpoint) - start = tbl->it_halfpoint; +again: + if ((pass == 0) && handle && *handle && + (*handle >= pool->start) && (*handle < pool->end)) + start = *handle; + else + start = pool->hint; + + limit = pool->end; /* The case below can happen if we have a small segment appended * to a large, or when the previous alloc was at the very end of * the available space. If so, go back to the initial start. */ if (start >= limit) - start = largealloc ? tbl->it_largehint : tbl->it_hint; - - again: + start = pool->start; if (limit + tbl->it_offset > mask) { limit = mask - tbl->it_offset + 1; /* If we're constrained on address range, first try * at the masked hint to avoid O(n) search complexity, - * but on second pass, start at 0. + * but on second pass, start at 0 in pool 0. */ - if ((start & mask) >= limit || pass > 0) - start = 0; - else + if ((start & mask) >= limit || pass > 0) { + spin_unlock(&(pool->lock)); + pool = &(tbl->pools[0]); + spin_lock(&(pool->lock)); + start = pool->start; + } else { start &= mask; + } } if (dev) @@ -142,16 +180,25 @@ static unsigned long iommu_range_alloc(s tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT, align_mask); if (n == -1) { - if (likely(pass < 2)) { - /* First failure, just rescan the half of the table. - * Second failure, rescan the other half of the table. - */ - start = (largealloc ^ pass) ? tbl->it_halfpoint : 0; - limit = pass ? tbl->it_size : limit; + if (likely(pass == 0)) { + /* First try the pool from the start */ + pool->hint = pool->start; + pass++; + goto again; + + } else if (pass <= tbl->nr_pools) { + /* Now try scanning all the other pools */ + spin_unlock(&(pool->lock)); + pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1); + pool = &tbl->pools[pool_nr]; + spin_lock(&(pool->lock)); + pool->hint = pool->start; pass++; goto again; + } else { - /* Third failure, give up */ + /* Give up */ + spin_unlock_irqrestore(&(pool->lock), flags); return DMA_ERROR_CODE; } } @@ -161,10 +208,10 @@ static unsigned long iommu_range_alloc(s /* Bump the hint to a new block for small allocs. */ if (largealloc) { /* Don't bump to new block to avoid fragmentation */ - tbl->it_largehint = end; + pool->hint = end; } else { /* Overflow will be taken care of at the next allocation */ - tbl->it_hint = (end + tbl->it_blocksize - 1) & + pool->hint = (end + tbl->it_blocksize - 1) & ~(tbl->it_blocksize - 1); } @@ -172,6 +219,8 @@ static unsigned long iommu_range_alloc(s if (handle) *handle = end; + spin_unlock_irqrestore(&(pool->lock), flags); + return n; } @@ -181,18 +230,14 @@ static dma_addr_t iommu_alloc(struct dev unsigned long mask, unsigned int align_order, struct dma_attrs *attrs) { - unsigned long entry, flags; + unsigned long entry; dma_addr_t ret = DMA_ERROR_CODE; int build_fail; - spin_lock_irqsave(&(tbl->it_lock), flags); - entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); - if (unlikely(entry == DMA_ERROR_CODE)) { - spin_unlock_irqrestore(&(tbl->it_lock), flags); + if (unlikely(entry == DMA_ERROR_CODE)) return DMA_ERROR_CODE; - } entry += tbl->it_offset; /* Offset into real TCE table */ ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */ @@ -209,8 +254,6 @@ static dma_addr_t iommu_alloc(struct dev */ if (unlikely(build_fail)) { __iommu_free(tbl, ret, npages); - - spin_unlock_irqrestore(&(tbl->it_lock), flags); return DMA_ERROR_CODE; } @@ -218,16 +261,14 @@ static dma_addr_t iommu_alloc(struct dev if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - spin_unlock_irqrestore(&(tbl->it_lock), flags); - /* Make sure updates are seen by hardware */ mb(); return ret; } -static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) +static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) { unsigned long entry, free_entry; @@ -247,20 +288,57 @@ static void __iommu_free(struct iommu_ta printk(KERN_INFO "\tindex = 0x%llx\n", (u64)tbl->it_index); WARN_ON(1); } - return; + + return false; } - ppc_md.tce_free(tbl, entry, npages); - iommu_area_free(tbl->it_map, free_entry, npages); + return true; } -static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) +static struct iommu_pool *get_pool(struct iommu_table *tbl, + unsigned long entry) { + struct iommu_pool *p; + unsigned long largepool_start = tbl->large_pool.start; + + /* The large pool is the last pool at the top of the table */ + if (entry >= largepool_start) { + p = &tbl->large_pool; + } else { + unsigned int pool_nr = entry / tbl->poolsize; + + BUG_ON(pool_nr > tbl->nr_pools); + p = &tbl->pools[pool_nr]; + } + + return p; +} + +static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry, free_entry; unsigned long flags; + struct iommu_pool *pool; - spin_lock_irqsave(&(tbl->it_lock), flags); + entry = dma_addr >> IOMMU_PAGE_SHIFT; + free_entry = entry - tbl->it_offset; + + pool = get_pool(tbl, free_entry); + + if (!iommu_free_check(tbl, dma_addr, npages)) + return; + + ppc_md.tce_free(tbl, entry, npages); + + spin_lock_irqsave(&(pool->lock), flags); + bitmap_clear(tbl->it_map, free_entry, npages); + spin_unlock_irqrestore(&(pool->lock), flags); +} +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ __iommu_free(tbl, dma_addr, npages); /* Make sure TLB cache is flushed if the HW needs it. We do @@ -269,8 +347,6 @@ static void iommu_free(struct iommu_tabl */ if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - - spin_unlock_irqrestore(&(tbl->it_lock), flags); } int iommu_map_sg(struct device *dev, struct iommu_table *tbl, @@ -279,7 +355,6 @@ int iommu_map_sg(struct device *dev, str struct dma_attrs *attrs) { dma_addr_t dma_next = 0, dma_addr; - unsigned long flags; struct scatterlist *s, *outs, *segstart; int outcount, incount, i, build_fail = 0; unsigned int align; @@ -301,8 +376,6 @@ int iommu_map_sg(struct device *dev, str DBG("sg mapping %d elements:\n", nelems); - spin_lock_irqsave(&(tbl->it_lock), flags); - max_seg_size = dma_get_max_seg_size(dev); for_each_sg(sglist, s, nelems, i) { unsigned long vaddr, npages, entry, slen; @@ -384,8 +457,6 @@ int iommu_map_sg(struct device *dev, str if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - spin_unlock_irqrestore(&(tbl->it_lock), flags); - DBG("mapped %d elements:\n", outcount); /* For the sake of iommu_unmap_sg, we clear out the length in the @@ -417,7 +488,6 @@ int iommu_map_sg(struct device *dev, str if (s == outs) break; } - spin_unlock_irqrestore(&(tbl->it_lock), flags); return 0; } @@ -427,15 +497,12 @@ void iommu_unmap_sg(struct iommu_table * struct dma_attrs *attrs) { struct scatterlist *sg; - unsigned long flags; BUG_ON(direction == DMA_NONE); if (!tbl) return; - spin_lock_irqsave(&(tbl->it_lock), flags); - sg = sglist; while (nelems--) { unsigned int npages; @@ -455,13 +522,16 @@ void iommu_unmap_sg(struct iommu_table * */ if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - - spin_unlock_irqrestore(&(tbl->it_lock), flags); } static void iommu_table_clear(struct iommu_table *tbl) { - if (!is_kdump_kernel()) { + /* + * In case of firmware assisted dump system goes through clean + * reboot process at the time of system crash. Hence it's safe to + * clear the TCE entries if firmware assisted dump is active. + */ + if (!is_kdump_kernel() || is_fadump_active()) { /* Clear the table in case firmware left allocations in it */ ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); return; @@ -504,22 +574,48 @@ struct iommu_table *iommu_init_table(str unsigned long sz; static int welcomed = 0; struct page *page; - - /* Set aside 1/4 of the table for large allocations. */ - tbl->it_halfpoint = tbl->it_size * 3 / 4; + unsigned int i; + struct iommu_pool *p; /* number of bytes needed for the bitmap */ sz = (tbl->it_size + 7) >> 3; - page = alloc_pages_node(nid, GFP_ATOMIC, get_order(sz)); + page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz)); if (!page) panic("iommu_init_table: Can't allocate %ld bytes\n", sz); tbl->it_map = page_address(page); memset(tbl->it_map, 0, sz); - tbl->it_hint = 0; - tbl->it_largehint = tbl->it_halfpoint; - spin_lock_init(&tbl->it_lock); + /* + * Reserve page 0 so it will not be used for any mappings. + * This avoids buggy drivers that consider page 0 to be invalid + * to crash the machine or even lose data. + */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); + + /* We only split the IOMMU table if we have 1GB or more of space */ + if ((tbl->it_size << IOMMU_PAGE_SHIFT) >= (1UL * 1024 * 1024 * 1024)) + tbl->nr_pools = IOMMU_NR_POOLS; + else + tbl->nr_pools = 1; + + /* We reserve the top 1/4 of the table for large allocations */ + tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools; + + for (i = 0; i < tbl->nr_pools; i++) { + p = &tbl->pools[i]; + spin_lock_init(&(p->lock)); + p->start = tbl->poolsize * i; + p->hint = p->start; + p->end = p->start + tbl->poolsize; + } + + p = &tbl->large_pool; + spin_lock_init(&(p->lock)); + p->start = tbl->poolsize * i; + p->hint = p->start; + p->end = tbl->it_size; iommu_table_clear(tbl); diff -uprN linux-2.6.32/arch/powerpc/kernel/irq.c linux-2.6.32.ovz/arch/powerpc/kernel/irq.c --- linux-2.6.32/arch/powerpc/kernel/irq.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/irq.c 2015-09-10 10:06:02.000000000 -0700 @@ -53,7 +53,6 @@ #include #include #include -#include #include #include @@ -138,11 +137,6 @@ notrace void raw_local_irq_restore(unsig } #endif /* CONFIG_PPC_STD_MMU_64 */ - if (test_perf_event_pending()) { - clear_perf_event_pending(); - perf_event_do_pending(); - } - /* * if (get_paca()->hard_enabled) return; * But again we need to take care that gcc gets hard_enabled directly diff -uprN linux-2.6.32/arch/powerpc/kernel/lparcfg.c linux-2.6.32.ovz/arch/powerpc/kernel/lparcfg.c --- linux-2.6.32/arch/powerpc/kernel/lparcfg.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/lparcfg.c 2015-09-10 10:06:35.000000000 -0700 @@ -37,7 +37,7 @@ #include #include -#define MODULE_VERS "1.8" +#define MODULE_VERS "1.9" #define MODULE_NAME "lparcfg" /* #define LPARCFG_DEBUG */ @@ -131,34 +131,6 @@ static int iseries_lparcfg_data(struct s /* * Methods used to fetch LPAR data when running on a pSeries platform. */ -/** - * h_get_mpp - * H_GET_MPP hcall returns info in 7 parms - */ -int h_get_mpp(struct hvcall_mpp_data *mpp_data) -{ - int rc; - unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; - - rc = plpar_hcall9(H_GET_MPP, retbuf); - - mpp_data->entitled_mem = retbuf[0]; - mpp_data->mapped_mem = retbuf[1]; - - mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff; - mpp_data->pool_num = retbuf[2] & 0xffff; - - mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff; - mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff; - mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff; - - mpp_data->pool_size = retbuf[4]; - mpp_data->loan_request = retbuf[5]; - mpp_data->backing_mem = retbuf[6]; - - return rc; -} -EXPORT_SYMBOL(h_get_mpp); struct hvcall_ppp_data { u64 entitlement; @@ -344,6 +316,30 @@ static void parse_mpp_data(struct seq_fi seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem); } +/** + * parse_mpp_x_data + * Parse out data returned from h_get_mpp_x + */ +static void parse_mpp_x_data(struct seq_file *m) +{ + struct hvcall_mpp_x_data mpp_x_data; + + if (!firmware_has_feature(FW_FEATURE_XCMO)) + return; + if (h_get_mpp_x(&mpp_x_data)) + return; + + seq_printf(m, "coalesced_bytes=%ld\n", mpp_x_data.coalesced_bytes); + + if (mpp_x_data.pool_coalesced_bytes) + seq_printf(m, "pool_coalesced_bytes=%ld\n", + mpp_x_data.pool_coalesced_bytes); + if (mpp_x_data.pool_purr_cycles) + seq_printf(m, "coalesce_pool_purr=%ld\n", mpp_x_data.pool_purr_cycles); + if (mpp_x_data.pool_spurr_cycles) + seq_printf(m, "coalesce_pool_spurr=%ld\n", mpp_x_data.pool_spurr_cycles); +} + #define SPLPAR_CHARACTERISTICS_TOKEN 20 #define SPLPAR_MAXLENGTH 1026*(sizeof(char)) @@ -486,6 +482,14 @@ static void splpar_dispatch_data(struct seq_printf(m, "dispatch_dispersions=%lu\n", dispatch_dispersions); } +static void parse_em_data(struct seq_file *m) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + if (plpar_hcall(H_GET_EM_PARMS, retbuf) == H_SUCCESS) + seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]); +} + static int pseries_lparcfg_data(struct seq_file *m, void *v) { int partition_potential_processors; @@ -511,6 +515,7 @@ static int pseries_lparcfg_data(struct s parse_system_parameter_string(m); parse_ppp_data(m); parse_mpp_data(m); + parse_mpp_x_data(m); pseries_cmo_data(m); splpar_dispatch_data(m); @@ -540,6 +545,8 @@ static int pseries_lparcfg_data(struct s seq_printf(m, "slb_size=%d\n", mmu_slb_size); + parse_em_data(m); + return 0; } diff -uprN linux-2.6.32/arch/powerpc/kernel/machine_kexec_64.c linux-2.6.32.ovz/arch/powerpc/kernel/machine_kexec_64.c --- linux-2.6.32/arch/powerpc/kernel/machine_kexec_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/machine_kexec_64.c 2015-09-10 10:06:34.000000000 -0700 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -155,33 +156,41 @@ void kexec_copy_flush(struct kimage *ima #ifdef CONFIG_SMP -/* FIXME: we should schedule this function to be called on all cpus based - * on calling the interrupts, but we would like to call it off irq level - * so that the interrupt controller is clean. - */ +static u8 kexec_all_irq_disabled; +u8 kexec_state[NR_CPUS] = {0}; + static void kexec_smp_down(void *arg) { + int my_cpu = get_cpu(); + + local_irq_disable(); + mb(); /* make sure our irqs are disabled before we say they are */ + kexec_state[my_cpu] = KEXEC_STATE_IRQS_OFF; + while(kexec_all_irq_disabled == 0) + cpu_relax(); + mb(); /* make sure all irqs are disabled before this */ + /* + * Now every CPU has IRQs off, we can clear out any pending + * IPIs and be sure that no more will come in after this. + */ if (ppc_md.kexec_cpu_down) ppc_md.kexec_cpu_down(0, 1); - local_irq_disable(); kexec_smp_wait(); /* NOTREACHED */ } -static void kexec_prepare_cpus(void) +static void kexec_prepare_cpus_wait(int wait_state) { int my_cpu, i, notified=-1; - smp_call_function(kexec_smp_down, NULL, /* wait */0); my_cpu = get_cpu(); - - /* check the others cpus are now down (via paca hw cpu id == -1) */ + /* Make sure each CPU has at least made it to the state we need */ for (i=0; i < NR_CPUS; i++) { if (i == my_cpu) continue; - while (paca[i].hw_cpu_id != -1) { + while (kexec_state[i] < wait_state) { barrier(); if (!cpu_possible(i)) { printk("kexec: cpu %d hw_cpu_id %d is not" @@ -201,20 +210,61 @@ static void kexec_prepare_cpus(void) } if (i != notified) { printk( "kexec: waiting for cpu %d (physical" - " %d) to go down\n", - i, paca[i].hw_cpu_id); + " %d) to enter %i state\n", + i, paca[i].hw_cpu_id, wait_state); notified = i; } } } + mb(); +} + +/* + * We need to make sure each present CPU is online. The next kernel will scan + * the device tree and assume primary threads are online and query secondry + * threads via RTAS to online them if required. If we don't online primary + * threads, they will be stuck. However, we also online secondary threads as we + * may be using 'cede offline'. In this case RTAS doesn't see the secondary + * threads as offline -- and again, these CPUs will be stuck. + * + * So, we online all CPUs that should be running, including secondary threads. + */ +static void wake_offline_cpus(void) +{ + int cpu = 0; + + for_each_present_cpu(cpu) { + if (!cpu_online(cpu)) { + printk(KERN_INFO "kexec: Waking offline cpu %d.\n", + cpu); + cpu_up(cpu); + } + } +} + +static void kexec_prepare_cpus(void) +{ + int my_cpu = get_cpu(); + + wake_offline_cpus(); + smp_call_function(kexec_smp_down, NULL, /* wait */0); + local_irq_disable(); + mb(); /* make sure IRQs are disabled before we say they are */ + kexec_state[my_cpu] = KEXEC_STATE_IRQS_OFF; + + kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF); + /* we are sure every CPU has IRQs off at this point */ + kexec_all_irq_disabled = 1; /* after we tell the others to go down */ if (ppc_md.kexec_cpu_down) ppc_md.kexec_cpu_down(0, 0); - put_cpu(); + /* Before removing MMU mapings, + * make sure all CPUs have entered real mode */ + kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE); - local_irq_disable(); + put_cpu(); } #else /* ! SMP */ diff -uprN linux-2.6.32/arch/powerpc/kernel/machine_kexec.c linux-2.6.32.ovz/arch/powerpc/kernel/machine_kexec.c --- linux-2.6.32/arch/powerpc/kernel/machine_kexec.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/machine_kexec.c 2015-10-24 08:14:47.354501188 -0700 @@ -45,6 +45,18 @@ void machine_kexec_cleanup(struct kimage ppc_md.machine_kexec_cleanup(image); } +void arch_crash_save_vmcoreinfo(void) +{ + +#ifdef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(contig_page_data); +#endif +} + /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. @@ -61,6 +73,35 @@ void machine_kexec(struct kimage *image) for(;;); } +#ifdef CONFIG_KEXEC_AUTO_RESERVE +unsigned long long __init arch_default_crash_base(void) +{ +#ifndef CONFIG_RELOCATABLE + return KDUMP_KERNELBASE; +#else + return 0; +#endif +} + +unsigned long long __init +arch_crash_auto_scale(unsigned long long total_size, unsigned long long size) +{ + if (total_size < KEXEC_AUTO_THRESHOLD) + return 0; + if (total_size < (1ULL<<32)) + return 1ULL<<27; + else { +#ifdef CONFIG_64BIT + if (total_size > (1ULL<<37)) /* 128G */ + return 1ULL<<32; /* 4G */ + return 1ULL< 0) { crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; @@ -98,7 +139,7 @@ void __init reserve_crashkernel(void) /* * unspecified address, choose a region of specified size * can overlap with initrd (ignoring corruption when retained) - * ppc64 requires kernel and some stacks to be in first segemnt + * ppc64 requires kernel and some stacks to be in first segment */ crashk_res.start = KDUMP_KERNELBASE; } @@ -109,24 +150,42 @@ void __init reserve_crashkernel(void) PAGE_SIZE); crashk_res.start = crash_base; } - #endif crash_size = PAGE_ALIGN(crash_size); crashk_res.end = crashk_res.start + crash_size - 1; /* The crash region must not overlap the current kernel */ if (overlaps_crashkernel(__pa(_stext), _end - _stext)) { +#ifdef CONFIG_RELOCATABLE + do { + /* Align kdump kernel to 16MB (size of large page) */ + crashk_res.start = ALIGN(crashk_res.start + + (16 * 1024 * 1024), 0x1000000); + if (crashk_res.start + (_end - _stext) > lmb.rmo_size) { + printk(KERN_WARNING + "Not enough memory for crash kernel\n"); + crashk_res.start = crashk_res.end = 0; + return; + } + } while (overlaps_crashkernel(__pa(_stext), _end - _stext)); + + crashk_res.end = crashk_res.start + crash_size - 1; + printk(KERN_INFO + "crash kernel memory overlaps with kernel memory\n" + "Moving it to %ldMB\n", (unsigned long)(crashk_res.start >> 20)); +#else printk(KERN_WARNING "Crash kernel can not overlap current kernel\n"); crashk_res.start = crashk_res.end = 0; return; +#endif } /* Crash kernel trumps memory limit */ if (memory_limit && memory_limit <= crashk_res.end) { memory_limit = crashk_res.end + 1; printk("Adjusted memory limit for crashkernel, now 0x%llx\n", - (unsigned long long)memory_limit); + memory_limit); } printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " @@ -165,6 +224,12 @@ static struct property crashk_size_prop .value = &crashk_size, }; +static struct property memory_limit_prop = { + .name = "linux,memory-limit", + .length = sizeof(unsigned long long), + .value = &memory_limit, +}; + static void __init export_crashk_values(struct device_node *node) { struct property *prop; @@ -184,6 +249,16 @@ static void __init export_crashk_values( crashk_size = crashk_res.end - crashk_res.start + 1; prom_add_property(node, &crashk_size_prop); } + + /* + * memory_limit is required by the kexec-tools to limit the + * crash regions to the actual memory used. + */ + prop = of_find_property(node, memory_limit_prop.name, NULL); + if (!prop) + prom_add_property(node, &memory_limit_prop); + else + prom_update_property(node, &memory_limit_prop, prop); } static int __init kexec_setup(void) diff -uprN linux-2.6.32/arch/powerpc/kernel/Makefile linux-2.6.32.ovz/arch/powerpc/kernel/Makefile --- linux-2.6.32/arch/powerpc/kernel/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/Makefile 2015-09-10 10:07:34.000000000 -0700 @@ -53,6 +53,7 @@ obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_FA_DUMP) += fadump.o obj-$(CONFIG_E500) += idle_e500.o obj-$(CONFIG_6xx) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o obj-$(CONFIG_TAU) += tau_6xx.o @@ -99,7 +100,8 @@ obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_PPC_PERF_CTRS) += perf_event.o perf_callchain.o obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ - power5+-pmu.o power6-pmu.o power7-pmu.o + power5+-pmu.o power6-pmu.o power7-pmu.o \ + power8-pmu.o obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff -uprN linux-2.6.32/arch/powerpc/kernel/misc_64.S linux-2.6.32.ovz/arch/powerpc/kernel/misc_64.S --- linux-2.6.32/arch/powerpc/kernel/misc_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/misc_64.S 2015-09-10 10:07:48.000000000 -0700 @@ -24,6 +24,7 @@ #include #include #include +#include .text @@ -471,6 +472,13 @@ _GLOBAL(kexec_wait) 1: mflr r5 addi r5,r5,kexec_flag-1b + /* set kexec_state for this CPU to real mode */ + /* r3 already contains cpuid here */ + LOAD_REG_ADDR(r4, kexec_state) + li r9,KEXEC_STATE_REAL_MODE + stbx r9, r4, r6 + SYNC + 99: HMT_LOW #ifdef CONFIG_KEXEC /* use no memory without kexec */ lwz r4,0(r5) @@ -494,14 +502,12 @@ kexec_flag: * note: this is a terminal routine, it does not save lr * * get phys id from paca - * set paca id to -1 to say we got here * switch to real mode * join other cpus in kexec_wait(phys_id) */ _GLOBAL(kexec_smp_wait) lhz r3,PACAHWCPUID(r13) - li r4,-1 - sth r4,PACAHWCPUID(r13) /* let others know we left */ + lhz r6,PACAPACAINDEX(r13) bl real_mode b .kexec_wait diff -uprN linux-2.6.32/arch/powerpc/kernel/module_32.c linux-2.6.32.ovz/arch/powerpc/kernel/module_32.c --- linux-2.6.32/arch/powerpc/kernel/module_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/module_32.c 2015-09-10 10:07:52.000000000 -0700 @@ -229,7 +229,8 @@ int apply_relocate_add(Elf32_Shdr *sechd const char *strtab, unsigned int symindex, unsigned int relsec, - struct module *module) + struct module *module, + struct rheldata *rheldata) { unsigned int i; Elf32_Rela *rela = (void *)sechdrs[relsec].sh_addr; diff -uprN linux-2.6.32/arch/powerpc/kernel/module_64.c linux-2.6.32.ovz/arch/powerpc/kernel/module_64.c --- linux-2.6.32/arch/powerpc/kernel/module_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/module_64.c 2015-09-10 10:07:52.000000000 -0700 @@ -342,7 +342,8 @@ int apply_relocate_add(Elf64_Shdr *sechd const char *strtab, unsigned int symindex, unsigned int relsec, - struct module *me) + struct module *me, + struct rheldata *rheldata) { unsigned int i; Elf64_Rela *rela = (void *)sechdrs[relsec].sh_addr; diff -uprN linux-2.6.32/arch/powerpc/kernel/mpc7450-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/mpc7450-pmu.c --- linux-2.6.32/arch/powerpc/kernel/mpc7450-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/mpc7450-pmu.c 2015-09-10 10:06:30.000000000 -0700 @@ -414,4 +414,4 @@ static int init_mpc7450_pmu(void) return register_power_pmu(&mpc7450_pmu); } -arch_initcall(init_mpc7450_pmu); +early_initcall(init_mpc7450_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/nvram_64.c linux-2.6.32.ovz/arch/powerpc/kernel/nvram_64.c --- linux-2.6.32/arch/powerpc/kernel/nvram_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/nvram_64.c 2015-09-10 10:06:37.000000000 -0700 @@ -31,17 +31,11 @@ #include #include #include +#include #undef DEBUG_NVRAM -static struct nvram_partition * nvram_part; -static long nvram_error_log_index = -1; -static long nvram_error_log_size = 0; - -struct err_log_info { - int error_type; - unsigned int seq_num; -}; +static LIST_HEAD(nvram_partitions); static loff_t dev_nvram_llseek(struct file *file, loff_t offset, int origin) { @@ -139,8 +133,8 @@ out: } -static int dev_nvram_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +static long dev_nvram_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) { switch(cmd) { #ifdef CONFIG_PPC_PMAC @@ -169,11 +163,11 @@ static int dev_nvram_ioctl(struct inode } const struct file_operations nvram_fops = { - .owner = THIS_MODULE, - .llseek = dev_nvram_llseek, - .read = dev_nvram_read, - .write = dev_nvram_write, - .ioctl = dev_nvram_ioctl, + .owner = THIS_MODULE, + .llseek = dev_nvram_llseek, + .read = dev_nvram_read, + .write = dev_nvram_write, + .unlocked_ioctl = dev_nvram_ioctl, }; static struct miscdevice nvram_dev = { @@ -184,16 +178,14 @@ static struct miscdevice nvram_dev = { #ifdef DEBUG_NVRAM -static void nvram_print_partitions(char * label) +static void __init nvram_print_partitions(char * label) { - struct list_head * p; struct nvram_partition * tmp_part; printk(KERN_WARNING "--------%s---------\n", label); printk(KERN_WARNING "indx\t\tsig\tchks\tlen\tname\n"); - list_for_each(p, &nvram_part->partition) { - tmp_part = list_entry(p, struct nvram_partition, partition); - printk(KERN_WARNING "%4d \t%02x\t%02x\t%d\t%s\n", + list_for_each_entry(tmp_part, &nvram_partitions, partition) { + printk(KERN_WARNING "%4d \t%02x\t%02x\t%d\t%.12s\n", tmp_part->index, tmp_part->header.signature, tmp_part->header.checksum, tmp_part->header.length, tmp_part->header.name); @@ -202,7 +194,7 @@ static void nvram_print_partitions(char #endif -static int nvram_write_header(struct nvram_partition * part) +static int __init nvram_write_header(struct nvram_partition * part) { loff_t tmp_index; int rc; @@ -214,7 +206,7 @@ static int nvram_write_header(struct nvr } -static unsigned char nvram_checksum(struct nvram_header *p) +static unsigned char __init nvram_checksum(struct nvram_header *p) { unsigned int c_sum, c_sum2; unsigned short *sp = (unsigned short *)p->name; /* assume 6 shorts */ @@ -233,115 +225,155 @@ static unsigned char nvram_checksum(stru * Find an nvram partition, sig can be 0 for any * partition or name can be NULL for any name, else * tries to match both + * + * Note: nvram_find_partition2() is used internally, but nvram_find_partition() + * is exported, so we keep it to preserve the kABI. */ struct nvram_partition *nvram_find_partition(int sig, const char *name) { struct nvram_partition * part; - struct list_head * p; - - list_for_each(p, &nvram_part->partition) { - part = list_entry(p, struct nvram_partition, partition); + list_for_each_entry(part, &nvram_partitions, partition) { if (sig && part->header.signature != sig) continue; if (name && 0 != strncmp(name, part->header.name, 12)) continue; - return part; + return part; } return NULL; } EXPORT_SYMBOL(nvram_find_partition); -static int nvram_remove_os_partition(void) +/* + * Per the criteria passed via nvram_remove_partition(), should this + * partition be removed? 1=remove, 0=keep + */ +static int nvram_can_remove_partition(struct nvram_partition *part, + const char *name, int sig, const char *const exceptions[]) { - struct list_head *i; - struct list_head *j; - struct nvram_partition * part; - struct nvram_partition * cur_part; + if (part->header.signature != sig) + return 0; + if (name) { + if (strncmp(name, part->header.name, 12)) + return 0; + } else if (exceptions) { + const char *const *except; + for (except = exceptions; *except; except++) { + if (!strncmp(*except, part->header.name, 12)) + return 0; + } + } + return 1; +} + +/** + * nvram_remove_partition - Remove one or more partitions in nvram + * @name: name of the partition to remove, or NULL for a + * signature only match + * @sig: signature of the partition(s) to remove + * @exceptions: When removing all partitions with a matching signature, + * leave these alone. + */ + +int __init nvram_remove_partition(const char *name, int sig, + const char *const exceptions[]) +{ + struct nvram_partition *part, *prev, *tmp; int rc; - list_for_each(i, &nvram_part->partition) { - part = list_entry(i, struct nvram_partition, partition); - if (part->header.signature != NVRAM_SIG_OS) + list_for_each_entry(part, &nvram_partitions, partition) { + if (!nvram_can_remove_partition(part, name, sig, exceptions)) continue; - - /* Make os partition a free partition */ + + /* Make partition a free partition */ part->header.signature = NVRAM_SIG_FREE; - sprintf(part->header.name, "wwwwwwwwwwww"); + memcpy(part->header.name, "wwwwwwwwwwww", 12); part->header.checksum = nvram_checksum(&part->header); - - /* Merge contiguous free partitions backwards */ - list_for_each_prev(j, &part->partition) { - cur_part = list_entry(j, struct nvram_partition, partition); - if (cur_part == nvram_part || cur_part->header.signature != NVRAM_SIG_FREE) { - break; - } - - part->header.length += cur_part->header.length; - part->header.checksum = nvram_checksum(&part->header); - part->index = cur_part->index; - - list_del(&cur_part->partition); - kfree(cur_part); - j = &part->partition; /* fixup our loop */ - } - - /* Merge contiguous free partitions forwards */ - list_for_each(j, &part->partition) { - cur_part = list_entry(j, struct nvram_partition, partition); - if (cur_part == nvram_part || cur_part->header.signature != NVRAM_SIG_FREE) { - break; - } - - part->header.length += cur_part->header.length; - part->header.checksum = nvram_checksum(&part->header); - - list_del(&cur_part->partition); - kfree(cur_part); - j = &part->partition; /* fixup our loop */ - } - rc = nvram_write_header(part); if (rc <= 0) { - printk(KERN_ERR "nvram_remove_os_partition: nvram_write failed (%d)\n", rc); + printk(KERN_ERR "nvram_remove_partition: nvram_write failed (%d)\n", rc); return rc; } + } + /* Merge contiguous ones */ + prev = NULL; + list_for_each_entry_safe(part, tmp, &nvram_partitions, partition) { + if (part->header.signature != NVRAM_SIG_FREE) { + prev = NULL; + continue; + } + if (prev) { + prev->header.length += part->header.length; + prev->header.checksum = nvram_checksum(&part->header); + rc = nvram_write_header(part); + if (rc <= 0) { + printk(KERN_ERR "nvram_remove_partition: nvram_write failed (%d)\n", rc); + return rc; + } + list_del(&part->partition); + kfree(part); + } else + prev = part; } return 0; } -/* nvram_create_os_partition +/** + * nvram_create_partition - Create a partition in nvram + * @name: name of the partition to create + * @sig: signature of the partition to create + * @req_size: size of data to allocate in bytes + * @min_size: minimum acceptable size (0 means req_size) * - * Create a OS linux partition to buffer error logs. - * Will create a partition starting at the first free - * space found if space has enough room. + * Returns a negative error code or a positive nvram index + * of the beginning of the data area of the newly created + * partition. If you provided a min_size smaller than req_size + * you need to query for the actual size yourself after the + * call using nvram_partition_get_size(). */ -static int nvram_create_os_partition(void) +loff_t __init nvram_create_partition(const char *name, int sig, + int req_size, int min_size) { struct nvram_partition *part; struct nvram_partition *new_part; struct nvram_partition *free_part = NULL; - int seq_init[2] = { 0, 0 }; loff_t tmp_index; long size = 0; int rc; - + + /* Convert sizes from bytes to blocks */ + req_size = _ALIGN_UP(req_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN; + min_size = _ALIGN_UP(min_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN; + + /* If no minimum size specified, make it the same as the + * requested size + */ + if (min_size == 0) + min_size = req_size; + if (min_size > req_size) + return -EINVAL; + + /* Now add one block to each for the header */ + req_size += 1; + min_size += 1; + /* Find a free partition that will give us the maximum needed size If can't find one that will give us the minimum size needed */ - list_for_each_entry(part, &nvram_part->partition, partition) { + list_for_each_entry(part, &nvram_partitions, partition) { if (part->header.signature != NVRAM_SIG_FREE) continue; - if (part->header.length >= NVRAM_MAX_REQ) { - size = NVRAM_MAX_REQ; + if (part->header.length >= req_size) { + size = req_size; free_part = part; break; } - if (!size && part->header.length >= NVRAM_MIN_REQ) { - size = NVRAM_MIN_REQ; + if (part->header.length > size && + part->header.length >= min_size) { + size = part->header.length; free_part = part; } } @@ -351,136 +383,96 @@ static int nvram_create_os_partition(voi /* Create our OS partition */ new_part = kmalloc(sizeof(*new_part), GFP_KERNEL); if (!new_part) { - printk(KERN_ERR "nvram_create_os_partition: kmalloc failed\n"); + pr_err("nvram_create_os_partition: kmalloc failed\n"); return -ENOMEM; } new_part->index = free_part->index; - new_part->header.signature = NVRAM_SIG_OS; + new_part->header.signature = sig; new_part->header.length = size; - strcpy(new_part->header.name, "ppc64,linux"); + strncpy(new_part->header.name, name, 12); new_part->header.checksum = nvram_checksum(&new_part->header); rc = nvram_write_header(new_part); if (rc <= 0) { - printk(KERN_ERR "nvram_create_os_partition: nvram_write_header \ - failed (%d)\n", rc); - return rc; - } - - /* make sure and initialize to zero the sequence number and the error - type logged */ - tmp_index = new_part->index + NVRAM_HEADER_LEN; - rc = ppc_md.nvram_write((char *)&seq_init, sizeof(seq_init), &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_create_os_partition: nvram_write " - "failed (%d)\n", rc); + pr_err("nvram_create_os_partition: nvram_write_header " + "failed (%d)\n", rc); return rc; } - - nvram_error_log_index = new_part->index + NVRAM_HEADER_LEN; - nvram_error_log_size = ((part->header.length - 1) * - NVRAM_BLOCK_LEN) - sizeof(struct err_log_info); - list_add_tail(&new_part->partition, &free_part->partition); - if (free_part->header.length <= size) { + /* Adjust or remove the partition we stole the space from */ + if (free_part->header.length > size) { + free_part->index += size * NVRAM_BLOCK_LEN; + free_part->header.length -= size; + free_part->header.checksum = nvram_checksum(&free_part->header); + rc = nvram_write_header(free_part); + if (rc <= 0) { + pr_err("nvram_create_os_partition: nvram_write_header " + "failed (%d)\n", rc); + return rc; + } + } else { list_del(&free_part->partition); kfree(free_part); - return 0; } - /* Adjust the partition we stole the space from */ - free_part->index += size * NVRAM_BLOCK_LEN; - free_part->header.length -= size; - free_part->header.checksum = nvram_checksum(&free_part->header); - - rc = nvram_write_header(free_part); - if (rc <= 0) { - printk(KERN_ERR "nvram_create_os_partition: nvram_write_header " - "failed (%d)\n", rc); - return rc; + /* Clear the new partition */ + for (tmp_index = new_part->index + NVRAM_HEADER_LEN; + tmp_index < ((size - 1) * NVRAM_BLOCK_LEN); + tmp_index += NVRAM_BLOCK_LEN) { + rc = ppc_md.nvram_write((char*) empty_zero_page, + NVRAM_BLOCK_LEN, &tmp_index); + if (rc <= 0) { + pr_err("nvram_create_partition: nvram_write failed (%d)\n", rc); + return rc; + } } - - return 0; + + return new_part->index + NVRAM_HEADER_LEN; } - -/* nvram_setup_partition - * - * This will setup the partition we need for buffering the - * error logs and cleanup partitions if needed. - * - * The general strategy is the following: - * 1.) If there is ppc64,linux partition large enough then use it. - * 2.) If there is not a ppc64,linux partition large enough, search - * for a free partition that is large enough. - * 3.) If there is not a free partition large enough remove - * _all_ OS partitions and consolidate the space. - * 4.) Will first try getting a chunk that will satisfy the maximum - * error log size (NVRAM_MAX_REQ). - * 5.) If the max chunk cannot be allocated then try finding a chunk - * that will satisfy the minum needed (NVRAM_MIN_REQ). +/** + * nvram_get_partition_size - Get the data size of an nvram partition + * @data_index: This is the offset of the start of the data of + * the partition. The same value that is returned by + * nvram_create_partition(). */ -static int nvram_setup_partition(void) +int nvram_get_partition_size(loff_t data_index) { - struct list_head * p; - struct nvram_partition * part; - int rc; - - /* For now, we don't do any of this on pmac, until I - * have figured out if it's worth killing some unused stuffs - * in our nvram, as Apple defined partitions use pretty much - * all of the space - */ - if (machine_is(powermac)) - return -ENOSPC; + struct nvram_partition *part; + + list_for_each_entry(part, &nvram_partitions, partition) { + if (part->index + NVRAM_HEADER_LEN == data_index) + return (part->header.length - 1) * NVRAM_BLOCK_LEN; + } + return -1; +} - /* see if we have an OS partition that meets our needs. - will try getting the max we need. If not we'll delete - partitions and try again. */ - list_for_each(p, &nvram_part->partition) { - part = list_entry(p, struct nvram_partition, partition); - if (part->header.signature != NVRAM_SIG_OS) - continue; - if (strcmp(part->header.name, "ppc64,linux")) - continue; +/** + * nvram_find_partition2 - Find an nvram partition by signature and name + * @name: Name of the partition or NULL for any name + * @sig: Signature to test against + * @out_size: if non-NULL, returns the size of the data part of the partition + */ +loff_t nvram_find_partition2(const char *name, int sig, int *out_size) +{ + struct nvram_partition *p; - if (part->header.length >= NVRAM_MIN_REQ) { - /* found our partition */ - nvram_error_log_index = part->index + NVRAM_HEADER_LEN; - nvram_error_log_size = ((part->header.length - 1) * - NVRAM_BLOCK_LEN) - sizeof(struct err_log_info); - return 0; + list_for_each_entry(p, &nvram_partitions, partition) { + if (p->header.signature == sig && + (!name || !strncmp(p->header.name, name, 12))) { + if (out_size) + *out_size = (p->header.length - 1) * + NVRAM_BLOCK_LEN; + return p->index + NVRAM_HEADER_LEN; } } - - /* try creating a partition with the free space we have */ - rc = nvram_create_os_partition(); - if (!rc) { - return 0; - } - - /* need to free up some space */ - rc = nvram_remove_os_partition(); - if (rc) { - return rc; - } - - /* create a partition in this new space */ - rc = nvram_create_os_partition(); - if (rc) { - printk(KERN_ERR "nvram_create_os_partition: Could not find a " - "NVRAM partition large enough\n"); - return rc; - } - return 0; } - -static int nvram_scan_partitions(void) +int __init nvram_scan_partitions(void) { loff_t cur_index = 0; struct nvram_header phead; @@ -490,7 +482,7 @@ static int nvram_scan_partitions(void) int total_size; int err; - if (ppc_md.nvram_size == NULL) + if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0) return -ENODEV; total_size = ppc_md.nvram_size(); @@ -537,12 +529,16 @@ static int nvram_scan_partitions(void) memcpy(&tmp_part->header, &phead, NVRAM_HEADER_LEN); tmp_part->index = cur_index; - list_add_tail(&tmp_part->partition, &nvram_part->partition); + list_add_tail(&tmp_part->partition, &nvram_partitions); cur_index += phead.length * NVRAM_BLOCK_LEN; } err = 0; +#ifdef DEBUG_NVRAM + nvram_print_partitions("NVRAM Partitions"); +#endif + out: kfree(header); return err; @@ -550,9 +546,10 @@ static int nvram_scan_partitions(void) static int __init nvram_init(void) { - int error; int rc; + BUILD_BUG_ON(NVRAM_BLOCK_LEN != 16); + if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0) return -ENODEV; @@ -562,29 +559,6 @@ static int __init nvram_init(void) return rc; } - /* initialize our anchor for the nvram partition list */ - nvram_part = kmalloc(sizeof(struct nvram_partition), GFP_KERNEL); - if (!nvram_part) { - printk(KERN_ERR "nvram_init: Failed kmalloc\n"); - return -ENOMEM; - } - INIT_LIST_HEAD(&nvram_part->partition); - - /* Get all the NVRAM partitions */ - error = nvram_scan_partitions(); - if (error) { - printk(KERN_ERR "nvram_init: Failed nvram_scan_partitions\n"); - return error; - } - - if(nvram_setup_partition()) - printk(KERN_WARNING "nvram_init: Could not find nvram partition" - " for nvram buffered error logging.\n"); - -#ifdef DEBUG_NVRAM - nvram_print_partitions("NVRAM Partitions"); -#endif - return rc; } @@ -593,132 +567,6 @@ void __exit nvram_cleanup(void) misc_deregister( &nvram_dev ); } - -#ifdef CONFIG_PPC_PSERIES - -/* nvram_write_error_log - * - * We need to buffer the error logs into nvram to ensure that we have - * the failure information to decode. If we have a severe error there - * is no way to guarantee that the OS or the machine is in a state to - * get back to user land and write the error to disk. For example if - * the SCSI device driver causes a Machine Check by writing to a bad - * IO address, there is no way of guaranteeing that the device driver - * is in any state that is would also be able to write the error data - * captured to disk, thus we buffer it in NVRAM for analysis on the - * next boot. - * - * In NVRAM the partition containing the error log buffer will looks like: - * Header (in bytes): - * +-----------+----------+--------+------------+------------------+ - * | signature | checksum | length | name | data | - * |0 |1 |2 3|4 15|16 length-1| - * +-----------+----------+--------+------------+------------------+ - * - * The 'data' section would look like (in bytes): - * +--------------+------------+-----------------------------------+ - * | event_logged | sequence # | error log | - * |0 3|4 7|8 nvram_error_log_size-1| - * +--------------+------------+-----------------------------------+ - * - * event_logged: 0 if event has not been logged to syslog, 1 if it has - * sequence #: The unique sequence # for each event. (until it wraps) - * error log: The error log from event_scan - */ -int nvram_write_error_log(char * buff, int length, - unsigned int err_type, unsigned int error_log_cnt) -{ - int rc; - loff_t tmp_index; - struct err_log_info info; - - if (nvram_error_log_index == -1) { - return -ESPIPE; - } - - if (length > nvram_error_log_size) { - length = nvram_error_log_size; - } - - info.error_type = err_type; - info.seq_num = error_log_cnt; - - tmp_index = nvram_error_log_index; - - rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); - return rc; - } - - rc = ppc_md.nvram_write(buff, length, &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); - return rc; - } - - return 0; -} - -/* nvram_read_error_log - * - * Reads nvram for error log for at most 'length' - */ -int nvram_read_error_log(char * buff, int length, - unsigned int * err_type, unsigned int * error_log_cnt) -{ - int rc; - loff_t tmp_index; - struct err_log_info info; - - if (nvram_error_log_index == -1) - return -1; - - if (length > nvram_error_log_size) - length = nvram_error_log_size; - - tmp_index = nvram_error_log_index; - - rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); - return rc; - } - - rc = ppc_md.nvram_read(buff, length, &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); - return rc; - } - - *error_log_cnt = info.seq_num; - *err_type = info.error_type; - - return 0; -} - -/* This doesn't actually zero anything, but it sets the event_logged - * word to tell that this event is safely in syslog. - */ -int nvram_clear_error_log(void) -{ - loff_t tmp_index; - int clear_word = ERR_FLAG_ALREADY_LOGGED; - int rc; - - tmp_index = nvram_error_log_index; - - rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc); - return rc; - } - - return 0; -} - -#endif /* CONFIG_PPC_PSERIES */ - module_init(nvram_init); module_exit(nvram_cleanup); MODULE_LICENSE("GPL"); diff -uprN linux-2.6.32/arch/powerpc/kernel/pci_32.c linux-2.6.32.ovz/arch/powerpc/kernel/pci_32.c --- linux-2.6.32/arch/powerpc/kernel/pci_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/pci_32.c 2015-09-10 10:07:23.000000000 -0700 @@ -372,7 +372,7 @@ static int __init pcibios_init(void) printk(KERN_INFO "PCI: Probing PCI hardware\n"); - if (ppc_pci_flags & PPC_PCI_REASSIGN_ALL_BUS) + if (pci_has_flag(PCI_REASSIGN_ALL_BUS)) pci_assign_all_buses = 1; /* Scan all of the recorded PCI controllers. */ diff -uprN linux-2.6.32/arch/powerpc/kernel/pci_64.c linux-2.6.32.ovz/arch/powerpc/kernel/pci_64.c --- linux-2.6.32/arch/powerpc/kernel/pci_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/pci_64.c 2015-09-10 10:07:23.000000000 -0700 @@ -32,8 +32,6 @@ #include #include -unsigned long pci_probe_only = 1; - /* pci_io_base -- the base address from which io bars are offsets. * This is the lowest I/O base address (so bar values are always positive), * and it *must* be the start of ISA space if an ISA bus exists because @@ -54,13 +52,10 @@ static int __init pcibios_init(void) */ ppc_md.phys_mem_access_prot = pci_phys_mem_access_prot; - if (pci_probe_only) - ppc_pci_flags |= PPC_PCI_PROBE_ONLY; - /* On ppc64, we always enable PCI domains and we keep domain 0 * backward compatible in /proc for video cards */ - ppc_pci_flags |= PPC_PCI_ENABLE_PROC_DOMAINS | PPC_PCI_COMPAT_DOMAIN_0; + pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0); /* Scan all of the recorded PCI controllers. */ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { @@ -130,30 +125,13 @@ EXPORT_SYMBOL_GPL(pcibios_unmap_io_space #endif /* CONFIG_HOTPLUG */ -int __devinit pcibios_map_io_space(struct pci_bus *bus) +static int __devinit pcibios_map_phb_io_space(struct pci_controller *hose) { struct vm_struct *area; unsigned long phys_page; unsigned long size_page; unsigned long io_virt_offset; - struct pci_controller *hose; - - WARN_ON(bus == NULL); - /* If this not a PHB, nothing to do, page tables still exist and - * thus HPTEs will be faulted in when needed - */ - if (bus->self) { - pr_debug("IO mapping for PCI-PCI bridge %s\n", - pci_name(bus->self)); - pr_debug(" virt=0x%016llx...0x%016llx\n", - bus->resource[0]->start + _IO_BASE, - bus->resource[0]->end + _IO_BASE); - return 0; - } - - /* Get the host bridge */ - hose = pci_bus_to_host(bus); phys_page = _ALIGN_DOWN(hose->io_base_phys, PAGE_SIZE); size_page = _ALIGN_UP(hose->pci_io_size, PAGE_SIZE); @@ -198,11 +176,30 @@ int __devinit pcibios_map_io_space(struc return 0; } + +int __devinit pcibios_map_io_space(struct pci_bus *bus) +{ + WARN_ON(bus == NULL); + + /* If this not a PHB, nothing to do, page tables still exist and + * thus HPTEs will be faulted in when needed + */ + if (bus->self) { + pr_debug("IO mapping for PCI-PCI bridge %s\n", + pci_name(bus->self)); + pr_debug(" virt=0x%016llx...0x%016llx\n", + bus->resource[0]->start + _IO_BASE, + bus->resource[0]->end + _IO_BASE); + return 0; + } + + return pcibios_map_phb_io_space(pci_bus_to_host(bus)); +} EXPORT_SYMBOL_GPL(pcibios_map_io_space); void __devinit pcibios_setup_phb_io_space(struct pci_controller *hose) { - pcibios_map_io_space(hose->bus); + pcibios_map_phb_io_space(hose); } #define IOBASE_BRIDGE_NUMBER 0 diff -uprN linux-2.6.32/arch/powerpc/kernel/pci-common.c linux-2.6.32.ovz/arch/powerpc/kernel/pci-common.c --- linux-2.6.32/arch/powerpc/kernel/pci-common.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/pci-common.c 2015-09-10 10:07:48.000000000 -0700 @@ -46,9 +46,6 @@ static int global_phb_number; /* Global /* ISA Memory physical address */ resource_size_t isa_mem_base; -/* Default PCI flags is 0 on ppc32, modified at boot on ppc64 */ -unsigned int ppc_pci_flags = 0; - static struct dma_map_ops *pci_dma_ops = &dma_direct_ops; @@ -854,67 +851,13 @@ int pci_proc_domain(struct pci_bus *bus) { struct pci_controller *hose = pci_bus_to_host(bus); - if (!(ppc_pci_flags & PPC_PCI_ENABLE_PROC_DOMAINS)) + if (!pci_has_flag(PCI_ENABLE_PROC_DOMAINS)) return 0; - if (ppc_pci_flags & PPC_PCI_COMPAT_DOMAIN_0) + if (pci_has_flag(PCI_COMPAT_DOMAIN_0)) return hose->global_number != 0; return 1; } -void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, - struct resource *res) -{ - resource_size_t offset = 0, mask = (resource_size_t)-1; - struct pci_controller *hose = pci_bus_to_host(dev->bus); - - if (!hose) - return; - if (res->flags & IORESOURCE_IO) { - offset = (unsigned long)hose->io_base_virt - _IO_BASE; - mask = 0xffffffffu; - } else if (res->flags & IORESOURCE_MEM) - offset = hose->pci_mem_offset; - - region->start = (res->start - offset) & mask; - region->end = (res->end - offset) & mask; -} -EXPORT_SYMBOL(pcibios_resource_to_bus); - -void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, - struct pci_bus_region *region) -{ - resource_size_t offset = 0, mask = (resource_size_t)-1; - struct pci_controller *hose = pci_bus_to_host(dev->bus); - - if (!hose) - return; - if (res->flags & IORESOURCE_IO) { - offset = (unsigned long)hose->io_base_virt - _IO_BASE; - mask = 0xffffffffu; - } else if (res->flags & IORESOURCE_MEM) - offset = hose->pci_mem_offset; - res->start = (region->start + offset) & mask; - res->end = (region->end + offset) & mask; -} -EXPORT_SYMBOL(pcibios_bus_to_resource); - -/* Fixup a bus resource into a linux resource */ -static void __devinit fixup_resource(struct resource *res, struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - resource_size_t offset = 0, mask = (resource_size_t)-1; - - if (res->flags & IORESOURCE_IO) { - offset = (unsigned long)hose->io_base_virt - _IO_BASE; - mask = 0xffffffffu; - } else if (res->flags & IORESOURCE_MEM) - offset = hose->pci_mem_offset; - - res->start = (res->start + offset) & mask; - res->end = (res->end + offset) & mask; -} - - /* This header fixup will do the resource fixup for all devices as they are * probed, but not for bridge ranges */ @@ -932,13 +875,13 @@ static void __devinit pcibios_fixup_reso struct resource *res = dev->resource + i; if (!res->flags) continue; - /* On platforms that have PPC_PCI_PROBE_ONLY set, we don't + /* On platforms that have PCI_PROBE_ONLY set, we don't * consider 0 as an unassigned BAR value. It's technically * a valid value, but linux doesn't like it... so when we can * re-assign things, we do so, but if we can't, we keep it * around and hope for the best... */ - if (res->start == 0 && !(ppc_pci_flags & PPC_PCI_PROBE_ONLY)) { + if (res->start == 0 && !pci_has_flag(PCI_PROBE_ONLY)) { pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] is unassigned\n", pci_name(dev), i, (unsigned long long)res->start, @@ -950,18 +893,11 @@ static void __devinit pcibios_fixup_reso continue; } - pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] fixup...\n", + pr_debug("PCI:%s Resource %d %016llx-%016llx [%x]\n", pci_name(dev), i, (unsigned long long)res->start,\ (unsigned long long)res->end, (unsigned int)res->flags); - - fixup_resource(res, dev); - - pr_debug("PCI:%s %016llx-%016llx\n", - pci_name(dev), - (unsigned long long)res->start, - (unsigned long long)res->end); } /* Call machine specific resource fixup */ @@ -985,7 +921,7 @@ static int __devinit pcibios_uninitializ int i; /* We don't do anything if PCI_PROBE_ONLY is set */ - if (ppc_pci_flags & PPC_PCI_PROBE_ONLY) + if (pci_has_flag(PCI_PROBE_ONLY)) return 0; /* Job is a bit different between memory and IO */ @@ -1047,35 +983,24 @@ static void __devinit pcibios_fixup_brid struct pci_dev *dev = bus->self; - for (i = 0; i < PCI_BUS_NUM_RESOURCES; ++i) { - if ((res = bus->resource[i]) == NULL) - continue; - if (!res->flags) + pci_bus_for_each_resource(bus, res, i) { + if (!res || !res->flags) continue; if (i >= 3 && bus->self->transparent) continue; - pr_debug("PCI:%s Bus rsrc %d %016llx-%016llx [%x] fixup...\n", + pr_debug("PCI:%s Bus rsrc %d %016llx-%016llx [%x]\n", pci_name(dev), i, (unsigned long long)res->start,\ (unsigned long long)res->end, (unsigned int)res->flags); - /* Perform fixup */ - fixup_resource(res, dev); - /* Try to detect uninitialized P2P bridge resources, * and clear them out so they get re-assigned later */ if (pcibios_uninitialized_bridge_resource(bus, res)) { res->flags = 0; pr_debug("PCI:%s (unassigned)\n", pci_name(dev)); - } else { - - pr_debug("PCI:%s %016llx-%016llx\n", - pci_name(dev), - (unsigned long long)res->start, - (unsigned long long)res->end); } } } @@ -1097,6 +1022,43 @@ void __devinit pcibios_setup_bus_self(st ppc_md.pci_dma_bus_setup(bus); } +static void pcibios_setup_device(struct pci_dev *dev) +{ + struct dev_archdata *sd = &dev->dev.archdata; + + /* Setup OF node pointer in archdata */ + sd->of_node = pci_device_to_OF_node(dev); + + /* Fixup NUMA node as it may not be setup yet by the generic + * code and is needed by the DMA init + */ + set_dev_node(&dev->dev, pcibus_to_node(dev->bus)); + + /* Hook up default DMA ops */ + set_dma_ops(&dev->dev, pci_dma_ops); + set_dma_offset(&dev->dev, PCI_DRAM_OFFSET); + + /* Additional platform DMA/iommu setup */ + if (ppc_md.pci_dma_dev_setup) + ppc_md.pci_dma_dev_setup(dev); + + /* Read default IRQs and fixup if necessary */ + pci_read_irq_line(dev); + if (ppc_md.pci_irq_fixup) + ppc_md.pci_irq_fixup(dev); +} + +int pcibios_add_device(struct pci_dev *dev) +{ + /* + * We can only call pcibios_setup_device() after bus setup is complete, + * since some of the platform specific DMA setup code depends on it. + */ + if (dev->bus->is_added) + pcibios_setup_device(dev); + return 0; +} + void __devinit pcibios_setup_bus_devices(struct pci_bus *bus) { struct pci_dev *dev; @@ -1105,28 +1067,13 @@ void __devinit pcibios_setup_bus_devices bus->number, bus->self ? pci_name(bus->self) : "PHB"); list_for_each_entry(dev, &bus->devices, bus_list) { - struct dev_archdata *sd = &dev->dev.archdata; - - /* Setup OF node pointer in archdata */ - sd->of_node = pci_device_to_OF_node(dev); - - /* Fixup NUMA node as it may not be setup yet by the generic - * code and is needed by the DMA init + /* Cardbus can call us to add new devices to a bus, so ignore + * those who are already fully discovered */ - set_dev_node(&dev->dev, pcibus_to_node(dev->bus)); + if (dev->is_added) + continue; - /* Hook up default DMA ops */ - sd->dma_ops = pci_dma_ops; - set_dma_offset(&dev->dev, PCI_DRAM_OFFSET); - - /* Additional platform DMA/iommu setup */ - if (ppc_md.pci_dma_dev_setup) - ppc_md.pci_dma_dev_setup(dev); - - /* Read default IRQs and fixup if necessary */ - pci_read_irq_line(dev); - if (ppc_md.pci_irq_fixup) - ppc_md.pci_irq_fixup(dev); + pcibios_setup_device(dev); } } @@ -1147,9 +1094,16 @@ void __devinit pcibios_fixup_bus(struct } EXPORT_SYMBOL(pcibios_fixup_bus); +void __devinit pci_fixup_cardbus(struct pci_bus *bus) +{ + /* Now fixup devices on that bus */ + pcibios_setup_bus_devices(bus); +} + + static int skip_isa_ioresource_align(struct pci_dev *dev) { - if ((ppc_pci_flags & PPC_PCI_CAN_SKIP_ISA_ALIGN) && + if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) && !(dev->bus->bridge_ctl & PCI_BRIDGE_CTL_ISA)) return 1; return 0; @@ -1265,9 +1219,8 @@ void pcibios_allocate_bus_resources(stru pr_debug("PCI: Allocating bus resources for %04x:%02x...\n", pci_domain_nr(bus), bus->number); - for (i = 0; i < PCI_BUS_NUM_RESOURCES; ++i) { - if ((res = bus->resource[i]) == NULL || !res->flags - || res->start > res->end || res->parent) + pci_bus_for_each_resource(bus, res, i) { + if (!res || !res->flags || res->start > res->end || res->parent) continue; if (bus->parent == NULL) pr = (res->flags & IORESOURCE_IO) ? @@ -1279,7 +1232,7 @@ void pcibios_allocate_bus_resources(stru * and as such ensure proper re-allocation * later. */ - if (ppc_pci_flags & PPC_PCI_REASSIGN_ALL_RSRC) + if (pci_has_flag(PCI_REASSIGN_ALL_RSRC)) goto clear_resource; pr = pci_find_parent_resource(bus->self, res); if (pr == res) { @@ -1463,7 +1416,7 @@ void __init pcibios_resource_survey(void list_for_each_entry(b, &pci_root_buses, node) pcibios_allocate_bus_resources(b); - if (!(ppc_pci_flags & PPC_PCI_REASSIGN_ALL_RSRC)) { + if (!pci_has_flag(PCI_REASSIGN_ALL_RSRC)) { pcibios_allocate_resources(0); pcibios_allocate_resources(1); } @@ -1472,7 +1425,7 @@ void __init pcibios_resource_survey(void * the low IO area and the VGA memory area if they intersect the * bus available resources to avoid allocating things on top of them */ - if (!(ppc_pci_flags & PPC_PCI_PROBE_ONLY)) { + if (!pci_has_flag(PCI_PROBE_ONLY)) { list_for_each_entry(b, &pci_root_buses, node) pcibios_reserve_legacy_regions(b); } @@ -1480,7 +1433,7 @@ void __init pcibios_resource_survey(void /* Now, if the platform didn't decide to blindly trust the firmware, * we proceed to assigning things that were left unassigned */ - if (!(ppc_pci_flags & PPC_PCI_PROBE_ONLY)) { + if (!pci_has_flag(PCI_PROBE_ONLY)) { pr_debug("PCI: Assigning unassigned resources...\n"); pci_assign_unassigned_resources(); } @@ -1561,14 +1514,13 @@ int pcibios_enable_device(struct pci_dev return pci_enable_resources(dev, mask); } -void __devinit pcibios_setup_phb_resources(struct pci_controller *hose) +static void __devinit pcibios_setup_phb_resources(struct pci_controller *hose, struct list_head *resources) { - struct pci_bus *bus = hose->bus; struct resource *res; int i; /* Hookup PHB IO resource */ - bus->resource[0] = res = &hose->io_resource; + res = &hose->io_resource; if (!res->flags) { printk(KERN_WARNING "PCI: I/O resource not set for host" @@ -1586,6 +1538,8 @@ void __devinit pcibios_setup_phb_resourc (unsigned long long)res->start, (unsigned long long)res->end, (unsigned long)res->flags); + pci_add_resource_offset(resources, res, + (resource_size_t) hose->io_base_virt - _IO_BASE); /* Hookup PHB Memory resources */ for (i = 0; i < 3; ++i) { @@ -1603,12 +1557,12 @@ void __devinit pcibios_setup_phb_resourc res->flags = IORESOURCE_MEM; #endif /* CONFIG_PPC32 */ } - bus->resource[i+1] = res; pr_debug("PCI: PHB MEM resource %d = %016llx-%016llx [%lx]\n", i, (unsigned long long)res->start, (unsigned long long)res->end, (unsigned long)res->flags); + pci_add_resource_offset(resources, res, hose->pci_mem_offset); } pr_debug("PCI: PHB MEM offset = %016llx\n", @@ -1700,6 +1654,7 @@ int early_find_capability(struct pci_con */ void __devinit pcibios_scan_phb(struct pci_controller *hose, void *sysdata) { + LIST_HEAD(resources); struct pci_bus *bus; struct device_node *node = hose->dn; int mode; @@ -1707,23 +1662,24 @@ void __devinit pcibios_scan_phb(struct p pr_debug("PCI: Scanning PHB %s\n", node ? node->full_name : ""); + /* Get some IO space for the new PHB */ + pcibios_setup_phb_io_space(hose); + + /* Wire up PHB bus resources */ + pcibios_setup_phb_resources(hose, &resources); + /* Create an empty bus for the toplevel */ - bus = pci_create_bus(hose->parent, hose->first_busno, hose->ops, - sysdata); + bus = pci_create_root_bus(hose->parent, hose->first_busno, + hose->ops, sysdata, &resources); if (bus == NULL) { pr_err("Failed to create bus for PCI domain %04x\n", hose->global_number); + pci_free_resource_list(&resources); return; } bus->secondary = hose->first_busno; hose->bus = bus; - /* Get some IO space for the new PHB */ - pcibios_setup_phb_io_space(hose); - - /* Wire up PHB bus resources */ - pcibios_setup_phb_resources(hose); - /* Get probe mode and perform scan */ mode = PCI_PROBE_NORMAL; if (node && ppc_md.pci_probe_mode) diff -uprN linux-2.6.32/arch/powerpc/kernel/pci_of_scan.c linux-2.6.32.ovz/arch/powerpc/kernel/pci_of_scan.c --- linux-2.6.32/arch/powerpc/kernel/pci_of_scan.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/pci_of_scan.c 2015-09-10 10:07:23.000000000 -0700 @@ -74,6 +74,7 @@ static void of_pci_parse_addrs(struct de { u64 base, size; unsigned int flags; + struct pci_bus_region region; struct resource *res; const u32 *addrs; u32 i; @@ -105,10 +106,11 @@ static void of_pci_parse_addrs(struct de printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); continue; } - res->start = base; - res->end = base + size - 1; res->flags = flags; res->name = pci_name(dev); + region.start = base; + region.end = base + size - 1; + pcibios_bus_to_resource(dev, res, ®ion); } } @@ -123,6 +125,7 @@ struct pci_dev *of_create_pci_dev(struct { struct pci_dev *dev; const char *type; + struct pci_slot *slot; dev = alloc_pci_dev(); if (!dev) @@ -140,6 +143,11 @@ struct pci_dev *of_create_pci_dev(struct dev->devfn = devfn; dev->multifunction = 0; /* maybe a lie? */ dev->needs_freset = 0; /* pcie fundamental reset required */ + set_pcie_port_type(dev); + + list_for_each_entry(slot, &dev->bus->slots, list) + if (PCI_SLOT(dev->devfn) == slot->number) + dev->slot = slot; dev->vendor = get_int_prop(node, "vendor-id", 0xffff); dev->device = get_int_prop(node, "device-id", 0xffff); @@ -160,10 +168,14 @@ struct pci_dev *of_create_pci_dev(struct dev->error_state = pci_channel_io_normal; dev->dma_mask = 0xffffffff; + /* Early fixups, before probing the BARs */ + pci_fixup_device(pci_fixup_early, dev); + if (!strcmp(type, "pci") || !strcmp(type, "pciex")) { /* a PCI-PCI bridge */ dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; dev->rom_base_reg = PCI_ROM_ADDRESS1; + set_pcie_hotplug_bridge(dev); } else if (!strcmp(type, "cardbus")) { dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; } else { @@ -198,6 +210,7 @@ void __devinit of_scan_pci_bridge(struct struct pci_bus *bus; const u32 *busrange, *ranges; int len, i, mode; + struct pci_bus_region region; struct resource *res; unsigned int flags; u64 size; @@ -260,9 +273,10 @@ void __devinit of_scan_pci_bridge(struct res = bus->resource[i]; ++i; } - res->start = of_read_number(&ranges[1], 2); - res->end = res->start + size - 1; res->flags = flags; + region.start = of_read_number(&ranges[1], 2); + region.end = region.start + size - 1; + pcibios_bus_to_resource(dev, res, ®ion); } sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), bus->number); @@ -300,6 +314,8 @@ static void __devinit __of_scan_bus(stru /* Scan direct children */ for_each_child_of_node(node, child) { pr_debug(" * %s\n", child->full_name); + if (!of_device_is_available(child)) + continue; reg = of_get_property(child, "reg", ®len); if (reg == NULL || reglen < 20) continue; diff -uprN linux-2.6.32/arch/powerpc/kernel/perf_callchain.c linux-2.6.32.ovz/arch/powerpc/kernel/perf_callchain.c --- linux-2.6.32/arch/powerpc/kernel/perf_callchain.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/perf_callchain.c 2015-09-10 10:08:01.000000000 -0700 @@ -23,18 +23,6 @@ #include "ppc32.h" #endif -/* - * Store another value in a callchain_entry. - */ -static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip) -{ - unsigned int nr = entry->nr; - - if (nr < PERF_MAX_STACK_DEPTH) { - entry->ip[nr] = ip; - entry->nr = nr + 1; - } -} /* * Is sp valid as the address of the next kernel stack frame after prev_sp? @@ -58,8 +46,8 @@ static int valid_next_sp(unsigned long s return 0; } -static void perf_callchain_kernel(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { unsigned long sp, next_sp; unsigned long next_ip; @@ -69,8 +57,7 @@ static void perf_callchain_kernel(struct lr = regs->link; sp = regs->gpr[1]; - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->nip); + perf_callchain_store(entry, perf_instruction_pointer(regs)); if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) return; @@ -89,7 +76,7 @@ static void perf_callchain_kernel(struct next_ip = regs->nip; lr = regs->link; level = 0; - callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); } else { if (level == 0) @@ -111,7 +98,7 @@ static void perf_callchain_kernel(struct ++level; } - callchain_store(entry, next_ip); + perf_callchain_store(entry, next_ip); if (!valid_next_sp(next_sp, sp)) return; sp = next_sp; @@ -177,8 +164,12 @@ static int read_user_stack_64(unsigned l ((unsigned long)ptr & 7)) return -EFAULT; - if (!__get_user_inatomic(*ret, ptr)) + pagefault_disable(); + if (!__get_user_inatomic(*ret, ptr)) { + pagefault_enable(); return 0; + } + pagefault_enable(); return read_user_stack_slow(ptr, ret, 8); } @@ -189,8 +180,12 @@ static int read_user_stack_32(unsigned i ((unsigned long)ptr & 3)) return -EFAULT; - if (!__get_user_inatomic(*ret, ptr)) + pagefault_disable(); + if (!__get_user_inatomic(*ret, ptr)) { + pagefault_enable(); return 0; + } + pagefault_enable(); return read_user_stack_slow(ptr, ret, 4); } @@ -243,8 +238,8 @@ static int sane_signal_64_frame(unsigned puc == (unsigned long) &sf->uc; } -static void perf_callchain_user_64(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_64(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long sp, next_sp; unsigned long next_ip; @@ -253,13 +248,10 @@ static void perf_callchain_user_64(struc struct signal_frame_64 __user *sigframe; unsigned long __user *fp, *uregs; - next_ip = regs->nip; lr = regs->link; sp = regs->gpr[1]; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); - for (;;) { + while (entry->nr < PERF_MAX_STACK_DEPTH) { fp = (unsigned long __user *) sp; if (!valid_user_sp(sp, 1) || read_user_stack_64(fp, &next_sp)) return; @@ -286,14 +278,14 @@ static void perf_callchain_user_64(struc read_user_stack_64(&uregs[PT_R1], &sp)) return; level = 0; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_store(entry, next_ip); continue; } if (level == 0) next_ip = lr; - callchain_store(entry, next_ip); + perf_callchain_store(entry, next_ip); ++level; sp = next_sp; } @@ -318,15 +310,21 @@ static inline int current_is_64bit(void) */ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) { + int rc; + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || ((unsigned long)ptr & 3)) return -EFAULT; - return __get_user_inatomic(*ret, ptr); + pagefault_disable(); + rc = __get_user_inatomic(*ret, ptr); + pagefault_enable(); + + return rc; } -static inline void perf_callchain_user_64(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static inline void perf_callchain_user_64(struct perf_callchain_entry *entry, + struct pt_regs *regs) { } @@ -445,8 +443,8 @@ static unsigned int __user *signal_frame return mctx->mc_gregs; } -static void perf_callchain_user_32(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_32(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned int sp, next_sp; unsigned int next_ip; @@ -454,11 +452,8 @@ static void perf_callchain_user_32(struc long level = 0; unsigned int __user *fp, *uregs; - next_ip = regs->nip; lr = regs->link; sp = regs->gpr[1]; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); while (entry->nr < PERF_MAX_STACK_DEPTH) { fp = (unsigned int __user *) (unsigned long) sp; @@ -480,48 +475,29 @@ static void perf_callchain_user_32(struc read_user_stack_32(&uregs[PT_R1], &sp)) return; level = 0; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_store(entry, next_ip); continue; } if (level == 0) next_ip = lr; - callchain_store(entry, next_ip); + perf_callchain_store(entry, next_ip); ++level; sp = next_sp; } } -/* - * Since we can't get PMU interrupts inside a PMU interrupt handler, - * we don't need separate irq and nmi entries here. - */ -static DEFINE_PER_CPU(struct perf_callchain_entry, callchain); - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { - struct perf_callchain_entry *entry = &__get_cpu_var(callchain); - - entry->nr = 0; + perf_callchain_store(entry, perf_instruction_pointer(regs)); - if (current->pid == 0) /* idle task? */ - return entry; + if (!current->mm) + return; - if (!user_mode(regs)) { - perf_callchain_kernel(regs, entry); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - if (current_is_64bit()) - perf_callchain_user_64(regs, entry); - else - perf_callchain_user_32(regs, entry); - } - - return entry; + if (current_is_64bit()) + perf_callchain_user_64(entry, regs); + else + perf_callchain_user_32(entry, regs); } diff -uprN linux-2.6.32/arch/powerpc/kernel/perf_event.c linux-2.6.32.ovz/arch/powerpc/kernel/perf_event.c --- linux-2.6.32/arch/powerpc/kernel/perf_event.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/perf_event.c 2015-09-10 10:07:57.000000000 -0700 @@ -35,6 +35,9 @@ struct cpu_hw_events { u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; + + unsigned int group_flag; + int n_txn_start; }; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); @@ -70,14 +73,27 @@ static inline u32 perf_get_misc_flags(st { return 0; } -static inline void perf_read_regs(struct pt_regs *regs) { } +static inline void perf_read_regs(struct pt_regs *regs) +{ + regs->result = 0; +} static inline int perf_intr_is_nmi(struct pt_regs *regs) { return 0; } +static inline int siar_valid(struct pt_regs *regs) +{ + return 1; +} + #endif /* CONFIG_PPC32 */ +static bool regs_use_siar(struct pt_regs *regs) +{ + return !!(regs->result & 1); +} + /* * Things that are specific to 64-bit implementations. */ @@ -87,11 +103,12 @@ static inline unsigned long perf_ip_adju { unsigned long mmcra = regs->dsisr; - if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { + if ((ppmu->flags & PPMU_HAS_SSLOT) && (mmcra & MMCRA_SAMPLE_ENABLE)) { unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; if (slot > 1) return 4 * (slot - 1); } + return 0; } @@ -100,48 +117,155 @@ static inline unsigned long perf_ip_adju * If we're not doing instruction sampling, give them the SDAR * (sampled data address). If we are doing instruction sampling, then * only give them the SDAR if it corresponds to the instruction - * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC - * bit in MMCRA. + * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC or + * the [POWER7P_]MMCRA_SDAR_VALID bit in MMCRA. */ static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { unsigned long mmcra = regs->dsisr; - unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? - POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; + unsigned long sdsync; + + if (ppmu->flags & PPMU_SIAR_VALID) + sdsync = POWER7P_MMCRA_SDAR_VALID; + else if (ppmu->flags & PPMU_ALT_SIPR) + sdsync = POWER6_MMCRA_SDSYNC; + else + sdsync = MMCRA_SDSYNC; if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) *addrp = mfspr(SPRN_SDAR); } -static inline u32 perf_get_misc_flags(struct pt_regs *regs) +static bool regs_sihv(struct pt_regs *regs) { - unsigned long mmcra = regs->dsisr; unsigned long sihv = MMCRA_SIHV; - unsigned long sipr = MMCRA_SIPR; - if (TRAP(regs) != 0xf00) - return 0; /* not a PMU interrupt */ + if (ppmu->flags & PPMU_HAS_SIER) + return !!(regs->dar & SIER_SIHV); - if (ppmu->flags & PPMU_ALT_SIPR) { + if (ppmu->flags & PPMU_ALT_SIPR) sihv = POWER6_MMCRA_SIHV; + + return !!(regs->dsisr & sihv); +} + +static bool regs_sipr(struct pt_regs *regs) +{ + unsigned long sipr = MMCRA_SIPR; + + if (ppmu->flags & PPMU_HAS_SIER) + return !!(regs->dar & SIER_SIPR); + + if (ppmu->flags & PPMU_ALT_SIPR) sipr = POWER6_MMCRA_SIPR; + + return !!(regs->dsisr & sipr); +} + +static bool regs_no_sipr(struct pt_regs *regs) +{ + return !!(regs->result & 2); +} + +static inline u32 perf_flags_from_msr(struct pt_regs *regs) +{ + if (regs->msr & MSR_PR) + return PERF_RECORD_MISC_USER; + if ((regs->msr & MSR_HV) && freeze_events_kernel != MMCR0_FCHV) + return PERF_RECORD_MISC_HYPERVISOR; + return PERF_RECORD_MISC_KERNEL; +} + +static inline u32 perf_get_misc_flags(struct pt_regs *regs) +{ + bool use_siar = regs_use_siar(regs); + + if (!use_siar) + return perf_flags_from_msr(regs); + + /* + * If we don't have flags in MMCRA, rather than using + * the MSR, we intuit the flags from the address in + * SIAR which should give slightly more reliable + * results + */ + if (regs_no_sipr(regs)) { + unsigned long siar = mfspr(SPRN_SIAR); + if (siar >= PAGE_OFFSET) + return PERF_RECORD_MISC_KERNEL; + return PERF_RECORD_MISC_USER; } /* PR has priority over HV, so order below is important */ - if (mmcra & sipr) + if (regs_sipr(regs)) return PERF_RECORD_MISC_USER; - if ((mmcra & sihv) && (freeze_events_kernel != MMCR0_FCHV)) + + if (regs_sihv(regs) && (freeze_events_kernel != MMCR0_FCHV)) return PERF_RECORD_MISC_HYPERVISOR; + return PERF_RECORD_MISC_KERNEL; } /* * Overload regs->dsisr to store MMCRA so we only need to read it once * on each interrupt. + * Overload regs->dar to store SIER if we have it. + * Overload regs->result to specify whether we should use the MSR (result + * is zero) or the SIAR (result is non zero). */ static inline void perf_read_regs(struct pt_regs *regs) { - regs->dsisr = mfspr(SPRN_MMCRA); + unsigned long mmcra = mfspr(SPRN_MMCRA); + int marked = mmcra & MMCRA_SAMPLE_ENABLE; + int use_siar; + + regs->dsisr = mmcra; + regs->result = 0; + + if (ppmu->flags & PPMU_NO_SIPR) + regs->result |= 2; + + /* + * On power8 if we're in random sampling mode, the SIER is updated. + * If we're in continuous sampling mode, we don't have SIPR. + */ + if (ppmu->flags & PPMU_HAS_SIER) { + if (marked) + regs->dar = mfspr(SPRN_SIER); + else + regs->result |= 2; + } + + + /* + * If this isn't a PMU exception (eg a software event) the SIAR is + * not valid. Use pt_regs. + * + * If it is a marked event use the SIAR. + * + * If the PMU doesn't update the SIAR for non marked events use + * pt_regs. + * + * If the PMU has HV/PR flags then check to see if they + * place the exception in userspace. If so, use pt_regs. In + * continuous sampling mode the SIAR and the PMU exception are + * not synchronised, so they may be many instructions apart. + * This can result in confusing backtraces. We still want + * hypervisor samples as well as samples in the kernel with + * interrupts off hence the userspace check. + */ + if (TRAP(regs) != 0xf00) + use_siar = 0; + else if (marked) + use_siar = 1; + else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING)) + use_siar = 0; + else if (!regs_no_sipr(regs) && regs_sipr(regs)) + use_siar = 0; + else + use_siar = 1; + + regs->result |= use_siar; } /* @@ -153,6 +277,24 @@ static inline int perf_intr_is_nmi(struc return !regs->softe; } +/* + * On processors like P7+ that have the SIAR-Valid bit, marked instructions + * must be sampled only if the SIAR-valid bit is set. + * + * For unmarked instructions and for processors that don't have the SIAR-Valid + * bit, assume that SIAR is valid. + */ +static inline int siar_valid(struct pt_regs *regs) +{ + unsigned long mmcra = regs->dsisr; + int marked = mmcra & MMCRA_SAMPLE_ENABLE; + + if ((ppmu->flags & PPMU_SIAR_VALID) && marked) + return mmcra & POWER7P_MMCRA_SIAR_VALID; + + return 1; +} + #endif /* CONFIG_PPC64 */ static void perf_event_interrupt(struct pt_regs *regs); @@ -395,10 +537,32 @@ static int check_excludes(struct perf_ev return 0; } +static u64 check_and_compute_delta(u64 prev, u64 val) +{ + u64 delta = (val - prev) & 0xfffffffful; + + /* + * POWER7 can roll back counter values, if the new value is smaller + * than the previous value it will cause the delta and the counter to + * have bogus values unless we rolled a counter over. If a coutner is + * rolled back, it will be smaller, but within 256, which is the maximum + * number of events to rollback at once. If we dectect a rollback + * return 0. This can lead to a small lack of precision in the + * counters. + */ + if (prev > val && (prev - val) < 256) + delta = 0; + + return delta; +} + static void power_pmu_read(struct perf_event *event) { s64 val, delta, prev; + if (event->hw.state & PERF_HES_STOPPED) + return; + if (!event->hw.idx) return; /* @@ -407,15 +571,31 @@ static void power_pmu_read(struct perf_e * Therefore we treat them like NMIs. */ do { - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); barrier(); val = read_pmc(event->hw.idx); - } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + delta = check_and_compute_delta(prev, val); + if (!delta) + return; + } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + + local64_add(delta, &event->count); - /* The counters are only 32 bits wide */ - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &event->hw.period_left); + /* + * A number of places program the PMC with (0x80000000 - period_left). + * We never want period_left to be less than 1 because we will program + * the PMC with a value >= 0x800000000 and an edge detected PMC will + * roll around to 0 before taking an exception. We have seen this + * on POWER8. + * + * To fix this, clamp the minimum value of period_left to 1. + */ + do { + prev = local64_read(&event->hw.period_left); + val = prev - delta; + if (val < 1) + val = 1; + } while (local64_cmpxchg(&event->hw.period_left, prev, val) != prev); } /* @@ -441,10 +621,11 @@ static void freeze_limited_counters(stru if (!event->hw.idx) continue; val = (event->hw.idx == 5) ? pmc5 : pmc6; - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); event->hw.idx = 0; - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); + delta = check_and_compute_delta(prev, val); + if (delta) + local64_add(delta, &event->count); } } @@ -452,14 +633,16 @@ static void thaw_limited_counters(struct unsigned long pmc5, unsigned long pmc6) { struct perf_event *event; - u64 val; + u64 val, prev; int i; for (i = 0; i < cpuhw->n_limited; ++i) { event = cpuhw->limited_counter[i]; event->hw.idx = cpuhw->limited_hwidx[i]; val = (event->hw.idx == 5) ? pmc5 : pmc6; - atomic64_set(&event->hw.prev_count, val); + prev = local64_read(&event->hw.prev_count); + if (check_and_compute_delta(prev, val)) + local64_set(&event->hw.prev_count, val); perf_event_update_userpage(event); } } @@ -514,7 +697,7 @@ static void write_mmcr0(struct cpu_hw_ev * Disable all events to prevent PMU interrupts and to allow * events to be added or removed. */ -void hw_perf_disable(void) +static void power_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuhw; unsigned long flags; @@ -562,7 +745,7 @@ void hw_perf_disable(void) * If we were previously disabled and events were added, then * put the new config on the PMU. */ -void hw_perf_enable(void) +static void power_pmu_enable(struct pmu *pmu) { struct perf_event *event; struct cpu_hw_events *cpuhw; @@ -663,12 +846,14 @@ void hw_perf_enable(void) } val = 0; if (event->hw.sample_period) { - left = atomic64_read(&event->hw.period_left); + left = local64_read(&event->hw.period_left); if (left < 0x80000000L) val = 0x80000000L - left; } - atomic64_set(&event->hw.prev_count, val); + local64_set(&event->hw.prev_count, val); event->hw.idx = idx; + if (event->hw.state & PERF_HES_STOPPED) + val = 0; write_pmc(idx, val); perf_event_update_userpage(event); } @@ -718,73 +903,13 @@ static int collect_events(struct perf_ev return n; } -static void event_sched_in(struct perf_event *event, int cpu) -{ - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = cpu; - event->tstamp_running += event->ctx->time - event->tstamp_stopped; - if (is_software_event(event)) - event->pmu->enable(event); -} - -/* - * Called to enable a whole group of events. - * Returns 1 if the group was enabled, or -EAGAIN if it could not be. - * Assumes the caller has disabled interrupts and has - * frozen the PMU with hw_perf_save_disable. - */ -int hw_perf_group_sched_in(struct perf_event *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, int cpu) -{ - struct cpu_hw_events *cpuhw; - long i, n, n0; - struct perf_event *sub; - - if (!ppmu) - return 0; - cpuhw = &__get_cpu_var(cpu_hw_events); - n0 = cpuhw->n_events; - n = collect_events(group_leader, ppmu->n_counter - n0, - &cpuhw->event[n0], &cpuhw->events[n0], - &cpuhw->flags[n0]); - if (n < 0) - return -EAGAIN; - if (check_excludes(cpuhw->event, cpuhw->flags, n0, n)) - return -EAGAIN; - i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0); - if (i < 0) - return -EAGAIN; - cpuhw->n_events = n0 + n; - cpuhw->n_added += n; - - /* - * OK, this group can go on; update event states etc., - * and enable any software events - */ - for (i = n0; i < n0 + n; ++i) - cpuhw->event[i]->hw.config = cpuhw->events[i]; - cpuctx->active_oncpu += n; - n = 1; - event_sched_in(group_leader, cpu); - list_for_each_entry(sub, &group_leader->sibling_list, group_entry) { - if (sub->state != PERF_EVENT_STATE_OFF) { - event_sched_in(sub, cpu); - ++n; - } - } - ctx->nr_active += n; - - return 1; -} - /* * Add a event to the PMU. * If all events are not already frozen, then we disable and * re-enable the PMU in order to get hw_perf_enable to do the * actual work of reconfiguring the PMU. */ -static int power_pmu_enable(struct perf_event *event) +static int power_pmu_add(struct perf_event *event, int ef_flags) { struct cpu_hw_events *cpuhw; unsigned long flags; @@ -792,7 +917,7 @@ static int power_pmu_enable(struct perf_ int ret = -EAGAIN; local_irq_save(flags); - perf_disable(); + perf_pmu_disable(event->pmu); /* * Add the event to the list (if there is room) @@ -805,18 +930,39 @@ static int power_pmu_enable(struct perf_ cpuhw->event[n0] = event; cpuhw->events[n0] = event->hw.config; cpuhw->flags[n0] = event->hw.event_base; + + /* + * This event may have been disabled/stopped in record_and_restart() + * because we exceeded the ->event_limit. If re-starting the event, + * clear the ->hw.state (STOPPED and UPTODATE flags), so the user + * notification is re-enabled. + */ + if (!(ef_flags & PERF_EF_START)) + event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + else + event->hw.state = 0; + + /* + * If group events scheduling transaction was started, + * skip the schedulability test here, it will be peformed + * at commit time(->commit_txn) as a whole + */ + if (cpuhw->group_flag & PERF_EVENT_TXN) + goto nocheck; + if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1)) goto out; if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1)) goto out; - event->hw.config = cpuhw->events[n0]; + +nocheck: ++cpuhw->n_events; ++cpuhw->n_added; ret = 0; out: - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); return ret; } @@ -824,22 +970,25 @@ static int power_pmu_enable(struct perf_ /* * Remove a event from the PMU. */ -static void power_pmu_disable(struct perf_event *event) +static void power_pmu_del(struct perf_event *event, int ef_flags) { struct cpu_hw_events *cpuhw; long i; unsigned long flags; local_irq_save(flags); - perf_disable(); + perf_pmu_disable(event->pmu); power_pmu_read(event); cpuhw = &__get_cpu_var(cpu_hw_events); for (i = 0; i < cpuhw->n_events; ++i) { if (event == cpuhw->event[i]) { - while (++i < cpuhw->n_events) + while (++i < cpuhw->n_events) { cpuhw->event[i-1] = cpuhw->event[i]; + cpuhw->events[i-1] = cpuhw->events[i]; + cpuhw->flags[i-1] = cpuhw->flags[i]; + } --cpuhw->n_events; ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr); if (event->hw.idx) { @@ -865,43 +1014,123 @@ static void power_pmu_disable(struct per cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); } - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); } /* - * Re-enable interrupts on a event after they were throttled - * because they were coming too fast. + * POWER-PMU does not support disabling individual counters, hence + * program their cycle counter to their max value and ignore the interrupts. */ -static void power_pmu_unthrottle(struct perf_event *event) + +static void power_pmu_start(struct perf_event *event, int ef_flags) { - s64 val, left; unsigned long flags; + s64 left; + unsigned long val; if (!event->hw.idx || !event->hw.sample_period) return; + + if (!(event->hw.state & PERF_HES_STOPPED)) + return; + + if (ef_flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + local_irq_save(flags); - perf_disable(); - power_pmu_read(event); - left = event->hw.sample_period; - event->hw.last_period = left; + perf_pmu_disable(event->pmu); + + event->hw.state = 0; + left = local64_read(&event->hw.period_left); + val = 0; if (left < 0x80000000L) val = 0x80000000L - left; + write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); } -struct pmu power_pmu = { - .enable = power_pmu_enable, - .disable = power_pmu_disable, - .read = power_pmu_read, - .unthrottle = power_pmu_unthrottle, -}; +static void power_pmu_stop(struct perf_event *event, int ef_flags) +{ + unsigned long flags; + + if (!event->hw.idx || !event->hw.sample_period) + return; + + if (event->hw.state & PERF_HES_STOPPED) + return; + + local_irq_save(flags); + perf_pmu_disable(event->pmu); + + power_pmu_read(event); + event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + write_pmc(event->hw.idx, 0); + + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); + local_irq_restore(flags); +} + +/* + * Start group events scheduling transaction + * Set the flag to make pmu::enable() not perform the + * schedulability test, it will be performed at commit time + */ +void power_pmu_start_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + + perf_pmu_disable(pmu); + cpuhw->group_flag |= PERF_EVENT_TXN; + cpuhw->n_txn_start = cpuhw->n_events; +} + +/* + * Stop group events scheduling transaction + * Clear the flag and pmu::enable() will perform the + * schedulability test. + */ +void power_pmu_cancel_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + + cpuhw->group_flag &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); +} + +/* + * Commit group events scheduling transaction + * Perform the group schedulability test as a whole + * Return 0 if success + */ +int power_pmu_commit_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw; + long i, n; + + if (!ppmu) + return -EAGAIN; + cpuhw = &__get_cpu_var(cpu_hw_events); + n = cpuhw->n_events; + if (check_excludes(cpuhw->event, cpuhw->flags, 0, n)) + return -EAGAIN; + i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n); + if (i < 0) + return -EAGAIN; + + for (i = cpuhw->n_txn_start; i < n; ++i) + cpuhw->event[i]->hw.config = cpuhw->events[i]; + + cpuhw->group_flag &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); + return 0; +} /* * Return 1 if we might be able to put event on a limited PMC, @@ -1003,7 +1232,7 @@ static int hw_perf_cache_event(u64 confi return 0; } -const struct pmu *hw_perf_event_init(struct perf_event *event) +static int power_pmu_event_init(struct perf_event *event) { u64 ev; unsigned long flags; @@ -1015,25 +1244,31 @@ const struct pmu *hw_perf_event_init(str struct cpu_hw_events *cpuhw; if (!ppmu) - return ERR_PTR(-ENXIO); + return -ENOENT; + + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_HARDWARE: ev = event->attr.config; if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; ev = ppmu->generic_events[ev]; break; case PERF_TYPE_HW_CACHE: err = hw_perf_cache_event(event->attr.config, &ev); if (err) - return ERR_PTR(err); + return err; break; case PERF_TYPE_RAW: ev = event->attr.config; break; default: - return ERR_PTR(-EINVAL); + return -ENOENT; } + event->hw.config_base = ev; event->hw.idx = 0; @@ -1052,7 +1287,7 @@ const struct pmu *hw_perf_event_init(str * XXX we should check if the task is an idle task. */ flags = 0; - if (event->ctx->task) + if (event->attach_state & PERF_ATTACH_TASK) flags |= PPMU_ONLY_COUNT_RUN; /* @@ -1070,7 +1305,7 @@ const struct pmu *hw_perf_event_init(str */ ev = normal_pmc_alternative(ev, flags); if (!ev) - return ERR_PTR(-EINVAL); + return -EINVAL; } } @@ -1084,24 +1319,24 @@ const struct pmu *hw_perf_event_init(str n = collect_events(event->group_leader, ppmu->n_counter - 1, ctrs, events, cflags); if (n < 0) - return ERR_PTR(-EINVAL); + return -EINVAL; } events[n] = ev; ctrs[n] = event; cflags[n] = flags; if (check_excludes(ctrs, cflags, n, 1)) - return ERR_PTR(-EINVAL); + return -EINVAL; cpuhw = &get_cpu_var(cpu_hw_events); err = power_check_constraints(cpuhw, events, cflags, n + 1); put_cpu_var(cpu_hw_events); if (err) - return ERR_PTR(-EINVAL); + return -EINVAL; event->hw.config = events[n]; event->hw.event_base = cflags[n]; event->hw.last_period = event->hw.sample_period; - atomic64_set(&event->hw.period_left, event->hw.last_period); + local64_set(&event->hw.period_left, event->hw.last_period); /* * See if we need to reserve the PMU. @@ -1121,74 +1356,91 @@ const struct pmu *hw_perf_event_init(str } event->destroy = hw_perf_event_destroy; - if (err) - return ERR_PTR(err); - return &power_pmu; + return err; } +static int power_pmu_event_idx(struct perf_event *event) +{ + return event->hw.idx; +} + +struct pmu power_pmu = { + .pmu_enable = power_pmu_enable, + .pmu_disable = power_pmu_disable, + .event_init = power_pmu_event_init, + .add = power_pmu_add, + .del = power_pmu_del, + .start = power_pmu_start, + .stop = power_pmu_stop, + .read = power_pmu_read, + .start_txn = power_pmu_start_txn, + .cancel_txn = power_pmu_cancel_txn, + .commit_txn = power_pmu_commit_txn, + .event_idx = power_pmu_event_idx, +}; + + /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled * here so there is no possibility of being interrupted. */ static void record_and_restart(struct perf_event *event, unsigned long val, - struct pt_regs *regs, int nmi) + struct pt_regs *regs) { u64 period = event->hw.sample_period; s64 prev, delta, left; int record = 0; + if (event->hw.state & PERF_HES_STOPPED) { + write_pmc(event->hw.idx, 0); + return; + } + /* we don't have to worry about interrupts here */ - prev = atomic64_read(&event->hw.prev_count); - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); + prev = local64_read(&event->hw.prev_count); + delta = check_and_compute_delta(prev, val); + local64_add(delta, &event->count); /* * See if the total period for this event has expired, * and update for the next period. */ val = 0; - left = atomic64_read(&event->hw.period_left) - delta; + left = local64_read(&event->hw.period_left) - delta; + if (delta == 0) + left++; if (period) { if (left <= 0) { left += period; if (left <= 0) left = period; - record = 1; + record = siar_valid(regs); + event->hw.last_period = event->hw.sample_period; } if (left < 0x80000000LL) val = 0x80000000LL - left; } + write_pmc(event->hw.idx, val); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); + /* * Finally record data if requested. */ if (record) { - struct perf_sample_data data = { - .addr = 0, - .period = event->hw.last_period, - }; + struct perf_sample_data data; + + perf_sample_data_init(&data, ~0ULL, event->hw.last_period); if (event->attr.sample_type & PERF_SAMPLE_ADDR) perf_get_data_addr(regs, &data.addr); - if (perf_event_overflow(event, nmi, &data, regs)) { - /* - * Interrupts are coming too fast - throttle them - * by setting the event to 0, so it will be - * at least 2^30 cycles until the next interrupt - * (assuming each event counts at most 2 counts - * per cycle). - */ - val = 0; - left = ~0ULL >> 1; - } + if (perf_event_overflow(event, &data, regs)) + power_pmu_stop(event, 0); } - - write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); - perf_event_update_userpage(event); } /* @@ -1211,13 +1463,41 @@ unsigned long perf_misc_flags(struct pt_ */ unsigned long perf_instruction_pointer(struct pt_regs *regs) { - unsigned long ip; + bool use_siar = regs_use_siar(regs); - if (TRAP(regs) != 0xf00) - return regs->nip; /* not a PMU interrupt */ + if (use_siar && siar_valid(regs)) + return mfspr(SPRN_SIAR) + perf_ip_adjust(regs); + else if (use_siar) + return 0; // no valid instruction pointer + else + return regs->nip; +} + +static bool pmc_overflow_power7(unsigned long val) +{ + /* + * Events on POWER7 can roll back if a speculative event doesn't + * eventually complete. Unfortunately in some rare cases they will + * raise a performance monitor exception. We need to catch this to + * ensure we reset the PMC. In all cases the PMC will be 256 or less + * cycles from overflow. + * + * We only do this if the first pass fails to find any overflowing + * PMCs because a user might set a period of less than 256 and we + * don't want to mistakenly reset them. + */ + if ((0x80000000 - val) <= 256) + return true; + + return false; +} + +static bool pmc_overflow(unsigned long val) +{ + if ((int)val < 0) + return true; - ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs); - return ip; + return false; } /* @@ -1225,11 +1505,11 @@ unsigned long perf_instruction_pointer(s */ static void perf_event_interrupt(struct pt_regs *regs) { - int i; + int i, j; struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); struct perf_event *event; - unsigned long val; - int found = 0; + unsigned long val[8]; + int found, active; int nmi; if (cpuhw->n_limited) @@ -1244,33 +1524,54 @@ static void perf_event_interrupt(struct else irq_enter(); - for (i = 0; i < cpuhw->n_events; ++i) { - event = cpuhw->event[i]; - if (!event->hw.idx || is_limited_pmc(event->hw.idx)) + /* Read all the PMCs since we'll need them a bunch of times */ + for (i = 0; i < ppmu->n_counter; ++i) + val[i] = read_pmc(i + 1); + + /* Try to find what caused the IRQ */ + found = 0; + for (i = 0; i < ppmu->n_counter; ++i) { + if (!pmc_overflow(val[i])) continue; - val = read_pmc(event->hw.idx); - if ((int)val < 0) { - /* event has overflowed */ - found = 1; - record_and_restart(event, val, regs, nmi); + if (is_limited_pmc(i + 1)) + continue; /* these won't generate IRQs */ + /* + * We've found one that's overflowed. For active + * counters we need to log this. For inactive + * counters, we need to reset it anyway + */ + found = 1; + active = 0; + for (j = 0; j < cpuhw->n_events; ++j) { + event = cpuhw->event[j]; + if (event->hw.idx == (i + 1)) { + active = 1; + record_and_restart(event, val[i], regs); + break; + } } + if (!active) + /* reset non active counters that have overflowed */ + write_pmc(i + 1, 0); } - /* - * In case we didn't find and reset the event that caused - * the interrupt, scan all events and reset any that are - * negative, to avoid getting continual interrupts. - * Any that we processed in the previous loop will not be negative. - */ - if (!found) { - for (i = 0; i < ppmu->n_counter; ++i) { - if (is_limited_pmc(i + 1)) + if (!found && __is_processor(PV_POWER7)) { + /* check active counters for special buggy p7 overflow */ + for (i = 0; i < cpuhw->n_events; ++i) { + event = cpuhw->event[i]; + if (!event->hw.idx || is_limited_pmc(event->hw.idx)) continue; - val = read_pmc(i + 1); - if ((int)val < 0) - write_pmc(i + 1, 0); + if (pmc_overflow_power7(val[event->hw.idx - 1])) { + /* event has overflowed in a buggy way*/ + found = 1; + record_and_restart(event, + val[event->hw.idx - 1], + regs); + } } } + if ((!found) && printk_ratelimit()) + printk(KERN_WARNING "Can't find PMC that caused IRQ\n"); /* * Reset MMCR0 to its normal value. This will set PMXE and @@ -1287,7 +1588,7 @@ static void perf_event_interrupt(struct irq_exit(); } -void hw_perf_event_setup(int cpu) +static void power_pmu_setup(int cpu) { struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); @@ -1297,6 +1598,23 @@ void hw_perf_event_setup(int cpu) cpuhw->mmcr[0] = MMCR0_FC; } +static int __cpuinit +power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + power_pmu_setup(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + int register_power_pmu(struct power_pmu *pmu) { if (ppmu) @@ -1314,5 +1632,8 @@ int register_power_pmu(struct power_pmu freeze_events_kernel = MMCR0_FCHV; #endif /* CONFIG_PPC64 */ + perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW); + perf_cpu_notifier(power_pmu_notifier); + return 0; } diff -uprN linux-2.6.32/arch/powerpc/kernel/perf_event_fsl_emb.c linux-2.6.32.ovz/arch/powerpc/kernel/perf_event_fsl_emb.c --- linux-2.6.32/arch/powerpc/kernel/perf_event_fsl_emb.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/perf_event_fsl_emb.c 2015-09-10 10:07:23.000000000 -0700 @@ -0,0 +1,687 @@ +/* + * Performance event support - Freescale Embedded Performance Monitor + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * Copyright 2010 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct cpu_hw_events { + int n_events; + int disabled; + u8 pmcs_enabled; + struct perf_event *event[MAX_HWEVENTS]; +}; +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); + +static struct fsl_emb_pmu *ppmu; + +/* Number of perf_events counting hardware events */ +static atomic_t num_events; +/* Used to avoid races in calling reserve/release_pmc_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * If interrupts were soft-disabled when a PMU interrupt occurs, treat + * it as an NMI. + */ +static inline int perf_intr_is_nmi(struct pt_regs *regs) +{ +#ifdef __powerpc64__ + return !regs->softe; +#else + return 0; +#endif +} + +static void perf_event_interrupt(struct pt_regs *regs); + +/* + * Read one performance monitor counter (PMC). + */ +static unsigned long read_pmc(int idx) +{ + unsigned long val; + + switch (idx) { + case 0: + val = mfpmr(PMRN_PMC0); + break; + case 1: + val = mfpmr(PMRN_PMC1); + break; + case 2: + val = mfpmr(PMRN_PMC2); + break; + case 3: + val = mfpmr(PMRN_PMC3); + break; + default: + printk(KERN_ERR "oops trying to read PMC%d\n", idx); + val = 0; + } + return val; +} + +/* + * Write one PMC. + */ +static void write_pmc(int idx, unsigned long val) +{ + switch (idx) { + case 0: + mtpmr(PMRN_PMC0, val); + break; + case 1: + mtpmr(PMRN_PMC1, val); + break; + case 2: + mtpmr(PMRN_PMC2, val); + break; + case 3: + mtpmr(PMRN_PMC3, val); + break; + default: + printk(KERN_ERR "oops trying to write PMC%d\n", idx); + } + + isync(); +} + +/* + * Write one local control A register + */ +static void write_pmlca(int idx, unsigned long val) +{ + switch (idx) { + case 0: + mtpmr(PMRN_PMLCA0, val); + break; + case 1: + mtpmr(PMRN_PMLCA1, val); + break; + case 2: + mtpmr(PMRN_PMLCA2, val); + break; + case 3: + mtpmr(PMRN_PMLCA3, val); + break; + default: + printk(KERN_ERR "oops trying to write PMLCA%d\n", idx); + } + + isync(); +} + +/* + * Write one local control B register + */ +static void write_pmlcb(int idx, unsigned long val) +{ + switch (idx) { + case 0: + mtpmr(PMRN_PMLCB0, val); + break; + case 1: + mtpmr(PMRN_PMLCB1, val); + break; + case 2: + mtpmr(PMRN_PMLCB2, val); + break; + case 3: + mtpmr(PMRN_PMLCB3, val); + break; + default: + printk(KERN_ERR "oops trying to write PMLCB%d\n", idx); + } + + isync(); +} + +static void fsl_emb_pmu_read(struct perf_event *event) +{ + s64 val, delta, prev; + + if (event->hw.state & PERF_HES_STOPPED) + return; + + /* + * Performance monitor interrupts come even when interrupts + * are soft-disabled, as long as interrupts are hard-enabled. + * Therefore we treat them like NMIs. + */ + do { + prev = local64_read(&event->hw.prev_count); + barrier(); + val = read_pmc(event->hw.idx); + } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + + /* The counters are only 32 bits wide */ + delta = (val - prev) & 0xfffffffful; + local64_add(delta, &event->count); + local64_sub(delta, &event->hw.period_left); +} + +/* + * Disable all events to prevent PMU interrupts and to allow + * events to be added or removed. + */ +static void fsl_emb_pmu_disable(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw; + unsigned long flags; + + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_events); + + if (!cpuhw->disabled) { + cpuhw->disabled = 1; + + /* + * Check if we ever enabled the PMU on this cpu. + */ + if (!cpuhw->pmcs_enabled) { + ppc_enable_pmcs(); + cpuhw->pmcs_enabled = 1; + } + + if (atomic_read(&num_events)) { + /* + * Set the 'freeze all counters' bit, and disable + * interrupts. The barrier is to make sure the + * mtpmr has been executed and the PMU has frozen + * the events before we return. + */ + + mtpmr(PMRN_PMGC0, PMGC0_FAC); + isync(); + } + } + local_irq_restore(flags); +} + +/* + * Re-enable all events if disable == 0. + * If we were previously disabled and events were added, then + * put the new config on the PMU. + */ +static void fsl_emb_pmu_enable(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw; + unsigned long flags; + + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_events); + if (!cpuhw->disabled) + goto out; + + cpuhw->disabled = 0; + ppc_set_pmu_inuse(cpuhw->n_events != 0); + + if (cpuhw->n_events > 0) { + mtpmr(PMRN_PMGC0, PMGC0_PMIE | PMGC0_FCECE); + isync(); + } + + out: + local_irq_restore(flags); +} + +static int collect_events(struct perf_event *group, int max_count, + struct perf_event *ctrs[]) +{ + int n = 0; + struct perf_event *event; + + if (!is_software_event(group)) { + if (n >= max_count) + return -1; + ctrs[n] = group; + n++; + } + list_for_each_entry(event, &group->sibling_list, group_entry) { + if (!is_software_event(event) && + event->state != PERF_EVENT_STATE_OFF) { + if (n >= max_count) + return -1; + ctrs[n] = event; + n++; + } + } + return n; +} + +/* context locked on entry */ +static int fsl_emb_pmu_add(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw; + int ret = -EAGAIN; + int num_counters = ppmu->n_counter; + u64 val; + int i; + + perf_pmu_disable(event->pmu); + cpuhw = &get_cpu_var(cpu_hw_events); + + if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) + num_counters = ppmu->n_restricted; + + /* + * Allocate counters from top-down, so that restricted-capable + * counters are kept free as long as possible. + */ + for (i = num_counters - 1; i >= 0; i--) { + if (cpuhw->event[i]) + continue; + + break; + } + + if (i < 0) + goto out; + + event->hw.idx = i; + cpuhw->event[i] = event; + ++cpuhw->n_events; + + val = 0; + if (event->hw.sample_period) { + s64 left = local64_read(&event->hw.period_left); + if (left < 0x80000000L) + val = 0x80000000L - left; + } + local64_set(&event->hw.prev_count, val); + + if (!(flags & PERF_EF_START)) { + event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + val = 0; + } + + write_pmc(i, val); + perf_event_update_userpage(event); + + write_pmlcb(i, event->hw.config >> 32); + write_pmlca(i, event->hw.config_base); + + ret = 0; + out: + put_cpu_var(cpu_hw_events); + perf_pmu_enable(event->pmu); + return ret; +} + +/* context locked on entry */ +static void fsl_emb_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw; + int i = event->hw.idx; + + perf_pmu_disable(event->pmu); + if (i < 0) + goto out; + + fsl_emb_pmu_read(event); + + cpuhw = &get_cpu_var(cpu_hw_events); + + WARN_ON(event != cpuhw->event[event->hw.idx]); + + write_pmlca(i, 0); + write_pmlcb(i, 0); + write_pmc(i, 0); + + cpuhw->event[i] = NULL; + event->hw.idx = -1; + + /* + * TODO: if at least one restricted event exists, and we + * just freed up a non-restricted-capable counter, and + * there is a restricted-capable counter occupied by + * a non-restricted event, migrate that event to the + * vacated counter. + */ + + cpuhw->n_events--; + + out: + perf_pmu_enable(event->pmu); + put_cpu_var(cpu_hw_events); +} + +static void fsl_emb_pmu_start(struct perf_event *event, int ef_flags) +{ + unsigned long flags; + s64 left; + + if (event->hw.idx < 0 || !event->hw.sample_period) + return; + + if (!(event->hw.state & PERF_HES_STOPPED)) + return; + + if (ef_flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + + local_irq_save(flags); + perf_pmu_disable(event->pmu); + + event->hw.state = 0; + left = local64_read(&event->hw.period_left); + write_pmc(event->hw.idx, left); + + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); + local_irq_restore(flags); +} + +static void fsl_emb_pmu_stop(struct perf_event *event, int ef_flags) +{ + unsigned long flags; + + if (event->hw.idx < 0 || !event->hw.sample_period) + return; + + if (event->hw.state & PERF_HES_STOPPED) + return; + + local_irq_save(flags); + perf_pmu_disable(event->pmu); + + fsl_emb_pmu_read(event); + event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + write_pmc(event->hw.idx, 0); + + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); + local_irq_restore(flags); +} + +/* + * Release the PMU if this is the last perf_event. + */ +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (!atomic_add_unless(&num_events, -1, 1)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_events) == 0) + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +/* + * Translate a generic cache event_id config to a raw event_id code. + */ +static int hw_perf_cache_event(u64 config, u64 *eventp) +{ + unsigned long type, op, result; + int ev; + + if (!ppmu->cache_events) + return -EINVAL; + + /* unpack config */ + type = config & 0xff; + op = (config >> 8) & 0xff; + result = (config >> 16) & 0xff; + + if (type >= PERF_COUNT_HW_CACHE_MAX || + op >= PERF_COUNT_HW_CACHE_OP_MAX || + result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + ev = (*ppmu->cache_events)[type][op][result]; + if (ev == 0) + return -EOPNOTSUPP; + if (ev == -1) + return -EINVAL; + *eventp = ev; + return 0; +} + +static int fsl_emb_pmu_event_init(struct perf_event *event) +{ + u64 ev; + struct perf_event *events[MAX_HWEVENTS]; + int n; + int err; + int num_restricted; + int i; + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + ev = event->attr.config; + if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) + return -EOPNOTSUPP; + ev = ppmu->generic_events[ev]; + break; + + case PERF_TYPE_HW_CACHE: + err = hw_perf_cache_event(event->attr.config, &ev); + if (err) + return err; + break; + + case PERF_TYPE_RAW: + ev = event->attr.config; + break; + + default: + return -ENOENT; + } + + event->hw.config = ppmu->xlate_event(ev); + if (!(event->hw.config & FSL_EMB_EVENT_VALID)) + return -EINVAL; + + /* + * If this is in a group, check if it can go on with all the + * other hardware events in the group. We assume the event + * hasn't been linked into its leader's sibling list at this point. + */ + n = 0; + if (event->group_leader != event) { + n = collect_events(event->group_leader, + ppmu->n_counter - 1, events); + if (n < 0) + return -EINVAL; + } + + if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) { + num_restricted = 0; + for (i = 0; i < n; i++) { + if (events[i]->hw.config & FSL_EMB_EVENT_RESTRICTED) + num_restricted++; + } + + if (num_restricted >= ppmu->n_restricted) + return -EINVAL; + } + + event->hw.idx = -1; + + event->hw.config_base = PMLCA_CE | PMLCA_FCM1 | + (u32)((ev << 16) & PMLCA_EVENT_MASK); + + if (event->attr.exclude_user) + event->hw.config_base |= PMLCA_FCU; + if (event->attr.exclude_kernel) + event->hw.config_base |= PMLCA_FCS; + if (event->attr.exclude_idle) + return -ENOTSUPP; + + event->hw.last_period = event->hw.sample_period; + local64_set(&event->hw.period_left, event->hw.last_period); + + /* + * See if we need to reserve the PMU. + * If no events are currently in use, then we have to take a + * mutex to ensure that we don't race with another task doing + * reserve_pmc_hardware or release_pmc_hardware. + */ + err = 0; + if (!atomic_inc_not_zero(&num_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_events) == 0 && + reserve_pmc_hardware(perf_event_interrupt)) + err = -EBUSY; + else + atomic_inc(&num_events); + mutex_unlock(&pmc_reserve_mutex); + + mtpmr(PMRN_PMGC0, PMGC0_FAC); + isync(); + } + event->destroy = hw_perf_event_destroy; + + return err; +} + +static struct pmu fsl_emb_pmu = { + .pmu_enable = fsl_emb_pmu_enable, + .pmu_disable = fsl_emb_pmu_disable, + .event_init = fsl_emb_pmu_event_init, + .add = fsl_emb_pmu_add, + .del = fsl_emb_pmu_del, + .start = fsl_emb_pmu_start, + .stop = fsl_emb_pmu_stop, + .read = fsl_emb_pmu_read, +}; + +/* + * A counter has overflowed; update its count and record + * things if requested. Note that interrupts are hard-disabled + * here so there is no possibility of being interrupted. + */ +static void record_and_restart(struct perf_event *event, unsigned long val, + struct pt_regs *regs) +{ + u64 period = event->hw.sample_period; + s64 prev, delta, left; + int record = 0; + + if (event->hw.state & PERF_HES_STOPPED) { + write_pmc(event->hw.idx, 0); + return; + } + + /* we don't have to worry about interrupts here */ + prev = local64_read(&event->hw.prev_count); + delta = (val - prev) & 0xfffffffful; + local64_add(delta, &event->count); + + /* + * See if the total period for this event has expired, + * and update for the next period. + */ + val = 0; + left = local64_read(&event->hw.period_left) - delta; + if (period) { + if (left <= 0) { + left += period; + if (left <= 0) + left = period; + record = 1; + } + if (left < 0x80000000LL) + val = 0x80000000LL - left; + } + + write_pmc(event->hw.idx, val); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); + + /* + * Finally record data if requested. + */ + if (record) { + struct perf_sample_data data; + + perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + + if (perf_event_overflow(event, &data, regs)) + fsl_emb_pmu_stop(event, 0); + } +} + +static void perf_event_interrupt(struct pt_regs *regs) +{ + int i; + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + unsigned long val; + int found = 0; + int nmi; + + nmi = perf_intr_is_nmi(regs); + if (nmi) + nmi_enter(); + else + irq_enter(); + + for (i = 0; i < ppmu->n_counter; ++i) { + event = cpuhw->event[i]; + + val = read_pmc(i); + if ((int)val < 0) { + if (event) { + /* event has overflowed */ + found = 1; + record_and_restart(event, val, regs); + } else { + /* + * Disabled counter is negative, + * reset it just in case. + */ + write_pmc(i, 0); + } + } + } + + /* PMM will keep counters frozen until we return from the interrupt. */ + mtmsr(mfmsr() | MSR_PMM); + mtpmr(PMRN_PMGC0, PMGC0_PMIE | PMGC0_FCECE); + isync(); + + if (nmi) + nmi_exit(); + else + irq_exit(); +} + +void hw_perf_event_setup(int cpu) +{ + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); + + memset(cpuhw, 0, sizeof(*cpuhw)); +} + +int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu) +{ + if (ppmu) + return -EBUSY; /* something's already registered */ + + ppmu = pmu; + pr_info("%s performance monitor hardware support registered\n", + pmu->name); + + perf_pmu_register(&fsl_emb_pmu, "cpu", PERF_TYPE_RAW); + + return 0; +} diff -uprN linux-2.6.32/arch/powerpc/kernel/power4-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power4-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power4-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power4-pmu.c 2015-09-10 10:07:03.000000000 -0700 @@ -602,6 +602,7 @@ static struct power_pmu power4_pmu = { .n_generic = ARRAY_SIZE(p4_generic_events), .generic_events = p4_generic_events, .cache_events = &power4_cache_events, + .flags = PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING, }; static int init_power4_pmu(void) @@ -613,4 +614,4 @@ static int init_power4_pmu(void) return register_power_pmu(&power4_pmu); } -arch_initcall(init_power4_pmu); +early_initcall(init_power4_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/power5-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power5-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power5-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power5-pmu.c 2015-09-10 10:07:34.000000000 -0700 @@ -73,10 +73,6 @@ #define MMCR1_PMCSEL_MSK 0x7f /* - * Bits in MMCRA - */ - -/* * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 @@ -390,7 +386,7 @@ static int power5_compute_mmcr(u64 event unsigned int hwc[], unsigned long mmcr[]) { unsigned long mmcr1 = 0; - unsigned long mmcra = 0; + unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS; unsigned int pmc, unit, byte, psel; unsigned int ttm, grp; int i, isbus, bit, grsel; @@ -614,6 +610,7 @@ static struct power_pmu power5_pmu = { .n_generic = ARRAY_SIZE(power5_generic_events), .generic_events = power5_generic_events, .cache_events = &power5_cache_events, + .flags = PPMU_HAS_SSLOT, }; static int init_power5_pmu(void) @@ -625,4 +622,4 @@ static int init_power5_pmu(void) return register_power_pmu(&power5_pmu); } -arch_initcall(init_power5_pmu); +early_initcall(init_power5_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/power5+-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power5+-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power5+-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power5+-pmu.c 2015-09-10 10:07:34.000000000 -0700 @@ -73,10 +73,6 @@ #define MMCR1_PMCSEL_MSK 0x7f /* - * Bits in MMCRA - */ - -/* * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 @@ -670,7 +666,7 @@ static struct power_pmu power5p_pmu = { .get_alternatives = power5p_get_alternatives, .disable_pmc = power5p_disable_pmc, .limited_pmc_event = power5p_limited_pmc_event, - .flags = PPMU_LIMITED_PMC5_6, + .flags = PPMU_LIMITED_PMC5_6 | PPMU_HAS_SSLOT, .n_generic = ARRAY_SIZE(power5p_generic_events), .generic_events = power5p_generic_events, .cache_events = &power5p_cache_events, @@ -686,4 +682,4 @@ static int init_power5p_pmu(void) return register_power_pmu(&power5p_pmu); } -arch_initcall(init_power5p_pmu); +early_initcall(init_power5p_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/power6-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power6-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power6-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power6-pmu.c 2015-09-10 10:06:30.000000000 -0700 @@ -178,7 +178,7 @@ static int p6_compute_mmcr(u64 event[], unsigned int hwc[], unsigned long mmcr[]) { unsigned long mmcr1 = 0; - unsigned long mmcra = 0; + unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS; int i; unsigned int pmc, ev, b, u, s, psel; unsigned int ttmset = 0; @@ -544,4 +544,4 @@ static int init_power6_pmu(void) return register_power_pmu(&power6_pmu); } -arch_initcall(init_power6_pmu); +early_initcall(init_power6_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/power7-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power7-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power7-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power7-pmu.c 2015-09-10 10:07:20.000000000 -0700 @@ -51,10 +51,6 @@ #define MMCR1_PMCSEL_MSK 0xff /* - * Bits in MMCRA - */ - -/* * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 @@ -230,7 +226,7 @@ static int power7_compute_mmcr(u64 event unsigned int hwc[], unsigned long mmcr[]) { unsigned long mmcr1 = 0; - unsigned long mmcra = 0; + unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS; unsigned int pmc, unit, combine, l2sel, psel; unsigned int pmc_inuse = 0; int i; @@ -370,7 +366,10 @@ static int init_power7_pmu(void) strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power7")) return -ENODEV; + if (__is_processor(PVR_POWER7p)) + power7_pmu.flags |= PPMU_SIAR_VALID; + return register_power_pmu(&power7_pmu); } -arch_initcall(init_power7_pmu); +early_initcall(init_power7_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/power8-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power8-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power8-pmu.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power8-pmu.c 2015-09-10 10:07:34.000000000 -0700 @@ -0,0 +1,497 @@ +/* + * Performance counter support for POWER8 processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * Copyright 2013 Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + + +/* + * Some power8 event codes. + */ +#define PM_CYC 0x0001e +#define PM_GCT_NOSLOT_CYC 0x100f8 +#define PM_CMPLU_STALL 0x4000a +#define PM_INST_CMPL 0x00002 +#define PM_BRU_FIN 0x10068 +#define PM_BR_MPRED_CMPL 0x400f6 + + +/* + * Raw event encoding for POWER8: + * + * 60 56 52 48 44 40 36 32 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * [ thresh_cmp ] [ thresh_ctl ] + * | + * thresh start/stop OR FAB match -* + * + * 28 24 20 16 12 8 4 0 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * [ ] [ sample ] [cache] [ pmc ] [unit ] c m [ pmcxsel ] + * | | | | | + * | | | | *- mark + * | | *- L1/L2/L3 cache_sel | + * | | | + * | *- sampling mode for marked events *- combine + * | + * *- thresh_sel + * + * Below uses IBM bit numbering. + * + * MMCR1[x:y] = unit (PMCxUNIT) + * MMCR1[x] = combine (PMCxCOMB) + * + * if pmc == 3 and unit == 0 and pmcxsel[0:6] == 0b0101011 + * # PM_MRK_FAB_RSP_MATCH + * MMCR1[20:27] = thresh_ctl (FAB_CRESP_MATCH / FAB_TYPE_MATCH) + * else if pmc == 4 and unit == 0xf and pmcxsel[0:6] == 0b0101001 + * # PM_MRK_FAB_RSP_MATCH_CYC + * MMCR1[20:27] = thresh_ctl (FAB_CRESP_MATCH / FAB_TYPE_MATCH) + * else + * MMCRA[48:55] = thresh_ctl (THRESH START/END) + * + * if thresh_sel: + * MMCRA[45:47] = thresh_sel + * + * if thresh_cmp: + * MMCRA[22:24] = thresh_cmp[0:2] + * MMCRA[25:31] = thresh_cmp[3:9] + * + * if unit == 6 or unit == 7 + * MMCRC[53:55] = cache_sel[1:3] (L2EVENT_SEL) + * else if unit == 8 or unit == 9: + * if cache_sel[0] == 0: # L3 bank + * MMCRC[47:49] = cache_sel[1:3] (L3EVENT_SEL0) + * else if cache_sel[0] == 1: + * MMCRC[50:51] = cache_sel[2:3] (L3EVENT_SEL1) + * else if cache_sel[1]: # L1 event + * MMCR1[16] = cache_sel[2] + * MMCR1[17] = cache_sel[3] + * + * if mark: + * MMCRA[63] = 1 (SAMPLE_ENABLE) + * MMCRA[57:59] = sample[0:2] (RAND_SAMP_ELIG) + * MMCRA[61:62] = sample[3:4] (RAND_SAMP_MODE) + * + */ + +#define EVENT_THR_CMP_SHIFT 40 /* Threshold CMP value */ +#define EVENT_THR_CMP_MASK 0x3ff +#define EVENT_THR_CTL_SHIFT 32 /* Threshold control value (start/stop) */ +#define EVENT_THR_CTL_MASK 0xffull +#define EVENT_THR_SEL_SHIFT 29 /* Threshold select value */ +#define EVENT_THR_SEL_MASK 0x7 +#define EVENT_THRESH_SHIFT 29 /* All threshold bits */ +#define EVENT_THRESH_MASK 0x1fffffull +#define EVENT_SAMPLE_SHIFT 24 /* Sampling mode & eligibility */ +#define EVENT_SAMPLE_MASK 0x1f +#define EVENT_CACHE_SEL_SHIFT 20 /* L2/L3 cache select */ +#define EVENT_CACHE_SEL_MASK 0xf +#define EVENT_IS_L1 (4 << EVENT_CACHE_SEL_SHIFT) +#define EVENT_PMC_SHIFT 16 /* PMC number (1-based) */ +#define EVENT_PMC_MASK 0xf +#define EVENT_UNIT_SHIFT 12 /* Unit */ +#define EVENT_UNIT_MASK 0xf +#define EVENT_COMBINE_SHIFT 11 /* Combine bit */ +#define EVENT_COMBINE_MASK 0x1 +#define EVENT_MARKED_SHIFT 8 /* Marked bit */ +#define EVENT_MARKED_MASK 0x1 +#define EVENT_IS_MARKED (EVENT_MARKED_MASK << EVENT_MARKED_SHIFT) +#define EVENT_PSEL_MASK 0xff /* PMCxSEL value */ + +/* + * Layout of constraint bits: + * + * 60 56 52 48 44 40 36 32 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * [ fab_match ] [ thresh_cmp ] [ thresh_ctl ] [ ] + * | + * thresh_sel -* + * + * 28 24 20 16 12 8 4 0 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * [ ] [ sample ] [ ] [6] [5] [4] [3] [2] [1] + * | | + * L1 I/D qualifier -* | Count of events for each PMC. + * | p1, p2, p3, p4, p5, p6. + * nc - number of counters -* + * + * The PMC fields P1..P6, and NC, are adder fields. As we accumulate constraints + * we want the low bit of each field to be added to any existing value. + * + * Everything else is a value field. + */ + +#define CNST_FAB_MATCH_VAL(v) (((v) & EVENT_THR_CTL_MASK) << 56) +#define CNST_FAB_MATCH_MASK CNST_FAB_MATCH_VAL(EVENT_THR_CTL_MASK) + +/* We just throw all the threshold bits into the constraint */ +#define CNST_THRESH_VAL(v) (((v) & EVENT_THRESH_MASK) << 32) +#define CNST_THRESH_MASK CNST_THRESH_VAL(EVENT_THRESH_MASK) + +#define CNST_L1_QUAL_VAL(v) (((v) & 3) << 22) +#define CNST_L1_QUAL_MASK CNST_L1_QUAL_VAL(3) + +#define CNST_SAMPLE_VAL(v) (((v) & EVENT_SAMPLE_MASK) << 16) +#define CNST_SAMPLE_MASK CNST_SAMPLE_VAL(EVENT_SAMPLE_MASK) + +/* + * For NC we are counting up to 4 events. This requires three bits, and we need + * the fifth event to overflow and set the 4th bit. To achieve that we bias the + * fields by 3 in test_adder. + */ +#define CNST_NC_SHIFT 12 +#define CNST_NC_VAL (1 << CNST_NC_SHIFT) +#define CNST_NC_MASK (8 << CNST_NC_SHIFT) +#define POWER8_TEST_ADDER (3 << CNST_NC_SHIFT) + +/* + * For the per-PMC fields we have two bits. The low bit is added, so if two + * events ask for the same PMC the sum will overflow, setting the high bit, + * indicating an error. So our mask sets the high bit. + */ +#define CNST_PMC_SHIFT(pmc) ((pmc - 1) * 2) +#define CNST_PMC_VAL(pmc) (1 << CNST_PMC_SHIFT(pmc)) +#define CNST_PMC_MASK(pmc) (2 << CNST_PMC_SHIFT(pmc)) + +/* Our add_fields is defined as: */ +#define POWER8_ADD_FIELDS \ + CNST_PMC_VAL(1) | CNST_PMC_VAL(2) | CNST_PMC_VAL(3) | \ + CNST_PMC_VAL(4) | CNST_PMC_VAL(5) | CNST_PMC_VAL(6) | CNST_NC_VAL + + +/* Bits in MMCR1 for POWER8 */ +#define MMCR1_UNIT_SHIFT(pmc) (60 - (4 * ((pmc) - 1))) +#define MMCR1_COMBINE_SHIFT(pmc) (35 - ((pmc) - 1)) +#define MMCR1_PMCSEL_SHIFT(pmc) (24 - (((pmc) - 1)) * 8) +#define MMCR1_DC_QUAL_SHIFT 47 +#define MMCR1_IC_QUAL_SHIFT 46 + +/* Bits in MMCRA for POWER8 */ +#define MMCRA_SAMP_MODE_SHIFT 1 +#define MMCRA_SAMP_ELIG_SHIFT 4 +#define MMCRA_THR_CTL_SHIFT 8 +#define MMCRA_THR_SEL_SHIFT 16 +#define MMCRA_THR_CMP_SHIFT 32 +#define MMCRA_SDAR_MODE_TLB (1ull << 42) + + +static inline bool event_is_fab_match(u64 event) +{ + /* Only check pmc, unit and pmcxsel, ignore the edge bit (0) */ + event &= 0xff0fe; + + /* PM_MRK_FAB_RSP_MATCH & PM_MRK_FAB_RSP_MATCH_CYC */ + return (event == 0x30056 || event == 0x4f052); +} + +static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp) +{ + unsigned int unit, pmc, cache; + unsigned long mask, value; + + mask = value = 0; + + pmc = (event >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK; + unit = (event >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK; + cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK; + + if (pmc) { + if (pmc > 6) + return -1; + + mask |= CNST_PMC_MASK(pmc); + value |= CNST_PMC_VAL(pmc); + + if (pmc >= 5 && event != 0x500fa && event != 0x600f4) + return -1; + } + + if (pmc <= 4) { + /* + * Add to number of counters in use. Note this includes events with + * a PMC of 0 - they still need a PMC, it's just assigned later. + * Don't count events on PMC 5 & 6, there is only one valid event + * on each of those counters, and they are handled above. + */ + mask |= CNST_NC_MASK; + value |= CNST_NC_VAL; + } + + if (unit >= 6 && unit <= 9) { + /* + * L2/L3 events contain a cache selector field, which is + * supposed to be programmed into MMCRC. However MMCRC is only + * HV writable, and there is no API for guest kernels to modify + * it. The solution is for the hypervisor to initialise the + * field to zeroes, and for us to only ever allow events that + * have a cache selector of zero. + */ + if (cache) + return -1; + + } else if (event & EVENT_IS_L1) { + mask |= CNST_L1_QUAL_MASK; + value |= CNST_L1_QUAL_VAL(cache); + } + + if (event & EVENT_IS_MARKED) { + mask |= CNST_SAMPLE_MASK; + value |= CNST_SAMPLE_VAL(event >> EVENT_SAMPLE_SHIFT); + } + + /* + * Special case for PM_MRK_FAB_RSP_MATCH and PM_MRK_FAB_RSP_MATCH_CYC, + * the threshold control bits are used for the match value. + */ + if (event_is_fab_match(event)) { + mask |= CNST_FAB_MATCH_MASK; + value |= CNST_FAB_MATCH_VAL(event >> EVENT_THR_CTL_SHIFT); + } else { + /* + * Check the mantissa upper two bits are not zero, unless the + * exponent is also zero. See the THRESH_CMP_MANTISSA doc. + */ + unsigned int cmp, exp; + + cmp = (event >> EVENT_THR_CMP_SHIFT) & EVENT_THR_CMP_MASK; + exp = cmp >> 7; + + if (exp && (cmp & 0x60) == 0) + return -1; + + mask |= CNST_THRESH_MASK; + value |= CNST_THRESH_VAL(event >> EVENT_THRESH_SHIFT); + } + + *maskp = mask; + *valp = value; + + return 0; +} + +static int power8_compute_mmcr(u64 event[], int n_ev, + unsigned int hwc[], unsigned long mmcr[]) +{ + unsigned long mmcra, mmcr1, unit, combine, psel, cache, val; + unsigned int pmc, pmc_inuse; + int i; + + pmc_inuse = 0; + + /* First pass to count resource use */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK; + if (pmc) + pmc_inuse |= 1 << pmc; + } + + /* In continous sampling mode, update SDAR on TLB miss */ + mmcra = MMCRA_SDAR_MODE_TLB; + mmcr1 = 0; + + /* Second pass: assign PMCs, set all MMCR1 fields */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK; + unit = (event[i] >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK; + combine = (event[i] >> EVENT_COMBINE_SHIFT) & EVENT_COMBINE_MASK; + psel = event[i] & EVENT_PSEL_MASK; + + if (!pmc) { + for (pmc = 1; pmc <= 4; ++pmc) { + if (!(pmc_inuse & (1 << pmc))) + break; + } + + pmc_inuse |= 1 << pmc; + } + + if (pmc <= 4) { + mmcr1 |= unit << MMCR1_UNIT_SHIFT(pmc); + mmcr1 |= combine << MMCR1_COMBINE_SHIFT(pmc); + mmcr1 |= psel << MMCR1_PMCSEL_SHIFT(pmc); + } + + if (event[i] & EVENT_IS_L1) { + cache = event[i] >> EVENT_CACHE_SEL_SHIFT; + mmcr1 |= (cache & 1) << MMCR1_IC_QUAL_SHIFT; + cache >>= 1; + mmcr1 |= (cache & 1) << MMCR1_DC_QUAL_SHIFT; + } + + if (event[i] & EVENT_IS_MARKED) { + mmcra |= MMCRA_SAMPLE_ENABLE; + + val = (event[i] >> EVENT_SAMPLE_SHIFT) & EVENT_SAMPLE_MASK; + if (val) { + mmcra |= (val & 3) << MMCRA_SAMP_MODE_SHIFT; + mmcra |= (val >> 2) << MMCRA_SAMP_ELIG_SHIFT; + } + } + + /* + * PM_MRK_FAB_RSP_MATCH and PM_MRK_FAB_RSP_MATCH_CYC, + * the threshold bits are used for the match value. + */ + if (event_is_fab_match(event[i])) { + mmcr1 |= (event[i] >> EVENT_THR_CTL_SHIFT) & + EVENT_THR_CTL_MASK; + } else { + val = (event[i] >> EVENT_THR_CTL_SHIFT) & EVENT_THR_CTL_MASK; + mmcra |= val << MMCRA_THR_CTL_SHIFT; + val = (event[i] >> EVENT_THR_SEL_SHIFT) & EVENT_THR_SEL_MASK; + mmcra |= val << MMCRA_THR_SEL_SHIFT; + val = (event[i] >> EVENT_THR_CMP_SHIFT) & EVENT_THR_CMP_MASK; + mmcra |= val << MMCRA_THR_CMP_SHIFT; + } + + hwc[i] = pmc - 1; + } + + /* Return MMCRx values */ + mmcr[0] = 0; + + /* pmc_inuse is 1-based */ + if (pmc_inuse & 2) + mmcr[0] = MMCR0_PMC1CE; + + if (pmc_inuse & 0x7c) + mmcr[0] |= MMCR0_PMCjCE; + + mmcr[1] = mmcr1; + mmcr[2] = mmcra; + + return 0; +} + +#define MAX_ALT 2 + +/* Table of alternatives, sorted by column 0 */ +static const unsigned int event_alternatives[][MAX_ALT] = { + { 0x10134, 0x301e2 }, /* PM_MRK_ST_CMPL */ + { 0x10138, 0x40138 }, /* PM_BR_MRK_2PATH */ + { 0x18082, 0x3e05e }, /* PM_L3_CO_MEPF */ + { 0x1d14e, 0x401e8 }, /* PM_MRK_DATA_FROM_L2MISS */ + { 0x1e054, 0x4000a }, /* PM_CMPLU_STALL */ + { 0x20036, 0x40036 }, /* PM_BR_2PATH */ + { 0x200f2, 0x300f2 }, /* PM_INST_DISP */ + { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */ + { 0x2013c, 0x3012e }, /* PM_MRK_FILT_MATCH */ + { 0x3e054, 0x400f0 }, /* PM_LD_MISS_L1 */ + { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */ +}; + +/* + * Scan the alternatives table for a match and return the + * index into the alternatives table if found, else -1. + */ +static int find_alternative(u64 event) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { + if (event < event_alternatives[i][0]) + break; + + for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) + if (event == event_alternatives[i][j]) + return i; + } + + return -1; +} + +static int power8_get_alternatives(u64 event, unsigned int flags, u64 alt[]) +{ + int i, j, num_alt = 0; + u64 alt_event; + + alt[num_alt++] = event; + + i = find_alternative(event); + if (i >= 0) { + /* Filter out the original event, it's already in alt[0] */ + for (j = 0; j < MAX_ALT; ++j) { + alt_event = event_alternatives[i][j]; + if (alt_event && alt_event != event) + alt[num_alt++] = alt_event; + } + } + + if (flags & PPMU_ONLY_COUNT_RUN) { + /* + * We're only counting in RUN state, so PM_CYC is equivalent to + * PM_RUN_CYC and PM_INST_CMPL === PM_RUN_INST_CMPL. + */ + j = num_alt; + for (i = 0; i < num_alt; ++i) { + switch (alt[i]) { + case 0x1e: /* PM_CYC */ + alt[j++] = 0x600f4; /* PM_RUN_CYC */ + break; + case 0x600f4: /* PM_RUN_CYC */ + alt[j++] = 0x1e; + break; + case 0x2: /* PM_PPC_CMPL */ + alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */ + break; + case 0x500fa: /* PM_RUN_INST_CMPL */ + alt[j++] = 0x2; /* PM_PPC_CMPL */ + break; + } + } + num_alt = j; + } + + return num_alt; +} + +static void power8_disable_pmc(unsigned int pmc, unsigned long mmcr[]) +{ + if (pmc <= 3) + mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SHIFT(pmc + 1)); +} + +static int power8_generic_events[] = { + [PERF_COUNT_HW_CPU_CYCLES] = PM_CYC, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PM_GCT_NOSLOT_CYC, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = PM_CMPLU_STALL, + [PERF_COUNT_HW_INSTRUCTIONS] = PM_INST_CMPL, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PM_BRU_FIN, + [PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL, +}; + +static struct power_pmu power8_pmu = { + .name = "POWER8", + .n_counter = 6, + .max_alternatives = MAX_ALT + 1, + .add_fields = POWER8_ADD_FIELDS, + .test_adder = POWER8_TEST_ADDER, + .compute_mmcr = power8_compute_mmcr, + .get_constraint = power8_get_constraint, + .get_alternatives = power8_get_alternatives, + .disable_pmc = power8_disable_pmc, + .flags = PPMU_HAS_SSLOT | PPMU_HAS_SIER, + .n_generic = ARRAY_SIZE(power8_generic_events), + .generic_events = power8_generic_events, +}; + +static int __init init_power8_pmu(void) +{ + if (!cur_cpu_spec->oprofile_cpu_type || + strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8")) + return -ENODEV; + + return register_power_pmu(&power8_pmu); +} +early_initcall(init_power8_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/ppc970-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/ppc970-pmu.c --- linux-2.6.32/arch/powerpc/kernel/ppc970-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/ppc970-pmu.c 2015-09-10 10:07:03.000000000 -0700 @@ -84,10 +84,6 @@ static short mmcr1_adder_bits[8] = { }; /* - * Bits in MMCRA - */ - -/* * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 @@ -173,9 +169,11 @@ static int p970_marked_instr_event(u64 e switch (unit) { case PM_VPU: mask = 0x4c; /* byte 0 bits 2,3,6 */ + break; case PM_LSU0: /* byte 2 bits 0,2,3,4,6; all of byte 1 */ mask = 0x085dff00; + break; case PM_LSU1L: mask = 0x50 << 24; /* byte 3 bits 4,6 */ break; @@ -484,6 +482,7 @@ static struct power_pmu ppc970_pmu = { .n_generic = ARRAY_SIZE(ppc970_generic_events), .generic_events = ppc970_generic_events, .cache_events = &ppc970_cache_events, + .flags = PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING, }; static int init_ppc970_pmu(void) @@ -496,4 +495,4 @@ static int init_ppc970_pmu(void) return register_power_pmu(&ppc970_pmu); } -arch_initcall(init_ppc970_pmu); +early_initcall(init_ppc970_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/ppc_ksyms.c linux-2.6.32.ovz/arch/powerpc/kernel/ppc_ksyms.c --- linux-2.6.32/arch/powerpc/kernel/ppc_ksyms.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/ppc_ksyms.c 2015-09-10 10:07:25.000000000 -0700 @@ -54,7 +54,6 @@ extern void single_step_exception(struct extern int sys_sigreturn(struct pt_regs *regs); EXPORT_SYMBOL(clear_pages); -EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(ISA_DMA_THRESHOLD); EXPORT_SYMBOL(DMA_MODE_READ); EXPORT_SYMBOL(DMA_MODE_WRITE); @@ -88,9 +87,7 @@ EXPORT_SYMBOL(__copy_tofrom_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__strncpy_from_user); EXPORT_SYMBOL(__strnlen_user); -#ifdef CONFIG_PPC64 -EXPORT_SYMBOL(copy_4K_page); -#endif +EXPORT_SYMBOL(copy_page); #if defined(CONFIG_PCI) && defined(CONFIG_PPC32) EXPORT_SYMBOL(isa_io_base); diff -uprN linux-2.6.32/arch/powerpc/kernel/process.c linux-2.6.32.ovz/arch/powerpc/kernel/process.c --- linux-2.6.32/arch/powerpc/kernel/process.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/process.c 2015-09-10 10:07:47.000000000 -0700 @@ -402,7 +402,6 @@ struct task_struct *__switch_to(struct t account_system_vtime(current); account_process_vtime(current); - calculate_steal_time(); /* * We can't take a PMU exception inside _switch() since there is a @@ -554,18 +553,6 @@ void exit_thread(void) void flush_thread(void) { -#ifdef CONFIG_PPC64 - struct thread_info *t = current_thread_info(); - - if (test_ti_thread_flag(t, TIF_ABI_PENDING)) { - clear_ti_thread_flag(t, TIF_ABI_PENDING); - if (test_ti_thread_flag(t, TIF_32BIT)) - clear_ti_thread_flag(t, TIF_32BIT); - else - set_ti_thread_flag(t, TIF_32BIT); - } -#endif - discard_lazy_cpu_state(); if (current->thread.dabr) { @@ -922,7 +909,7 @@ int sys_execve(unsigned long a0, unsigne struct pt_regs *regs) { int error; - char *filename; + struct filename *filename; filename = getname((char __user *) a0); error = PTR_ERR(filename); @@ -931,7 +918,7 @@ int sys_execve(unsigned long a0, unsigne flush_fp_to_thread(current); flush_altivec_to_thread(current); flush_spe_to_thread(current); - error = do_execve(filename, (char __user * __user *) a1, + error = do_execve(filename->name, (char __user * __user *) a1, (char __user * __user *) a2, regs); putname(filename); out: @@ -1120,11 +1107,11 @@ void ppc64_runlatch_off(void) static struct kmem_cache *thread_info_cache; -struct thread_info *alloc_thread_info(struct task_struct *tsk) +struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { struct thread_info *ti; - ti = kmem_cache_alloc(thread_info_cache, GFP_KERNEL); + ti = kmem_cache_alloc_node(thread_info_cache, GFP_KERNEL, node); if (unlikely(ti == NULL)) return NULL; #ifdef CONFIG_DEBUG_STACK_USAGE @@ -1201,3 +1188,14 @@ unsigned long randomize_et_dyn(unsigned return ret; } + +#ifdef CONFIG_SMP +int arch_sd_sibling_asym_packing(void) +{ + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + return SD_ASYM_PACKING; + } + return 0; +} +#endif diff -uprN linux-2.6.32/arch/powerpc/kernel/prom.c linux-2.6.32.ovz/arch/powerpc/kernel/prom.c --- linux-2.6.32/arch/powerpc/kernel/prom.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/prom.c 2015-09-10 10:07:19.000000000 -0700 @@ -51,8 +51,8 @@ #include #include #include -#include #include +#include #include #ifdef DEBUG @@ -426,7 +426,7 @@ static int __init early_parse_mem(char * return 1; memory_limit = PAGE_ALIGN(memparse(p, &p)); - DBG("memory limit = 0x%llx\n", (unsigned long long)memory_limit); + DBG("memory limit = 0x%llx\n", memory_limit); return 0; } @@ -1077,87 +1077,6 @@ static void __init early_reserve_mem(voi } } -#ifdef CONFIG_PHYP_DUMP -/** - * phyp_dump_calculate_reserve_size() - reserve variable boot area 5% or arg - * - * Function to find the largest size we need to reserve - * during early boot process. - * - * It either looks for boot param and returns that OR - * returns larger of 256 or 5% rounded down to multiples of 256MB. - * - */ -static inline unsigned long phyp_dump_calculate_reserve_size(void) -{ - unsigned long tmp; - - if (phyp_dump_info->reserve_bootvar) - return phyp_dump_info->reserve_bootvar; - - /* divide by 20 to get 5% of value */ - tmp = lmb_end_of_DRAM(); - do_div(tmp, 20); - - /* round it down in multiples of 256 */ - tmp = tmp & ~0x0FFFFFFFUL; - - return (tmp > PHYP_DUMP_RMR_END ? tmp : PHYP_DUMP_RMR_END); -} - -/** - * phyp_dump_reserve_mem() - reserve all not-yet-dumped mmemory - * - * This routine may reserve memory regions in the kernel only - * if the system is supported and a dump was taken in last - * boot instance or if the hardware is supported and the - * scratch area needs to be setup. In other instances it returns - * without reserving anything. The memory in case of dump being - * active is freed when the dump is collected (by userland tools). - */ -static void __init phyp_dump_reserve_mem(void) -{ - unsigned long base, size; - unsigned long variable_reserve_size; - - if (!phyp_dump_info->phyp_dump_configured) { - printk(KERN_ERR "Phyp-dump not supported on this hardware\n"); - return; - } - - if (!phyp_dump_info->phyp_dump_at_boot) { - printk(KERN_INFO "Phyp-dump disabled at boot time\n"); - return; - } - - variable_reserve_size = phyp_dump_calculate_reserve_size(); - - if (phyp_dump_info->phyp_dump_is_active) { - /* Reserve *everything* above RMR.Area freed by userland tools*/ - base = variable_reserve_size; - size = lmb_end_of_DRAM() - base; - - /* XXX crashed_ram_end is wrong, since it may be beyond - * the memory_limit, it will need to be adjusted. */ - lmb_reserve(base, size); - - phyp_dump_info->init_reserve_start = base; - phyp_dump_info->init_reserve_size = size; - } else { - size = phyp_dump_info->cpu_state_size + - phyp_dump_info->hpte_region_size + - variable_reserve_size; - base = lmb_end_of_DRAM() - size; - lmb_reserve(base, size); - phyp_dump_info->init_reserve_start = base; - phyp_dump_info->init_reserve_size = size; - } -} -#else -static inline void __init phyp_dump_reserve_mem(void) {} -#endif /* CONFIG_PHYP_DUMP && CONFIG_PPC_RTAS */ - - void __init early_init_devtree(void *params) { phys_addr_t limit; @@ -1172,9 +1091,9 @@ void __init early_init_devtree(void *par of_scan_flat_dt(early_init_dt_scan_rtas, NULL); #endif -#ifdef CONFIG_PHYP_DUMP - /* scan tree to see if dump occured during last boot */ - of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL); +#ifdef CONFIG_FA_DUMP + /* scan tree to see if dump is active during last boot */ + of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL); #endif /* Retrieve various informations from the /chosen node of the @@ -1198,9 +1117,15 @@ void __init early_init_devtree(void *par if (PHYSICAL_START > MEMORY_START) lmb_reserve(MEMORY_START, 0x8000); reserve_kdump_trampoline(); - reserve_crashkernel(); +#ifdef CONFIG_FA_DUMP + /* + * If we fail to reserve memory for firmware-assisted dump then + * fallback to kexec based kdump. + */ + if (fadump_reserve_mem() == 0) +#endif + reserve_crashkernel(); early_reserve_mem(); - phyp_dump_reserve_mem(); limit = memory_limit; if (! limit) { diff -uprN linux-2.6.32/arch/powerpc/kernel/prom_init.c linux-2.6.32.ovz/arch/powerpc/kernel/prom_init.c --- linux-2.6.32/arch/powerpc/kernel/prom_init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/prom_init.c 2015-09-10 10:07:25.000000000 -0700 @@ -635,6 +635,9 @@ static void __init early_cmdline_parse(v #define OV3_VMX 0x40 /* VMX/Altivec */ #define OV3_DFP 0x20 /* decimal FP */ +/* Option vector 4: IBM PAPR implementation */ +#define OV4_MIN_ENT_CAP 0x01 /* minimum VP entitled capacity */ + /* Option vector 5: PAPR/OF options supported */ #define OV5_LPAR 0x80 /* logical partitioning supported */ #define OV5_SPLPAR 0x40 /* shared-processor LPAR supported */ @@ -650,9 +653,17 @@ static void __init early_cmdline_parse(v #endif /* CONFIG_PCI_MSI */ #ifdef CONFIG_PPC_SMLPAR #define OV5_CMO 0x80 /* Cooperative Memory Overcommitment */ +#define OV5_XCMO 0x40 /* Page Coalescing */ #else #define OV5_CMO 0x00 +#define OV5_XCMO 0x00 #endif +#define OV5_TYPE1_AFFINITY 0x80 /* Type 1 NUMA affinity */ +#define OV5_PFO_HW_RNG 0x80 /* PFO Random Number Generator */ +#define OV5_PFO_HW_ENCR 0x20 /* PFO Encryption Accelerator */ + +/* Option Vector 6: IBM PAPR hints */ +#define OV6_LINUX 0x02 /* Linux is our OS */ /* * The architecture vector has an array of PVR mask/value pairs, @@ -665,7 +676,7 @@ static unsigned char ibm_architecture_ve W(0xffffffff), W(0x0f000003), /* all 2.06-compliant */ W(0xffffffff), W(0x0f000002), /* all 2.05-compliant */ W(0xfffffffe), W(0x0f000001), /* all 2.04-compliant and earlier */ - 5 - 1, /* 5 option vectors */ + 6 - 1, /* 6 option vectors */ /* option vector 1: processor architectures supported */ 3 - 2, /* length */ @@ -693,16 +704,39 @@ static unsigned char ibm_architecture_ve OV3_FP | OV3_VMX | OV3_DFP, /* option vector 4: IBM PAPR implementation */ - 2 - 2, /* length */ + 3 - 2, /* length */ 0, /* don't halt */ + OV4_MIN_ENT_CAP, /* minimum VP entitled capacity */ /* option vector 5: PAPR/OF options */ - 5 - 2, /* length */ + 18 - 2, /* length */ 0, /* don't ignore, don't halt */ OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY | OV5_DONATE_DEDICATE_CPU | OV5_MSI, 0, - OV5_CMO, + OV5_CMO | OV5_XCMO, + OV5_TYPE1_AFFINITY, + 0, + 0, + 0, + /* WARNING: The offset of the "number of cores" field below + * must match by the macro below. Update the definition if + * the structure layout changes. + */ +#define IBM_ARCH_VEC_NRCORES_OFFSET 101 + W(NR_CPUS), /* number of cores supported */ + 0, + 0, + 0, + 0, + OV5_PFO_HW_RNG | OV5_PFO_HW_ENCR, + + /* option vector 6: IBM PAPR hints */ + 4 - 2, /* length */ + 0, + 0, + OV6_LINUX, + }; /* Old method - ELF header with PT_NOTE sections */ @@ -792,13 +826,70 @@ static struct fake_elf { } }; +static int __init prom_count_smt_threads(void) +{ + phandle node; + char type[64]; + unsigned int plen; + + /* Pick up th first CPU node we can find */ + for (node = 0; prom_next_node(&node); ) { + type[0] = 0; + prom_getprop(node, "device_type", type, sizeof(type)); + + if (strcmp(type, RELOC("cpu"))) + continue; + /* + * There is an entry for each smt thread, each entry being + * 4 bytes long. All cpus should have the same number of + * smt threads, so return after finding the first. + */ + plen = prom_getproplen(node, "ibm,ppc-interrupt-server#s"); + if (plen == PROM_ERROR) + break; + plen >>= 2; + prom_debug("Found 0x%x smt threads per core\n", (unsigned long)plen); + + /* Sanity check */ + if (plen < 1 || plen > 64) { + prom_printf("Threads per core 0x%x out of bounds, assuming 1\n", + (unsigned long)plen); + return 1; + } + return plen; + } + prom_debug("No threads found, assuming 1 per core\n"); + + return 1; + +} + + static void __init prom_send_capabilities(void) { ihandle elfloader, root; prom_arg_t ret; + u32 *cores; root = call_prom("open", 1, 1, ADDR("/")); if (root != 0) { + /* We need to tell the FW about the number of cores we support. + * + * To do that, we count the number of threads on the first core + * (we assume this is the same for all cores) and use it to + * divide NR_CPUS. + */ + cores = (u32 *)PTRRELOC(&ibm_architecture_vec[IBM_ARCH_VEC_NRCORES_OFFSET]); + if (*cores != NR_CPUS) { + prom_printf("WARNING ! " + "ibm_architecture_vec structure inconsistent: 0x%x !\n", + *cores); + } else { + *cores = NR_CPUS / prom_count_smt_threads(); + prom_printf("Max number of cores passed to firmware: 0x%x\n", + (unsigned long)*cores); + } + /* try calling the ibm,client-architecture-support method */ prom_printf("Calling ibm,client-architecture-support..."); if (call_prom_ret("call-method", 3, 2, &ret, @@ -890,7 +981,7 @@ static unsigned long __init alloc_up(uns } if (addr == 0) return 0; - RELOC(alloc_bottom) = addr; + RELOC(alloc_bottom) = addr + size; prom_debug(" -> %x\n", addr); prom_debug(" alloc_bottom : %x\n", RELOC(alloc_bottom)); @@ -1704,7 +1795,7 @@ static void __init *make_room(unsigned l chunk = alloc_up(room, 0); if (chunk == 0) prom_panic("No memory for flatten_device_tree (claim failed)"); - *mem_end = RELOC(alloc_top); + *mem_end = chunk + room; } ret = (void *)*mem_start; @@ -1923,7 +2014,7 @@ static void __init flatten_device_tree(v mem_start = (unsigned long)alloc_up(room, PAGE_SIZE); if (mem_start == 0) prom_panic("Can't allocate initial device-tree chunk\n"); - mem_end = RELOC(alloc_top); + mem_end = mem_start + room; /* Get root of tree */ root = call_prom("peer", 1, 1, (phandle)0); diff -uprN linux-2.6.32/arch/powerpc/kernel/ptrace.c linux-2.6.32.ovz/arch/powerpc/kernel/ptrace.c --- linux-2.6.32/arch/powerpc/kernel/ptrace.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/ptrace.c 2015-09-10 10:06:36.000000000 -0700 @@ -124,12 +124,16 @@ static int gpr_get(struct task_struct *t unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - int ret; + int i, ret; if (target->thread.regs == NULL) return -EIO; - CHECK_FULL_REGS(target->thread.regs); + if (!FULL_REGS(target->thread.regs)) { + /* We have a partial register set. Fill 14-31 with bogus values */ + for (i = 14; i < 32; i++) + target->thread.regs->gpr[i] = NV_REG_POISON; + } ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, target->thread.regs, @@ -536,11 +540,16 @@ static int gpr32_get(struct task_struct compat_ulong_t *k = kbuf; compat_ulong_t __user *u = ubuf; compat_ulong_t reg; + int i; if (target->thread.regs == NULL) return -EIO; - CHECK_FULL_REGS(target->thread.regs); + if (!FULL_REGS(target->thread.regs)) { + /* We have a partial register set. Fill 14-31 with bogus values */ + for (i = 14; i < 32; i++) + target->thread.regs->gpr[i] = NV_REG_POISON; + } pos /= sizeof(reg); count /= sizeof(reg); @@ -1076,9 +1085,7 @@ void do_syscall_trace_leave(struct pt_re { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, - regs->result); + audit_syscall_exit(regs); step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) diff -uprN linux-2.6.32/arch/powerpc/kernel/rtas.c linux-2.6.32.ovz/arch/powerpc/kernel/rtas.c --- linux-2.6.32/arch/powerpc/kernel/rtas.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/rtas.c 2015-09-10 10:07:40.000000000 -0700 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -40,20 +41,14 @@ #include #include #include +#include +#include struct rtas_t rtas = { .lock = __RAW_SPIN_LOCK_UNLOCKED }; EXPORT_SYMBOL(rtas); -struct rtas_suspend_me_data { - atomic_t working; /* number of cpus accessing this struct */ - atomic_t done; - int token; /* ibm,suspend-me */ - int error; - struct completion *complete; /* wait on this until working == 0 */ -}; - DEFINE_SPINLOCK(rtas_data_buf_lock); EXPORT_SYMBOL(rtas_data_buf_lock); @@ -500,7 +495,7 @@ unsigned int rtas_busy_delay(int status) might_sleep(); ms = rtas_busy_delay_time(status); - if (ms) + if (ms && need_resched()) msleep(ms); return ms; @@ -690,10 +685,14 @@ void rtas_os_term(char *str) { int status; - if (panic_timeout) - return; - - if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term")) + /* + * Firmware with the ibm,extended-os-term property is guaranteed + * to always return from an ibm,os-term call. Earlier versions without + * this property may terminate the partition which we want to avoid + * since it interferes with panic_timeout. + */ + if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term") || + RTAS_UNKNOWN_SERVICE == rtas_token("ibm,extended-os-term")) return; snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str); @@ -704,20 +703,64 @@ void rtas_os_term(char *str) } while (rtas_busy_delay(status)); if (status != 0) - printk(KERN_EMERG "ibm,os-term call failed %d\n", - status); + printk(KERN_EMERG "ibm,os-term call failed %d\n", status); } static int ibm_suspend_me_token = RTAS_UNKNOWN_SERVICE; #ifdef CONFIG_PPC_PSERIES -static void rtas_percpu_suspend_me(void *info) +static int __rtas_suspend_last_cpu(struct rtas_suspend_me_data *data, int wake_when_done) +{ + u16 slb_size = mmu_slb_size; + int rc = H_MULTI_THREADS_ACTIVE; + int cpu; + + slb_set_size(SLB_MIN_SIZE); + printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n", smp_processor_id()); + + while (rc == H_MULTI_THREADS_ACTIVE && !data->done) { + rc = rtas_call(data->token, 0, 1, NULL); + if (rc && rc != H_MULTI_THREADS_ACTIVE) + printk(KERN_DEBUG "ibm,suspend-me returned %d\n", rc); + } + + smp_rmb(); + if (rc || data->error) + slb_set_size(slb_size); + + if (data->error) + rc = data->error; + + data->error = rc; + + if (wake_when_done) { + smp_wmb(); + data->done = 1; + + /* Ensure data->done is seen on all CPUs that are about to wake up + as a result of the H_PROD below */ + mb(); + + for_each_online_cpu(cpu) + plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); + } + + if (atomic_dec_return(&data->working) == 0) + complete(data->complete); + + return rc; +} + +int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data) +{ + atomic_inc(&data->working); + return __rtas_suspend_last_cpu(data, 0); +} + +static int __rtas_suspend_cpu(struct rtas_suspend_me_data *data, int wake_when_done) { long rc = H_SUCCESS; unsigned long msr_save; - u16 slb_size = mmu_slb_size; int cpu; - struct rtas_suspend_me_data *data = - (struct rtas_suspend_me_data *)info; atomic_inc(&data->working); @@ -725,7 +768,7 @@ static void rtas_percpu_suspend_me(void msr_save = mfmsr(); mtmsr(msr_save & ~(MSR_EE)); - while (rc == H_SUCCESS && !atomic_read(&data->done)) + while (rc == H_SUCCESS && !data->done) rc = plpar_hcall_norets(H_JOIN); mtmsr(msr_save); @@ -737,34 +780,132 @@ static void rtas_percpu_suspend_me(void /* All other cpus are in H_JOIN, this cpu does * the suspend. */ - slb_set_size(SLB_MIN_SIZE); - printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n", - smp_processor_id()); - data->error = rtas_call(data->token, 0, 1, NULL); - - if (data->error) { - printk(KERN_DEBUG "ibm,suspend-me returned %d\n", - data->error); - slb_set_size(slb_size); - } - } else { - printk(KERN_ERR "H_JOIN on cpu %i failed with rc = %ld\n", - smp_processor_id(), rc); - data->error = rc; + return __rtas_suspend_last_cpu(data, wake_when_done); } - atomic_set(&data->done, 1); - - /* This cpu did the suspend or got an error; in either case, - * we need to prod all other other cpus out of join state. - * Extra prods are harmless. - */ - for_each_online_cpu(cpu) - plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); + printk(KERN_ERR "H_JOIN on cpu %i failed with rc = %ld\n", + smp_processor_id(), rc); + data->error = rc; + + if (wake_when_done) { + smp_wmb(); + data->done = 1; + /* Ensure data->done is seen on all CPUs that are about to wake up + as a result of the H_PROD below */ + mb(); + pSeries_coalesce_init(); + + /* This cpu did the suspend or got an error; in either case, + * we need to prod all other other cpus out of join state. + * Extra prods are harmless. + */ + for_each_online_cpu(cpu) + plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); + } out: if (atomic_dec_return(&data->working) == 0) complete(data->complete); + return rc; +} + +int rtas_suspend_cpu(struct rtas_suspend_me_data *data) +{ + return __rtas_suspend_cpu(data, 0); +} + +static void rtas_percpu_suspend_me(void *info) +{ + __rtas_suspend_cpu((struct rtas_suspend_me_data *)info, 1); +} + +enum rtas_cpu_state { + DOWN, + UP, +}; + +#ifndef CONFIG_SMP +static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, + cpumask_var_t cpus) +{ + if (!cpumask_empty(cpus)) { + cpumask_clear(cpus); + return -EINVAL; + } else + return 0; +} +#else +/* On return cpumask will be altered to indicate CPUs changed. + * CPUs with states changed will be set in the mask, + * CPUs with status unchanged will be unset in the mask. */ +static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, + cpumask_var_t cpus) +{ + int cpu; + int cpuret = 0; + int ret = 0; + + if (cpumask_empty(cpus)) + return 0; + + for_each_cpu(cpu, cpus) { + switch (state) { + case DOWN: + cpuret = cpu_down(cpu); + break; + case UP: + cpuret = cpu_up(cpu); + break; + } + if (cpuret) { + pr_debug("%s: cpu_%s for cpu#%d returned %d.\n", + __func__, + ((state == UP) ? "up" : "down"), + cpu, cpuret); + if (!ret) + ret = cpuret; + if (state == UP) { + /* clear bits for unchanged cpus, return */ + cpumask_shift_right(cpus, cpus, cpu); + cpumask_shift_left(cpus, cpus, cpu); + break; + } else { + /* clear bit for unchanged cpu, continue */ + cpumask_clear_cpu(cpu, cpus); + } + } + } + + return ret; } +#endif + +int rtas_online_cpus_mask(cpumask_var_t cpus) +{ + int ret; + + ret = rtas_cpu_state_change_mask(UP, cpus); + + if (ret) { + cpumask_var_t tmp_mask; + + if (!alloc_cpumask_var(&tmp_mask, GFP_TEMPORARY)) + return ret; + + /* Use tmp_mask to preserve cpus mask from first failure */ + cpumask_copy(tmp_mask, cpus); + rtas_offline_cpus_mask(tmp_mask); + free_cpumask_var(tmp_mask); + } + + return ret; +} +EXPORT_SYMBOL(rtas_online_cpus_mask); + +int rtas_offline_cpus_mask(cpumask_var_t cpus) +{ + return rtas_cpu_state_change_mask(DOWN, cpus); +} +EXPORT_SYMBOL(rtas_offline_cpus_mask); static int rtas_ibm_suspend_me(struct rtas_args *args) { @@ -773,6 +914,8 @@ static int rtas_ibm_suspend_me(struct rt unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; struct rtas_suspend_me_data data; DECLARE_COMPLETION_ONSTACK(done); + cpumask_var_t offline_mask; + int cpuret; if (!rtas_service_present("ibm,suspend-me")) return -ENOSYS; @@ -796,12 +939,26 @@ static int rtas_ibm_suspend_me(struct rt return 0; } + if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY)) + return -ENOMEM; + atomic_set(&data.working, 0); - atomic_set(&data.done, 0); - data.token = rtas_token("ibm,suspend-me"); + data.done = 0; data.error = 0; + data.token = rtas_token("ibm,suspend-me"); data.complete = &done; + /* All present CPUs must be online */ + cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask); + cpuret = rtas_online_cpus_mask(offline_mask); + if (cpuret) { + pr_err("%s: Could not bring present CPUs online.\n", __func__); + data.error = cpuret; + goto out; + } + + stop_topology_update(); + /* Call function on all CPUs. One of us will make the * rtas call */ @@ -813,6 +970,16 @@ static int rtas_ibm_suspend_me(struct rt if (data.error != 0) printk(KERN_ERR "Error doing global join\n"); + start_topology_update(); + + /* Take down CPUs not online prior to suspend */ + cpuret = rtas_offline_cpus_mask(offline_mask); + if (cpuret) + pr_warn("%s: Could not restore CPUs to offline state.\n", + __func__); + +out: + free_cpumask_var(offline_mask); return data.error; } #else /* CONFIG_PPC_PSERIES */ @@ -822,6 +989,40 @@ static int rtas_ibm_suspend_me(struct rt } #endif +/** + * Find a specific pseries error log in an RTAS extended event log. + * @log: RTAS error/event log + * @section_id: two character section identifier + * + * Returns a pointer to the specified errorlog or NULL if not found. + */ +struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, + uint16_t section_id) +{ + struct rtas_ext_event_log_v6 *ext_log = + (struct rtas_ext_event_log_v6 *)log->buffer; + struct pseries_errorlog *sect; + unsigned char *p, *log_end; + + /* Check that we understand the format */ + if (log->extended_log_length < sizeof(struct rtas_ext_event_log_v6) || + ext_log->log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG || + ext_log->company_id != RTAS_V6EXT_COMPANY_ID_IBM) + return NULL; + + log_end = log->buffer + log->extended_log_length; + p = ext_log->vendor_log; + + while (p < log_end) { + sect = (struct pseries_errorlog *)p; + if (sect->id == section_id) + return sect; + p += sect->length; + } + + return NULL; +} + asmlinkage int ppc_rtas(struct rtas_args __user *uargs) { struct rtas_args args; diff -uprN linux-2.6.32/arch/powerpc/kernel/rtas_flash.c linux-2.6.32.ovz/arch/powerpc/kernel/rtas_flash.c --- linux-2.6.32/arch/powerpc/kernel/rtas_flash.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/rtas_flash.c 2015-09-10 10:07:25.000000000 -0700 @@ -56,13 +56,31 @@ #define VALIDATE_READY -1001 /* Firmware image ready for validation */ #define VALIDATE_PARAM_ERR -3 /* RTAS Parameter Error */ #define VALIDATE_HW_ERR -1 /* RTAS Hardware Error */ -#define VALIDATE_TMP_UPDATE 0 /* Validate Return Status */ -#define VALIDATE_FLASH_AUTH 1 /* Validate Return Status */ -#define VALIDATE_INVALID_IMG 2 /* Validate Return Status */ -#define VALIDATE_CUR_UNKNOWN 3 /* Validate Return Status */ -#define VALIDATE_TMP_COMMIT_DL 4 /* Validate Return Status */ -#define VALIDATE_TMP_COMMIT 5 /* Validate Return Status */ -#define VALIDATE_TMP_UPDATE_DL 6 /* Validate Return Status */ + +/* ibm,validate-flash-image update result tokens */ +#define VALIDATE_TMP_UPDATE 0 /* T side will be updated */ +#define VALIDATE_FLASH_AUTH 1 /* Partition does not have authority */ +#define VALIDATE_INVALID_IMG 2 /* Candidate image is not valid */ +#define VALIDATE_CUR_UNKNOWN 3 /* Current fixpack level is unknown */ +/* + * Current T side will be committed to P side before being replace with new + * image, and the new image is downlevel from current image + */ +#define VALIDATE_TMP_COMMIT_DL 4 +/* + * Current T side will be committed to P side before being replaced with new + * image + */ +#define VALIDATE_TMP_COMMIT 5 +/* + * T side will be updated with a downlevel image + */ +#define VALIDATE_TMP_UPDATE_DL 6 +/* + * The candidate image's release date is later than the system's firmware + * service entitlement date - service warranty period has expired + */ +#define VALIDATE_OUT_OF_WRNTY 7 /* ibm,manage-flash-image operation tokens */ #define RTAS_REJECT_TMP_IMG 0 @@ -70,6 +88,7 @@ /* Array sizes */ #define VALIDATE_BUF_SIZE 4096 +#define VALIDATE_MSG_LEN 256 #define RTAS_MSG_MAXLEN 64 /* Quirk - RTAS requires 4k list length and block size */ @@ -93,12 +112,8 @@ struct flash_block_list { struct flash_block_list *next; struct flash_block blocks[FLASH_BLOCKS_PER_NODE]; }; -struct flash_block_list_header { /* just the header of flash_block_list */ - unsigned long num_blocks; - struct flash_block_list *next; -}; -static struct flash_block_list_header rtas_firmware_flash_list = {0, NULL}; +static struct flash_block_list *rtas_firmware_flash_list; /* Use slab cache to guarantee 4k alignment */ static struct kmem_cache *flash_block_cache = NULL; @@ -107,13 +122,14 @@ static struct kmem_cache *flash_block_ca /* Local copy of the flash block list. * We only allow one open of the flash proc file and create this - * list as we go. This list will be put in the - * rtas_firmware_flash_list var once it is fully read. + * list as we go. The rtas_firmware_flash_list varable will be + * set once the data is fully read. * * For convenience as we build the list we use virtual addrs, * we do not fill in the version number, and the length field * is treated as the number of entries currently in the block - * (i.e. not a byte count). This is all fixed on release. + * (i.e. not a byte count). This is all fixed when calling + * the flash routine. */ /* Status int must be first member of struct */ @@ -200,16 +216,16 @@ static int rtas_flash_release(struct ino if (uf->flist) { /* File was opened in write mode for a new flash attempt */ /* Clear saved list */ - if (rtas_firmware_flash_list.next) { - free_flash_list(rtas_firmware_flash_list.next); - rtas_firmware_flash_list.next = NULL; + if (rtas_firmware_flash_list) { + free_flash_list(rtas_firmware_flash_list); + rtas_firmware_flash_list = NULL; } if (uf->status != FLASH_AUTH) uf->status = flash_list_valid(uf->flist); if (uf->status == FLASH_IMG_READY) - rtas_firmware_flash_list.next = uf->flist; + rtas_firmware_flash_list = uf->flist; else free_flash_list(uf->flist); @@ -285,12 +301,6 @@ static ssize_t rtas_flash_read(struct fi return msglen; } -/* constructor for flash_block_cache */ -void rtas_block_ctor(void *ptr) -{ - memset(ptr, 0, RTAS_BLK_SIZE); -} - /* We could be much more efficient here. But to keep this function * simple we allocate a page to the block list no matter how small the * count is. If the system is low on memory it will be just as well @@ -315,7 +325,7 @@ static ssize_t rtas_flash_write(struct f * proc file */ if (uf->flist == NULL) { - uf->flist = kmem_cache_alloc(flash_block_cache, GFP_KERNEL); + uf->flist = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); if (!uf->flist) return -ENOMEM; } @@ -326,7 +336,7 @@ static ssize_t rtas_flash_write(struct f next_free = fl->num_blocks; if (next_free == FLASH_BLOCKS_PER_NODE) { /* Need to allocate another block_list */ - fl->next = kmem_cache_alloc(flash_block_cache, GFP_KERNEL); + fl->next = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); if (!fl->next) return -ENOMEM; fl = fl->next; @@ -335,7 +345,7 @@ static ssize_t rtas_flash_write(struct f if (count > RTAS_BLK_SIZE) count = RTAS_BLK_SIZE; - p = kmem_cache_alloc(flash_block_cache, GFP_KERNEL); + p = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); if (!p) return -ENOMEM; @@ -474,7 +484,7 @@ static void validate_flash(struct rtas_v } static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf, - char *msg) + char *msg, int msglen) { int n; @@ -482,7 +492,8 @@ static int get_validate_flash_msg(struct n = sprintf(msg, "%d\n", args_buf->update_results); if ((args_buf->update_results >= VALIDATE_CUR_UNKNOWN) || (args_buf->update_results == VALIDATE_TMP_UPDATE)) - n += sprintf(msg + n, "%s\n", args_buf->buf); + n += snprintf(msg + n, msglen - n, "%s\n", + args_buf->buf); } else { n = sprintf(msg, "%d\n", args_buf->status); } @@ -494,7 +505,7 @@ static ssize_t validate_flash_read(struc { struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); struct rtas_validate_flash_t *args_buf; - char msg[RTAS_MSG_MAXLEN]; + char msg[VALIDATE_MSG_LEN]; int msglen; args_buf = (struct rtas_validate_flash_t *) dp->data; @@ -502,7 +513,7 @@ static ssize_t validate_flash_read(struc if (ppos && *ppos != 0) return 0; /* be cheap */ - msglen = get_validate_flash_msg(args_buf, msg); + msglen = get_validate_flash_msg(args_buf, msg, VALIDATE_MSG_LEN); if (msglen > count) msglen = count; @@ -592,7 +603,7 @@ static void rtas_flash_firmware(int rebo unsigned long rtas_block_list; int i, status, update_token; - if (rtas_firmware_flash_list.next == NULL) + if (rtas_firmware_flash_list == NULL) return; /* nothing to do */ if (reboot_type != SYS_RESTART) { @@ -609,20 +620,31 @@ static void rtas_flash_firmware(int rebo return; } - /* NOTE: the "first" block list is a global var with no data - * blocks in the kernel data segment. We do this because - * we want to ensure this block_list addr is under 4GB. + /* + * Just before starting the firmware flash, cancel the event scan work + * to avoid any soft lockup issues. */ - rtas_firmware_flash_list.num_blocks = 0; - flist = (struct flash_block_list *)&rtas_firmware_flash_list; + rtas_cancel_event_scan(); + + /* + * NOTE: the "first" block must be under 4GB, so we create + * an entry with no data blocks in the reserved buffer in + * the kernel data segment. + */ + spin_lock(&rtas_data_buf_lock); + flist = (struct flash_block_list *)&rtas_data_buf[0]; + flist->num_blocks = 0; + flist->next = rtas_firmware_flash_list; rtas_block_list = virt_to_abs(flist); if (rtas_block_list >= 4UL*1024*1024*1024) { printk(KERN_ALERT "FLASH: kernel bug...flash list header addr above 4GB\n"); + spin_unlock(&rtas_data_buf_lock); return; } printk(KERN_ALERT "FLASH: preparing saved firmware image for flash\n"); /* Update the block_list in place. */ + rtas_firmware_flash_list = NULL; /* too hard to backout on error */ image_size = 0; for (f = flist; f; f = next) { /* Translate data addrs to absolute */ @@ -663,6 +685,7 @@ static void rtas_flash_firmware(int rebo printk(KERN_ALERT "FLASH: unknown flash return code %d\n", status); break; } + spin_unlock(&rtas_data_buf_lock); } static void remove_flash_pde(struct proc_dir_entry *dp) @@ -798,7 +821,7 @@ static int __init rtas_flash_init(void) flash_block_cache = kmem_cache_create("rtas_flash_cache", RTAS_BLK_SIZE, RTAS_BLK_SIZE, 0, - rtas_block_ctor); + NULL); if (!flash_block_cache) { printk(KERN_ERR "%s: failed to create block cache\n", __func__); @@ -820,6 +843,11 @@ static void __exit rtas_flash_cleanup(vo { rtas_flash_term_hook = NULL; + if (rtas_firmware_flash_list) { + free_flash_list(rtas_firmware_flash_list); + rtas_firmware_flash_list = NULL; + } + if (flash_block_cache) kmem_cache_destroy(flash_block_cache); diff -uprN linux-2.6.32/arch/powerpc/kernel/rtas_pci.c linux-2.6.32.ovz/arch/powerpc/kernel/rtas_pci.c --- linux-2.6.32/arch/powerpc/kernel/rtas_pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/rtas_pci.c 2015-09-10 10:07:23.000000000 -0700 @@ -276,7 +276,7 @@ void __init find_and_init_phbs(void) pci_devs_phb_init(); /* - * pci_probe_only and pci_assign_all_buses can be set via properties + * PCI_PROBE_ONLY and PCI_REASSIGN_ALL_BUS can be set via properties * in chosen. */ if (of_chosen) { @@ -284,14 +284,18 @@ void __init find_and_init_phbs(void) prop = of_get_property(of_chosen, "linux,pci-probe-only", NULL); - if (prop) - pci_probe_only = *prop; + if (prop) { + if (*prop) + pci_add_flags(PCI_PROBE_ONLY); + else + pci_clear_flags(PCI_PROBE_ONLY); + } #ifdef CONFIG_PPC32 /* Will be made generic soon */ prop = of_get_property(of_chosen, "linux,pci-assign-all-buses", NULL); if (prop && *prop) - ppc_pci_flags |= PPC_PCI_REASSIGN_ALL_BUS; + pci_add_flags(PCI_REASSIGN_ALL_BUS); #endif /* CONFIG_PPC32 */ } } diff -uprN linux-2.6.32/arch/powerpc/kernel/setup_32.c linux-2.6.32.ovz/arch/powerpc/kernel/setup_32.c --- linux-2.6.32/arch/powerpc/kernel/setup_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/setup_32.c 2015-09-10 10:08:00.000000000 -0700 @@ -345,7 +345,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_SWIOTLB if (ppc_swiotlb_enable) - swiotlb_init(); + swiotlb_init(1); #endif paging_init(); diff -uprN linux-2.6.32/arch/powerpc/kernel/setup_64.c linux-2.6.32.ovz/arch/powerpc/kernel/setup_64.c --- linux-2.6.32/arch/powerpc/kernel/setup_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/setup_64.c 2015-09-10 10:08:00.000000000 -0700 @@ -96,7 +96,7 @@ int ucache_bsize; #ifdef CONFIG_SMP -static int smt_enabled_cmdline; +static char *smt_enabled_cmdline = NULL; /* Look for ibm,smt-enabled OF option */ static void check_smt_enabled(void) @@ -104,37 +104,46 @@ static void check_smt_enabled(void) struct device_node *dn; const char *smt_option; - /* Allow the command line to overrule the OF option */ - if (smt_enabled_cmdline) - return; - - dn = of_find_node_by_path("/options"); + /* Default to enabling all threads */ + smt_enabled_at_boot = threads_per_core; - if (dn) { - smt_option = of_get_property(dn, "ibm,smt-enabled", NULL); + /* Allow the command line to overrule the OF option */ + if (smt_enabled_cmdline) { + if (!strcmp(smt_enabled_cmdline, "on")) + smt_enabled_at_boot = threads_per_core; + else if (!strcmp(smt_enabled_cmdline, "off")) + smt_enabled_at_boot = 0; + else { + long smt; + int rc; + + rc = strict_strtol(smt_enabled_cmdline, 10, &smt); + if (!rc) + smt_enabled_at_boot = + min(threads_per_core, (int)smt); + } + } else { + dn = of_find_node_by_path("/options"); + if (dn) { + smt_option = of_get_property(dn, "ibm,smt-enabled", + NULL); + + if (smt_option) { + if (!strcmp(smt_option, "on")) + smt_enabled_at_boot = threads_per_core; + else if (!strcmp(smt_option, "off")) + smt_enabled_at_boot = 0; + } - if (smt_option) { - if (!strcmp(smt_option, "on")) - smt_enabled_at_boot = 1; - else if (!strcmp(smt_option, "off")) - smt_enabled_at_boot = 0; - } - } + of_node_put(dn); + } + } } /* Look for smt-enabled= cmdline option */ static int __init early_smt_enabled(char *p) { - smt_enabled_cmdline = 1; - - if (!p) - return 0; - - if (!strcmp(p, "on") || !strcmp(p, "1")) - smt_enabled_at_boot = 1; - else if (!strcmp(p, "off") || !strcmp(p, "0")) - smt_enabled_at_boot = 0; - + smt_enabled_cmdline = p; return 0; } early_param("smt-enabled", early_smt_enabled); @@ -398,8 +407,8 @@ void __init setup_system(void) */ xmon_setup(); - check_smt_enabled(); smp_setup_cpu_maps(); + check_smt_enabled(); #ifdef CONFIG_SMP /* Release secondary cpus out of their spinloops at 0x60 now that @@ -432,9 +441,18 @@ void __init setup_system(void) DBG(" <- setup_system()\n"); } +static u64 slb0_limit(void) +{ + if (cpu_has_feature(CPU_FTR_1T_SEGMENT)) { + return 1UL << SID_SHIFT_1T; + } + return 1UL << SID_SHIFT; +} + #ifdef CONFIG_IRQSTACKS static void __init irqstack_early_init(void) { + u64 limit = slb0_limit(); unsigned int i; /* @@ -444,10 +462,10 @@ static void __init irqstack_early_init(v for_each_possible_cpu(i) { softirq_ctx[i] = (struct thread_info *) __va(lmb_alloc_base(THREAD_SIZE, - THREAD_SIZE, 0x10000000)); + THREAD_SIZE, limit)); hardirq_ctx[i] = (struct thread_info *) __va(lmb_alloc_base(THREAD_SIZE, - THREAD_SIZE, 0x10000000)); + THREAD_SIZE, limit)); } } #else @@ -478,7 +496,7 @@ static void __init exc_lvl_early_init(vo */ static void __init emergency_stack_init(void) { - unsigned long limit; + u64 limit; unsigned int i; /* @@ -490,7 +508,7 @@ static void __init emergency_stack_init( * bringup, we need to get at them in real mode. This means they * must also be within the RMO region. */ - limit = min(0x10000000ULL, lmb.rmo_size); + limit = min(slb0_limit(), lmb.rmo_size); for_each_possible_cpu(i) { unsigned long sp; @@ -550,7 +568,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_SWIOTLB if (ppc_swiotlb_enable) - swiotlb_init(); + swiotlb_init(1); #endif paging_init(); diff -uprN linux-2.6.32/arch/powerpc/kernel/setup-common.c linux-2.6.32.ovz/arch/powerpc/kernel/setup-common.c --- linux-2.6.32/arch/powerpc/kernel/setup-common.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/setup-common.c 2015-09-10 10:07:15.000000000 -0700 @@ -60,6 +60,7 @@ #include #include #include +#include #include "setup.h" @@ -102,6 +103,14 @@ EXPORT_SYMBOL(ppc_do_canonicalize_irqs); /* also used by kexec */ void machine_shutdown(void) { +#ifdef CONFIG_FA_DUMP + /* + * if fadump is active, cleanup the fadump registration before we + * shutdown. + */ + fadump_cleanup(); +#endif + if (ppc_md.machine_shutdown) ppc_md.machine_shutdown(); } @@ -603,6 +612,11 @@ EXPORT_SYMBOL(check_legacy_ioport); static int ppc_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { + /* + * If firmware-assisted dump has been registered then trigger + * firmware-assisted dump and let firmware handle everything else. + */ + crash_fadump(NULL, ptr); ppc_md.panic(ptr); /* May not return */ return NOTIFY_DONE; } diff -uprN linux-2.6.32/arch/powerpc/kernel/signal_32.c linux-2.6.32.ovz/arch/powerpc/kernel/signal_32.c --- linux-2.6.32/arch/powerpc/kernel/signal_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/signal_32.c 2015-09-10 10:07:39.000000000 -0700 @@ -444,6 +444,12 @@ static int save_user_regs(struct pt_regs #endif /* CONFIG_ALTIVEC */ if (copy_fpr_to_user(&frame->mc_fregs, current)) return 1; + + /* + * Clear the MSR VSX bit to indicate there is no valid state attached + * to this context, except in the specific case below where we set it. + */ + msr &= ~MSR_VSX; #ifdef CONFIG_VSX /* * Copy VSR 0-31 upper half from thread_struct to local diff -uprN linux-2.6.32/arch/powerpc/kernel/signal_64.c linux-2.6.32.ovz/arch/powerpc/kernel/signal_64.c --- linux-2.6.32/arch/powerpc/kernel/signal_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/signal_64.c 2015-09-10 10:07:39.000000000 -0700 @@ -116,6 +116,12 @@ static long setup_sigcontext(struct sigc flush_fp_to_thread(current); /* copy fpr regs and fpscr */ err |= copy_fpr_to_user(&sc->fp_regs, current); + + /* + * Clear the MSR VSX bit to indicate there is no valid state attached + * to this context, except in the specific case below where we set it. + */ + msr &= ~MSR_VSX; #ifdef CONFIG_VSX /* * Copy VSX low doubleword to local buffer for formatting, diff -uprN linux-2.6.32/arch/powerpc/kernel/signal.c linux-2.6.32.ovz/arch/powerpc/kernel/signal.c --- linux-2.6.32/arch/powerpc/kernel/signal.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/signal.c 2015-09-10 10:06:57.000000000 -0700 @@ -196,6 +196,8 @@ void do_signal(struct pt_regs *regs, uns if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff -uprN linux-2.6.32/arch/powerpc/kernel/smp.c linux-2.6.32.ovz/arch/powerpc/kernel/smp.c --- linux-2.6.32/arch/powerpc/kernel/smp.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/smp.c 2015-09-10 10:07:26.000000000 -0700 @@ -49,6 +49,7 @@ #ifdef CONFIG_PPC64 #include #endif +#include #ifdef DEBUG #include @@ -98,6 +99,7 @@ void smp_message_recv(int msg) break; case PPC_MSG_RESCHEDULE: /* we notice need_resched on exit */ + scheduler_ipi(); break; case PPC_MSG_CALL_FUNC_SINGLE: generic_smp_call_function_single_interrupt(); @@ -127,6 +129,7 @@ static irqreturn_t call_function_action( static irqreturn_t reschedule_action(int irq, void *data) { + scheduler_ipi(); /* we just need the return path side effect of checking need_resched */ return IRQ_HANDLED; } @@ -501,11 +504,8 @@ int __devinit start_secondary(void *unus if (smp_ops->take_timebase) smp_ops->take_timebase(); - if (system_state > SYSTEM_BOOTING) - snapshot_timebase(); - secondary_cpu_time_init(); - + vdso_getcpu_init(); ipi_call_lock(); notify_cpu_starting(cpu); set_cpu_online(cpu, true); @@ -565,8 +565,6 @@ void __init smp_cpus_done(unsigned int m set_cpus_allowed(current, old_mask); - snapshot_timebases(); - dump_numa_cpu_topology(); } diff -uprN linux-2.6.32/arch/powerpc/kernel/syscalls.c linux-2.6.32.ovz/arch/powerpc/kernel/syscalls.c --- linux-2.6.32/arch/powerpc/kernel/syscalls.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/syscalls.c 2015-09-10 10:05:47.000000000 -0700 @@ -140,7 +140,6 @@ static inline unsigned long do_mmap2(uns unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off, int shift) { - struct file * file = NULL; unsigned long ret = -EINVAL; if (!arch_validate_prot(prot)) @@ -151,20 +150,8 @@ static inline unsigned long do_mmap2(uns goto out; off >>= shift; } - - ret = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - if (!(file = fget(fd))) - goto out; - } - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flags, off); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off); out: return ret; } diff -uprN linux-2.6.32/arch/powerpc/kernel/sysfs.c linux-2.6.32.ovz/arch/powerpc/kernel/sysfs.c --- linux-2.6.32/arch/powerpc/kernel/sysfs.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/sysfs.c 2015-09-10 10:05:44.000000000 -0700 @@ -461,6 +461,25 @@ static void unregister_cpu_online(unsign cacheinfo_cpu_offline(cpu); } + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE +ssize_t arch_cpu_probe(const char *buf, size_t count) +{ + if (ppc_md.cpu_probe) + return ppc_md.cpu_probe(buf, count); + + return -EINVAL; +} + +ssize_t arch_cpu_release(const char *buf, size_t count) +{ + if (ppc_md.cpu_release) + return ppc_md.cpu_release(buf, count); + + return -EINVAL; +} +#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ + #endif /* CONFIG_HOTPLUG_CPU */ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self, diff -uprN linux-2.6.32/arch/powerpc/kernel/sys_ppc32.c linux-2.6.32.ovz/arch/powerpc/kernel/sys_ppc32.c --- linux-2.6.32/arch/powerpc/kernel/sys_ppc32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/sys_ppc32.c 2015-09-10 10:07:24.000000000 -0700 @@ -191,7 +191,7 @@ long compat_sys_execve(unsigned long a0, struct pt_regs *regs) { int error; - char * filename; + struct filename *filename; filename = getname((char __user *) a0); error = PTR_ERR(filename); @@ -200,7 +200,7 @@ long compat_sys_execve(unsigned long a0, flush_fp_to_thread(current); flush_altivec_to_thread(current); - error = compat_do_execve(filename, compat_ptr(a1), compat_ptr(a2), regs); + error = compat_do_execve(filename->name, compat_ptr(a1), compat_ptr(a2), regs); putname(filename); diff -uprN linux-2.6.32/arch/powerpc/kernel/time.c linux-2.6.32.ovz/arch/powerpc/kernel/time.c --- linux-2.6.32/arch/powerpc/kernel/time.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/time.c 2015-09-10 10:06:57.000000000 -0700 @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include @@ -186,6 +186,8 @@ u64 __cputime_jiffies_factor; EXPORT_SYMBOL(__cputime_jiffies_factor); u64 __cputime_msec_factor; EXPORT_SYMBOL(__cputime_msec_factor); +u64 __cputime_usec_factor; +EXPORT_SYMBOL(__cputime_usec_factor); u64 __cputime_sec_factor; EXPORT_SYMBOL(__cputime_sec_factor); u64 __cputime_clockt_factor; @@ -195,14 +197,16 @@ DEFINE_PER_CPU(unsigned long, cputime_sc cputime_t cputime_one_jiffy; +void (*dtl_consumer)(struct dtl_entry *, u64); + static void calc_cputime_factors(void) { struct div_result res; div128_by_32(HZ, 0, tb_ticks_per_sec, &res); __cputime_jiffies_factor = res.result_low; - div128_by_32(1000, 0, tb_ticks_per_sec, &res); - __cputime_msec_factor = res.result_low; + div128_by_32(1000000, 0, tb_ticks_per_sec, &res); + __cputime_usec_factor = res.result_low; div128_by_32(1, 0, tb_ticks_per_sec, &res); __cputime_sec_factor = res.result_low; div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res); @@ -210,62 +214,156 @@ static void calc_cputime_factors(void) } /* - * Read the PURR on systems that have it, otherwise the timebase. + * Read the SPURR on systems that have it, otherwise the PURR, + * or if that doesn't exist return the timebase value passed in. */ -static u64 read_purr(void) +static u64 read_spurr(u64 tb) { + if (cpu_has_feature(CPU_FTR_SPURR)) + return mfspr(SPRN_SPURR); if (cpu_has_feature(CPU_FTR_PURR)) return mfspr(SPRN_PURR); - return mftb(); + return tb; } +#ifdef CONFIG_PPC_SPLPAR + /* - * Read the SPURR on systems that have it, otherwise the purr + * Scan the dispatch trace log and count up the stolen time. + * Should be called with interrupts disabled. */ -static u64 read_spurr(u64 purr) +static u64 scan_dispatch_log(u64 stop_tb) { - /* - * cpus without PURR won't have a SPURR - * We already know the former when we use this, so tell gcc - */ - if (cpu_has_feature(CPU_FTR_PURR) && cpu_has_feature(CPU_FTR_SPURR)) - return mfspr(SPRN_SPURR); - return purr; + u64 i = local_paca->dtl_ridx; + struct dtl_entry *dtl = local_paca->dtl_curr; + struct dtl_entry *dtl_end = local_paca->dispatch_log_end; + struct lppaca *vpa = local_paca->lppaca_ptr; + u64 tb_delta; + u64 stolen = 0; + u64 dtb; + + if (!dtl) + return 0; + + if (i == vpa->dtl_idx) + return 0; + while (i < vpa->dtl_idx) { + if (dtl_consumer) + dtl_consumer(dtl, i); + dtb = dtl->timebase; + tb_delta = dtl->enqueue_to_dispatch_time + + dtl->ready_to_enqueue_time; + barrier(); + if (i + N_DISPATCH_LOG < vpa->dtl_idx) { + /* buffer has overflowed */ + i = vpa->dtl_idx - N_DISPATCH_LOG; + dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); + continue; + } + if (dtb > stop_tb) + break; + stolen += tb_delta; + ++i; + ++dtl; + if (dtl == dtl_end) + dtl = local_paca->dispatch_log; + } + local_paca->dtl_ridx = i; + local_paca->dtl_curr = dtl; + return stolen; } /* + * Accumulate stolen time by scanning the dispatch trace log. + * Called on entry from user mode. + */ +void accumulate_stolen_time(void) +{ + u64 sst, ust; + + sst = scan_dispatch_log(get_paca()->starttime_user); + ust = scan_dispatch_log(get_paca()->starttime); + get_paca()->system_time -= sst; + get_paca()->user_time -= ust; + get_paca()->stolen_time += ust + sst; +} + +static inline u64 calculate_stolen_time(u64 stop_tb) +{ + u64 stolen = 0; + + if (get_paca()->dtl_ridx != get_paca()->lppaca_ptr->dtl_idx) { + stolen = scan_dispatch_log(stop_tb); + get_paca()->system_time -= stolen; + } + + stolen += get_paca()->stolen_time; + get_paca()->stolen_time = 0; + return stolen; +} + +#else /* CONFIG_PPC_SPLPAR */ +static inline u64 calculate_stolen_time(u64 stop_tb) +{ + return 0; +} + +#endif /* CONFIG_PPC_SPLPAR */ + +/* * Account time for a transition between system, hard irq * or soft irq state. */ void account_system_vtime(struct task_struct *tsk) { - u64 now, nowscaled, delta, deltascaled, sys_time; + u64 now, nowscaled, delta, deltascaled; unsigned long flags; + u64 stolen, udelta, sys_scaled, user_scaled; local_irq_save(flags); - now = read_purr(); + now = mftb(); nowscaled = read_spurr(now); - delta = now - get_paca()->startpurr; + get_paca()->system_time += now - get_paca()->starttime; + get_paca()->starttime = now; deltascaled = nowscaled - get_paca()->startspurr; - get_paca()->startpurr = now; get_paca()->startspurr = nowscaled; - if (!in_interrupt()) { - /* deltascaled includes both user and system time. - * Hence scale it based on the purr ratio to estimate - * the system time */ - sys_time = get_paca()->system_time; - if (get_paca()->user_time) - deltascaled = deltascaled * sys_time / - (sys_time + get_paca()->user_time); - delta += sys_time; - get_paca()->system_time = 0; + + stolen = calculate_stolen_time(now); + + delta = get_paca()->system_time; + get_paca()->system_time = 0; + udelta = get_paca()->user_time - get_paca()->utime_sspurr; + get_paca()->utime_sspurr = get_paca()->user_time; + + /* + * Because we don't read the SPURR on every kernel entry/exit, + * deltascaled includes both user and system SPURR ticks. + * Apportion these ticks to system SPURR ticks and user + * SPURR ticks in the same ratio as the system time (delta) + * and user time (udelta) values obtained from the timebase + * over the same interval. The system ticks get accounted here; + * the user ticks get saved up in paca->user_time_scaled to be + * used by account_process_tick. + */ + sys_scaled = delta; + user_scaled = udelta; + if (deltascaled != delta + udelta) { + if (udelta) { + sys_scaled = deltascaled * delta / (delta + udelta); + user_scaled = deltascaled - sys_scaled; + } else { + sys_scaled = deltascaled; + } + } + get_paca()->user_time_scaled += user_scaled; + + if (in_interrupt() || idle_task(smp_processor_id()) != tsk) { + account_system_time(tsk, 0, delta, sys_scaled); + if (stolen) + account_steal_time(stolen); + } else { + account_idle_time(delta + stolen); } - if (in_irq() || idle_task(smp_processor_id()) != tsk) - account_system_time(tsk, 0, delta, deltascaled); - else - account_idle_time(delta); - per_cpu(cputime_last_delta, smp_processor_id()) = delta; - per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled; local_irq_restore(flags); } @@ -274,125 +372,26 @@ void account_system_vtime(struct task_st * by the exception entry and exit code to the generic process * user and system time records. * Must be called with interrupts disabled. + * Assumes that account_system_vtime() has been called recently + * (i.e. since the last entry from usermode) so that + * get_paca()->user_time_scaled is up to date. */ void account_process_tick(struct task_struct *tsk, int user_tick) { cputime_t utime, utimescaled; utime = get_paca()->user_time; + utimescaled = get_paca()->user_time_scaled; get_paca()->user_time = 0; - utimescaled = cputime_to_scaled(utime); + get_paca()->user_time_scaled = 0; + get_paca()->utime_sspurr = 0; account_user_time(tsk, utime, utimescaled); } -/* - * Stuff for accounting stolen time. - */ -struct cpu_purr_data { - int initialized; /* thread is running */ - u64 tb; /* last TB value read */ - u64 purr; /* last PURR value read */ - u64 spurr; /* last SPURR value read */ -}; - -/* - * Each entry in the cpu_purr_data array is manipulated only by its - * "owner" cpu -- usually in the timer interrupt but also occasionally - * in process context for cpu online. As long as cpus do not touch - * each others' cpu_purr_data, disabling local interrupts is - * sufficient to serialize accesses. - */ -static DEFINE_PER_CPU(struct cpu_purr_data, cpu_purr_data); - -static void snapshot_tb_and_purr(void *data) -{ - unsigned long flags; - struct cpu_purr_data *p = &__get_cpu_var(cpu_purr_data); - - local_irq_save(flags); - p->tb = get_tb_or_rtc(); - p->purr = mfspr(SPRN_PURR); - wmb(); - p->initialized = 1; - local_irq_restore(flags); -} - -/* - * Called during boot when all cpus have come up. - */ -void snapshot_timebases(void) -{ - if (!cpu_has_feature(CPU_FTR_PURR)) - return; - on_each_cpu(snapshot_tb_and_purr, NULL, 1); -} - -/* - * Must be called with interrupts disabled. - */ -void calculate_steal_time(void) -{ - u64 tb, purr; - s64 stolen; - struct cpu_purr_data *pme; - - pme = &__get_cpu_var(cpu_purr_data); - if (!pme->initialized) - return; /* !CPU_FTR_PURR or early in early boot */ - tb = mftb(); - purr = mfspr(SPRN_PURR); - stolen = (tb - pme->tb) - (purr - pme->purr); - if (stolen > 0) { - if (idle_task(smp_processor_id()) != current) - account_steal_time(stolen); - else - account_idle_time(stolen); - } - pme->tb = tb; - pme->purr = purr; -} - -#ifdef CONFIG_PPC_SPLPAR -/* - * Must be called before the cpu is added to the online map when - * a cpu is being brought up at runtime. - */ -static void snapshot_purr(void) -{ - struct cpu_purr_data *pme; - unsigned long flags; - - if (!cpu_has_feature(CPU_FTR_PURR)) - return; - local_irq_save(flags); - pme = &__get_cpu_var(cpu_purr_data); - pme->tb = mftb(); - pme->purr = mfspr(SPRN_PURR); - pme->initialized = 1; - local_irq_restore(flags); -} - -#endif /* CONFIG_PPC_SPLPAR */ - #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ #define calc_cputime_factors() -#define calculate_steal_time() do { } while (0) #endif -#if !(defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR)) -#define snapshot_purr() do { } while (0) -#endif - -/* - * Called when a cpu comes up after the system has finished booting, - * i.e. as a result of a hotplug cpu action. - */ -void snapshot_timebase(void) -{ - __get_cpu_var(last_jiffy) = get_tb_or_rtc(); - snapshot_purr(); -} - void __delay(unsigned long loops) { unsigned long start; @@ -422,7 +421,9 @@ void udelay(unsigned long usecs) EXPORT_SYMBOL(udelay); static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, - u64 new_tb_to_xs) + u64 new_tb_to_xs, struct timespec *now, + struct timespec *wall_time, + u32 frac_sec) { /* * tb_update_count is used to allow the userspace gettimeofday code @@ -438,9 +439,10 @@ static inline void update_gtod(u64 new_t vdso_data->tb_orig_stamp = new_tb_stamp; vdso_data->stamp_xsec = new_stamp_xsec; vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; - vdso_data->stamp_xtime = xtime; + vdso_data->wtom_clock_sec = now->tv_sec; + vdso_data->wtom_clock_nsec = now->tv_nsec; + vdso_data->stamp_xtime = *wall_time; + vdso_data->stamp_sec_fraction = frac_sec; smp_wmb(); ++(vdso_data->tb_update_count); } @@ -530,25 +532,60 @@ void __init iSeries_time_init_early(void } #endif /* CONFIG_PPC_ISERIES */ -#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_PPC32) -DEFINE_PER_CPU(u8, perf_event_pending); +#ifdef CONFIG_IRQ_WORK -void set_perf_event_pending(void) +/* + * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... + */ +#ifdef CONFIG_PPC64 +static inline unsigned long test_irq_work_pending(void) { - get_cpu_var(perf_event_pending) = 1; - set_dec(1); - put_cpu_var(perf_event_pending); + unsigned long x; + + asm volatile("lbz %0,%1(13)" + : "=r" (x) + : "i" (offsetof(struct paca_struct, irq_work_pending))); + return x; } -#define test_perf_event_pending() __get_cpu_var(perf_event_pending) -#define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0 +static inline void set_irq_work_pending_flag(void) +{ + asm volatile("stb %0,%1(13)" : : + "r" (1), + "i" (offsetof(struct paca_struct, irq_work_pending))); +} -#else /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */ +static inline void clear_irq_work_pending(void) +{ + asm volatile("stb %0,%1(13)" : : + "r" (0), + "i" (offsetof(struct paca_struct, irq_work_pending))); +} -#define test_perf_event_pending() 0 -#define clear_perf_event_pending() +#else /* 32-bit */ -#endif /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */ +DEFINE_PER_CPU(u8, irq_work_pending); + +#define set_irq_work_pending_flag() __get_cpu_var(irq_work_pending) = 1 +#define test_irq_work_pending() __get_cpu_var(irq_work_pending) +#define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0 + +#endif /* 32 vs 64 bit */ + +void set_irq_work_pending(void) +{ + preempt_disable(); + set_irq_work_pending_flag(); + set_dec(1); + preempt_enable(); +} + +#else /* CONFIG_IRQ_WORK */ + +#define test_irq_work_pending() 0 +#define clear_irq_work_pending() + +#endif /* CONFIG_IRQ_WORK */ /* * For iSeries shared processors, we have to let the hypervisor @@ -576,10 +613,6 @@ void timer_interrupt(struct pt_regs * re set_dec(DECREMENTER_MAX); #ifdef CONFIG_PPC32 - if (test_perf_event_pending()) { - clear_perf_event_pending(); - perf_event_do_pending(); - } if (atomic_read(&ppc_n_lost_interrupts) != 0) do_IRQ(regs); #endif @@ -595,7 +628,10 @@ void timer_interrupt(struct pt_regs * re old_regs = set_irq_regs(regs); irq_enter(); - calculate_steal_time(); + if (test_irq_work_pending()) { + clear_irq_work_pending(); + irq_work_run(); + } #ifdef CONFIG_PPC_ISERIES if (firmware_has_feature(FW_FEATURE_ISERIES)) @@ -828,9 +864,11 @@ static cycle_t timebase_read(struct cloc return (cycle_t)get_tb(); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) { u64 t2x, stamp_xsec; + u32 frac_sec; if (clock != &clocksource_timebase) return; @@ -841,11 +879,18 @@ void update_vsyscall(struct timespec *wa /* XXX this assumes clock->shift == 22 */ /* 4611686018 ~= 2^(20+64-22) / 1e9 */ - t2x = (u64) clock->mult * 4611686018ULL; - stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC; + t2x = (u64) mult * 4611686018ULL; + stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC; do_div(stamp_xsec, 1000000000); - stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC; - update_gtod(clock->cycle_last, stamp_xsec, t2x); + stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC; + + if (wall_time->tv_nsec >= 1000000000) { + ++wall_time->tv_sec; + wall_time->tv_nsec -= 1000000000; + } + /* this is tv_nsec / 1e9 as a 0.32 fraction */ + frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32; + update_gtod(clock->cycle_last, stamp_xsec, t2x, wtm, wall_time, frac_sec); } void update_vsyscall_tz(void) @@ -1032,8 +1077,6 @@ void __init time_init(void) /* Save the current timebase to pretty up CONFIG_PRINTK_TIME */ boot_tb = get_tb_or_rtc(); - write_seqlock_irqsave(&xtime_lock, flags); - /* If platform provided a timezone (pmac), we correct the time */ if (timezone_offset) { sys_tz.tz_minuteswest = -timezone_offset / 60; @@ -1043,11 +1086,9 @@ void __init time_init(void) vdso_data->tb_orig_stamp = tb_last_jiffy; vdso_data->tb_update_count = 0; vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; + vdso_data->stamp_xsec = (u64) get_seconds() * XSEC_PER_SEC; vdso_data->tb_to_xs = tb_to_xs; - write_sequnlock_irqrestore(&xtime_lock, flags); - /* Start the decrementer on CPUs that have manual control * such as BookE */ diff -uprN linux-2.6.32/arch/powerpc/kernel/traps.c linux-2.6.32.ovz/arch/powerpc/kernel/traps.c --- linux-2.6.32/arch/powerpc/kernel/traps.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/traps.c 2015-09-10 10:07:15.000000000 -0700 @@ -58,6 +58,7 @@ #ifdef CONFIG_FSL_BOOKE #include #endif +#include #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) int (*__debugger)(struct pt_regs *regs); @@ -157,6 +158,8 @@ int die(const char *str, struct pt_regs add_taint(TAINT_DIE); spin_unlock_irqrestore(&die.lock, flags); + crash_fadump(regs, str); + if (kexec_should_crash(current) || kexec_sr_activated(smp_processor_id())) crash_kexec(regs); @@ -174,6 +177,15 @@ int die(const char *str, struct pt_regs return 0; } +void user_single_step_siginfo(struct task_struct *tsk, + struct pt_regs *regs, siginfo_t *info) +{ + memset(info, 0, sizeof(*info)); + info->si_signo = SIGTRAP; + info->si_code = TRAP_TRACE; + info->si_addr = (void __user *)regs->nip; +} + void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) { siginfo_t info; diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso32/getcpu.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/getcpu.S --- linux-2.6.32/arch/powerpc/kernel/vdso32/getcpu.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/getcpu.S 2015-09-10 10:07:26.000000000 -0700 @@ -0,0 +1,45 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + */ +#include +#include + + .text +/* + * Exact prototype of getcpu + * + * int __kernel_getcpu(unsigned *cpu, unsigned *node); + * + */ +V_FUNCTION_BEGIN(__kernel_getcpu) + .cfi_startproc + mfspr r5,SPRN_USPRG3 + cmpdi cr0,r3,0 + cmpdi cr1,r4,0 + clrlwi r6,r5,16 + rlwinm r7,r5,16,31-15,31-0 + beq cr0,1f + stw r6,0(r3) +1: beq cr1,2f + stw r7,0(r4) +2: crclr cr0*4+so + li r3,0 /* always success */ + blr + .cfi_endproc +V_FUNCTION_END(__kernel_getcpu) diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso32/gettimeofday.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/gettimeofday.S --- linux-2.6.32/arch/powerpc/kernel/vdso32/gettimeofday.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/gettimeofday.S 2015-09-10 10:06:08.000000000 -0700 @@ -19,8 +19,10 @@ /* Offset for the low 32-bit part of a field of long type */ #ifdef CONFIG_PPC64 #define LOPART 4 +#define TSPEC_TV_SEC TSPC64_TV_SEC+LOPART #else #define LOPART 0 +#define TSPEC_TV_SEC TSPC32_TV_SEC #endif .text @@ -41,23 +43,11 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) mr r9, r3 /* datapage ptr in r9 */ cmplwi r10,0 /* check if tv is NULL */ beq 3f - bl __do_get_xsec@local /* get xsec from tb & kernel */ - bne- 2f /* out of line -> do syscall */ - - /* seconds are xsec >> 20 */ - rlwinm r5,r4,12,20,31 - rlwimi r5,r3,12,0,19 - stw r5,TVAL32_TV_SEC(r10) - - /* get remaining xsec and convert to usec. we scale - * up remaining xsec by 12 bits and get the top 32 bits - * of the multiplication - */ - rlwinm r5,r4,12,0,19 - lis r6,1000000@h - ori r6,r6,1000000@l - mulhwu r5,r5,r6 - stw r5,TVAL32_TV_USEC(r10) + lis r7,1000000@ha /* load up USEC_PER_SEC */ + addi r7,r7,1000000@l /* so we get microseconds in r4 */ + bl __do_get_tspec@local /* get sec/usec from tb & kernel */ + stw r3,TVAL32_TV_SEC(r10) + stw r4,TVAL32_TV_USEC(r10) 3: cmplwi r11,0 /* check if tz is NULL */ beq 1f @@ -70,14 +60,6 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) crclr cr0*4+so li r3,0 blr - -2: - mtlr r12 - mr r3,r10 - mr r4,r11 - li r0,__NR_gettimeofday - sc - blr .cfi_endproc V_FUNCTION_END(__kernel_gettimeofday) @@ -100,7 +82,8 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) mr r11,r4 /* r11 saves tp */ bl __get_datapage@local /* get data page */ mr r9,r3 /* datapage ptr in r9 */ - + lis r7,NSEC_PER_SEC@h /* want nanoseconds */ + ori r7,r7,NSEC_PER_SEC@l 50: bl __do_get_tspec@local /* get sec/nsec from tb & kernel */ bne cr1,80f /* not monotonic -> all done */ @@ -198,83 +181,12 @@ V_FUNCTION_END(__kernel_clock_getres) /* - * This is the core of gettimeofday() & friends, it returns the xsec - * value in r3 & r4 and expects the datapage ptr (non clobbered) - * in r9. clobbers r0,r4,r5,r6,r7,r8. - * When returning, r8 contains the counter value that can be reused - * by the monotonic clock implementation - */ -__do_get_xsec: - .cfi_startproc - /* Check for update count & load values. We use the low - * order 32 bits of the update count - */ -1: lwz r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9) - andi. r0,r8,1 /* pending update ? loop */ - bne- 1b - xor r0,r8,r8 /* create dependency */ - add r9,r9,r0 - - /* Load orig stamp (offset to TB) */ - lwz r5,CFG_TB_ORIG_STAMP(r9) - lwz r6,(CFG_TB_ORIG_STAMP+4)(r9) - - /* Get a stable TB value */ -2: mftbu r3 - mftbl r4 - mftbu r0 - cmpl cr0,r3,r0 - bne- 2b - - /* Substract tb orig stamp. If the high part is non-zero, we jump to - * the slow path which call the syscall. - * If it's ok, then we have our 32 bits tb_ticks value in r7 - */ - subfc r7,r6,r4 - subfe. r0,r5,r3 - bne- 3f - - /* Load scale factor & do multiplication */ - lwz r5,CFG_TB_TO_XS(r9) /* load values */ - lwz r6,(CFG_TB_TO_XS+4)(r9) - mulhwu r4,r7,r5 - mulhwu r6,r7,r6 - mullw r0,r7,r5 - addc r6,r6,r0 - - /* At this point, we have the scaled xsec value in r4 + XER:CA - * we load & add the stamp since epoch - */ - lwz r5,CFG_STAMP_XSEC(r9) - lwz r6,(CFG_STAMP_XSEC+4)(r9) - adde r4,r4,r6 - addze r3,r5 - - /* We now have our result in r3,r4. We create a fake dependency - * on that result and re-check the counter - */ - or r6,r4,r3 - xor r0,r6,r6 - add r9,r9,r0 - lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) - cmpl cr0,r8,r0 /* check if updated */ - bne- 1b - - /* Warning ! The caller expects CR:EQ to be set to indicate a - * successful calculation (so it won't fallback to the syscall - * method). We have overriden that CR bit in the counter check, - * but fortunately, the loop exit condition _is_ CR:EQ set, so - * we can exit safely here. If you change this code, be careful - * of that side effect. - */ -3: blr - .cfi_endproc - -/* - * This is the core of clock_gettime(), it returns the current - * time in seconds and nanoseconds in r3 and r4. + * This is the core of clock_gettime() and gettimeofday(), + * it returns the current time in r3 (seconds) and r4. + * On entry, r7 gives the resolution of r4, either USEC_PER_SEC + * or NSEC_PER_SEC, giving r4 in microseconds or nanoseconds. * It expects the datapage ptr in r9 and doesn't clobber it. - * It clobbers r0, r5, r6, r10 and returns NSEC_PER_SEC in r7. + * It clobbers r0, r5 and r6. * On return, r8 contains the counter value that can be reused. * This clobbers cr0 but not any other cr field. */ @@ -297,70 +209,58 @@ __do_get_tspec: 2: mftbu r3 mftbl r4 mftbu r0 - cmpl cr0,r3,r0 + cmplw cr0,r3,r0 bne- 2b /* Subtract tb orig stamp and shift left 12 bits. */ - subfc r7,r6,r4 + subfc r4,r6,r4 subfe r0,r5,r3 slwi r0,r0,12 - rlwimi. r0,r7,12,20,31 - slwi r7,r7,12 + rlwimi. r0,r4,12,20,31 + slwi r4,r4,12 - /* Load scale factor & do multiplication */ + /* + * Load scale factor & do multiplication. + * We only use the high 32 bits of the tb_to_xs value. + * Even with a 1GHz timebase clock, the high 32 bits of + * tb_to_xs will be at least 4 million, so the error from + * ignoring the low 32 bits will be no more than 0.25ppm. + * The error will just make the clock run very very slightly + * slow until the next time the kernel updates the VDSO data, + * at which point the clock will catch up to the kernel's value, + * so there is no long-term error accumulation. + */ lwz r5,CFG_TB_TO_XS(r9) /* load values */ - lwz r6,(CFG_TB_TO_XS+4)(r9) - mulhwu r3,r7,r6 - mullw r10,r7,r5 - mulhwu r4,r7,r5 - addc r10,r3,r10 + mulhwu r4,r4,r5 li r3,0 beq+ 4f /* skip high part computation if 0 */ mulhwu r3,r0,r5 - mullw r7,r0,r5 - mulhwu r5,r0,r6 - mullw r6,r0,r6 - adde r4,r4,r7 - addze r3,r3 + mullw r5,r0,r5 addc r4,r4,r5 addze r3,r3 - addc r10,r10,r6 - -4: addze r4,r4 /* add in carry */ - lis r7,NSEC_PER_SEC@h - ori r7,r7,NSEC_PER_SEC@l - mulhwu r4,r4,r7 /* convert to nanoseconds */ - - /* At this point, we have seconds & nanoseconds since the xtime - * stamp in r3+CA and r4. Load & add the xtime stamp. - */ -#ifdef CONFIG_PPC64 - lwz r5,STAMP_XTIME+TSPC64_TV_SEC+LOPART(r9) - lwz r6,STAMP_XTIME+TSPC64_TV_NSEC+LOPART(r9) -#else - lwz r5,STAMP_XTIME+TSPC32_TV_SEC(r9) - lwz r6,STAMP_XTIME+TSPC32_TV_NSEC(r9) -#endif - add r4,r4,r6 +4: + /* At this point, we have seconds since the xtime stamp + * as a 32.32 fixed-point number in r3 and r4. + * Load & add the xtime stamp. + */ + lwz r5,STAMP_XTIME+TSPEC_TV_SEC(r9) + lwz r6,STAMP_SEC_FRAC(r9) + addc r4,r4,r6 adde r3,r3,r5 - /* We now have our result in r3,r4. We create a fake dependency - * on that result and re-check the counter + /* We create a fake dependency on the result in r3/r4 + * and re-check the counter */ or r6,r4,r3 xor r0,r6,r6 add r9,r9,r0 lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) - cmpl cr0,r8,r0 /* check if updated */ + cmplw cr0,r8,r0 /* check if updated */ bne- 1b - /* check for nanosecond overflow and adjust if necessary */ - cmpw r4,r7 - bltlr /* all done if no overflow */ - subf r4,r7,r4 /* adjust if overflow */ - addi r3,r3,1 + mulhwu r4,r4,r7 /* convert to micro or nanoseconds */ blr .cfi_endproc diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso32/Makefile linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/Makefile --- linux-2.6.32/arch/powerpc/kernel/vdso32/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/Makefile 2015-09-10 10:07:26.000000000 -0700 @@ -1,7 +1,9 @@ # List of files in the vdso, has to be asm only for now -obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o +obj-vdso32-$(CONFIG_PPC64) = getcpu.o +obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o \ + $(obj-vdso32-y) # Build rules @@ -41,7 +43,8 @@ $(obj-vdso32): %.o: %.S # actual build commands quiet_cmd_vdso32ld = VDSO32L $@ - cmd_vdso32ld = $(CROSS32CC) $(c_flags) -Wl,-T $^ -o $@ + cmd_vdso32ld = $(CROSS32CC) $(c_flags) -Wl,-T $^ -o $@ \ + $(if $(AFTER_LINK),; $(AFTER_LINK)) quiet_cmd_vdso32as = VDSO32A $@ cmd_vdso32as = $(CROSS32CC) $(a_flags) -c -o $@ $< diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso32/vdso32.lds.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/vdso32.lds.S --- linux-2.6.32/arch/powerpc/kernel/vdso32/vdso32.lds.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/vdso32.lds.S 2015-09-10 10:07:26.000000000 -0700 @@ -147,6 +147,9 @@ VERSION __kernel_sync_dicache_p5; __kernel_sigtramp32; __kernel_sigtramp_rt32; +#ifdef CONFIG_PPC64 + __kernel_getcpu; +#endif local: *; }; diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso64/getcpu.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/getcpu.S --- linux-2.6.32/arch/powerpc/kernel/vdso64/getcpu.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/getcpu.S 2015-09-10 10:07:26.000000000 -0700 @@ -0,0 +1,45 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + */ +#include +#include + + .text +/* + * Exact prototype of getcpu + * + * int __kernel_getcpu(unsigned *cpu, unsigned *node); + * + */ +V_FUNCTION_BEGIN(__kernel_getcpu) + .cfi_startproc + mfspr r5,SPRN_USPRG3 + cmpdi cr0,r3,0 + cmpdi cr1,r4,0 + clrlwi r6,r5,16 + rlwinm r7,r5,16,31-15,31-0 + beq cr0,1f + stw r6,0(r3) +1: beq cr1,2f + stw r7,0(r4) +2: crclr cr0*4+so + li r3,0 /* always success */ + blr + .cfi_endproc +V_FUNCTION_END(__kernel_getcpu) diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso64/gettimeofday.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/gettimeofday.S --- linux-2.6.32/arch/powerpc/kernel/vdso64/gettimeofday.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/gettimeofday.S 2015-09-10 10:06:08.000000000 -0700 @@ -33,18 +33,11 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) bl V_LOCAL_FUNC(__get_datapage) /* get data page */ cmpldi r11,0 /* check if tv is NULL */ beq 2f - bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ - lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ - ori r7,r7,16960 - rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ - rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ - std r5,TVAL64_TV_SEC(r11) /* store sec in tv */ - subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ - mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / - * XSEC_PER_SEC - */ - rldicl r0,r0,44,20 - std r0,TVAL64_TV_USEC(r11) /* store usec in tv */ + lis r7,1000000@ha /* load up USEC_PER_SEC */ + addi r7,r7,1000000@l + bl V_LOCAL_FUNC(__do_get_tspec) /* get sec/us from tb & kernel */ + std r4,TVAL64_TV_SEC(r11) /* store sec in tv */ + std r5,TVAL64_TV_USEC(r11) /* store usec in tv */ 2: cmpldi r10,0 /* check if tz is NULL */ beq 1f lwz r4,CFG_TZ_MINUTEWEST(r3)/* fill tz */ @@ -77,6 +70,8 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) .cfi_register lr,r12 mr r11,r4 /* r11 saves tp */ bl V_LOCAL_FUNC(__get_datapage) /* get data page */ + lis r7,NSEC_PER_SEC@h /* want nanoseconds */ + ori r7,r7,NSEC_PER_SEC@l 50: bl V_LOCAL_FUNC(__do_get_tspec) /* get time from tb & kernel */ bne cr1,80f /* if not monotonic, all done */ @@ -171,49 +166,12 @@ V_FUNCTION_END(__kernel_clock_getres) /* - * This is the core of gettimeofday(), it returns the xsec - * value in r4 and expects the datapage ptr (non clobbered) - * in r3. clobbers r0,r4,r5,r6,r7,r8 - * When returning, r8 contains the counter value that can be reused - */ -V_FUNCTION_BEGIN(__do_get_xsec) - .cfi_startproc - /* check for update count & load values */ -1: ld r8,CFG_TB_UPDATE_COUNT(r3) - andi. r0,r8,1 /* pending update ? loop */ - bne- 1b - xor r0,r8,r8 /* create dependency */ - add r3,r3,r0 - - /* Get TB & offset it. We use the MFTB macro which will generate - * workaround code for Cell. - */ - MFTB(r7) - ld r9,CFG_TB_ORIG_STAMP(r3) - subf r7,r9,r7 - - /* Scale result */ - ld r5,CFG_TB_TO_XS(r3) - mulhdu r7,r7,r5 - - /* Add stamp since epoch */ - ld r6,CFG_STAMP_XSEC(r3) - add r4,r6,r7 - - xor r0,r4,r4 - add r3,r3,r0 - ld r0,CFG_TB_UPDATE_COUNT(r3) - cmpld cr0,r0,r8 /* check if updated */ - bne- 1b - blr - .cfi_endproc -V_FUNCTION_END(__do_get_xsec) - -/* - * This is the core of clock_gettime(), it returns the current - * time in seconds and nanoseconds in r4 and r5. + * This is the core of clock_gettime() and gettimeofday(), + * it returns the current time in r4 (seconds) and r5. + * On entry, r7 gives the resolution of r5, either USEC_PER_SEC + * or NSEC_PER_SEC, giving r5 in microseconds or nanoseconds. * It expects the datapage ptr in r3 and doesn't clobber it. - * It clobbers r0 and r6 and returns NSEC_PER_SEC in r7. + * It clobbers r0, r6 and r9. * On return, r8 contains the counter value that can be reused. * This clobbers cr0 but not any other cr field. */ @@ -229,18 +187,18 @@ V_FUNCTION_BEGIN(__do_get_tspec) /* Get TB & offset it. We use the MFTB macro which will generate * workaround code for Cell. */ - MFTB(r7) + MFTB(r6) ld r9,CFG_TB_ORIG_STAMP(r3) - subf r7,r9,r7 + subf r6,r9,r6 /* Scale result */ ld r5,CFG_TB_TO_XS(r3) - sldi r7,r7,12 /* compute time since stamp_xtime */ - mulhdu r6,r7,r5 /* in units of 2^-32 seconds */ + sldi r6,r6,12 /* compute time since stamp_xtime */ + mulhdu r6,r6,r5 /* in units of 2^-32 seconds */ /* Add stamp since epoch */ ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3) - ld r5,STAMP_XTIME+TSPC64_TV_NSEC(r3) + lwz r5,STAMP_SEC_FRAC(r3) or r0,r4,r5 or r0,r0,r6 xor r0,r0,r0 @@ -250,17 +208,11 @@ V_FUNCTION_BEGIN(__do_get_tspec) bne- 1b /* reload if so */ /* convert to seconds & nanoseconds and add to stamp */ - lis r7,NSEC_PER_SEC@h - ori r7,r7,NSEC_PER_SEC@l - mulhwu r0,r6,r7 /* compute nanoseconds and */ + add r6,r6,r5 /* add on fractional seconds of xtime */ + mulhwu r5,r6,r7 /* compute micro or nanoseconds and */ srdi r6,r6,32 /* seconds since stamp_xtime */ - clrldi r0,r0,32 - add r5,r5,r0 /* add nanoseconds together */ - cmpd r5,r7 /* overflow? */ + clrldi r5,r5,32 add r4,r4,r6 - bltlr /* all done if no overflow */ - subf r5,r7,r5 /* if overflow, adjust */ - addi r4,r4,1 blr .cfi_endproc V_FUNCTION_END(__do_get_tspec) diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso64/Makefile linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/Makefile --- linux-2.6.32/arch/powerpc/kernel/vdso64/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/Makefile 2015-09-10 10:07:26.000000000 -0700 @@ -1,6 +1,6 @@ # List of files in the vdso, has to be asm only for now -obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o +obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o # Build rules @@ -36,7 +36,8 @@ $(obj-vdso64): %.o: %.S # actual build commands quiet_cmd_vdso64ld = VDSO64L $@ - cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $^ -o $@ + cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $^ -o $@ \ + $(if $(AFTER_LINK),; $(AFTER_LINK)) quiet_cmd_vdso64as = VDSO64A $@ cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso64/vdso64.lds.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/vdso64.lds.S --- linux-2.6.32/arch/powerpc/kernel/vdso64/vdso64.lds.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/vdso64.lds.S 2015-09-10 10:07:26.000000000 -0700 @@ -146,6 +146,7 @@ VERSION __kernel_sync_dicache; __kernel_sync_dicache_p5; __kernel_sigtramp_rt64; + __kernel_getcpu; local: *; }; diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso.c linux-2.6.32.ovz/arch/powerpc/kernel/vdso.c --- linux-2.6.32/arch/powerpc/kernel/vdso.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso.c 2015-09-10 10:07:26.000000000 -0700 @@ -264,17 +264,11 @@ int arch_setup_additional_pages(struct l * the "data" page of the vDSO or you'll stop getting kernel updates * and your nice userland gettimeofday will be totally dead. * It's fine to use that for setting breakpoints in the vDSO code - * pages though - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. + * pages though. */ rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_pagelist); if (rc) { current->mm->context.vdso_base = 0; @@ -714,6 +708,31 @@ static void __init vdso_setup_syscall_ma } } +#ifdef CONFIG_PPC64 +int __cpuinit vdso_getcpu_init(void) +{ + unsigned long cpu, node, val; + + /* + * SPRG3 contains the CPU in the bottom 16 bits and the NUMA node in + * the next 16 bits. The VDSO uses this to implement getcpu(). + */ + cpu = get_cpu(); + WARN_ON_ONCE(cpu > 0xffff); + + node = cpu_to_node(cpu); + WARN_ON_ONCE(node > 0xffff); + + val = (cpu & 0xfff) | ((node & 0xffff) << 16); + mtspr(SPRN_SPRG3, val); + + put_cpu(); + + return 0; +} +/* We need to call this before SMP init */ +early_initcall(vdso_getcpu_init); +#endif static int __init vdso_init(void) { @@ -820,17 +839,17 @@ static int __init vdso_init(void) } arch_initcall(vdso_init); -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { return 0; } -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; } diff -uprN linux-2.6.32/arch/powerpc/kernel/vector.S linux-2.6.32.ovz/arch/powerpc/kernel/vector.S --- linux-2.6.32/arch/powerpc/kernel/vector.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vector.S 2015-09-10 10:05:42.000000000 -0700 @@ -58,7 +58,7 @@ _GLOBAL(load_up_altivec) * all 1's */ mfspr r4,SPRN_VRSAVE - cmpdi 0,r4,0 + cmpwi 0,r4,0 bne+ 1f li r4,-1 mtspr SPRN_VRSAVE,r4 diff -uprN linux-2.6.32/arch/powerpc/kernel/vio.c linux-2.6.32.ovz/arch/powerpc/kernel/vio.c --- linux-2.6.32/arch/powerpc/kernel/vio.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vio.c 2015-09-10 10:07:23.000000000 -0700 @@ -14,7 +14,9 @@ * 2 of the License, or (at your option) any later version. */ +#include #include +#include #include #include #include @@ -699,13 +701,26 @@ static int vio_cmo_bus_probe(struct vio_ struct vio_driver *viodrv = to_vio_driver(dev->driver); unsigned long flags; size_t size; + bool dma_capable = false; - /* - * Check to see that device has a DMA window and configure - * entitlement for the device. - */ - if (of_get_property(viodev->dev.archdata.of_node, - "ibm,my-dma-window", NULL)) { + /* A device requires entitlement if it has a DMA window property */ + switch (viodev->family) { + case VDEVICE: + if (of_get_property(viodev->dev.archdata.of_node, + "ibm,my-dma-window", NULL)) + dma_capable = true; + break; + case PFO: + dma_capable = false; + break; + default: + dev_warn(dev, "unknown device family: %d\n", viodev->family); + BUG(); + break; + } + + /* Configure entitlement for the device. */ + if (dma_capable) { /* Check that the driver is CMO enabled and get desired DMA */ if (!viodrv->get_desired_dma) { dev_err(dev, "%s: device driver does not support CMO\n", @@ -1039,6 +1054,94 @@ static void vio_cmo_sysfs_init(void) { } EXPORT_SYMBOL(vio_cmo_entitlement_update); EXPORT_SYMBOL(vio_cmo_set_dev_desired); + +/* + * Platform Facilities Option (PFO) support + */ + +/** + * vio_h_cop_sync - Perform a synchronous PFO co-processor operation + * + * @vdev - Pointer to a struct vio_dev for device + * @op - Pointer to a struct vio_pfo_op for the operation parameters + * + * Calls the hypervisor to synchronously perform the PFO operation + * described in @op. In the case of a busy response from the hypervisor, + * the operation will be re-submitted indefinitely unless a non-zero timeout + * is specified or an error occurs. The timeout places a limit on when to + * stop re-submitting a operation, the total time can be exceeded if an + * operation is in progress. + * + * If op->hcall_ret is not NULL, this will be set to the return from the + * last h_cop_op call or it will be 0 if an error not involving the h_call + * was encountered. + * + * Returns: + * 0 on success, + * -EINVAL if the h_call fails due to an invalid parameter, + * -E2BIG if the h_call can not be performed synchronously, + * -EBUSY if a timeout is specified and has elapsed, + * -EACCES if the memory area for data/status has been rescinded, or + * -EPERM if a hardware fault has been indicated + */ +int vio_h_cop_sync(struct vio_dev *vdev, struct vio_pfo_op *op) +{ + struct device *dev = &vdev->dev; + unsigned long deadline = 0; + long hret = 0; + int ret = 0; + + if (op->timeout) + deadline = jiffies + msecs_to_jiffies(op->timeout); + + while (true) { + hret = plpar_hcall_norets(H_COP, op->flags, + vdev->resource_id, + op->in, op->inlen, op->out, + op->outlen, op->csbcpb); + + if (hret == H_SUCCESS || + (hret != H_NOT_ENOUGH_RESOURCES && + hret != H_BUSY && hret != H_RESOURCE) || + (op->timeout && time_after(deadline, jiffies))) + break; + + dev_dbg(dev, "%s: hcall ret(%ld), retrying.\n", __func__, hret); + } + + switch (hret) { + case H_SUCCESS: + ret = 0; + break; + case H_OP_MODE: + case H_TOO_BIG: + ret = -E2BIG; + break; + case H_RESCINDED: + ret = -EACCES; + break; + case H_HARDWARE: + ret = -EPERM; + break; + case H_NOT_ENOUGH_RESOURCES: + case H_RESOURCE: + case H_BUSY: + ret = -EBUSY; + break; + default: + ret = -EINVAL; + break; + } + + if (ret) + dev_dbg(dev, "%s: Sync h_cop_op failure (ret:%d) (hret:%ld)\n", + __func__, ret, hret); + + op->hcall_err = hret; + return ret; +} +EXPORT_SYMBOL(vio_h_cop_sync); + static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) { const unsigned char *dma_window; @@ -1053,7 +1156,7 @@ static struct iommu_table *vio_build_iom if (!dma_window) return NULL; - tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); if (tbl == NULL) return NULL; @@ -1066,6 +1169,7 @@ static struct iommu_table *vio_build_iom tbl->it_offset = offset >> IOMMU_PAGE_SHIFT; tbl->it_busno = 0; tbl->it_type = TCE_VB; + tbl->it_blocksize = 16; return iommu_init_table(tbl, -1); } @@ -1194,35 +1298,87 @@ static void __devinit vio_dev_release(st struct vio_dev *vio_register_device_node(struct device_node *of_node) { struct vio_dev *viodev; + struct device_node *parent_node; const unsigned int *unit_address; + const unsigned int *pfo_resid = NULL; + enum vio_dev_family family; + const char *of_node_name = of_node->name ? of_node->name : ""; - /* we need the 'device_type' property, in order to match with drivers */ - if (of_node->type == NULL) { - printk(KERN_WARNING "%s: node %s missing 'device_type'\n", - __func__, - of_node->name ? of_node->name : ""); + /* + * Determine if this node is a under the /vdevice node or under the + * /ibm,platform-facilities node. This decides the device's family. + */ + parent_node = of_get_parent(of_node); + if (parent_node) { + if (!strcmp(parent_node->full_name, "/ibm,platform-facilities")) + family = PFO; + else if (!strcmp(parent_node->full_name, "/vdevice")) + family = VDEVICE; + else { + pr_warn("%s: parent(%s) of %s not recognized.\n", + __func__, + parent_node->full_name, + of_node_name); + of_node_put(parent_node); + return NULL; + } + of_node_put(parent_node); + } else { + pr_warn("%s: could not determine the parent of node %s.\n", + __func__, of_node_name); return NULL; } - unit_address = of_get_property(of_node, "reg", NULL); - if (unit_address == NULL) { - printk(KERN_WARNING "%s: node %s missing 'reg'\n", - __func__, - of_node->name ? of_node->name : ""); - return NULL; + if (family == PFO) { + if (of_get_property(of_node, "interrupt-controller", NULL)) { + pr_debug("%s: Skipping the interrupt controller %s.\n", + __func__, of_node_name); + return NULL; + } } /* allocate a vio_dev for this node */ viodev = kzalloc(sizeof(struct vio_dev), GFP_KERNEL); - if (viodev == NULL) + if (viodev == NULL) { + pr_warn("%s: allocation failure for VIO device.\n", __func__); return NULL; + } - viodev->irq = irq_of_parse_and_map(of_node, 0); + /* we need the 'device_type' property, in order to match with drivers */ + viodev->family = family; + if (viodev->family == VDEVICE) { + if (of_node->type != NULL) + viodev->type = of_node->type; + else { + pr_warn("%s: node %s is missing the 'device_type' " + "property.\n", __func__, of_node_name); + goto out; + } + + unit_address = of_get_property(of_node, "reg", NULL); + if (unit_address == NULL) { + pr_warn("%s: node %s missing 'reg'\n", + __func__, of_node_name); + goto out; + } + dev_set_name(&viodev->dev, "%x", *unit_address); + viodev->irq = irq_of_parse_and_map(of_node, 0); + viodev->unit_address = *unit_address; + } else { + /* PFO devices need their resource_id for submitting COP_OPs + * This is an optional field for devices, but is required when + * performing synchronous ops */ + pfo_resid = of_get_property(of_node, "ibm,resource-id", NULL); + if (pfo_resid != NULL) + viodev->resource_id = *pfo_resid; + + unit_address = NULL; + dev_set_name(&viodev->dev, "%s", of_node_name); + viodev->type = of_node_name; + viodev->irq = 0; + } - dev_set_name(&viodev->dev, "%x", *unit_address); viodev->name = of_node->name; - viodev->type = of_node->type; - viodev->unit_address = *unit_address; if (firmware_has_feature(FW_FEATURE_ISERIES)) { unit_address = of_get_property(of_node, "linux,unit_address", NULL); @@ -1231,11 +1387,6 @@ struct vio_dev *vio_register_device_node } viodev->dev.archdata.of_node = of_node_get(of_node); - if (firmware_has_feature(FW_FEATURE_CMO)) - vio_cmo_set_dma_ops(viodev); - else - viodev->dev.archdata.dma_ops = &dma_iommu_ops; - set_iommu_table_base(&viodev->dev, vio_build_iommu_table(viodev)); set_dev_node(&viodev->dev, of_node_to_nid(of_node)); /* init generic 'struct device' fields: */ @@ -1243,6 +1394,21 @@ struct vio_dev *vio_register_device_node viodev->dev.bus = &vio_bus_type; viodev->dev.release = vio_dev_release; + if (of_get_property(viodev->dev.archdata.of_node, + "ibm,my-dma-window", NULL)) { + if (firmware_has_feature(FW_FEATURE_CMO)) + vio_cmo_set_dma_ops(viodev); + else + set_dma_ops(&viodev->dev, &dma_iommu_ops); + + set_iommu_table_base(&viodev->dev, + vio_build_iommu_table(viodev)); + + /* needed to ensure proper operation of coherent allocations + * later, in case driver doesn't set it explicitly */ + dma_set_mask(&viodev->dev, DMA_BIT_MASK(64)); + dma_set_coherent_mask(&viodev->dev, DMA_BIT_MASK(64)); + } /* register with generic device framework */ if (device_register(&viodev->dev)) { printk(KERN_ERR "%s: failed to register device %s\n", @@ -1253,16 +1419,51 @@ struct vio_dev *vio_register_device_node } return viodev; + +out: /* Use this exit point for any return prior to device_register */ + kfree(viodev); + + return NULL; } EXPORT_SYMBOL(vio_register_device_node); +/* + * vio_bus_scan_for_devices - Scan OF and register each child device + * @root_name - OF node name for the root of the subtree to search. + * This must be non-NULL + * + * Starting from the root node provide, register the device node for + * each child beneath the root. + */ +static void vio_bus_scan_register_devices(char *root_name) +{ + struct device_node *node_root, *node_child; + + if (!root_name) + return; + + node_root = of_find_node_by_name(NULL, root_name); + if (node_root) { + + /* + * Create struct vio_devices for each virtual device in + * the device tree. Drivers will associate with them later. + */ + node_child = of_get_next_child(node_root, NULL); + while (node_child) { + vio_register_device_node(node_child); + node_child = of_get_next_child(node_root, node_child); + } + of_node_put(node_root); + } +} + /** * vio_bus_init: - Initialize the virtual IO bus */ static int __init vio_bus_init(void) { int err; - struct device_node *node_vroot; if (firmware_has_feature(FW_FEATURE_CMO)) vio_cmo_sysfs_init(); @@ -1287,19 +1488,8 @@ static int __init vio_bus_init(void) if (firmware_has_feature(FW_FEATURE_CMO)) vio_cmo_bus_init(); - node_vroot = of_find_node_by_name(NULL, "vdevice"); - if (node_vroot) { - struct device_node *of_node; - - /* - * Create struct vio_devices for each virtual device in - * the device tree. Drivers will associate with them later. - */ - for (of_node = node_vroot->child; of_node != NULL; - of_node = of_node->sibling) - vio_register_device_node(of_node); - of_node_put(node_vroot); - } + vio_bus_scan_register_devices("vdevice"); + vio_bus_scan_register_devices("ibm,platform-facilities"); return 0; } @@ -1319,9 +1509,27 @@ static ssize_t devspec_show(struct devic return sprintf(buf, "%s\n", of_node ? of_node->full_name : "none"); } +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + const struct vio_dev *vio_dev = to_vio_dev(dev); + struct device_node *dn; + const char *cp; + + dn = dev->archdata.of_node; + if (!dn) + return -ENODEV; + cp = of_get_property(dn, "compatible", NULL); + if (!cp) + return -ENODEV; + + return sprintf(buf, "vio:T%sS%s\n", vio_dev->type, cp); +} + static struct device_attribute vio_dev_attrs[] = { __ATTR_RO(name), __ATTR_RO(devspec), + __ATTR_RO(modalias), __ATTR_NULL }; @@ -1357,6 +1565,29 @@ static int vio_hotplug(struct device *de return 0; } +static int vio_pm_suspend(struct device *dev) +{ + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + + if (pm && pm->suspend) + return pm->suspend(dev); + return 0; +} + +static int vio_pm_resume(struct device *dev) +{ + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + + if (pm && pm->resume) + return pm->resume(dev); + return 0; +} + +const struct dev_pm_ops vio_dev_pm_ops = { + .suspend = vio_pm_suspend, + .resume = vio_pm_resume, +}; + static struct bus_type vio_bus_type = { .name = "vio", .dev_attrs = vio_dev_attrs, @@ -1364,6 +1595,7 @@ static struct bus_type vio_bus_type = { .match = vio_bus_match, .probe = vio_bus_probe, .remove = vio_bus_remove, + .pm = &vio_dev_pm_ops, }; /** @@ -1404,12 +1636,28 @@ struct vio_dev *vio_find_node(struct dev { const uint32_t *unit_address; char kobj_name[20]; + struct device_node *vnode_parent; + const char *dev_type; + + vnode_parent = of_get_parent(vnode); + if (!vnode_parent) + return NULL; + + dev_type = of_get_property(vnode_parent, "device_type", NULL); + of_node_put(vnode_parent); + if (!dev_type) + return NULL; /* construct the kobject name from the device node */ - unit_address = of_get_property(vnode, "reg", NULL); - if (!unit_address) + if (!strcmp(dev_type, "vdevice")) { + unit_address = of_get_property(vnode, "reg", NULL); + if (!unit_address) + return NULL; + snprintf(kobj_name, sizeof(kobj_name), "%x", *unit_address); + } else if (!strcmp(dev_type, "ibm,platform-facilities")) + snprintf(kobj_name, sizeof(kobj_name), "%s", vnode->name); + else return NULL; - snprintf(kobj_name, sizeof(kobj_name), "%x", *unit_address); return vio_find_name(kobj_name); } diff -uprN linux-2.6.32/arch/powerpc/kernel/vmlinux.lds.S linux-2.6.32.ovz/arch/powerpc/kernel/vmlinux.lds.S --- linux-2.6.32/arch/powerpc/kernel/vmlinux.lds.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vmlinux.lds.S 2015-09-10 10:05:48.000000000 -0700 @@ -38,6 +38,9 @@ jiffies = jiffies_64 + 4; #endif SECTIONS { + . = 0; + reloc_start = .; + . = KERNELBASE; /* diff -uprN linux-2.6.32/arch/powerpc/kvm/Kconfig linux-2.6.32.ovz/arch/powerpc/kvm/Kconfig --- linux-2.6.32/arch/powerpc/kvm/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kvm/Kconfig 2015-09-10 10:05:38.000000000 -0700 @@ -58,6 +58,7 @@ config KVM_E500 If unsure, say N. +source drivers/vhost/Kconfig source drivers/virtio/Kconfig endif # VIRTUALIZATION diff -uprN linux-2.6.32/arch/powerpc/kvm/powerpc.c linux-2.6.32.ovz/arch/powerpc/kvm/powerpc.c --- linux-2.6.32/arch/powerpc/kvm/powerpc.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kvm/powerpc.c 2015-09-10 10:06:30.000000000 -0700 @@ -78,8 +78,9 @@ int kvmppc_emulate_mmio(struct kvm_run * return r; } -void kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void *garbage) { + return 0; } void kvm_arch_hardware_disable(void *garbage) @@ -135,6 +136,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm { kvmppc_free_vcpus(kvm); kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); kfree(kvm); } @@ -160,14 +162,24 @@ long kvm_arch_dev_ioctl(struct file *fil return -EINVAL; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc) { return 0; } +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + return; +} + + void kvm_arch_flush_shadow(struct kvm *kvm) { } diff -uprN linux-2.6.32/arch/powerpc/lib/checksum_64.S linux-2.6.32.ovz/arch/powerpc/lib/checksum_64.S --- linux-2.6.32/arch/powerpc/lib/checksum_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/checksum_64.S 2015-09-10 10:07:25.000000000 -0700 @@ -65,165 +65,393 @@ _GLOBAL(csum_tcpudp_magic) srwi r3,r3,16 blr +#define STACKFRAMESIZE 256 +#define STK_REG(i) (112 + ((i)-14)*8) + /* * Computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit). * - * This code assumes at least halfword alignment, though the length - * can be any number of bytes. The sum is accumulated in r5. - * * csum_partial(r3=buff, r4=len, r5=sum) */ _GLOBAL(csum_partial) - subi r3,r3,8 /* we'll offset by 8 for the loads */ - srdi. r6,r4,3 /* divide by 8 for doubleword count */ - addic r5,r5,0 /* clear carry */ - beq 3f /* if we're doing < 8 bytes */ - andi. r0,r3,2 /* aligned on a word boundary already? */ - beq+ 1f - lhz r6,8(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 - subi r4,r4,2 - addc r5,r5,r6 - srdi. r6,r4,3 /* recompute number of doublewords */ - beq 3f /* any left? */ -1: mtctr r6 -2: ldu r6,8(r3) /* main sum loop */ - adde r5,r5,r6 - bdnz 2b - andi. r4,r4,7 /* compute bytes left to sum after doublewords */ -3: cmpwi 0,r4,4 /* is at least a full word left? */ - blt 4f - lwz r6,8(r3) /* sum this word */ + addic r0,r5,0 /* clear carry */ + + srdi. r6,r4,3 /* less than 8 bytes? */ + beq .Lcsum_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + beq .Lcsum_aligned + + li r7,4 + sub r6,r7,r6 + mtctr r6 + +1: + lhz r6,0(r3) /* align to doubleword */ + subi r4,r4,2 + addi r3,r3,2 + adde r0,r0,r6 + bdnz 1b + +.Lcsum_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r4,7 + beq .Lcsum_tail_doublewords /* len < 128 */ + + srdi r6,r4,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + + ld r6,0(r3) + ld r9,8(r3) + + ld r10,16(r3) + ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back addes take 2 cycles because of + * the XER dependency. This means the fastest this loop can go is + * 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + + adde r0,r0,r11 + + adde r0,r0,r12 + + adde r0,r0,r14 + + adde r0,r0,r15 + ld r6,0(r3) + ld r9,8(r3) + + adde r0,r0,r16 + ld r10,16(r3) + ld r11,24(r3) + bdnz 2b + + + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + adde r0,r0,r11 + adde r0,r0,r12 + adde r0,r0,r14 + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r4,r4,63 + +.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r4,3 + beq .Lcsum_tail_word + + mtctr r6 +3: + ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 + bdnz 3b + + andi. r4,r4,7 + +.Lcsum_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r4,2 + beq .Lcsum_tail_halfword + + lwz r6,0(r3) addi r3,r3,4 + adde r0,r0,r6 subi r4,r4,4 - adde r5,r5,r6 -4: cmpwi 0,r4,2 /* is at least a halfword left? */ - blt+ 5f - lhz r6,8(r3) /* sum this halfword */ - addi r3,r3,2 - subi r4,r4,2 - adde r5,r5,r6 -5: cmpwi 0,r4,1 /* is at least a byte left? */ - bne+ 6f - lbz r6,8(r3) /* sum this byte */ - slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */ - adde r5,r5,r6 -6: addze r5,r5 /* add in final carry */ - rldicl r4,r5,32,0 /* fold two 32-bit halves together */ - add r3,r4,r5 - srdi r3,r3,32 - blr + +.Lcsum_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r4,1 + beq .Lcsum_tail_byte + + lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 + subi r4,r4,2 + +.Lcsum_tail_byte: /* Up to 1 byte to go */ + andi. r6,r4,1 + beq .Lcsum_finish + + lbz r6,0(r3) + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 + +.Lcsum_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr + + + .macro source +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Lsrc_error + .previous + .endm + + .macro dest +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldest_error + .previous + .endm /* * Computes the checksum of a memory block at src, length len, * and adds in "sum" (32-bit), while copying the block to dst. * If an access exception occurs on src or dst, it stores -EFAULT - * to *src_err or *dst_err respectively, and (for an error on - * src) zeroes the rest of dst. - * - * This code needs to be reworked to take advantage of 64 bit sum+copy. - * However, due to tokenring halfword alignment problems this will be very - * tricky. For now we'll leave it until we instrument it somehow. + * to *src_err or *dst_err respectively. The caller must take any action + * required in this case (zeroing memory, recalculating partial checksum etc). * * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) */ _GLOBAL(csum_partial_copy_generic) - addic r0,r6,0 - subi r3,r3,4 - subi r4,r4,4 - srwi. r6,r5,2 - beq 3f /* if we're doing < 4 bytes */ - andi. r9,r4,2 /* Align dst to longword boundary */ - beq+ 1f -81: lhz r6,4(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 + addic r0,r6,0 /* clear carry */ + + srdi. r6,r5,3 /* less than 8 bytes? */ + beq .Lcopy_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + * + * If the source and destination are relatively unaligned we only + * align the source. This keeps things simple. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + beq .Lcopy_aligned + + li r7,4 + sub r6,r7,r6 + mtctr r6 + +1: +source; lhz r6,0(r3) /* align to doubleword */ subi r5,r5,2 -91: sth r6,4(r4) - addi r4,r4,2 - addc r0,r0,r6 - srwi. r6,r5,2 /* # words to do */ - beq 3f -1: mtctr r6 -82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */ -92: stwu r6,4(r4) /* be unnecessary to unroll this loop */ - adde r0,r0,r6 - bdnz 82b - andi. r5,r5,3 -3: cmpwi 0,r5,2 - blt+ 4f -83: lhz r6,4(r3) addi r3,r3,2 - subi r5,r5,2 -93: sth r6,4(r4) + adde r0,r0,r6 +dest; sth r6,0(r4) addi r4,r4,2 + bdnz 1b + +.Lcopy_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r5,7 + beq .Lcopy_tail_doublewords /* len < 128 */ + + srdi r6,r5,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + +source; ld r6,0(r3) +source; ld r9,8(r3) + +source; ld r10,16(r3) +source; ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back addes take 2 cycles because of + * the XER dependency. This means the fastest this loop can go is + * 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: adde r0,r0,r6 -4: cmpwi 0,r5,1 - bne+ 5f -84: lbz r6,4(r3) -94: stb r6,4(r4) - slwi r6,r6,8 /* Upper byte of word */ - adde r0,r0,r6 -5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */ - rldicl r4,r3,32,0 /* fold 64 bit value */ - add r3,r4,r3 - srdi r3,r3,32 - blr +source; ld r12,32(r3) +source; ld r14,40(r3) -/* These shouldn't go in the fixup section, since that would - cause the ex_table addresses to get out of order. */ + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 +source; ld r6,0(r3) +source; ld r9,8(r3) + + adde r0,r0,r16 +source; ld r10,16(r3) +source; ld r11,24(r3) + bdnz 2b + + + adde r0,r0,r6 +source; ld r12,32(r3) +source; ld r14,40(r3) + + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r5,r5,63 + +.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r5,3 + beq .Lcopy_tail_word - .globl src_error_1 -src_error_1: - li r6,0 - subi r5,r5,2 -95: sth r6,4(r4) - addi r4,r4,2 - srwi. r6,r5,2 - beq 3f mtctr r6 - .globl src_error_2 -src_error_2: - li r6,0 -96: stwu r6,4(r4) - bdnz 96b -3: andi. r5,r5,3 - beq src_error - .globl src_error_3 -src_error_3: - li r6,0 - mtctr r5 - addi r4,r4,3 -97: stbu r6,1(r4) - bdnz 97b - .globl src_error -src_error: +3: +source; ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 +dest; std r6,0(r4) + addi r4,r4,8 + bdnz 3b + + andi. r5,r5,7 + +.Lcopy_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r5,2 + beq .Lcopy_tail_halfword + +source; lwz r6,0(r3) + addi r3,r3,4 + adde r0,r0,r6 +dest; stw r6,0(r4) + addi r4,r4,4 + subi r5,r5,4 + +.Lcopy_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r5,1 + beq .Lcopy_tail_byte + +source; lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 +dest; sth r6,0(r4) + addi r4,r4,2 + subi r5,r5,2 + +.Lcopy_tail_byte: /* Up to 1 byte to go */ + andi. r6,r5,1 + beq .Lcopy_finish + +source; lbz r6,0(r3) + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 +dest; stb r6,0(r4) + +.Lcopy_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr + +.Lsrc_error: cmpdi 0,r7,0 - beq 1f + beqlr li r6,-EFAULT stw r6,0(r7) -1: addze r3,r0 blr - .globl dst_error -dst_error: +.Ldest_error: cmpdi 0,r8,0 - beq 1f + beqlr li r6,-EFAULT stw r6,0(r8) -1: addze r3,r0 blr - -.section __ex_table,"a" - .align 3 - .llong 81b,src_error_1 - .llong 91b,dst_error - .llong 82b,src_error_2 - .llong 92b,dst_error - .llong 83b,src_error_3 - .llong 93b,dst_error - .llong 84b,src_error_3 - .llong 94b,dst_error - .llong 95b,dst_error - .llong 96b,dst_error - .llong 97b,dst_error diff -uprN linux-2.6.32/arch/powerpc/lib/checksum_wrappers_64.c linux-2.6.32.ovz/arch/powerpc/lib/checksum_wrappers_64.c --- linux-2.6.32/arch/powerpc/lib/checksum_wrappers_64.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/checksum_wrappers_64.c 2015-09-10 10:07:25.000000000 -0700 @@ -0,0 +1,102 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2010 + * + * Author: Anton Blanchard + */ +#include +#include +#include +#include +#include + +__wsum csum_and_copy_from_user(const void __user *src, void *dst, + int len, __wsum sum, int *err_ptr) +{ + unsigned int csum; + + might_sleep(); + + *err_ptr = 0; + + if (!len) { + csum = 0; + goto out; + } + + if (unlikely((len < 0) || !access_ok(VERIFY_READ, src, len))) { + *err_ptr = -EFAULT; + csum = (__force unsigned int)sum; + goto out; + } + + csum = csum_partial_copy_generic((void __force *)src, dst, + len, sum, err_ptr, NULL); + + if (unlikely(*err_ptr)) { + int missing = __copy_from_user(dst, src, len); + + if (missing) { + memset(dst + len - missing, 0, missing); + *err_ptr = -EFAULT; + } else { + *err_ptr = 0; + } + + csum = csum_partial(dst, len, sum); + } + +out: + return (__force __wsum)csum; +} +EXPORT_SYMBOL(csum_and_copy_from_user); + +__wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, + __wsum sum, int *err_ptr) +{ + unsigned int csum; + + might_sleep(); + + *err_ptr = 0; + + if (!len) { + csum = 0; + goto out; + } + + if (unlikely((len < 0) || !access_ok(VERIFY_WRITE, dst, len))) { + *err_ptr = -EFAULT; + csum = -1; /* invalid checksum */ + goto out; + } + + csum = csum_partial_copy_generic(src, (void __force *)dst, + len, sum, NULL, err_ptr); + + if (unlikely(*err_ptr)) { + csum = csum_partial(src, len, sum); + + if (copy_to_user(dst, src, len)) { + *err_ptr = -EFAULT; + csum = -1; /* invalid checksum */ + } + } + +out: + return (__force __wsum)csum; +} +EXPORT_SYMBOL(csum_and_copy_to_user); diff -uprN linux-2.6.32/arch/powerpc/lib/copypage_64.S linux-2.6.32.ovz/arch/powerpc/lib/copypage_64.S --- linux-2.6.32/arch/powerpc/lib/copypage_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/copypage_64.S 2015-09-10 10:07:25.000000000 -0700 @@ -6,6 +6,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -15,9 +16,13 @@ PPC64_CACHES: .tc ppc64_caches[TC],ppc64_caches .section ".text" - -_GLOBAL(copy_4K_page) - li r5,4096 /* 4K page size */ +_GLOBAL(copy_page) +BEGIN_FTR_SECTION + lis r5,PAGE_SIZE@h +FTR_SECTION_ELSE + b .copypage_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) + ori r5,r5,PAGE_SIZE@l BEGIN_FTR_SECTION ld r10,PPC64_CACHES@toc(r2) lwz r11,DCACHEL1LOGLINESIZE(r10) /* log2 of cache line size */ diff -uprN linux-2.6.32/arch/powerpc/lib/copypage_power7.S linux-2.6.32.ovz/arch/powerpc/lib/copypage_power7.S --- linux-2.6.32/arch/powerpc/lib/copypage_power7.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/copypage_power7.S 2015-09-10 10:07:25.000000000 -0700 @@ -0,0 +1,168 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + */ +#include +#include + +#define STACKFRAMESIZE 256 +#define STK_REG(i) (112 + ((i)-14)*8) + +_GLOBAL(copypage_power7) + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. Since source and destination are page + * aligned we don't need to clear the bottom 7 bits of either + * address. + */ + ori r9,r3,1 /* stream=1 */ + +#ifdef CONFIG_PPC_64K_PAGES + lis r7,0x0E01 /* depth=7, units=512 */ +#else + lis r7,0x0E00 /* depth=7 */ + ori r7,r7,0x1000 /* units=32 */ +#endif + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + dcbt r0,r4,0b01000 + dcbt r0,r7,0b01010 + dcbtst r0,r9,0b01000 + dcbtst r0,r10,0b01010 + eieio + dcbt r0,r8,0b01010 /* GO */ +.machine pop + +#ifdef CONFIG_ALTIVEC + mflr r0 + std r3,48(r1) + std r4,56(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl .enter_vmx_copy + cmpwi r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STACKFRAMESIZE+48(r1) + ld r4,STACKFRAMESIZE+56(r1) + mtlr r0 + + li r0,(PAGE_SIZE/128) + mtctr r0 + + beq .Lnonvmx_copy + + addi r1,r1,STACKFRAMESIZE + + li r6,16 + li r7,32 + li r8,48 + li r9,64 + li r10,80 + li r11,96 + li r12,112 + + .align 5 +1: lvx vr7,r0,r4 + lvx vr6,r4,r6 + lvx vr5,r4,r7 + lvx vr4,r4,r8 + lvx vr3,r4,r9 + lvx vr2,r4,r10 + lvx vr1,r4,r11 + lvx vr0,r4,r12 + addi r4,r4,128 + stvx vr7,r0,r3 + stvx vr6,r3,r6 + stvx vr5,r3,r7 + stvx vr4,r3,r8 + stvx vr3,r3,r9 + stvx vr2,r3,r10 + stvx vr1,r3,r11 + stvx vr0,r3,r12 + addi r3,r3,128 + bdnz 1b + + b .exit_vmx_copy /* tail call optimise */ + +#else + li r0,(PAGE_SIZE/128) + mtctr r0 + + stdu r1,-STACKFRAMESIZE(r1) +#endif + +.Lnonvmx_copy: + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + std r17,STK_REG(r17)(r1) + std r18,STK_REG(r18)(r1) + std r19,STK_REG(r19)(r1) + std r20,STK_REG(r20)(r1) + +1: ld r0,0(r4) + ld r5,8(r4) + ld r6,16(r4) + ld r7,24(r4) + ld r8,32(r4) + ld r9,40(r4) + ld r10,48(r4) + ld r11,56(r4) + ld r12,64(r4) + ld r14,72(r4) + ld r15,80(r4) + ld r16,88(r4) + ld r17,96(r4) + ld r18,104(r4) + ld r19,112(r4) + ld r20,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r5,8(r3) + std r6,16(r3) + std r7,24(r3) + std r8,32(r3) + std r9,40(r3) + std r10,48(r3) + std r11,56(r3) + std r12,64(r3) + std r14,72(r3) + std r15,80(r3) + std r16,88(r3) + std r17,96(r3) + std r18,104(r3) + std r19,112(r3) + std r20,120(r3) + addi r3,r3,128 + bdnz 1b + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + ld r17,STK_REG(r17)(r1) + ld r18,STK_REG(r18)(r1) + ld r19,STK_REG(r19)(r1) + ld r20,STK_REG(r20)(r1) + addi r1,r1,STACKFRAMESIZE + blr diff -uprN linux-2.6.32/arch/powerpc/lib/copyuser_64.S linux-2.6.32.ovz/arch/powerpc/lib/copyuser_64.S --- linux-2.6.32/arch/powerpc/lib/copyuser_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/copyuser_64.S 2015-09-10 10:06:45.000000000 -0700 @@ -11,6 +11,12 @@ .align 7 _GLOBAL(__copy_tofrom_user) +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE + b __copy_tofrom_user_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +_GLOBAL(__copy_tofrom_user_base) /* first check for a whole page copy on a page boundary */ cmpldi cr1,r5,16 cmpdi cr6,r5,4096 @@ -44,37 +50,55 @@ BEGIN_FTR_SECTION andi. r0,r4,7 bne .Lsrc_unaligned END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) - srdi r7,r5,4 -20: ld r9,0(r4) - addi r4,r4,-8 - mtctr r7 - andi. r5,r5,7 - bf cr7*4+0,22f - addi r3,r3,8 - addi r4,r4,8 - mr r8,r9 - blt cr1,72f -21: ld r9,8(r4) -70: std r8,8(r3) -22: ldu r8,16(r4) -71: stdu r9,16(r3) + blt cr1,.Ldo_tail /* if < 16 bytes to copy */ + srdi r0,r5,5 + cmpdi cr1,r0,0 +20: ld r7,0(r4) +220: ld r6,8(r4) + addi r4,r4,16 + mtctr r0 + andi. r0,r5,0x10 + beq 22f + addi r3,r3,16 + addi r4,r4,-16 + mr r9,r7 + mr r8,r6 + beq cr1,72f +21: ld r7,16(r4) +221: ld r6,24(r4) + addi r4,r4,32 +70: std r9,0(r3) +270: std r8,8(r3) +22: ld r9,0(r4) +222: ld r8,8(r4) +71: std r7,16(r3) +271: std r6,24(r3) + addi r3,r3,32 bdnz 21b -72: std r8,8(r3) +72: std r9,0(r3) +272: std r8,8(r3) + andi. r5,r5,0xf beq+ 3f - addi r3,r3,16 + addi r4,r4,16 .Ldo_tail: - bf cr7*4+1,1f -23: lwz r9,8(r4) + addi r3,r3,16 + bf cr7*4+0,246f +244: ld r9,0(r4) + addi r4,r4,8 +245: std r9,0(r3) + addi r3,r3,8 +246: bf cr7*4+1,1f +23: lwz r9,0(r4) addi r4,r4,4 73: stw r9,0(r3) addi r3,r3,4 1: bf cr7*4+2,2f -44: lhz r9,8(r4) +44: lhz r9,0(r4) addi r4,r4,2 74: sth r9,0(r3) addi r3,r3,2 2: bf cr7*4+3,3f -45: lbz r9,8(r4) +45: lbz r9,0(r4) 75: stb r9,0(r3) 3: li r3,0 blr @@ -220,7 +244,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_ 131: addi r3,r3,8 120: +320: 122: +322: 124: 125: 126: @@ -229,9 +255,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_ 129: 133: addi r3,r3,8 -121: 132: addi r3,r3,8 +121: +321: +344: 134: 135: 138: @@ -303,18 +331,22 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_ 183: add r3,r3,r7 b 1f +371: 180: addi r3,r3,8 171: 177: addi r3,r3,8 -170: -172: +370: +372: 176: 178: addi r3,r3,4 185: addi r3,r3,4 +170: +172: +345: 173: 174: 175: @@ -341,11 +373,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_ .section __ex_table,"a" .align 3 .llong 20b,120b + .llong 220b,320b .llong 21b,121b + .llong 221b,321b .llong 70b,170b + .llong 270b,370b .llong 22b,122b + .llong 222b,322b .llong 71b,171b + .llong 271b,371b .llong 72b,172b + .llong 272b,372b + .llong 244b,344b + .llong 245b,345b .llong 23b,123b .llong 73b,173b .llong 44b,144b diff -uprN linux-2.6.32/arch/powerpc/lib/copyuser_power7.S linux-2.6.32.ovz/arch/powerpc/lib/copyuser_power7.S --- linux-2.6.32/arch/powerpc/lib/copyuser_power7.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/copyuser_power7.S 2015-09-10 10:07:25.000000000 -0700 @@ -0,0 +1,714 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Author: Anton Blanchard + */ +#include + +#define STACKFRAMESIZE 256 +#define STK_REG(i) (112 + ((i)-14)*8) + + .macro err1 +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Ldo_err1 + .previous + .endm + + .macro err2 +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldo_err2 + .previous + .endm + +#ifdef CONFIG_ALTIVEC + .macro err3 +300: + .section __ex_table,"a" + .align 3 + .llong 300b,.Ldo_err3 + .previous + .endm + + .macro err4 +400: + .section __ex_table,"a" + .align 3 + .llong 400b,.Ldo_err4 + .previous + .endm + + +.Ldo_err4: + ld r16,STK_REG(r16)(r1) + ld r15,STK_REG(r15)(r1) + ld r14,STK_REG(r14)(r1) +.Ldo_err3: + bl .exit_vmx_usercopy + ld r0,STACKFRAMESIZE+16(r1) + mtlr r0 + b .Lexit +#endif /* CONFIG_ALTIVEC */ + +.Ldo_err2: + ld r22,STK_REG(r22)(r1) + ld r21,STK_REG(r21)(r1) + ld r20,STK_REG(r20)(r1) + ld r19,STK_REG(r19)(r1) + ld r18,STK_REG(r18)(r1) + ld r17,STK_REG(r17)(r1) + ld r16,STK_REG(r16)(r1) + ld r15,STK_REG(r15)(r1) + ld r14,STK_REG(r14)(r1) +.Lexit: + addi r1,r1,STACKFRAMESIZE +.Ldo_err1: + ld r3,48(r1) + ld r4,56(r1) + ld r5,64(r1) + b __copy_tofrom_user_base + + +_GLOBAL(__copy_tofrom_user_power7) +#ifdef CONFIG_ALTIVEC + cmpldi r5,16 + cmpldi cr1,r5,4096 + + std r3,48(r1) + std r4,56(r1) + std r5,64(r1) + + blt .Lshort_copy + bgt cr1,.Lvmx_copy +#else + cmpldi r5,16 + + std r3,48(r1) + std r4,56(r1) + std r5,64(r1) + + blt .Lshort_copy +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f +err1; lbz r0,0(r4) + addi r4,r4,1 +err1; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + std r17,STK_REG(r17)(r1) + std r18,STK_REG(r18)(r1) + std r19,STK_REG(r19)(r1) + std r20,STK_REG(r20)(r1) + std r21,STK_REG(r21)(r1) + std r22,STK_REG(r22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r7,16(r4) +err2; ld r8,24(r4) +err2; ld r9,32(r4) +err2; ld r10,40(r4) +err2; ld r11,48(r4) +err2; ld r12,56(r4) +err2; ld r14,64(r4) +err2; ld r15,72(r4) +err2; ld r16,80(r4) +err2; ld r17,88(r4) +err2; ld r18,96(r4) +err2; ld r19,104(r4) +err2; ld r20,112(r4) +err2; ld r21,120(r4) + addi r4,r4,128 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r7,16(r3) +err2; std r8,24(r3) +err2; std r9,32(r3) +err2; std r10,40(r3) +err2; std r11,48(r3) +err2; std r12,56(r3) +err2; std r14,64(r3) +err2; std r15,72(r3) +err2; std r16,80(r3) +err2; std r17,88(r3) +err2; std r18,96(r3) +err2; std r19,104(r3) +err2; std r20,112(r3) +err2; std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + ld r17,STK_REG(r17)(r1) + ld r18,STK_REG(r18)(r1) + ld r19,STK_REG(r19)(r1) + ld r20,STK_REG(r20)(r1) + ld r21,STK_REG(r21)(r1) + ld r22,STK_REG(r22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) +err1; ld r9,32(r4) +err1; ld r10,40(r4) +err1; ld r11,48(r4) +err1; ld r12,56(r4) + addi r4,r4,64 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) +err1; std r9,32(r3) +err1; std r10,40(r3) +err1; std r11,48(r3) +err1; std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) + addi r4,r4,32 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f +err1; ld r0,0(r4) +err1; ld r6,8(r4) + addi r4,r4,16 +err1; std r0,0(r3) +err1; std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f +err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err1; lwz r6,4(r4) + addi r4,r4,8 +err1; stw r0,0(r3) +err1; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err1; lbz r0,0(r4) +err1; stb r0,0(r3) + +15: li r3,0 + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl .enter_vmx_usercopy + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STACKFRAMESIZE+48(r1) + ld r4,STACKFRAMESIZE+56(r1) + ld r5,STACKFRAMESIZE+64(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + dcbt r0,r6,0b01000 + dcbt r0,r7,0b01010 + dcbtst r0,r9,0b01000 + dcbtst r0,r10,0b01010 + eieio + dcbt r0,r8,0b01010 /* GO */ +.machine pop + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f +err3; lvx vr1,r0,r4 + addi r4,r4,16 +err3; stvx vr1,r0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f +err3; lvx vr1,r0,r4 +err3; lvx vr0,r4,r9 + addi r4,r4,32 +err3; stvx vr1,r0,r3 +err3; stvx vr0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx vr3,r0,r4 +err3; lvx vr2,r4,r9 +err3; lvx vr1,r4,r10 +err3; lvx vr0,r4,r11 + addi r4,r4,64 +err3; stvx vr3,r0,r3 +err3; stvx vr2,r3,r9 +err3; stvx vr1,r3,r10 +err3; stvx vr0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx vr7,r0,r4 +err4; lvx vr6,r4,r9 +err4; lvx vr5,r4,r10 +err4; lvx vr4,r4,r11 +err4; lvx vr3,r4,r12 +err4; lvx vr2,r4,r14 +err4; lvx vr1,r4,r15 +err4; lvx vr0,r4,r16 + addi r4,r4,128 +err4; stvx vr7,r0,r3 +err4; stvx vr6,r3,r9 +err4; stvx vr5,r3,r10 +err4; stvx vr4,r3,r11 +err4; stvx vr3,r3,r12 +err4; stvx vr2,r3,r14 +err4; stvx vr1,r3,r15 +err4; stvx vr0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx vr3,r0,r4 +err3; lvx vr2,r4,r9 +err3; lvx vr1,r4,r10 +err3; lvx vr0,r4,r11 + addi r4,r4,64 +err3; stvx vr3,r0,r3 +err3; stvx vr2,r3,r9 +err3; stvx vr1,r3,r10 +err3; stvx vr0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx vr1,r0,r4 +err3; lvx vr0,r4,r9 + addi r4,r4,32 +err3; stvx vr1,r0,r3 +err3; stvx vr0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx vr1,r0,r4 + addi r4,r4,16 +err3; stvx vr1,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b .exit_vmx_usercopy /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r7,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + lvsl vr16,0,r4 /* Setup permute control vector */ +err3; lvx vr0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f +err3; lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + addi r4,r4,16 +err3; stvx vr8,r0,r3 + addi r3,r3,16 + vor vr0,vr1,vr1 + +5: bf cr7*4+2,6f +err3; lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 +err3; lvx vr0,r4,r9 + vperm vr9,vr1,vr0,vr16 + addi r4,r4,32 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx vr3,r0,r4 + vperm vr8,vr0,vr3,vr16 +err3; lvx vr2,r4,r9 + vperm vr9,vr3,vr2,vr16 +err3; lvx vr1,r4,r10 + vperm vr10,vr2,vr1,vr16 +err3; lvx vr0,r4,r11 + vperm vr11,vr1,vr0,vr16 + addi r4,r4,64 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 +err3; stvx vr10,r3,r10 +err3; stvx vr11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx vr7,r0,r4 + vperm vr8,vr0,vr7,vr16 +err4; lvx vr6,r4,r9 + vperm vr9,vr7,vr6,vr16 +err4; lvx vr5,r4,r10 + vperm vr10,vr6,vr5,vr16 +err4; lvx vr4,r4,r11 + vperm vr11,vr5,vr4,vr16 +err4; lvx vr3,r4,r12 + vperm vr12,vr4,vr3,vr16 +err4; lvx vr2,r4,r14 + vperm vr13,vr3,vr2,vr16 +err4; lvx vr1,r4,r15 + vperm vr14,vr2,vr1,vr16 +err4; lvx vr0,r4,r16 + vperm vr15,vr1,vr0,vr16 + addi r4,r4,128 +err4; stvx vr8,r0,r3 +err4; stvx vr9,r3,r9 +err4; stvx vr10,r3,r10 +err4; stvx vr11,r3,r11 +err4; stvx vr12,r3,r12 +err4; stvx vr13,r3,r14 +err4; stvx vr14,r3,r15 +err4; stvx vr15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx vr3,r0,r4 + vperm vr8,vr0,vr3,vr16 +err3; lvx vr2,r4,r9 + vperm vr9,vr3,vr2,vr16 +err3; lvx vr1,r4,r10 + vperm vr10,vr2,vr1,vr16 +err3; lvx vr0,r4,r11 + vperm vr11,vr1,vr0,vr16 + addi r4,r4,64 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 +err3; stvx vr10,r3,r10 +err3; stvx vr11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 +err3; lvx vr0,r4,r9 + vperm vr9,vr1,vr0,vr16 + addi r4,r4,32 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + addi r4,r4,16 +err3; stvx vr8,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r6,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b .exit_vmx_usercopy /* tail call optimise */ +#endif /* CONFiG_ALTIVEC */ diff -uprN linux-2.6.32/arch/powerpc/lib/locks.c linux-2.6.32.ovz/arch/powerpc/lib/locks.c --- linux-2.6.32/arch/powerpc/lib/locks.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/locks.c 2015-09-10 10:07:50.000000000 -0700 @@ -84,12 +84,16 @@ void __rw_yield(raw_rwlock_t *rw) void __raw_spin_unlock_wait(raw_spinlock_t *lock) { + smp_mb(); + while (lock->slock) { HMT_low(); if (SHARED_PROCESSOR) __spin_yield(lock); } HMT_medium(); + + smp_mb(); } EXPORT_SYMBOL(__raw_spin_unlock_wait); diff -uprN linux-2.6.32/arch/powerpc/lib/Makefile linux-2.6.32.ovz/arch/powerpc/lib/Makefile --- linux-2.6.32/arch/powerpc/lib/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/Makefile 2015-09-10 10:07:25.000000000 -0700 @@ -17,12 +17,15 @@ obj-$(CONFIG_PPC32) += div64.o copy_32.o obj-$(CONFIG_HAS_IOMEM) += devres.o obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ - memcpy_64.o usercopy_64.o mem_64.o string.o + memcpy_64.o usercopy_64.o mem_64.o string.o \ + checksum_wrappers_64.o copyuser_power7.o \ + copypage_power7.o memcpy_power7.o obj-$(CONFIG_XMON) += sstep.o obj-$(CONFIG_KPROBES) += sstep.o ifeq ($(CONFIG_PPC64),y) obj-$(CONFIG_SMP) += locks.o +obj-$(CONFIG_ALTIVEC) += vmx-helper.o endif obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o diff -uprN linux-2.6.32/arch/powerpc/lib/memcpy_64.S linux-2.6.32.ovz/arch/powerpc/lib/memcpy_64.S --- linux-2.6.32/arch/powerpc/lib/memcpy_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/memcpy_64.S 2015-09-10 10:07:25.000000000 -0700 @@ -11,7 +11,11 @@ .align 7 _GLOBAL(memcpy) +BEGIN_FTR_SECTION std r3,48(r1) /* save destination pointer for return value */ +FTR_SECTION_ELSE + b memcpy_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) PPC_MTOCRF 0x01,r5 cmpldi cr1,r5,16 neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry diff -uprN linux-2.6.32/arch/powerpc/lib/memcpy_power7.S linux-2.6.32.ovz/arch/powerpc/lib/memcpy_power7.S --- linux-2.6.32/arch/powerpc/lib/memcpy_power7.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/memcpy_power7.S 2015-09-10 10:07:25.000000000 -0700 @@ -0,0 +1,650 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + */ +#include + +#define STACKFRAMESIZE 256 +#define STK_REG(i) (112 + ((i)-14)*8) + +_GLOBAL(memcpy_power7) +#ifdef CONFIG_ALTIVEC + cmpldi r5,16 + cmpldi cr1,r5,4096 + + std r3,48(r1) + + blt .Lshort_copy + bgt cr1,.Lvmx_copy +#else + cmpldi r5,16 + + std r3,48(r1) + + blt .Lshort_copy +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + std r17,STK_REG(r17)(r1) + std r18,STK_REG(r18)(r1) + std r19,STK_REG(r19)(r1) + std r20,STK_REG(r20)(r1) + std r21,STK_REG(r21)(r1) + std r22,STK_REG(r22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + ld r14,64(r4) + ld r15,72(r4) + ld r16,80(r4) + ld r17,88(r4) + ld r18,96(r4) + ld r19,104(r4) + ld r20,112(r4) + ld r21,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + std r14,64(r3) + std r15,72(r3) + std r16,80(r3) + std r17,88(r3) + std r18,96(r3) + std r19,104(r3) + std r20,112(r3) + std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + ld r17,STK_REG(r17)(r1) + ld r18,STK_REG(r18)(r1) + ld r19,STK_REG(r19)(r1) + ld r20,STK_REG(r20)(r1) + ld r21,STK_REG(r21)(r1) + ld r22,STK_REG(r22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + addi r4,r4,64 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + addi r4,r4,32 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f + ld r0,0(r4) + ld r6,8(r4) + addi r4,r4,16 + std r0,0(r3) + std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: ld r3,48(r1) + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: + mflr r0 + std r4,56(r1) + std r5,64(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl .enter_vmx_copy + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STACKFRAMESIZE+48(r1) + ld r4,STACKFRAMESIZE+56(r1) + ld r5,STACKFRAMESIZE+64(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + dcbt r0,r6,0b01000 + dcbt r0,r7,0b01010 + dcbtst r0,r9,0b01000 + dcbtst r0,r10,0b01010 + eieio + dcbt r0,r8,0b01010 /* GO */ +.machine pop + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f + lvx vr1,r0,r4 + addi r4,r4,16 + stvx vr1,r0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f + lvx vr1,r0,r4 + lvx vr0,r4,r9 + addi r4,r4,32 + stvx vr1,r0,r3 + stvx vr0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx vr3,r0,r4 + lvx vr2,r4,r9 + lvx vr1,r4,r10 + lvx vr0,r4,r11 + addi r4,r4,64 + stvx vr3,r0,r3 + stvx vr2,r3,r9 + stvx vr1,r3,r10 + stvx vr0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx vr7,r0,r4 + lvx vr6,r4,r9 + lvx vr5,r4,r10 + lvx vr4,r4,r11 + lvx vr3,r4,r12 + lvx vr2,r4,r14 + lvx vr1,r4,r15 + lvx vr0,r4,r16 + addi r4,r4,128 + stvx vr7,r0,r3 + stvx vr6,r3,r9 + stvx vr5,r3,r10 + stvx vr4,r3,r11 + stvx vr3,r3,r12 + stvx vr2,r3,r14 + stvx vr1,r3,r15 + stvx vr0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx vr3,r0,r4 + lvx vr2,r4,r9 + lvx vr1,r4,r10 + lvx vr0,r4,r11 + addi r4,r4,64 + stvx vr3,r0,r3 + stvx vr2,r3,r9 + stvx vr1,r3,r10 + stvx vr0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx vr1,r0,r4 + lvx vr0,r4,r9 + addi r4,r4,32 + stvx vr1,r0,r3 + stvx vr0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx vr1,r0,r4 + addi r4,r4,16 + stvx vr1,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,48(r1) + b .exit_vmx_copy /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r7,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + lvsl vr16,0,r4 /* Setup permute control vector */ + lvx vr0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f + lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + addi r4,r4,16 + stvx vr8,r0,r3 + addi r3,r3,16 + vor vr0,vr1,vr1 + +5: bf cr7*4+2,6f + lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + lvx vr0,r4,r9 + vperm vr9,vr1,vr0,vr16 + addi r4,r4,32 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx vr3,r0,r4 + vperm vr8,vr0,vr3,vr16 + lvx vr2,r4,r9 + vperm vr9,vr3,vr2,vr16 + lvx vr1,r4,r10 + vperm vr10,vr2,vr1,vr16 + lvx vr0,r4,r11 + vperm vr11,vr1,vr0,vr16 + addi r4,r4,64 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx vr7,r0,r4 + vperm vr8,vr0,vr7,vr16 + lvx vr6,r4,r9 + vperm vr9,vr7,vr6,vr16 + lvx vr5,r4,r10 + vperm vr10,vr6,vr5,vr16 + lvx vr4,r4,r11 + vperm vr11,vr5,vr4,vr16 + lvx vr3,r4,r12 + vperm vr12,vr4,vr3,vr16 + lvx vr2,r4,r14 + vperm vr13,vr3,vr2,vr16 + lvx vr1,r4,r15 + vperm vr14,vr2,vr1,vr16 + lvx vr0,r4,r16 + vperm vr15,vr1,vr0,vr16 + addi r4,r4,128 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + stvx vr12,r3,r12 + stvx vr13,r3,r14 + stvx vr14,r3,r15 + stvx vr15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx vr3,r0,r4 + vperm vr8,vr0,vr3,vr16 + lvx vr2,r4,r9 + vperm vr9,vr3,vr2,vr16 + lvx vr1,r4,r10 + vperm vr10,vr2,vr1,vr16 + lvx vr0,r4,r11 + vperm vr11,vr1,vr0,vr16 + addi r4,r4,64 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + lvx vr0,r4,r9 + vperm vr9,vr1,vr0,vr16 + addi r4,r4,32 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + addi r4,r4,16 + stvx vr8,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,48(r1) + b .exit_vmx_copy /* tail call optimise */ +#endif /* CONFiG_ALTIVEC */ diff -uprN linux-2.6.32/arch/powerpc/lib/string.S linux-2.6.32.ovz/arch/powerpc/lib/string.S --- linux-2.6.32/arch/powerpc/lib/string.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/string.S 2015-09-10 10:06:06.000000000 -0700 @@ -28,7 +28,7 @@ _GLOBAL(strcpy) /* This clears out any unused part of the destination buffer, just as the libc version does. -- paulus */ _GLOBAL(strncpy) - cmpwi 0,r5,0 + PPC_LCMPI 0,r5,0 beqlr mtctr r5 addi r6,r3,-1 @@ -39,7 +39,7 @@ _GLOBAL(strncpy) bdnzf 2,1b /* dec ctr, branch if ctr != 0 && !cr0.eq */ bnelr /* if we didn't hit a null char, we're done */ mfctr r5 - cmpwi 0,r5,0 /* any space left in destination buffer? */ + PPC_LCMPI 0,r5,0 /* any space left in destination buffer? */ beqlr /* we know r0 == 0 here */ 2: stbu r0,1(r6) /* clear it out if so */ bdnz 2b @@ -70,8 +70,8 @@ _GLOBAL(strcmp) blr _GLOBAL(strncmp) - PPC_LCMPI r5,0 - beqlr + PPC_LCMPI 0,r5,0 + beq- 2f mtctr r5 addi r5,r3,-1 addi r4,r4,-1 @@ -82,6 +82,8 @@ _GLOBAL(strncmp) beqlr 1 bdnzt eq,1b blr +2: li r3,0 + blr _GLOBAL(strlen) addi r4,r3,-1 @@ -92,8 +94,8 @@ _GLOBAL(strlen) blr _GLOBAL(memcmp) - cmpwi 0,r5,0 - ble- 2f + PPC_LCMPI 0,r5,0 + beq- 2f mtctr r5 addi r6,r3,-1 addi r4,r4,-1 @@ -106,8 +108,8 @@ _GLOBAL(memcmp) blr _GLOBAL(memchr) - cmpwi 0,r5,0 - ble- 2f + PPC_LCMPI 0,r5,0 + beq- 2f mtctr r5 addi r3,r3,-1 1: lbzu r0,1(r3) diff -uprN linux-2.6.32/arch/powerpc/lib/vmx-helper.c linux-2.6.32.ovz/arch/powerpc/lib/vmx-helper.c --- linux-2.6.32/arch/powerpc/lib/vmx-helper.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/vmx-helper.c 2015-09-10 10:07:25.000000000 -0700 @@ -0,0 +1,73 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Authors: Sukadev Bhattiprolu + * Anton Blanchard + */ +#include +#include + +int enter_vmx_usercopy(void) +{ + if (in_interrupt()) + return 0; + + /* This acts as preempt_disable() as well and will make + * enable_kernel_altivec(). We need to disable page faults + * as they can call schedule and thus make us lose the VMX + * context. So on page faults, we just fail which will cause + * a fallback to the normal non-vmx copy. + */ + pagefault_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * This function must return 0 because we tail call optimise when calling + * from __copy_tofrom_user_power7 which returns 0 on success. + */ +int exit_vmx_usercopy(void) +{ + pagefault_enable(); + return 0; +} + +int enter_vmx_copy(void) +{ + if (in_interrupt()) + return 0; + + preempt_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * All calls to this function will be optimised into tail calls. We are + * passed a pointer to the destination which we return as required by a + * memcpy implementation. + */ +void *exit_vmx_copy(void *dest) +{ + preempt_enable(); + return dest; +} diff -uprN linux-2.6.32/arch/powerpc/Makefile linux-2.6.32.ovz/arch/powerpc/Makefile --- linux-2.6.32/arch/powerpc/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/Makefile 2015-09-10 10:06:21.000000000 -0700 @@ -158,9 +158,11 @@ drivers-$(CONFIG_OPROFILE) += arch/power # Default to zImage, override when needed all: zImage -BOOT_TARGETS = zImage zImage.initrd uImage zImage% dtbImage% treeImage.% cuImage.% simpleImage.% +# With make 3.82 we cannot mix normal and wildcard targets +BOOT_TARGETS1 := zImage zImage.initrd uImaged +BOOT_TARGETS2 := zImage% dtbImage% treeImage.% cuImage.% simpleImage.% -PHONY += $(BOOT_TARGETS) +PHONY += $(BOOT_TARGETS1) $(BOOT_TARGETS2) boot := arch/$(ARCH)/boot @@ -175,10 +177,16 @@ relocs_check: arch/powerpc/relocs_check. zImage: relocs_check endif -$(BOOT_TARGETS): vmlinux +$(BOOT_TARGETS1): vmlinux + $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@) +$(BOOT_TARGETS2): vmlinux + $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@) + + +bootwrapper_install: $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@) -bootwrapper_install %.dtb: +%.dtb: $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@) define archhelp diff -uprN linux-2.6.32/arch/powerpc/mm/fault.c linux-2.6.32.ovz/arch/powerpc/mm/fault.c --- linux-2.6.32/arch/powerpc/mm/fault.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/fault.c 2015-09-10 10:07:52.000000000 -0700 @@ -125,6 +125,7 @@ int __kprobes do_page_fault(struct pt_re int is_write = 0, ret; int trap = TRAP(regs); int is_exec = trap == 0x400; + unsigned int flags = 0; #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) /* @@ -141,6 +142,12 @@ int __kprobes do_page_fault(struct pt_re is_write = error_code & ESR_DST; #endif /* CONFIG_4xx || CONFIG_BOOKE */ + if (is_write) + flags |= FAULT_FLAG_WRITE; + + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (notify_page_fault(regs)) return 0; @@ -171,7 +178,7 @@ int __kprobes do_page_fault(struct pt_re die("Weird page fault", regs, SIGSEGV); } - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -302,7 +309,7 @@ good_area: * the fault. */ survive: - ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); + ret = handle_mm_fault(mm, vma, address, flags); if (unlikely(ret & VM_FAULT_ERROR)) { if (ret & VM_FAULT_OOM) goto out_of_memory; @@ -312,7 +319,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { @@ -323,7 +330,7 @@ good_area: #endif } else { current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } up_read(&mm->mmap_sem); @@ -353,15 +360,10 @@ bad_area_nosemaphore: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); - if (user_mode(regs)) - do_group_exit(SIGKILL); - return SIGKILL; + if (!user_mode(regs)) + return SIGKILL; + pagefault_out_of_memory(); + return 0; do_sigbus: up_read(&mm->mmap_sem); diff -uprN linux-2.6.32/arch/powerpc/mm/fsl_booke_mmu.c linux-2.6.32.ovz/arch/powerpc/mm/fsl_booke_mmu.c --- linux-2.6.32/arch/powerpc/mm/fsl_booke_mmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/fsl_booke_mmu.c 2015-09-10 10:06:12.000000000 -0700 @@ -131,15 +131,10 @@ void settlbcam(int index, unsigned long TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR; TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); -#ifndef CONFIG_KGDB /* want user access for breakpoints */ if (flags & _PAGE_USER) { TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); } -#else - TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; - TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); -#endif tlbcam_addrs[index].start = virt; tlbcam_addrs[index].limit = virt + size - 1; diff -uprN linux-2.6.32/arch/powerpc/mm/gup.c linux-2.6.32.ovz/arch/powerpc/mm/gup.c --- linux-2.6.32/arch/powerpc/mm/gup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/gup.c 2015-09-10 10:06:45.000000000 -0700 @@ -62,7 +62,7 @@ static noinline int gup_huge_pte(pte_t * { unsigned long mask; unsigned long pte_end; - struct page *head, *page; + struct page *head, *page, *tail; pte_t pte; int refs; @@ -82,6 +82,7 @@ static noinline int gup_huge_pte(pte_t * refs = 0; head = pte_page(pte); page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT); + tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -96,10 +97,20 @@ static noinline int gup_huge_pte(pte_t * } if (unlikely(pte_val(pte) != pte_val(*ptep))) { /* Could be optimized better */ - while (*nr) { - put_page(page); - (*nr)--; - } + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; } return 1; diff -uprN linux-2.6.32/arch/powerpc/mm/hash_utils_64.c linux-2.6.32.ovz/arch/powerpc/mm/hash_utils_64.c --- linux-2.6.32/arch/powerpc/mm/hash_utils_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/hash_utils_64.c 2015-09-10 10:07:15.000000000 -0700 @@ -53,6 +53,7 @@ #include #include #include +#include #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -541,11 +542,11 @@ static unsigned long __init htab_get_tab } #ifdef CONFIG_MEMORY_HOTPLUG -void create_section_mapping(unsigned long start, unsigned long end) +int create_section_mapping(unsigned long start, unsigned long end) { - BUG_ON(htab_bolt_mapping(start, end, __pa(start), + return htab_bolt_mapping(start, end, __pa(start), pgprot_val(PAGE_KERNEL), mmu_linear_psize, - mmu_kernel_ssize)); + mmu_kernel_ssize); } int remove_section_mapping(unsigned long start, unsigned long end) @@ -625,6 +626,16 @@ static void __init htab_initialize(void) /* Using a hypervisor which owns the htab */ htab_address = NULL; _SDR1 = 0; +#ifdef CONFIG_FA_DUMP + /* + * If firmware assisted dump is active firmware preserves + * the contents of htab along with entire partition memory. + * Clear the htab if firmware assisted dump is active so + * that we dont end up using old mappings. + */ + if (is_fadump_active() && ppc_md.hpte_clear_all) + ppc_md.hpte_clear_all(); +#endif } else { /* Find storage for the HPT. Must be contiguous in * the absolute address space. On cell we want it to be @@ -879,6 +890,18 @@ static inline int subpage_protection(pgd } #endif +void hash_failure_debug(unsigned long ea, unsigned long access, + unsigned long vsid, unsigned long trap, + int ssize, int psize, unsigned long pte) +{ + if (!printk_ratelimit()) + return; + pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", + ea, access, current->comm); + pr_info(" trap=0x%lx vsid=0x%lx ssize=%d psize=%d pte=0x%lx\n", + trap, vsid, ssize, psize, pte); +} + /* Result code is: * 0 - handled * 1 - normal page fault @@ -1039,6 +1062,12 @@ int hash_page(unsigned long ea, unsigned local, ssize, spp); } + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, psize, + pte_val(*ptep)); #ifndef CONFIG_PPC_64K_PAGES DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); #else @@ -1057,8 +1086,7 @@ void hash_preload(struct mm_struct *mm, void *pgdir; pte_t *ptep; unsigned long flags; - int local = 0; - int ssize; + int rc, ssize, local = 0; BUG_ON(REGION_ID(ea) != USER_REGION_ID); @@ -1104,11 +1132,18 @@ void hash_preload(struct mm_struct *mm, /* Hash it in */ #ifdef CONFIG_PPC_HAS_HASH_64K if (mm->context.user_psize == MMU_PAGE_64K) - __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); + rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); else #endif /* CONFIG_PPC_HAS_HASH_64K */ - __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize, - subpage_protection(pgdir, ea)); + rc = __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize, + subpage_protection(pgdir, ea)); + + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, + mm->context.user_psize, pte_val(*ptep)); local_irq_restore(flags); } diff -uprN linux-2.6.32/arch/powerpc/mm/hugetlbpage.c linux-2.6.32.ovz/arch/powerpc/mm/hugetlbpage.c --- linux-2.6.32/arch/powerpc/mm/hugetlbpage.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/hugetlbpage.c 2015-09-10 10:07:03.000000000 -0700 @@ -226,7 +226,7 @@ pte_t *huge_pte_offset(struct mm_struct } pte_t *huge_pte_alloc(struct mm_struct *mm, - unsigned long addr, unsigned long sz) + unsigned long addr, unsigned long sz, bool *shared) { pgd_t *pg; pud_t *pu; @@ -556,32 +556,27 @@ int hash_huge_page(struct mm_struct *mm, unsigned long old_pte, new_pte; unsigned long va, rflags, pa, sz; long slot; - int err = 1; int ssize = user_segment_size(ea); unsigned int mmu_psize; int shift; + mmu_psize = get_slice_psize(mm, ea); if (!mmu_huge_psizes[mmu_psize]) - goto out; + return 1; ptep = huge_pte_offset(mm, ea); /* Search the Linux page table for a match with va */ va = hpt_va(ea, vsid, ssize); - /* - * If no pte found or not present, send the problem up to - * do_page_fault + /* If no pte found send the problem up to do_page_fault */ - if (unlikely(!ptep || pte_none(*ptep))) - goto out; + if (unlikely(!ptep)) + return 1; + + /* We need _PAGE_PRESENT as part of our access check */ + access |= _PAGE_PRESENT; - /* - * Check the user's access rights to the page. If access should be - * prevented then send the problem up to do_page_fault. - */ - if (unlikely(access & ~pte_val(*ptep))) - goto out; /* * At this point, we have a pte (old_pte) which can be used to build * or update an HPTE. There are 2 cases: @@ -593,13 +588,19 @@ int hash_huge_page(struct mm_struct *mm, * because we are doing software DIRTY bit management and the * page is currently not DIRTY. */ - - do { old_pte = pte_val(*ptep); - if (old_pte & _PAGE_BUSY) - goto out; + /* If PTE busy, retry the access */ + if (unlikely(old_pte & _PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(access & ~old_pte)) + return 1; + /* Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access */ new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_RW) + new_pte |= _PAGE_DIRTY; } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, old_pte, new_pte)); @@ -671,8 +672,16 @@ repeat: } } - if (unlikely(slot == -2)) - panic("hash_huge_page: pte_insert failed\n"); + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + mmu_psize, old_pte); + return -1; + } new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); } @@ -681,11 +690,7 @@ repeat: * No need to use ldarx/stdcx here */ *ptep = __pte(new_pte & ~_PAGE_BUSY); - - err = 0; - - out: - return err; + return 0; } static void __init set_huge_psize(int psize) diff -uprN linux-2.6.32/arch/powerpc/mm/init_64.c linux-2.6.32.ovz/arch/powerpc/mm/init_64.c --- linux-2.6.32/arch/powerpc/mm/init_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/init_64.c 2015-09-10 10:07:38.000000000 -0700 @@ -77,7 +77,9 @@ #endif /* CONFIG_PPC_STD_MMU_64 */ phys_addr_t memstart_addr = ~0; +EXPORT_SYMBOL_GPL(memstart_addr); phys_addr_t kernstart_addr; +EXPORT_SYMBOL_GPL(kernstart_addr); void free_initmem(void) { @@ -217,19 +219,55 @@ static void __meminit vmemmap_create_map } #endif /* CONFIG_PPC_BOOK3E */ -int __meminit vmemmap_populate(struct page *start_page, - unsigned long nr_pages, int node) +struct vmemmap_backing *vmemmap_list; + +static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node) +{ + static struct vmemmap_backing *next; + static int num_left; + + /* allocate a page when required and hand out chunks */ + if (!next || !num_left) { + next = vmemmap_alloc_block(PAGE_SIZE, node); + if (unlikely(!next)) { + WARN_ON(1); + return NULL; + } + num_left = PAGE_SIZE / sizeof(struct vmemmap_backing); + } + + num_left--; + + return next++; +} + +static __meminit void vmemmap_list_populate(unsigned long phys, + unsigned long start, + int node) +{ + struct vmemmap_backing *vmem_back; + + vmem_back = vmemmap_list_alloc(node); + if (unlikely(!vmem_back)) { + WARN_ON(1); + return; + } + + vmem_back->phys = phys; + vmem_back->virt_addr = start; + vmem_back->list = vmemmap_list; + + vmemmap_list = vmem_back; +} + +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) { - unsigned long start = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + nr_pages); unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; /* Align to the page size of the linear mapping. */ start = _ALIGN_DOWN(start, page_size); - pr_debug("vmemmap_populate page %p, %ld pages, node %d\n", - start_page, nr_pages, node); - pr_debug(" -> map %lx..%lx\n", start, end); + pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node); for (; start < end; start += page_size) { void *p; @@ -241,6 +279,8 @@ int __meminit vmemmap_populate(struct pa if (!p) return -ENOMEM; + vmemmap_list_populate(__pa(p), start, node); + pr_debug(" * %016lx..%016lx allocated at %p\n", start, start + page_size, p); diff -uprN linux-2.6.32/arch/powerpc/mm/mem.c linux-2.6.32.ovz/arch/powerpc/mm/mem.c --- linux-2.6.32/arch/powerpc/mm/mem.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/mem.c 2015-09-10 10:07:19.000000000 -0700 @@ -47,6 +47,7 @@ #include #include #include +#include #include "mmu_decl.h" @@ -57,7 +58,7 @@ int init_bootmem_done; int mem_init_done; -phys_addr_t memory_limit; +unsigned long long memory_limit; #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; @@ -127,7 +128,8 @@ int arch_add_memory(int nid, u64 start, pgdata = NODE_DATA(nid); start = (unsigned long)__va(start); - create_section_mapping(start, start + size); + if (create_section_mapping(start, start + size)) + return -EINVAL; /* this should work for most non-highmem platforms */ zone = pgdata->node_zones; @@ -511,3 +513,23 @@ void update_mmu_cache(struct vm_area_str hash_preload(vma->vm_mm, address, access, trap); #endif /* CONFIG_PPC_STD_MMU */ } + +#ifdef CONFIG_STRICT_DEVMEM +/* + * devmem_is_allowed(): check to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * Access has to be given to non-kernel-ram areas as well, these contain the + * PCI mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pfn) +{ + if (iomem_is_exclusive(pfn << PAGE_SHIFT)) + return 0; + if (!page_is_ram(pfn)) + return 1; + if (page_is_rtas_user_buf(pfn)) + return 1; + return 0; +} +#endif /* CONFIG_STRICT_DEVMEM */ diff -uprN linux-2.6.32/arch/powerpc/mm/numa.c linux-2.6.32.ovz/arch/powerpc/mm/numa.c --- linux-2.6.32/arch/powerpc/mm/numa.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/numa.c 2015-09-10 10:07:44.000000000 -0700 @@ -20,10 +20,15 @@ #include #include #include +#include +#include #include #include #include #include +#include +#include +#include static int numa_enabled = 1; @@ -42,6 +47,12 @@ EXPORT_SYMBOL(node_data); static int min_common_depth; static int n_mem_addr_cells, n_mem_size_cells; +static int form1_affinity; + +#define MAX_DISTANCE_REF_POINTS 4 +static int distance_ref_points_depth; +static const unsigned int *distance_ref_points; +static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, unsigned int *nid) @@ -132,7 +143,7 @@ static void __init get_node_active_regio work_with_active_regions(nid, get_active_region_work_fn, node_ar); } -static void __cpuinit map_cpu_to_node(int cpu, int node) +static void map_cpu_to_node(int cpu, int node) { numa_cpu_lookup_table[cpu] = node; @@ -142,7 +153,7 @@ static void __cpuinit map_cpu_to_node(in cpu_set(cpu, numa_cpumask_lookup_table[node]); } -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) static void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; @@ -156,7 +167,7 @@ static void unmap_cpu_from_node(unsigned cpu, node); } } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ /* must hold reference to node during call */ static const int *of_get_associativity(struct device_node *dev) @@ -179,31 +190,78 @@ static const u32 *of_get_usable_memory(s return prop; } +int __node_distance(int a, int b) +{ + int i; + int distance = LOCAL_DISTANCE; + + if (!form1_affinity) + return distance; + + for (i = 0; i < distance_ref_points_depth; i++) { + if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) + break; + + /* Double the distance for each NUMA level */ + distance *= 2; + } + + return distance; +} +EXPORT_SYMBOL(__node_distance); + +static void initialize_distance_lookup_table(int nid, + const unsigned int *associativity) +{ + int i; + + if (!form1_affinity) + return; + + for (i = 0; i < distance_ref_points_depth; i++) { + distance_lookup_table[nid][i] = + associativity[distance_ref_points[i]]; + } +} + /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa * info is found. */ -static int of_node_to_nid_single(struct device_node *device) +static int associativity_to_nid(const unsigned int *associativity) { int nid = -1; - const unsigned int *tmp; if (min_common_depth == -1) goto out; - tmp = of_get_associativity(device); - if (!tmp) - goto out; - - if (tmp[0] >= min_common_depth) - nid = tmp[min_common_depth]; + if (associativity[0] >= min_common_depth) + nid = associativity[min_common_depth]; /* POWER4 LPAR uses 0xffff as invalid node */ if (nid == 0xffff || nid >= MAX_NUMNODES) nid = -1; + + if (nid > 0 && associativity[0] >= distance_ref_points_depth) + initialize_distance_lookup_table(nid, associativity); + out: return nid; } +/* Returns the nid associated with the given device tree node, + * or -1 if not found. + */ +static int of_node_to_nid_single(struct device_node *device) +{ + int nid = -1; + const unsigned int *tmp; + + tmp = of_get_associativity(device); + if (tmp) + nid = associativity_to_nid(tmp); + return nid; +} + /* Walk the device tree upwards, looking for an associativity id */ int of_node_to_nid(struct device_node *device) { @@ -226,26 +284,12 @@ int of_node_to_nid(struct device_node *d } EXPORT_SYMBOL_GPL(of_node_to_nid); -/* - * In theory, the "ibm,associativity" property may contain multiple - * associativity lists because a resource may be multiply connected - * into the machine. This resource then has different associativity - * characteristics relative to its multiple connections. We ignore - * this for now. We also assume that all cpu and memory sets have - * their distances represented at a common level. This won't be - * true for hierarchical NUMA. - * - * In any case the ibm,associativity-reference-points should give - * the correct depth for a normal NUMA system. - * - * - Dave Hansen - */ static int __init find_min_common_depth(void) { int depth; - const unsigned int *ref_points; struct device_node *rtas_root; - unsigned int len; + struct device_node *chosen; + const char *vec5; rtas_root = of_find_node_by_path("/rtas"); @@ -253,23 +297,67 @@ static int __init find_min_common_depth( return -1; /* - * this property is 2 32-bit integers, each representing a level of - * depth in the associativity nodes. The first is for an SMP - * configuration (should be all 0's) and the second is for a normal - * NUMA configuration. + * This property is a set of 32-bit integers, each representing + * an index into the ibm,associativity nodes. + * + * With form 0 affinity the first integer is for an SMP configuration + * (should be all 0's) and the second is for a normal NUMA + * configuration. We have only one level of NUMA. + * + * With form 1 affinity the first integer is the most significant + * NUMA boundary and the following are progressively less significant + * boundaries. There can be more than one level of NUMA. */ - ref_points = of_get_property(rtas_root, - "ibm,associativity-reference-points", &len); + distance_ref_points = of_get_property(rtas_root, + "ibm,associativity-reference-points", + &distance_ref_points_depth); - if ((len >= 2 * sizeof(unsigned int)) && ref_points) { - depth = ref_points[1]; - } else { + if (!distance_ref_points) { dbg("NUMA: ibm,associativity-reference-points not found.\n"); - depth = -1; + goto err; + } + + distance_ref_points_depth /= sizeof(int); + +#define VEC5_AFFINITY_BYTE 5 +#define VEC5_AFFINITY 0x80 + chosen = of_find_node_by_path("/chosen"); + if (chosen) { + vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL); + if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) { + dbg("Using form 1 affinity\n"); + form1_affinity = 1; + } + } + + if (form1_affinity) { + depth = distance_ref_points[0]; + } else { + if (distance_ref_points_depth < 2) { + printk(KERN_WARNING "NUMA: " + "short ibm,associativity-reference-points\n"); + goto err; + } + + depth = distance_ref_points[1]; + } + + /* + * Warn and cap if the hardware supports more than + * MAX_DISTANCE_REF_POINTS domains. + */ + if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { + printk(KERN_WARNING "NUMA: distance array capped at " + "%d entries\n", MAX_DISTANCE_REF_POINTS); + distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; } - of_node_put(rtas_root); + of_node_put(rtas_root); return depth; + +err: + of_node_put(rtas_root); + return -1; } static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) @@ -1141,4 +1229,270 @@ int hot_add_scn_to_nid(unsigned long scn return nid; } +static u64 hot_add_drconf_memory_max(void) +{ + struct device_node *memory = NULL; + unsigned int drconf_cell_cnt = 0; + u64 lmb_size = 0; + const u32 *dm = 0; + + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (memory) { + drconf_cell_cnt = of_get_drconf_memory(memory, &dm); + lmb_size = of_get_lmb_size(memory); + of_node_put(memory); + } + return lmb_size * drconf_cell_cnt; +} + +/* + * memory_hotplug_max - return max address of memory that may be added + * + * This is currently only used on systems that support drconfig memory + * hotplug. + */ +u64 memory_hotplug_max(void) +{ + return max(hot_add_drconf_memory_max(), lmb_end_of_DRAM()); +} #endif /* CONFIG_MEMORY_HOTPLUG */ + +/* Virtual Processor Home Node (VPHN) support */ +#ifdef CONFIG_PPC_SPLPAR +#define VPHN_NR_CHANGE_CTRS (8) +static u8 vphn_cpu_change_counts[NR_CPUS][VPHN_NR_CHANGE_CTRS]; +static cpumask_t cpu_associativity_changes_mask; +static int vphn_enabled; +static void set_topology_timer(void); + +/* + * Store the current values of the associativity change counters in the + * hypervisor. + */ +static void setup_cpu_associativity_change_counters(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + int i; + u8 *counts = vphn_cpu_change_counts[cpu]; + volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + + for (i = 0; i < VPHN_NR_CHANGE_CTRS; i++) + counts[i] = hypervisor_counts[i]; + } +} + +/* + * The hypervisor maintains a set of 8 associativity change counters in + * the VPA of each cpu that correspond to the associativity levels in the + * ibm,associativity-reference-points property. When an associativity + * level changes, the corresponding counter is incremented. + * + * Set a bit in cpu_associativity_changes_mask for each cpu whose home + * node associativity levels have changed. + * + * Returns the number of cpus with unhandled associativity changes. + */ +static int update_cpu_associativity_changes_mask(void) +{ + int cpu, nr_cpus = 0; + cpumask_t *changes = &cpu_associativity_changes_mask; + + cpumask_clear(changes); + + for_each_possible_cpu(cpu) { + int i, changed = 0; + u8 *counts = vphn_cpu_change_counts[cpu]; + volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + + for (i = 0; i < VPHN_NR_CHANGE_CTRS; i++) { + if (hypervisor_counts[i] > counts[i]) { + counts[i] = hypervisor_counts[i]; + changed = 1; + } + } + if (changed) { + cpumask_set_cpu(cpu, changes); + nr_cpus++; + } + } + + return nr_cpus; +} + +static void topology_work_fn(struct work_struct *work) +{ + rebuild_sched_domains(); +} +static DECLARE_WORK(topology_work, topology_work_fn); + +void topology_schedule_update(void) +{ + schedule_work(&topology_work); +} + +static void topology_timer_fn(unsigned long ignored) +{ + if (!vphn_enabled) + return; + if (update_cpu_associativity_changes_mask() > 0) + topology_schedule_update(); + set_topology_timer(); +} +static struct timer_list topology_timer = + TIMER_INITIALIZER(topology_timer_fn, 0, 0); + +static void set_topology_timer(void) +{ + topology_timer.data = 0; + topology_timer.expires = jiffies + 60 * HZ; + add_timer(&topology_timer); +} + +/* + * Start polling for VPHN associativity changes. + */ +int start_topology_update(void) +{ + int rc = 0; + + /* Disabled until races with load balancing are fixed */ + if (0 && firmware_has_feature(FW_FEATURE_VPHN)) { + vphn_enabled = 1; + setup_cpu_associativity_change_counters(); + init_timer_deferrable(&topology_timer); + set_topology_timer(); + rc = 1; + } + + return rc; +} +__initcall(start_topology_update); + +/* + * Disable polling for VPHN associativity changes. + */ +int stop_topology_update(void) +{ + vphn_enabled = 0; + return del_timer_sync(&topology_timer); +} +#endif /* CONFIG_PPC_SPLPAR */ + +/* 6 64-bit registers unpacked into 12 32-bit associativity values */ +#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32)) + +/* + * Convert the associativity domain numbers returned from the hypervisor + * to the sequence they would appear in the ibm,associativity property. + */ +static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked) +{ + int i, nr_assoc_doms = 0; + const u16 *field = (const u16*) packed; + +#define VPHN_FIELD_UNUSED (0xffff) +#define VPHN_FIELD_MSB (0x8000) +#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) + + for (i = 0; i < VPHN_ASSOC_BUFSIZE; i++) { + if (*field == VPHN_FIELD_UNUSED) { + /* All significant fields processed, and remaining + * fields contain the reserved value of all 1's. + * Just store them. + */ + unpacked[i] = *((u32*)field); + field += 2; + } else if (*field & VPHN_FIELD_MSB) { + /* Data is in the lower 15 bits of this field */ + unpacked[i] = *field & VPHN_FIELD_MASK; + field++; + nr_assoc_doms++; + } else { + /* Data is in the lower 15 bits of this field + * concatenated with the next 16 bit field + */ + unpacked[i] = *((u32*)field); + field += 2; + nr_assoc_doms++; + } + } + + return nr_assoc_doms; +} + +/* + * Retrieve the new associativity information for a virtual processor's + * home node. + */ +static long hcall_vphn(unsigned long cpu, unsigned int *associativity) +{ + long rc; + long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + u64 flags = 1; + int hwcpu = get_hard_smp_processor_id(cpu); + + rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); + vphn_unpack_associativity(retbuf, associativity); + + return rc; +} + +static long vphn_get_associativity(unsigned long cpu, + unsigned int *associativity) +{ + long rc = hcall_vphn(cpu, associativity); + + switch (rc) { + case H_FUNCTION: + printk(KERN_INFO + "VPHN is not supported. Disabling polling...\n"); + stop_topology_update(); + break; + case H_HARDWARE: + printk(KERN_ERR + "hcall_vphn() experienced a hardware fault " + "preventing VPHN. Disabling polling...\n"); + stop_topology_update(); + } + + return rc; +} + +/* + * Update the node maps and sysfs entries for each cpu whose home node + * has changed. + */ +int arch_update_cpu_topology(void) +{ + int cpu, nid, old_nid; + unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0}; + struct sys_device *sysdev; + + for_each_cpu_mask(cpu, cpu_associativity_changes_mask) { + vphn_get_associativity(cpu, associativity); + nid = associativity_to_nid(associativity); + + if (nid < 0 || !node_online(nid)) + nid = first_online_node; + + old_nid = numa_cpu_lookup_table[cpu]; + + /* Disable hotplug while we update the cpu + * masks and sysfs. + */ + get_online_cpus(); + unregister_cpu_under_node(cpu, old_nid); + unmap_cpu_from_node(cpu); + map_cpu_to_node(cpu, nid); + register_cpu_under_node(cpu, nid); + put_online_cpus(); + + sysdev = get_cpu_sysdev(cpu); + if (sysdev) + kobject_uevent(&sysdev->kobj, KOBJ_CHANGE); + } + + return 1; +} diff -uprN linux-2.6.32/arch/powerpc/mm/slice.c linux-2.6.32.ovz/arch/powerpc/mm/slice.c --- linux-2.6.32/arch/powerpc/mm/slice.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/slice.c 2015-09-10 10:07:40.000000000 -0700 @@ -230,9 +230,10 @@ static unsigned long slice_find_area_bot unsigned long start_addr, addr; struct slice_mask mask; int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned int unmap_factor = sysctl_unmap_area_factor; if (use_cache) { - if (len <= mm->cached_hole_size) { + if (len <= mm->cached_hole_size && !unmap_factor) { start_addr = addr = TASK_UNMAPPED_BASE; mm->cached_hole_size = 0; } else @@ -264,7 +265,9 @@ full_search: mm->free_area_cache = addr + len; return addr; } - if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start) + + if (use_cache && !unmap_factor && + (addr + mm->cached_hole_size) < vma->vm_start) mm->cached_hole_size = vma->vm_start - addr; addr = vma->vm_end; } @@ -272,7 +275,8 @@ full_search: /* Make sure we didn't miss any holes */ if (use_cache && start_addr != TASK_UNMAPPED_BASE) { start_addr = addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; + if (likely(!unmap_factor)) + mm->cached_hole_size = 0; goto full_search; } return -ENOMEM; @@ -287,10 +291,13 @@ static unsigned long slice_find_area_top unsigned long addr; struct slice_mask mask; int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned int unmap_factor = sysctl_unmap_area_factor; + int firsttime = 1; + again: /* check if free_area_cache is useful for us */ if (use_cache) { - if (len <= mm->cached_hole_size) { + if (len <= mm->cached_hole_size && !unmap_factor) { mm->cached_hole_size = 0; mm->free_area_cache = mm->mmap_base; } @@ -344,7 +351,8 @@ static unsigned long slice_find_area_top } /* remember the largest hole we saw so far */ - if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start) + if (use_cache && !unmap_factor && + (addr + mm->cached_hole_size) < vma->vm_start) mm->cached_hole_size = vma->vm_start - addr; /* try just below the current vma->vm_start */ @@ -352,6 +360,18 @@ static unsigned long slice_find_area_top } /* + * Using the next-fit algorithm, it is possible we started + * searching below usable address space holes. Go back to the + * top and start over. + */ + if (unmap_factor && firsttime) { + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = 0; + firsttime = 0; + goto again; + } + + /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() @@ -364,7 +384,8 @@ static unsigned long slice_find_area_top */ if (use_cache) { mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; + if (likely(!unmap_factor)) + mm->cached_hole_size = ~0UL; } return addr; @@ -424,7 +445,7 @@ unsigned long slice_get_unmapped_area(un if (fixed && (addr & ((1ul << pshift) - 1))) return -EINVAL; if (fixed && addr > (mm->task_size - len)) - return -EINVAL; + return -ENOMEM; /* If hint, make sure it matches our alignment restrictions */ if (!fixed && addr) { diff -uprN linux-2.6.32/arch/powerpc/oprofile/op_model_power4.c linux-2.6.32.ovz/arch/powerpc/oprofile/op_model_power4.c --- linux-2.6.32/arch/powerpc/oprofile/op_model_power4.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/oprofile/op_model_power4.c 2015-09-10 10:07:23.000000000 -0700 @@ -22,6 +22,13 @@ #include #define dbg(args...) +#define OPROFILE_PM_PMCSEL_MSK 0xffULL +#define OPROFILE_PM_UNIT_SHIFT 60 +#define OPROFILE_PM_UNIT_MSK 0xfULL +#define OPROFILE_MAX_PMC_NUM 3 +#define OPROFILE_PMSEL_FIELD_WIDTH 8 +#define OPROFILE_UNIT_FIELD_WIDTH 4 +#define MMCRA_SIAR_VALID_MASK 0x10000000ULL static unsigned long reset_value[OP_MAX_COUNTER]; @@ -32,6 +39,61 @@ static int use_slot_nums; static u32 mmcr0_val; static u64 mmcr1_val; static u64 mmcra_val; +static u32 cntr_marked_events; + +static int power7_marked_instr_event(u64 mmcr1) +{ + u64 psel, unit; + int pmc, cntr_marked_events = 0; + + /* Given the MMCR1 value, look at the field for each counter to + * determine if it is a marked event. Code based on the function + * power7_marked_instr_event() in file arch/powerpc/perf/power7-pmu.c. + */ + for (pmc = 0; pmc < 4; pmc++) { + psel = mmcr1 & (OPROFILE_PM_PMCSEL_MSK + << (OPROFILE_MAX_PMC_NUM - pmc) + * OPROFILE_PMSEL_FIELD_WIDTH); + psel = (psel >> ((OPROFILE_MAX_PMC_NUM - pmc) + * OPROFILE_PMSEL_FIELD_WIDTH)) & ~1ULL; + unit = mmcr1 & (OPROFILE_PM_UNIT_MSK + << (OPROFILE_PM_UNIT_SHIFT + - (pmc * OPROFILE_PMSEL_FIELD_WIDTH))); + unit = unit >> (OPROFILE_PM_UNIT_SHIFT + - (pmc * OPROFILE_PMSEL_FIELD_WIDTH)); + + switch (psel >> 4) { + case 2: + cntr_marked_events |= (pmc == 1 || pmc == 3) << pmc; + break; + case 3: + if (psel == 0x3c) { + cntr_marked_events |= (pmc == 0) << pmc; + break; + } + + if (psel == 0x3e) { + cntr_marked_events |= (pmc != 1) << pmc; + break; + } + + cntr_marked_events |= 1 << pmc; + break; + case 4: + case 5: + cntr_marked_events |= (unit == 0xd) << pmc; + break; + case 6: + if (psel == 0x64) + cntr_marked_events |= (pmc >= 2) << pmc; + break; + case 8: + cntr_marked_events |= (unit == 0xd) << pmc; + break; + } + } + return cntr_marked_events; +} static int power4_reg_setup(struct op_counter_config *ctr, struct op_system_config *sys, @@ -48,6 +110,23 @@ static int power4_reg_setup(struct op_co mmcr1_val = sys->mmcr1; mmcra_val = sys->mmcra; + /* Power 7+ and newer architectures: + * Determine which counter events in the group (the group of events is + * specified by the bit settings in the MMCR1 register) are marked + * events for use in the interrupt handler. Do the calculation once + * before OProfile starts. Information is used in the interrupt + * handler. Starting with Power 7+ we only record the sample for + * marked events if the SIAR valid bit is set. For non marked events + * the sample is always recorded. + */ + if (__is_processor(PVR_POWER7p)) + cntr_marked_events = power7_marked_instr_event(mmcr1_val); + else + cntr_marked_events = 0; /* For older processors, set the bit map + * to zero so the sample will always be + * be recorded. + */ + for (i = 0; i < cur_cpu_spec->num_pmcs; ++i) reset_value[i] = 0x80000000UL - ctr[i].count; @@ -261,6 +340,28 @@ static int get_kernel(unsigned long pc, return is_kernel; } +static bool pmc_overflow(unsigned long val) +{ + if ((int)val < 0) + return true; + + /* + * Events on POWER7 can roll back if a speculative event doesn't + * eventually complete. Unfortunately in some rare cases they will + * raise a performance monitor exception. We need to catch this to + * ensure we reset the PMC. In all cases the PMC will be 256 or less + * cycles from overflow. + * + * We only do this if the first pass fails to find any overflowing + * PMCs because a user might set a period of less than 256 and we + * don't want to mistakenly reset them. + */ + if (__is_processor(PV_POWER7) && ((0x80000000 - val) <= 256)) + return true; + + return false; +} + static void power4_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr) { @@ -270,6 +371,7 @@ static void power4_handle_interrupt(stru int i; unsigned int mmcr0; unsigned long mmcra; + bool siar_valid = false; mmcra = mfspr(SPRN_MMCRA); @@ -279,11 +381,29 @@ static void power4_handle_interrupt(stru /* set the PMM bit (see comment below) */ mtmsrd(mfmsr() | MSR_PMM); + /* Check that the SIAR valid bit in MMCRA is set to 1. */ + if ((mmcra & MMCRA_SIAR_VALID_MASK) == MMCRA_SIAR_VALID_MASK) + siar_valid = true; + for (i = 0; i < cur_cpu_spec->num_pmcs; ++i) { val = classic_ctr_read(i); - if (val < 0) { + if (pmc_overflow(val)) { if (oprofile_running && ctr[i].enabled) { - oprofile_add_ext_sample(pc, regs, i, is_kernel); + /* Power 7+ and newer architectures: + * If the event is a marked event, then only + * save the sample if the SIAR valid bit is + * set. If the event is not marked, then + * always save the sample. + * Note, the Sample enable bit in the MMCRA + * register must be set to 1 if the group + * contains a marked event. + */ + if ((siar_valid && + (cntr_marked_events & (1 << i))) + || !(cntr_marked_events & (1 << i))) + oprofile_add_ext_sample(pc, regs, i, + is_kernel); + classic_ctr_write(i, reset_value[i]); } else { classic_ctr_write(i, 0); diff -uprN linux-2.6.32/arch/powerpc/platforms/40x/ep405.c linux-2.6.32.ovz/arch/powerpc/platforms/40x/ep405.c --- linux-2.6.32/arch/powerpc/platforms/40x/ep405.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/40x/ep405.c 2015-09-10 10:07:23.000000000 -0700 @@ -100,7 +100,7 @@ static void __init ep405_setup_arch(void /* Find & init the BCSR CPLD */ ep405_init_bcsr(); - ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC); + pci_set_flags(PCI_REASSIGN_ALL_RSRC); } static int __init ep405_probe(void) diff -uprN linux-2.6.32/arch/powerpc/platforms/40x/ppc40x_simple.c linux-2.6.32.ovz/arch/powerpc/platforms/40x/ppc40x_simple.c --- linux-2.6.32/arch/powerpc/platforms/40x/ppc40x_simple.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/40x/ppc40x_simple.c 2015-09-10 10:07:23.000000000 -0700 @@ -65,7 +65,7 @@ static int __init ppc40x_probe(void) for (i = 0; i < ARRAY_SIZE(board); i++) { if (of_flat_dt_is_compatible(root, board[i])) { - ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC); + pci_set_flags(PCI_REASSIGN_ALL_RSRC); return 1; } } diff -uprN linux-2.6.32/arch/powerpc/platforms/40x/walnut.c linux-2.6.32.ovz/arch/powerpc/platforms/40x/walnut.c --- linux-2.6.32/arch/powerpc/platforms/40x/walnut.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/40x/walnut.c 2015-09-10 10:07:23.000000000 -0700 @@ -51,7 +51,7 @@ static int __init walnut_probe(void) if (!of_flat_dt_is_compatible(root, "ibm,walnut")) return 0; - ppc_pci_flags = PPC_PCI_REASSIGN_ALL_RSRC; + pci_set_flags(PCI_REASSIGN_ALL_RSRC); return 1; } diff -uprN linux-2.6.32/arch/powerpc/platforms/44x/ebony.c linux-2.6.32.ovz/arch/powerpc/platforms/44x/ebony.c --- linux-2.6.32/arch/powerpc/platforms/44x/ebony.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/44x/ebony.c 2015-09-10 10:07:23.000000000 -0700 @@ -54,7 +54,7 @@ static int __init ebony_probe(void) if (!of_flat_dt_is_compatible(root, "ibm,ebony")) return 0; - ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC); + pci_set_flags(PCI_REASSIGN_ALL_RSRC); return 1; } diff -uprN linux-2.6.32/arch/powerpc/platforms/44x/ppc44x_simple.c linux-2.6.32.ovz/arch/powerpc/platforms/44x/ppc44x_simple.c --- linux-2.6.32/arch/powerpc/platforms/44x/ppc44x_simple.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/44x/ppc44x_simple.c 2015-09-10 10:07:23.000000000 -0700 @@ -71,7 +71,7 @@ static int __init ppc44x_probe(void) for (i = 0; i < ARRAY_SIZE(board); i++) { if (of_flat_dt_is_compatible(root, board[i])) { - ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC); + pci_set_flags(PCI_REASSIGN_ALL_RSRC); return 1; } } diff -uprN linux-2.6.32/arch/powerpc/platforms/44x/sam440ep.c linux-2.6.32.ovz/arch/powerpc/platforms/44x/sam440ep.c --- linux-2.6.32/arch/powerpc/platforms/44x/sam440ep.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/44x/sam440ep.c 2015-09-10 10:07:23.000000000 -0700 @@ -51,7 +51,7 @@ static int __init sam440ep_probe(void) if (!of_flat_dt_is_compatible(root, "acube,sam440ep")) return 0; - ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC); + pci_set_flags(PCI_REASSIGN_ALL_RSRC); return 1; } diff -uprN linux-2.6.32/arch/powerpc/platforms/52xx/mpc52xx_pci.c linux-2.6.32.ovz/arch/powerpc/platforms/52xx/mpc52xx_pci.c --- linux-2.6.32/arch/powerpc/platforms/52xx/mpc52xx_pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/52xx/mpc52xx_pci.c 2015-09-10 10:07:23.000000000 -0700 @@ -371,7 +371,7 @@ mpc52xx_add_bridge(struct device_node *n pr_debug("Adding MPC52xx PCI host bridge %s\n", node->full_name); - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); if (of_address_to_resource(node, 0, &rsrc) != 0) { printk(KERN_ERR "Can't get %s resources\n", node->full_name); diff -uprN linux-2.6.32/arch/powerpc/platforms/82xx/pq2.c linux-2.6.32.ovz/arch/powerpc/platforms/82xx/pq2.c --- linux-2.6.32/arch/powerpc/platforms/82xx/pq2.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/82xx/pq2.c 2015-09-10 10:07:23.000000000 -0700 @@ -53,7 +53,7 @@ static void __init pq2_pci_add_bridge(st if (of_address_to_resource(np, 0, &r) || r.end - r.start < 0x10b) goto err; - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); hose = pcibios_alloc_controller(np); if (!hose) diff -uprN linux-2.6.32/arch/powerpc/platforms/cell/iommu.c linux-2.6.32.ovz/arch/powerpc/platforms/cell/iommu.c --- linux-2.6.32/arch/powerpc/platforms/cell/iommu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/cell/iommu.c 2015-09-10 10:07:33.000000000 -0700 @@ -476,7 +476,7 @@ cell_iommu_setup_window(struct cbe_iommu ioid = cell_iommu_get_ioid(np); - window = kmalloc_node(sizeof(*window), GFP_KERNEL, iommu->nid); + window = kzalloc_node(sizeof(*window), GFP_KERNEL, iommu->nid); BUG_ON(window == NULL); window->offset = offset; @@ -518,7 +518,6 @@ cell_iommu_setup_window(struct cbe_iommu __set_bit(0, window->table.it_map); tce_build_cell(&window->table, window->table.it_offset, 1, (unsigned long)iommu->pad_page, DMA_TO_DEVICE, NULL); - window->table.it_hint = window->table.it_blocksize; return window; } diff -uprN linux-2.6.32/arch/powerpc/platforms/cell/spufs/syscalls.c linux-2.6.32.ovz/arch/powerpc/platforms/cell/spufs/syscalls.c --- linux-2.6.32/arch/powerpc/platforms/cell/spufs/syscalls.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/cell/spufs/syscalls.c 2015-09-10 10:07:24.000000000 -0700 @@ -61,7 +61,7 @@ out: static long do_spu_create(const char __user *pathname, unsigned int flags, mode_t mode, struct file *neighbor) { - char *tmp; + struct filename *tmp; int ret; tmp = getname(pathname); @@ -69,7 +69,7 @@ static long do_spu_create(const char __u if (!IS_ERR(tmp)) { struct nameidata nd; - ret = path_lookup(tmp, LOOKUP_PARENT, &nd); + ret = path_lookup(tmp->name, LOOKUP_PARENT, &nd); if (!ret) { nd.flags |= LOOKUP_OPEN | LOOKUP_CREATE; ret = spufs_create(&nd, flags, mode, neighbor); diff -uprN linux-2.6.32/arch/powerpc/platforms/chrp/pci.c linux-2.6.32.ovz/arch/powerpc/platforms/chrp/pci.c --- linux-2.6.32/arch/powerpc/platforms/chrp/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/chrp/pci.c 2015-09-10 10:07:23.000000000 -0700 @@ -199,7 +199,7 @@ static void __init setup_peg2(struct pci printk ("RTAS supporting Pegasos OF not found, please upgrade" " your firmware\n"); } - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); /* keep the reference to the root node */ } diff -uprN linux-2.6.32/arch/powerpc/platforms/fsl_uli1575.c linux-2.6.32.ovz/arch/powerpc/platforms/fsl_uli1575.c --- linux-2.6.32/arch/powerpc/platforms/fsl_uli1575.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/fsl_uli1575.c 2015-09-10 10:06:44.000000000 -0700 @@ -222,6 +222,7 @@ static void __devinit quirk_final_uli524 int i; u8 *dummy; struct pci_bus *bus = dev->bus; + struct resource *res; resource_size_t end = 0; for (i = PCI_BRIDGE_RESOURCES; i < PCI_BRIDGE_RESOURCES+3; i++) { @@ -230,13 +231,12 @@ static void __devinit quirk_final_uli524 end = pci_resource_end(dev, i); } - for (i = 0; i < PCI_BUS_NUM_RESOURCES; i++) { - if ((bus->resource[i]) && - (bus->resource[i]->flags & IORESOURCE_MEM)) { - if (bus->resource[i]->end == end) - dummy = ioremap(bus->resource[i]->start, 0x4); + pci_bus_for_each_resource(bus, res, i) { + if (res && res->flags & IORESOURCE_MEM) { + if (res->end == end) + dummy = ioremap(res->start, 0x4); else - dummy = ioremap(bus->resource[i]->end - 3, 0x4); + dummy = ioremap(res->end - 3, 0x4); if (dummy) { in_8(dummy); iounmap(dummy); diff -uprN linux-2.6.32/arch/powerpc/platforms/iseries/iommu.c linux-2.6.32.ovz/arch/powerpc/platforms/iseries/iommu.c --- linux-2.6.32/arch/powerpc/platforms/iseries/iommu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/iseries/iommu.c 2015-09-10 10:06:44.000000000 -0700 @@ -183,7 +183,7 @@ static void pci_dma_dev_setup_iseries(st BUG_ON(lsn == NULL); - tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL); + tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); iommu_table_getparms_iSeries(pdn->busno, *lsn, 0, tbl); diff -uprN linux-2.6.32/arch/powerpc/platforms/iseries/pci.c linux-2.6.32.ovz/arch/powerpc/platforms/iseries/pci.c --- linux-2.6.32/arch/powerpc/platforms/iseries/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/iseries/pci.c 2015-09-10 10:07:23.000000000 -0700 @@ -868,7 +868,7 @@ void __init iSeries_pcibios_init(void) /* Install IO hooks */ ppc_pci_io = iseries_pci_io; - pci_probe_only = 1; + pci_add_flags(PCI_PROBE_ONLY); /* iSeries has no IO space in the common sense, it needs to set * the IO base to 0 diff -uprN linux-2.6.32/arch/powerpc/platforms/maple/pci.c linux-2.6.32.ovz/arch/powerpc/platforms/maple/pci.c --- linux-2.6.32/arch/powerpc/platforms/maple/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/maple/pci.c 2015-09-10 10:07:23.000000000 -0700 @@ -565,7 +565,7 @@ void __init maple_pci_init(void) } /* Tell pci.c to not change any resource allocations. */ - pci_probe_only = 1; + pci_add_flags(PCI_PROBE_ONLY); } int maple_pci_get_legacy_ide_irq(struct pci_dev *pdev, int channel) diff -uprN linux-2.6.32/arch/powerpc/platforms/pasemi/pci.c linux-2.6.32.ovz/arch/powerpc/platforms/pasemi/pci.c --- linux-2.6.32/arch/powerpc/platforms/pasemi/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pasemi/pci.c 2015-09-10 10:07:23.000000000 -0700 @@ -231,7 +231,7 @@ void __init pas_pci_init(void) pci_devs_phb_init(); /* Use the common resource allocation mechanism */ - pci_probe_only = 1; + pci_add_flags(PCI_PROBE_ONLY); } void __iomem *pasemi_pci_getcfgaddr(struct pci_dev *dev, int offset) diff -uprN linux-2.6.32/arch/powerpc/platforms/powermac/pci.c linux-2.6.32.ovz/arch/powerpc/platforms/powermac/pci.c --- linux-2.6.32/arch/powerpc/platforms/powermac/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/powermac/pci.c 2015-09-10 10:07:23.000000000 -0700 @@ -731,7 +731,7 @@ static void __init setup_bandit(struct p static int __init setup_uninorth(struct pci_controller *hose, struct resource *addr) { - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); has_uninorth = 1; hose->ops = ¯isc_pci_ops; hose->cfg_addr = ioremap(addr->start + 0x800000, 0x1000); @@ -998,7 +998,7 @@ void __init pmac_pci_init(void) struct device_node *np, *root; struct device_node *ht = NULL; - ppc_pci_set_flags(PPC_PCI_CAN_SKIP_ISA_ALIGN); + pci_set_flags(PCI_CAN_SKIP_ISA_ALIGN); root = of_find_node_by_path("/"); if (root == NULL) { @@ -1045,9 +1045,6 @@ void __init pmac_pci_init(void) } /* pmac_check_ht_link(); */ - /* We can allocate missing resources if any */ - pci_probe_only = 0; - #else /* CONFIG_PPC64 */ init_p2pbridge(); init_second_ohare(); @@ -1057,7 +1054,7 @@ void __init pmac_pci_init(void) * some offset between bus number and domains for now when we * assign all busses should help for now */ - if (ppc_pci_has_flag(PPC_PCI_REASSIGN_ALL_BUS)) + if (pci_has_flag(PCI_REASSIGN_ALL_BUS)) pcibios_assign_bus_offset = 0x10; #endif } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/cmm.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/cmm.c --- linux-2.6.32/arch/powerpc/platforms/pseries/cmm.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/cmm.c 2015-09-10 10:06:29.000000000 -0700 @@ -38,19 +38,28 @@ #include #include #include +#include #include "plpar_wrappers.h" #define CMM_DRIVER_VERSION "1.0.0" #define CMM_DEFAULT_DELAY 1 +#define CMM_HOTPLUG_DELAY 5 #define CMM_DEBUG 0 #define CMM_DISABLE 0 #define CMM_OOM_KB 1024 #define CMM_MIN_MEM_MB 256 #define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10)) #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) +/* + * The priority level tries to ensure that this notifier is called as + * late as possible to reduce thrashing in the shared memory pool. + */ +#define CMM_MEM_HOTPLUG_PRI 1 +#define CMM_MEM_ISOLATE_PRI 15 static unsigned int delay = CMM_DEFAULT_DELAY; +static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY; static unsigned int oom_kb = CMM_OOM_KB; static unsigned int cmm_debug = CMM_DEBUG; static unsigned int cmm_disabled = CMM_DISABLE; @@ -65,6 +74,10 @@ MODULE_VERSION(CMM_DRIVER_VERSION); module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. " "[Default=" __stringify(CMM_DEFAULT_DELAY) "]"); +module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove " + "before loaning resumes. " + "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]"); module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. " "[Default=" __stringify(CMM_OOM_KB) "]"); @@ -92,6 +105,9 @@ static unsigned long oom_freed_pages; static struct cmm_page_array *cmm_page_list; static DEFINE_SPINLOCK(cmm_lock); +static DEFINE_MUTEX(hotplug_mutex); +static int hotplug_occurred; /* protected by the hotplug mutex */ + static struct task_struct *cmm_thread_ptr; /** @@ -110,6 +126,17 @@ static long cmm_alloc_pages(long nr) cmm_dbg("Begin request for %ld pages\n", nr); while (nr) { + /* Exit if a hotplug operation is in progress or occurred */ + if (mutex_trylock(&hotplug_mutex)) { + if (hotplug_occurred) { + mutex_unlock(&hotplug_mutex); + break; + } + mutex_unlock(&hotplug_mutex); + } else { + break; + } + addr = __get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC); if (!addr) @@ -119,8 +146,9 @@ static long cmm_alloc_pages(long nr) if (!pa || pa->index >= CMM_NR_PAGES) { /* Need a new page for the page list. */ spin_unlock(&cmm_lock); - npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN | - __GFP_NORETRY | __GFP_NOMEMALLOC); + npa = (struct cmm_page_array *)__get_free_page( + GFP_NOIO | __GFP_NOWARN | + __GFP_NORETRY | __GFP_NOMEMALLOC); if (!npa) { pr_info("%s: Can not allocate new page list\n", __func__); free_page(addr); @@ -229,8 +257,9 @@ static void cmm_get_mpp(void) { int rc; struct hvcall_mpp_data mpp_data; - unsigned long active_pages_target; - signed long page_loan_request; + signed long active_pages_target, page_loan_request, target; + signed long total_pages = totalram_pages + loaned_pages; + signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE; rc = h_get_mpp(&mpp_data); @@ -238,17 +267,25 @@ static void cmm_get_mpp(void) return; page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE); - loaned_pages_target = page_loan_request + loaned_pages; - if (loaned_pages_target > oom_freed_pages) - loaned_pages_target -= oom_freed_pages; + target = page_loan_request + (signed long)loaned_pages; + + if (target < 0 || total_pages < min_mem_pages) + target = 0; + + if (target > oom_freed_pages) + target -= oom_freed_pages; else - loaned_pages_target = 0; + target = 0; + + active_pages_target = total_pages - target; - active_pages_target = totalram_pages + loaned_pages - loaned_pages_target; + if (min_mem_pages > active_pages_target) + target = total_pages - min_mem_pages; - if ((min_mem_mb * 1024 * 1024) > (active_pages_target * PAGE_SIZE)) - loaned_pages_target = totalram_pages + loaned_pages - - ((min_mem_mb * 1024 * 1024) / PAGE_SIZE); + if (target < 0) + target = 0; + + loaned_pages_target = target; cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n", page_loan_request, loaned_pages, loaned_pages_target, @@ -273,9 +310,28 @@ static int cmm_thread(void *dummy) while (1) { timeleft = msleep_interruptible(delay * 1000); - if (kthread_should_stop() || timeleft) { - loaned_pages_target = loaned_pages; + if (kthread_should_stop() || timeleft) break; + + if (mutex_trylock(&hotplug_mutex)) { + if (hotplug_occurred) { + hotplug_occurred = 0; + mutex_unlock(&hotplug_mutex); + cmm_dbg("Hotplug operation has occurred, " + "loaning activity suspended " + "for %d seconds.\n", + hotplug_delay); + timeleft = msleep_interruptible(hotplug_delay * + 1000); + if (kthread_should_stop() || timeleft) + break; + continue; + } + mutex_unlock(&hotplug_mutex); + } else { + cmm_dbg("Hotplug operation in progress, activity " + "suspended\n"); + continue; } cmm_get_mpp(); @@ -405,6 +461,193 @@ static struct notifier_block cmm_reboot_ }; /** + * cmm_count_pages - Count the number of pages loaned in a particular range. + * + * @arg: memory_isolate_notify structure with address range and count + * + * Return value: + * 0 on success + **/ +static unsigned long cmm_count_pages(void *arg) +{ + struct memory_isolate_notify *marg = arg; + struct cmm_page_array *pa; + unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn); + unsigned long end = start + (marg->nr_pages << PAGE_SHIFT); + unsigned long idx; + + spin_lock(&cmm_lock); + pa = cmm_page_list; + while (pa) { + if ((unsigned long)pa >= start && (unsigned long)pa < end) + marg->pages_found++; + for (idx = 0; idx < pa->index; idx++) + if (pa->page[idx] >= start && pa->page[idx] < end) + marg->pages_found++; + pa = pa->next; + } + spin_unlock(&cmm_lock); + return 0; +} + +/** + * cmm_memory_isolate_cb - Handle memory isolation notifier calls + * @self: notifier block struct + * @action: action to take + * @arg: struct memory_isolate_notify data for handler + * + * Return value: + * NOTIFY_OK or notifier error based on subfunction return value + **/ +static int cmm_memory_isolate_cb(struct notifier_block *self, + unsigned long action, void *arg) +{ + int ret = 0; + + if (action == MEM_ISOLATE_COUNT) + ret = cmm_count_pages(arg); + + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; + + return ret; +} + +static struct notifier_block cmm_mem_isolate_nb = { + .notifier_call = cmm_memory_isolate_cb, + .priority = CMM_MEM_ISOLATE_PRI +}; + +/** + * cmm_mem_going_offline - Unloan pages where memory is to be removed + * @arg: memory_notify structure with page range to be offlined + * + * Return value: + * 0 on success + **/ +static int cmm_mem_going_offline(void *arg) +{ + struct memory_notify *marg = arg; + unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn); + unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT); + struct cmm_page_array *pa_curr, *pa_last, *npa; + unsigned long idx; + unsigned long freed = 0; + + cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n", + start_page, marg->nr_pages); + spin_lock(&cmm_lock); + + /* Search the page list for pages in the range to be offlined */ + pa_last = pa_curr = cmm_page_list; + while (pa_curr) { + for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) { + if ((pa_curr->page[idx] < start_page) || + (pa_curr->page[idx] >= end_page)) + continue; + + plpar_page_set_active(__pa(pa_curr->page[idx])); + free_page(pa_curr->page[idx]); + freed++; + loaned_pages--; + totalram_pages++; + pa_curr->page[idx] = pa_last->page[--pa_last->index]; + if (pa_last->index == 0) { + if (pa_curr == pa_last) + pa_curr = pa_last->next; + pa_last = pa_last->next; + free_page((unsigned long)cmm_page_list); + cmm_page_list = pa_last; + continue; + } + } + pa_curr = pa_curr->next; + } + + /* Search for page list structures in the range to be offlined */ + pa_last = NULL; + pa_curr = cmm_page_list; + while (pa_curr) { + if (((unsigned long)pa_curr >= start_page) && + ((unsigned long)pa_curr < end_page)) { + npa = (struct cmm_page_array *)__get_free_page( + GFP_NOIO | __GFP_NOWARN | + __GFP_NORETRY | __GFP_NOMEMALLOC); + if (!npa) { + spin_unlock(&cmm_lock); + cmm_dbg("Failed to allocate memory for list " + "management. Memory hotplug " + "failed.\n"); + return ENOMEM; + } + memcpy(npa, pa_curr, PAGE_SIZE); + if (pa_curr == cmm_page_list) + cmm_page_list = npa; + if (pa_last) + pa_last->next = npa; + free_page((unsigned long) pa_curr); + freed++; + pa_curr = npa; + } + + pa_last = pa_curr; + pa_curr = pa_curr->next; + } + + spin_unlock(&cmm_lock); + cmm_dbg("Released %ld pages in the search range.\n", freed); + + return 0; +} + +/** + * cmm_memory_cb - Handle memory hotplug notifier calls + * @self: notifier block struct + * @action: action to take + * @arg: struct memory_notify data for handler + * + * Return value: + * NOTIFY_OK or notifier error based on subfunction return value + * + **/ +static int cmm_memory_cb(struct notifier_block *self, + unsigned long action, void *arg) +{ + int ret = 0; + + switch (action) { + case MEM_GOING_OFFLINE: + mutex_lock(&hotplug_mutex); + hotplug_occurred = 1; + ret = cmm_mem_going_offline(arg); + break; + case MEM_OFFLINE: + case MEM_CANCEL_OFFLINE: + mutex_unlock(&hotplug_mutex); + cmm_dbg("Memory offline operation complete.\n"); + break; + case MEM_GOING_ONLINE: + case MEM_ONLINE: + case MEM_CANCEL_ONLINE: + break; + } + + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; + + return ret; +} + +static struct notifier_block cmm_mem_nb = { + .notifier_call = cmm_memory_cb, + .priority = CMM_MEM_HOTPLUG_PRI +}; + +/** * cmm_init - Module initialization * * Return value: @@ -426,18 +669,24 @@ static int cmm_init(void) if ((rc = cmm_sysfs_register(&cmm_sysdev))) goto out_reboot_notifier; + if (register_memory_notifier(&cmm_mem_nb) || + register_memory_isolate_notifier(&cmm_mem_isolate_nb)) + goto out_unregister_notifier; + if (cmm_disabled) return rc; cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); if (IS_ERR(cmm_thread_ptr)) { rc = PTR_ERR(cmm_thread_ptr); - goto out_unregister_sysfs; + goto out_unregister_notifier; } return rc; -out_unregister_sysfs: +out_unregister_notifier: + unregister_memory_notifier(&cmm_mem_nb); + unregister_memory_isolate_notifier(&cmm_mem_isolate_nb); cmm_unregister_sysfs(&cmm_sysdev); out_reboot_notifier: unregister_reboot_notifier(&cmm_reboot_nb); @@ -458,6 +707,8 @@ static void cmm_exit(void) kthread_stop(cmm_thread_ptr); unregister_oom_notifier(&cmm_oom_nb); unregister_reboot_notifier(&cmm_reboot_nb); + unregister_memory_notifier(&cmm_mem_nb); + unregister_memory_isolate_notifier(&cmm_mem_isolate_nb); cmm_free_pages(loaned_pages); cmm_unregister_sysfs(&cmm_sysdev); } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/dlpar.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/dlpar.c --- linux-2.6.32/arch/powerpc/platforms/pseries/dlpar.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/dlpar.c 2015-09-10 10:06:13.000000000 -0700 @@ -0,0 +1,563 @@ +/* + * Support for dynamic reconfiguration for PCI, Memory, and CPU + * Hotplug and Dynamic Logical Partitioning on RPA platforms. + * + * Copyright (C) 2009 Nathan Fontenot + * Copyright (C) 2009 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include "offline_states.h" + +#include +#include +#include +#include +#include + +struct cc_workarea { + u32 drc_index; + u32 zero; + u32 name_offset; + u32 prop_length; + u32 prop_offset; +}; + +static void dlpar_free_cc_property(struct property *prop) +{ + kfree(prop->name); + kfree(prop->value); + kfree(prop); +} + +static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa) +{ + struct property *prop; + char *name; + char *value; + + prop = kzalloc(sizeof(*prop), GFP_KERNEL); + if (!prop) + return NULL; + + name = (char *)ccwa + ccwa->name_offset; + prop->name = kstrdup(name, GFP_KERNEL); + if (!prop->name) { + dlpar_free_cc_property(prop); + return NULL; + } + + prop->length = ccwa->prop_length; + value = (char *)ccwa + ccwa->prop_offset; + prop->value = kzalloc(prop->length, GFP_KERNEL); + if (!prop->value) { + dlpar_free_cc_property(prop); + return NULL; + } + + memcpy(prop->value, value, prop->length); + return prop; +} + +static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa) +{ + struct device_node *dn; + char *name; + + dn = kzalloc(sizeof(*dn), GFP_KERNEL); + if (!dn) + return NULL; + + /* The configure connector reported name does not contain a + * preceeding '/', so we allocate a buffer large enough to + * prepend this to the full_name. + */ + name = (char *)ccwa + ccwa->name_offset; + dn->full_name = kmalloc(strlen(name) + 2, GFP_KERNEL); + if (!dn->full_name) { + kfree(dn); + return NULL; + } + + sprintf(dn->full_name, "/%s", name); + return dn; +} + +static void dlpar_free_one_cc_node(struct device_node *dn) +{ + struct property *prop; + + while (dn->properties) { + prop = dn->properties; + dn->properties = prop->next; + dlpar_free_cc_property(prop); + } + + kfree(dn->full_name); + kfree(dn); +} + +static void dlpar_free_cc_nodes(struct device_node *dn) +{ + if (dn->child) + dlpar_free_cc_nodes(dn->child); + + if (dn->sibling) + dlpar_free_cc_nodes(dn->sibling); + + dlpar_free_one_cc_node(dn); +} + +#define NEXT_SIBLING 1 +#define NEXT_CHILD 2 +#define NEXT_PROPERTY 3 +#define PREV_PARENT 4 +#define MORE_MEMORY 5 +#define CALL_AGAIN -2 +#define ERR_CFG_USE -9003 + +struct device_node *dlpar_configure_connector(u32 drc_index) +{ + struct device_node *dn; + struct device_node *first_dn = NULL; + struct device_node *last_dn = NULL; + struct property *property; + struct property *last_property = NULL; + struct cc_workarea *ccwa; + int cc_token; + int rc; + + cc_token = rtas_token("ibm,configure-connector"); + if (cc_token == RTAS_UNKNOWN_SERVICE) + return NULL; + + spin_lock(&rtas_data_buf_lock); + ccwa = (struct cc_workarea *)&rtas_data_buf[0]; + ccwa->drc_index = drc_index; + ccwa->zero = 0; + + rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL); + while (rc) { + switch (rc) { + case NEXT_SIBLING: + dn = dlpar_parse_cc_node(ccwa); + if (!dn) + goto cc_error; + + dn->parent = last_dn->parent; + last_dn->sibling = dn; + last_dn = dn; + break; + + case NEXT_CHILD: + dn = dlpar_parse_cc_node(ccwa); + if (!dn) + goto cc_error; + + if (!first_dn) + first_dn = dn; + else { + dn->parent = last_dn; + if (last_dn) + last_dn->child = dn; + } + + last_dn = dn; + break; + + case NEXT_PROPERTY: + property = dlpar_parse_cc_property(ccwa); + if (!property) + goto cc_error; + + if (!last_dn->properties) + last_dn->properties = property; + else + last_property->next = property; + + last_property = property; + break; + + case PREV_PARENT: + last_dn = last_dn->parent; + break; + + case CALL_AGAIN: + break; + + case MORE_MEMORY: + case ERR_CFG_USE: + default: + printk(KERN_ERR "Unexpected Error (%d) " + "returned from configure-connector\n", rc); + goto cc_error; + } + + rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL); + } + + spin_unlock(&rtas_data_buf_lock); + return first_dn; + +cc_error: + if (first_dn) + dlpar_free_cc_nodes(first_dn); + spin_unlock(&rtas_data_buf_lock); + return NULL; +} + +static struct device_node *derive_parent(const char *path) +{ + struct device_node *parent; + char *last_slash; + + last_slash = strrchr(path, '/'); + if (last_slash == path) { + parent = of_find_node_by_path("/"); + } else { + char *parent_path; + int parent_path_len = last_slash - path + 1; + parent_path = kmalloc(parent_path_len, GFP_KERNEL); + if (!parent_path) + return NULL; + + strlcpy(parent_path, path, parent_path_len); + parent = of_find_node_by_path(parent_path); + kfree(parent_path); + } + + return parent; +} + +int dlpar_attach_node(struct device_node *dn) +{ + struct proc_dir_entry *ent; + int rc; + + of_node_set_flag(dn, OF_DYNAMIC); + kref_init(&dn->kref); + dn->parent = derive_parent(dn->full_name); + if (!dn->parent) + return -ENOMEM; + + rc = blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_RECONFIG_ADD, dn); + if (rc == NOTIFY_BAD) { + printk(KERN_ERR "Failed to add device node %s\n", + dn->full_name); + return -ENOMEM; /* For now, safe to assume kmalloc failure */ + } + + of_attach_node(dn); + +#ifdef CONFIG_PROC_DEVICETREE + ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde); + if (ent) + proc_device_tree_add_node(dn, ent); +#endif + + of_node_put(dn->parent); + return 0; +} + +int dlpar_detach_node(struct device_node *dn) +{ + struct device_node *parent = dn->parent; + struct property *prop = dn->properties; + +#ifdef CONFIG_PROC_DEVICETREE + while (prop) { + remove_proc_entry(prop->name, dn->pde); + prop = prop->next; + } + + if (dn->pde) + remove_proc_entry(dn->pde->name, parent->pde); +#endif + + blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_RECONFIG_REMOVE, dn); + of_detach_node(dn); + of_node_put(dn); /* Must decrement the refcount */ + + return 0; +} + +#define DR_ENTITY_SENSE 9003 +#define DR_ENTITY_PRESENT 1 +#define DR_ENTITY_UNUSABLE 2 +#define ALLOCATION_STATE 9003 +#define ALLOC_UNUSABLE 0 +#define ALLOC_USABLE 1 +#define ISOLATION_STATE 9001 +#define ISOLATE 0 +#define UNISOLATE 1 + +int dlpar_acquire_drc(u32 drc_index) +{ + int dr_status, rc; + + rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status, + DR_ENTITY_SENSE, drc_index); + if (rc || dr_status != DR_ENTITY_UNUSABLE) + return rc ? rc : -EIO; + + rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE); + if (rc) + return rc; + + rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE); + if (rc) { + rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE); + return rc; + } + + return 0; +} + +int dlpar_release_drc(u32 drc_index) +{ + int dr_status, rc; + + rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status, + DR_ENTITY_SENSE, drc_index); + if (rc || dr_status != DR_ENTITY_PRESENT) + return rc ? rc : -EIO; + + rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE); + if (rc) + return rc; + + rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE); + if (rc) { + rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE); + return rc; + } + + return 0; +} + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + +static DEFINE_MUTEX(pseries_cpu_hotplug_mutex); + +void cpu_hotplug_driver_lock(void) +__acquires(pseries_cpu_hotplug_mutex) +{ + mutex_lock(&pseries_cpu_hotplug_mutex); +} + +void cpu_hotplug_driver_unlock(void) +__releases(pseries_cpu_hotplug_mutex) +{ + mutex_unlock(&pseries_cpu_hotplug_mutex); +} + +static int dlpar_online_cpu(struct device_node *dn) +{ + int rc = 0; + unsigned int cpu; + int len, nthreads, i; + const u32 *intserv; + + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); + if (!intserv) + return -EINVAL; + + nthreads = len / sizeof(u32); + + cpu_maps_update_begin(); + for (i = 0; i < nthreads; i++) { + for_each_present_cpu(cpu) { + if (get_hard_smp_processor_id(cpu) != intserv[i]) + continue; + BUG_ON(get_cpu_current_state(cpu) + != CPU_STATE_OFFLINE); + cpu_maps_update_done(); + rc = cpu_up(cpu); + if (rc) + goto out; + cpu_maps_update_begin(); + + break; + } + if (cpu == num_possible_cpus()) + printk(KERN_WARNING "Could not find cpu to online " + "with physical id 0x%x\n", intserv[i]); + } + cpu_maps_update_done(); + +out: + return rc; + +} + +static ssize_t dlpar_cpu_probe(const char *buf, size_t count) +{ + struct device_node *dn; + unsigned long drc_index; + char *cpu_name; + int rc; + + cpu_hotplug_driver_lock(); + rc = strict_strtoul(buf, 0, &drc_index); + if (rc) { + rc = -EINVAL; + goto out; + } + + dn = dlpar_configure_connector(drc_index); + if (!dn) { + rc = -EINVAL; + goto out; + } + + /* configure-connector reports cpus as living in the base + * directory of the device tree. CPUs actually live in the + * cpus directory so we need to fixup the full_name. + */ + cpu_name = kasprintf(GFP_KERNEL, "/cpus%s", dn->full_name); + if (!cpu_name) { + dlpar_free_cc_nodes(dn); + rc = -ENOMEM; + goto out; + } + + kfree(dn->full_name); + dn->full_name = cpu_name; + + rc = dlpar_acquire_drc(drc_index); + if (rc) { + dlpar_free_cc_nodes(dn); + rc = -EINVAL; + goto out; + } + + rc = dlpar_attach_node(dn); + if (rc) { + dlpar_release_drc(drc_index); + dlpar_free_cc_nodes(dn); + } + + rc = dlpar_online_cpu(dn); +out: + cpu_hotplug_driver_unlock(); + + return rc ? rc : count; +} + +static int dlpar_offline_cpu(struct device_node *dn) +{ + int rc = 0; + unsigned int cpu; + int len, nthreads, i; + const u32 *intserv; + + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); + if (!intserv) + return -EINVAL; + + nthreads = len / sizeof(u32); + + cpu_maps_update_begin(); + for (i = 0; i < nthreads; i++) { + for_each_present_cpu(cpu) { + if (get_hard_smp_processor_id(cpu) != intserv[i]) + continue; + + if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE) + break; + + if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) { + set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); + cpu_maps_update_done(); + rc = cpu_down(cpu); + if (rc) + goto out; + cpu_maps_update_begin(); + break; + + } + + /* + * The cpu is in CPU_STATE_INACTIVE. + * Upgrade it's state to CPU_STATE_OFFLINE. + */ + set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); + BUG_ON(plpar_hcall_norets(H_PROD, intserv[i]) + != H_SUCCESS); + __cpu_die(cpu); + break; + } + if (cpu == num_possible_cpus()) + printk(KERN_WARNING "Could not find cpu to offline " + "with physical id 0x%x\n", intserv[i]); + } + cpu_maps_update_done(); + +out: + return rc; + +} + +static ssize_t dlpar_cpu_release(const char *buf, size_t count) +{ + struct device_node *dn; + const u32 *drc_index; + int rc; + + dn = of_find_node_by_path(buf); + if (!dn) + return -EINVAL; + + drc_index = of_get_property(dn, "ibm,my-drc-index", NULL); + if (!drc_index) { + of_node_put(dn); + return -EINVAL; + } + + cpu_hotplug_driver_lock(); + rc = dlpar_offline_cpu(dn); + if (rc) { + of_node_put(dn); + rc = -EINVAL; + goto out; + } + + rc = dlpar_release_drc(*drc_index); + if (rc) { + of_node_put(dn); + goto out; + } + + rc = dlpar_detach_node(dn); + if (rc) { + dlpar_acquire_drc(*drc_index); + goto out; + } + + of_node_put(dn); +out: + cpu_hotplug_driver_unlock(); + return rc ? rc : count; +} + +static int __init pseries_dlpar_init(void) +{ + ppc_md.cpu_probe = dlpar_cpu_probe; + ppc_md.cpu_release = dlpar_cpu_release; + + return 0; +} +machine_device_initcall(pseries, pseries_dlpar_init); + +#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/dtl.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/dtl.c --- linux-2.6.32/arch/powerpc/platforms/pseries/dtl.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/dtl.c 2015-09-10 10:06:45.000000000 -0700 @@ -22,37 +22,22 @@ #include #include +#include #include #include #include #include +#include #include "plpar_wrappers.h" -/* - * Layout of entries in the hypervisor's DTL buffer. Although we don't - * actually access the internals of an entry (we only need to know the size), - * we might as well define it here for reference. - */ -struct dtl_entry { - u8 dispatch_reason; - u8 preempt_reason; - u16 processor_id; - u32 enqueue_to_dispatch_time; - u32 ready_to_enqueue_time; - u32 waiting_to_ready_time; - u64 timebase; - u64 fault_addr; - u64 srr0; - u64 srr1; -}; - struct dtl { struct dtl_entry *buf; struct dentry *file; int cpu; int buf_entries; u64 last_idx; + spinlock_t lock; }; static DEFINE_PER_CPU(struct dtl, dtl); @@ -71,25 +56,97 @@ static u8 dtl_event_mask = 0x7; static int dtl_buf_entries = (16 * 85); -static int dtl_enable(struct dtl *dtl) +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +struct dtl_ring { + u64 write_index; + struct dtl_entry *write_ptr; + struct dtl_entry *buf; + struct dtl_entry *buf_end; + u8 saved_dtl_mask; +}; + +static DEFINE_PER_CPU(struct dtl_ring, dtl_rings); + +static atomic_t dtl_count; + +/* + * The cpu accounting code controls the DTL ring buffer, and we get + * given entries as they are processed. + */ +static void consume_dtle(struct dtl_entry *dtle, u64 index) { - unsigned long addr; - int ret, hwcpu; + struct dtl_ring *dtlr = &__get_cpu_var(dtl_rings); + struct dtl_entry *wp = dtlr->write_ptr; + struct lppaca *vpa = local_paca->lppaca_ptr; + + if (!wp) + return; + + *wp = *dtle; + barrier(); + + /* check for hypervisor ring buffer overflow, ignore this entry if so */ + if (index + N_DISPATCH_LOG < vpa->dtl_idx) + return; + + ++wp; + if (wp == dtlr->buf_end) + wp = dtlr->buf; + dtlr->write_ptr = wp; - /* only allow one reader */ - if (dtl->buf) - return -EBUSY; + /* incrementing write_index makes the new entry visible */ + smp_wmb(); + ++dtlr->write_index; +} - /* we need to store the original allocation size for use during read */ - dtl->buf_entries = dtl_buf_entries; +static int dtl_start(struct dtl *dtl) +{ + struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu); - dtl->buf = kmalloc_node(dtl->buf_entries * sizeof(struct dtl_entry), - GFP_KERNEL, cpu_to_node(dtl->cpu)); - if (!dtl->buf) { - printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n", - __func__, dtl->cpu); - return -ENOMEM; - } + dtlr->buf = dtl->buf; + dtlr->buf_end = dtl->buf + dtl->buf_entries; + dtlr->write_index = 0; + + /* setting write_ptr enables logging into our buffer */ + smp_wmb(); + dtlr->write_ptr = dtl->buf; + + /* enable event logging */ + dtlr->saved_dtl_mask = lppaca[dtl->cpu].dtl_enable_mask; + lppaca[dtl->cpu].dtl_enable_mask |= dtl_event_mask; + + dtl_consumer = consume_dtle; + atomic_inc(&dtl_count); + return 0; +} + +static void dtl_stop(struct dtl *dtl) +{ + struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu); + + dtlr->write_ptr = NULL; + smp_wmb(); + + dtlr->buf = NULL; + + /* restore dtl_enable_mask */ + lppaca[dtl->cpu].dtl_enable_mask = dtlr->saved_dtl_mask; + + if (atomic_dec_and_test(&dtl_count)) + dtl_consumer = NULL; +} + +static u64 dtl_current_index(struct dtl *dtl) +{ + return per_cpu(dtl_rings, dtl->cpu).write_index; +} + +#else /* CONFIG_VIRT_CPU_ACCOUNTING */ + +static int dtl_start(struct dtl *dtl) +{ + unsigned long addr; + int ret, hwcpu; /* Register our dtl buffer with the hypervisor. The HV expects the * buffer size to be passed in the second word of the buffer */ @@ -101,12 +158,11 @@ static int dtl_enable(struct dtl *dtl) if (ret) { printk(KERN_WARNING "%s: DTL registration for cpu %d (hw %d) " "failed with %d\n", __func__, dtl->cpu, hwcpu, ret); - kfree(dtl->buf); return -EIO; } /* set our initial buffer indices */ - dtl->last_idx = lppaca[dtl->cpu].dtl_idx = 0; + lppaca[dtl->cpu].dtl_idx = 0; /* ensure that our updates to the lppaca fields have occurred before * we actually enable the logging */ @@ -118,17 +174,67 @@ static int dtl_enable(struct dtl *dtl) return 0; } -static void dtl_disable(struct dtl *dtl) +static void dtl_stop(struct dtl *dtl) { int hwcpu = get_hard_smp_processor_id(dtl->cpu); lppaca[dtl->cpu].dtl_enable_mask = 0x0; - unregister_dtl(hwcpu, __pa(dtl->buf)); + unregister_dtl(hwcpu); +} + +static u64 dtl_current_index(struct dtl *dtl) +{ + return lppaca[dtl->cpu].dtl_idx; +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + +static int dtl_enable(struct dtl *dtl) +{ + long int n_entries; + long int rc; + struct dtl_entry *buf = NULL; + + /* only allow one reader */ + if (dtl->buf) + return -EBUSY; + + n_entries = dtl_buf_entries; + buf = kmalloc_node(n_entries * sizeof(struct dtl_entry), + GFP_KERNEL, cpu_to_node(dtl->cpu)); + if (!buf) { + printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n", + __func__, dtl->cpu); + return -ENOMEM; + } + + spin_lock(&dtl->lock); + rc = -EBUSY; + if (!dtl->buf) { + /* store the original allocation size for use during read */ + dtl->buf_entries = n_entries; + dtl->buf = buf; + dtl->last_idx = 0; + rc = dtl_start(dtl); + if (rc) + dtl->buf = NULL; + } + spin_unlock(&dtl->lock); + + if (rc) + kfree(buf); + return rc; +} + +static void dtl_disable(struct dtl *dtl) +{ + spin_lock(&dtl->lock); + dtl_stop(dtl); kfree(dtl->buf); dtl->buf = NULL; dtl->buf_entries = 0; + spin_unlock(&dtl->lock); } /* file interface */ @@ -156,8 +262,9 @@ static int dtl_file_release(struct inode static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { - int rc, cur_idx, last_idx, n_read, n_req, read_size; + long int rc, n_read, n_req, read_size; struct dtl *dtl; + u64 cur_idx, last_idx, i; if ((len % sizeof(struct dtl_entry)) != 0) return -EINVAL; @@ -170,41 +277,48 @@ static ssize_t dtl_file_read(struct file /* actual number of entries read */ n_read = 0; - cur_idx = lppaca[dtl->cpu].dtl_idx; + spin_lock(&dtl->lock); + + cur_idx = dtl_current_index(dtl); last_idx = dtl->last_idx; - if (cur_idx - last_idx > dtl->buf_entries) { - pr_debug("%s: hv buffer overflow for cpu %d, samples lost\n", - __func__, dtl->cpu); - } + if (last_idx + dtl->buf_entries <= cur_idx) + last_idx = cur_idx - dtl->buf_entries + 1; - cur_idx %= dtl->buf_entries; - last_idx %= dtl->buf_entries; + if (last_idx + n_req > cur_idx) + n_req = cur_idx - last_idx; + + if (n_req > 0) + dtl->last_idx = last_idx + n_req; + + spin_unlock(&dtl->lock); + + if (n_req <= 0) + return 0; + + i = last_idx % dtl->buf_entries; /* read the tail of the buffer if we've wrapped */ - if (last_idx > cur_idx) { - read_size = min(n_req, dtl->buf_entries - last_idx); + if (i + n_req > dtl->buf_entries) { + read_size = dtl->buf_entries - i; - rc = copy_to_user(buf, &dtl->buf[last_idx], + rc = copy_to_user(buf, &dtl->buf[i], read_size * sizeof(struct dtl_entry)); if (rc) return -EFAULT; - last_idx = 0; + i = 0; n_req -= read_size; n_read += read_size; buf += read_size * sizeof(struct dtl_entry); } /* .. and now the head */ - read_size = min(n_req, cur_idx - last_idx); - rc = copy_to_user(buf, &dtl->buf[last_idx], - read_size * sizeof(struct dtl_entry)); + rc = copy_to_user(buf, &dtl->buf[i], n_req * sizeof(struct dtl_entry)); if (rc) return -EFAULT; - n_read += read_size; - dtl->last_idx += n_read; + n_read += n_req; return n_read * sizeof(struct dtl_entry); } @@ -236,9 +350,6 @@ static int dtl_init(void) struct dentry *event_mask_file, *buf_entries_file; int rc, i; - if (!firmware_has_feature(FW_FEATURE_SPLPAR)) - return -ENODEV; - /* set up common debugfs structure */ rc = -ENOMEM; @@ -262,6 +373,7 @@ static int dtl_init(void) /* set up the per-cpu log structures */ for_each_possible_cpu(i) { struct dtl *dtl = &per_cpu(dtl, i); + spin_lock_init(&dtl->lock); dtl->cpu = i; rc = dtl_setup_file(dtl); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/eeh.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh.c --- linux-2.6.32/arch/powerpc/platforms/pseries/eeh.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh.c 2015-09-10 10:08:00.000000000 -0700 @@ -95,6 +95,7 @@ static int ibm_slot_error_detail; static int ibm_get_config_addr_info; static int ibm_get_config_addr_info2; static int ibm_configure_bridge; +static int ibm_configure_pe; int eeh_subsystem_enabled; EXPORT_SYMBOL(eeh_subsystem_enabled); @@ -263,6 +264,8 @@ void eeh_slot_error_detail(struct pci_dn pci_regs_buf[0] = 0; rtas_pci_enable(pdn, EEH_THAW_MMIO); + rtas_configure_bridge(pdn); + eeh_restore_bars(pdn); loglen = gather_pci_data(pdn, pci_regs_buf, EEH_PCI_REGS_LOG_LEN); rtas_slot_error_detail(pdn, severity, pci_regs_buf, loglen); @@ -450,6 +453,39 @@ void eeh_clear_slot (struct device_node spin_unlock_irqrestore(&confirm_error_lock, flags); } +void __eeh_set_pe_freset(struct device_node *parent, unsigned int *freset) +{ + struct device_node *dn; + + for_each_child_of_node(parent, dn) { + if (PCI_DN(dn)) { + + struct pci_dev *dev = PCI_DN(dn)->pcidev; + + if (dev && dev->driver) + *freset |= dev->needs_freset; + + __eeh_set_pe_freset(dn, freset); + } + } +} + +void eeh_set_pe_freset(struct device_node *dn, unsigned int *freset) +{ + struct pci_dev *dev; + dn = find_device_pe(dn); + + /* Back up one, since config addrs might be shared */ + if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) + dn = dn->parent; + + dev = PCI_DN(dn)->pcidev; + if (dev) + *freset |= dev->needs_freset; + + __eeh_set_pe_freset(dn, freset); +} + /** * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze * @dn device node @@ -491,7 +527,7 @@ int eeh_dn_check_failure(struct device_n pdn->eeh_mode & EEH_MODE_NOCHECK) { ignored_check++; pr_debug("EEH: Ignored check (%x) for %s %s\n", - pdn->eeh_mode, pci_name (dev), dn->full_name); + pdn->eeh_mode, eeh_pci_name(dev), dn->full_name); return 0; } @@ -515,9 +551,9 @@ int eeh_dn_check_failure(struct device_n printk (KERN_ERR "EEH: %d reads ignored for recovering device at " "location=%s driver=%s pci addr=%s\n", pdn->eeh_check_count, location, - dev->driver->name, pci_name(dev)); + eeh_driver_name(dev), eeh_pci_name(dev)); printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n", - dev->driver->name); + eeh_driver_name(dev)); dump_stack(); } goto dn_unlock; @@ -620,8 +656,6 @@ unsigned long eeh_check_failure(const vo dn = pci_device_to_OF_node(dev); eeh_dn_check_failure (dn, dev); - - pci_dev_put(dev); return val; } @@ -694,15 +728,24 @@ rtas_pci_slot_reset(struct pci_dn *pdn, if (pdn->eeh_pe_config_addr) config_addr = pdn->eeh_pe_config_addr; - rc = rtas_call(ibm_set_slot_reset,4,1, NULL, + rc = rtas_call(ibm_set_slot_reset, 4, 1, NULL, config_addr, BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid), state); - if (rc) - printk (KERN_WARNING "EEH: Unable to reset the failed slot," - " (%d) #RST=%d dn=%s\n", - rc, state, pdn->node->full_name); + + /* Fundamental-reset not supported on this PE, try hot-reset */ + if (rc == -8 && state == 3) { + rc = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), 1); + if (rc) + printk(KERN_WARNING + "EEH: Unable to reset the failed slot," + " #RST=%d dn=%s\n", + rc, pdn->node->full_name); + } } /** @@ -738,18 +781,21 @@ int pcibios_set_pcie_reset_state(struct /** * rtas_set_slot_reset -- assert the pci #RST line for 1/4 second * @pdn: pci device node to be reset. - * - * Return 0 if success, else a non-zero value. */ static void __rtas_set_slot_reset(struct pci_dn *pdn) { - struct pci_dev *dev = pdn->pcidev; + unsigned int freset = 0; - /* Determine type of EEH reset required by device, - * default hot reset or fundamental reset + /* Determine type of EEH reset required for + * Partitionable Endpoint, a hot-reset (1) + * or a fundamental reset (3). + * A fundamental reset required by any device under + * Partitionable Endpoint trumps hot-reset. */ - if (dev->needs_freset) + eeh_set_pe_freset(pdn->node, &freset); + + if (freset) rtas_pci_slot_reset(pdn, 3); else rtas_pci_slot_reset(pdn, 1); @@ -897,13 +943,20 @@ rtas_configure_bridge(struct pci_dn *pdn { int config_addr; int rc; + int token; /* Use PE configuration address, if present */ config_addr = pdn->eeh_config_addr; if (pdn->eeh_pe_config_addr) config_addr = pdn->eeh_pe_config_addr; - rc = rtas_call(ibm_configure_bridge,3,1, NULL, + /* Use new configure-pe function, if supported */ + if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) + token = ibm_configure_pe; + else + token = ibm_configure_bridge; + + rc = rtas_call(token, 3, 1, NULL, config_addr, BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid)); @@ -1079,6 +1132,7 @@ void __init eeh_init(void) ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); ibm_configure_bridge = rtas_token ("ibm,configure-bridge"); + ibm_configure_pe = rtas_token("ibm,configure-pe"); if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) return; @@ -1125,7 +1179,7 @@ void __init eeh_init(void) * on the CEC architecture, type of the device, on earlier boot * command-line arguments & etc. */ -static void eeh_add_device_early(struct device_node *dn) +void eeh_add_device_early(struct device_node *dn) { struct pci_controller *phb; struct eeh_early_enable_info info; @@ -1160,7 +1214,7 @@ EXPORT_SYMBOL_GPL(eeh_add_device_tree_ea * This routine must be used to complete EEH initialization for PCI * devices that were added after system boot (e.g. hotplug, dlpar). */ -static void eeh_add_device_late(struct pci_dev *dev) +void eeh_add_device_late(struct pci_dev *dev) { struct device_node *dn; struct pci_dn *pdn; @@ -1178,7 +1232,6 @@ static void eeh_add_device_late(struct p } WARN_ON(pdn->pcidev); - pci_dev_get (dev); pdn->pcidev = dev; pci_addr_cache_insert_device(dev); @@ -1210,7 +1263,7 @@ EXPORT_SYMBOL_GPL(eeh_add_device_tree_la * this device will no longer be detected after this call; thus, * i/o errors affecting this slot may leave this device unusable. */ -static void eeh_remove_device(struct pci_dev *dev) +void eeh_remove_device(struct pci_dev *dev) { struct device_node *dn; if (!dev || !eeh_subsystem_enabled) @@ -1225,7 +1278,6 @@ static void eeh_remove_device(struct pci return; } PCI_DN(dn)->pcidev = NULL; - pci_dev_put (dev); pci_addr_cache_remove_device(dev); eeh_sysfs_remove_device(dev); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/eeh_cache.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh_cache.c --- linux-2.6.32/arch/powerpc/platforms/pseries/eeh_cache.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh_cache.c 2015-09-10 10:08:00.000000000 -0700 @@ -69,16 +69,12 @@ static inline struct pci_dev *__pci_get_ struct pci_io_addr_range *piar; piar = rb_entry(n, struct pci_io_addr_range, rb_node); - if (addr < piar->addr_lo) { + if (addr < piar->addr_lo) n = n->rb_left; - } else { - if (addr > piar->addr_hi) { - n = n->rb_right; - } else { - pci_dev_get(piar->pcidev); - return piar->pcidev; - } - } + else if (addr > piar->addr_hi) + n = n->rb_right; + else + return piar->pcidev; } return NULL; @@ -157,7 +153,6 @@ pci_addr_cache_insert(struct pci_dev *de if (!piar) return NULL; - pci_dev_get(dev); piar->addr_lo = alo; piar->addr_hi = ahi; piar->pcidev = dev; @@ -245,7 +240,6 @@ restart: if (piar->pcidev == dev) { rb_erase(n, &pci_io_addr_cache_root.rb_root); - pci_dev_put(piar->pcidev); kfree(piar); goto restart; } @@ -294,7 +288,6 @@ void __init pci_addr_cache_build(void) dn = pci_device_to_OF_node(dev); if (!dn) continue; - pci_dev_get(dev); /* matching put is in eeh_remove_device() */ PCI_DN(dn)->pcidev = dev; eeh_sysfs_add_device(dev); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/eeh_driver.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh_driver.c --- linux-2.6.32/arch/powerpc/platforms/pseries/eeh_driver.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh_driver.c 2015-09-10 10:06:44.000000000 -0700 @@ -344,7 +344,7 @@ struct pci_dn * handle_eeh_events (struc struct pci_bus *frozen_bus; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; - const char *location, *pci_str, *drv_str; + const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str; frozen_dn = find_device_pe(event->dn); if (!frozen_dn) { @@ -353,7 +353,7 @@ struct pci_dn * handle_eeh_events (struc location = location ? location : "unknown"; printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " "for location=%s pci addr=%s\n", - location, pci_name(event->dev)); + location, eeh_pci_name(event->dev)); return NULL; } @@ -380,22 +380,26 @@ struct pci_dn * handle_eeh_events (struc frozen_pdn = PCI_DN(frozen_dn); frozen_pdn->eeh_freeze_count++; - if (frozen_pdn->pcidev) { - pci_str = pci_name (frozen_pdn->pcidev); - drv_str = pcid_name (frozen_pdn->pcidev); - } else { - pci_str = pci_name (event->dev); - drv_str = pcid_name (event->dev); - } - + pci_str = eeh_pci_name(event->dev); + drv_str = pcid_name(event->dev); + if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) goto excess_failures; printk(KERN_WARNING "EEH: This PCI device has failed %d times in the last hour:\n", frozen_pdn->eeh_freeze_count); + + if (frozen_pdn->pcidev) { + bus_pci_str = pci_name(frozen_pdn->pcidev); + bus_drv_str = pcid_name(frozen_pdn->pcidev); + printk(KERN_WARNING + "EEH: Bus location=%s driver=%s pci addr=%s\n", + location, bus_drv_str, bus_pci_str); + } + printk(KERN_WARNING - "EEH: location=%s driver=%s pci addr=%s\n", + "EEH: Device location=%s driver=%s pci addr=%s\n", location, drv_str, pci_str); /* Walk the various device drivers attached to this slot through diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/eeh_event.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh_event.c --- linux-2.6.32/arch/powerpc/platforms/pseries/eeh_event.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh_event.c 2015-09-10 10:08:00.000000000 -0700 @@ -80,12 +80,11 @@ static int eeh_event_handler(void * dumm eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", - pci_name(event->dev)); + eeh_pci_name(event->dev)); pdn = handle_eeh_events(event); eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); - pci_dev_put(event->dev); kfree(event); mutex_unlock(&eeh_event_mutex); @@ -137,9 +136,6 @@ int eeh_send_failure_event (struct devic return 1; } - if (dev) - pci_dev_get(dev); - event->dn = dn; event->dev = dev; diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/firmware.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/firmware.c --- linux-2.6.32/arch/powerpc/platforms/pseries/firmware.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/firmware.c 2015-09-10 10:06:32.000000000 -0700 @@ -55,6 +55,7 @@ firmware_features_table[FIRMWARE_MAX_FEA {FW_FEATURE_XDABR, "hcall-xdabr"}, {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, {FW_FEATURE_SPLPAR, "hcall-splpar"}, + {FW_FEATURE_VPHN, "hcall-vphn"}, }; /* Build up the firmware features bitmask using the contents of diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/hotplug-cpu.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hotplug-cpu.c --- linux-2.6.32/arch/powerpc/platforms/pseries/hotplug-cpu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hotplug-cpu.c 2015-09-10 10:06:23.000000000 -0700 @@ -30,6 +30,7 @@ #include #include "xics.h" #include "plpar_wrappers.h" +#include "offline_states.h" /* This version can't take the spinlock, because it never returns */ static struct rtas_args rtas_stop_self_args = { @@ -39,6 +40,55 @@ static struct rtas_args rtas_stop_self_a .rets = &rtas_stop_self_args.args[0], }; +static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) = + CPU_STATE_OFFLINE; +static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE; + +static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE; + +static int cede_offline_enabled __read_mostly = 1; + +/* + * Enable/disable cede_offline when available. + */ +static int __init setup_cede_offline(char *str) +{ + if (!strcmp(str, "off")) + cede_offline_enabled = 0; + else if (!strcmp(str, "on")) + cede_offline_enabled = 1; + else + return 0; + return 1; +} + +__setup("cede_offline=", setup_cede_offline); + +enum cpu_state_vals get_cpu_current_state(int cpu) +{ + return per_cpu(current_state, cpu); +} + +void set_cpu_current_state(int cpu, enum cpu_state_vals state) +{ + per_cpu(current_state, cpu) = state; +} + +enum cpu_state_vals get_preferred_offline_state(int cpu) +{ + return per_cpu(preferred_offline_state, cpu); +} + +void set_preferred_offline_state(int cpu, enum cpu_state_vals state) +{ + per_cpu(preferred_offline_state, cpu) = state; +} + +void set_default_offline_state(int cpu) +{ + per_cpu(preferred_offline_state, cpu) = default_offline_state; +} + static void rtas_stop_self(void) { struct rtas_args *args = &rtas_stop_self_args; @@ -56,38 +106,54 @@ static void rtas_stop_self(void) static void pseries_mach_cpu_die(void) { + unsigned int cpu = smp_processor_id(); + unsigned int hwcpu = hard_smp_processor_id(); + u8 cede_latency_hint = 0; + local_irq_disable(); idle_task_exit(); xics_teardown_cpu(); - unregister_slb_shadow(hard_smp_processor_id(), __pa(get_slb_shadow())); - rtas_stop_self(); - /* Should never get here... */ - BUG(); - for(;;); -} -static int qcss_tok; /* query-cpu-stopped-state token */ + if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + set_cpu_current_state(cpu, CPU_STATE_INACTIVE); + pseries_suspend_cpu(); + + cede_latency_hint = 2; + + get_lppaca()->idle = 1; + if (!get_lppaca()->shared_proc) + get_lppaca()->donate_dedicated_cpu = 1; -/* Get state of physical CPU. - * Return codes: - * 0 - The processor is in the RTAS stopped state - * 1 - stop-self is in progress - * 2 - The processor is not in the RTAS stopped state - * -1 - Hardware Error - * -2 - Hardware Busy, Try again later. - */ -static int query_cpu_stopped(unsigned int pcpu) -{ - int cpu_status, status; + while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + extended_cede_processor(cede_latency_hint); + } - status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu); - if (status != 0) { - printk(KERN_ERR - "RTAS query-cpu-stopped-state failed: %i\n", status); - return status; + if (!get_lppaca()->shared_proc) + get_lppaca()->donate_dedicated_cpu = 0; + get_lppaca()->idle = 0; + + if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) { + unregister_slb_shadow(hwcpu, __pa(get_slb_shadow())); + + /* + * Call to start_secondary_resume() will not return. + * Kernel stack will be reset and start_secondary() + * will be called to continue the online operation. + */ + start_secondary_resume(); + } } - return cpu_status; + /* Requested state is CPU_STATE_OFFLINE at this point */ + WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE); + + set_cpu_current_state(cpu, CPU_STATE_OFFLINE); + unregister_slb_shadow(hwcpu, __pa(get_slb_shadow())); + rtas_stop_self(); + + /* Should never get here... */ + BUG(); + for(;;); } static int pseries_cpu_disable(void) @@ -106,18 +172,44 @@ static int pseries_cpu_disable(void) return 0; } +/* + * pseries_cpu_die: Wait for the cpu to die. + * @cpu: logical processor id of the CPU whose death we're awaiting. + * + * This function is called from the context of the thread which is performing + * the cpu-offline. Here we wait for long enough to allow the cpu in question + * to self-destroy so that the cpu-offline thread can send the CPU_DEAD + * notifications. + * + * OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to + * self-destruct. + */ static void pseries_cpu_die(unsigned int cpu) { int tries; - int cpu_status; + int cpu_status = 1; unsigned int pcpu = get_hard_smp_processor_id(cpu); - for (tries = 0; tries < 25; tries++) { - cpu_status = query_cpu_stopped(pcpu); - if (cpu_status == 0 || cpu_status == -1) - break; - cpu_relax(); + if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + cpu_status = 1; + for (tries = 0; tries < 5000; tries++) { + if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) { + cpu_status = 0; + break; + } + msleep(1); + } + } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) { + + for (tries = 0; tries < 25; tries++) { + cpu_status = smp_query_cpu_stopped(pcpu); + if (cpu_status == QCSS_STOPPED || + cpu_status == QCSS_HARDWARE_ERROR) + break; + cpu_relax(); + } } + if (cpu_status != 0) { printk("Querying DEAD? cpu %i (%i) shows %i\n", cpu, pcpu, cpu_status); @@ -252,10 +344,30 @@ static struct notifier_block pseries_smp .notifier_call = pseries_smp_notifier, }; +#define MAX_CEDE_LATENCY_LEVELS 4 +#define CEDE_LATENCY_PARAM_LENGTH 10 +#define CEDE_LATENCY_PARAM_MAX_LENGTH \ + (MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char)) +#define CEDE_LATENCY_TOKEN 45 + +static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH]; + +static int parse_cede_parameters(void) +{ + memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH); + return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + CEDE_LATENCY_TOKEN, + __pa(cede_parameters), + CEDE_LATENCY_PARAM_MAX_LENGTH); +} + static int __init pseries_cpu_hotplug_init(void) { struct device_node *np; const char *typep; + int cpu; + int qcss_tok; for_each_node_by_name(np, "interrupt-controller") { typep = of_get_property(np, "compatible", NULL); @@ -283,8 +395,16 @@ static int __init pseries_cpu_hotplug_in smp_ops->cpu_die = pseries_cpu_die; /* Processors can be added/removed only on LPAR */ - if (firmware_has_feature(FW_FEATURE_LPAR)) + if (firmware_has_feature(FW_FEATURE_LPAR)) { pSeries_reconfig_notifier_register(&pseries_smp_nb); + cpu_maps_update_begin(); + if (cede_offline_enabled && parse_cede_parameters() == 0) { + default_offline_state = CPU_STATE_INACTIVE; + for_each_online_cpu(cpu) + set_default_offline_state(cpu); + } + cpu_maps_update_done(); + } return 0; } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/hotplug-memory.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hotplug-memory.c --- linux-2.6.32/arch/powerpc/platforms/pseries/hotplug-memory.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hotplug-memory.c 2015-09-10 10:06:13.000000000 -0700 @@ -11,11 +11,55 @@ #include #include +#include #include #include #include #include +u32 memory_block_size_bytes(void) +{ + struct device_node *np; + unsigned int memblock_size = 0; + + np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (np) { + const unsigned long *size; + + size = of_get_property(np, "ibm,lmb-size", NULL); + memblock_size = size ? *size : 0; + + of_node_put(np); + } else { + unsigned int memzero_size = 0; + const unsigned int *regs; + + np = of_find_node_by_path("/memory@0"); + if (np) { + regs = of_get_property(np, "reg", NULL); + memzero_size = regs ? regs[3] : 0; + of_node_put(np); + } + + if (memzero_size) { + /* We now know the size of memory@0, use this to find + * the first memoryblock and get its size. + */ + char buf[64]; + + sprintf(buf, "/memory@%x", memzero_size); + np = of_find_node_by_path(buf); + if (np) { + regs = of_get_property(np, "reg", NULL); + memblock_size = regs ? regs[3] : 0; + of_node_put(np); + } + } + } + + return memblock_size; +} + static int pseries_remove_lmb(unsigned long base, unsigned int lmb_size) { unsigned long start, start_pfn; @@ -54,6 +98,12 @@ static int pseries_remove_lmb(unsigned l */ start = (unsigned long)__va(base); ret = remove_section_mapping(start, start + lmb_size); + + /* Ensure all vmalloc mappings are flushed in case they also + * hit that section of memory + */ + vm_unmap_aliases(); + return ret; } @@ -120,30 +170,22 @@ static int pseries_add_memory(struct dev static int pseries_drconf_memory(unsigned long *base, unsigned int action) { - struct device_node *np; - const unsigned long *lmb_size; + unsigned long memblock_size; int rc; - np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); - if (!np) - return -EINVAL; - - lmb_size = of_get_property(np, "ibm,lmb-size", NULL); - if (!lmb_size) { - of_node_put(np); - return -EINVAL; - } + memblock_size = memory_block_size_bytes(); + if (!memblock_size) + return -EINVAL; if (action == PSERIES_DRCONF_MEM_ADD) { - rc = lmb_add(*base, *lmb_size); + rc = lmb_add(*base, memblock_size); rc = (rc < 0) ? -EINVAL : 0; } else if (action == PSERIES_DRCONF_MEM_REMOVE) { - rc = pseries_remove_lmb(*base, *lmb_size); + rc = pseries_remove_lmb(*base, memblock_size); } else { rc = -EINVAL; } - of_node_put(np); return rc; } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/hvconsole.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hvconsole.c --- linux-2.6.32/arch/powerpc/platforms/pseries/hvconsole.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hvconsole.c 2015-09-10 10:06:45.000000000 -0700 @@ -73,7 +73,7 @@ int hvc_put_chars(uint32_t vtermno, cons if (ret == H_SUCCESS) return count; if (ret == H_BUSY) - return 0; + return -EAGAIN; return -EIO; } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/iommu.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/iommu.c --- linux-2.6.32/arch/powerpc/platforms/pseries/iommu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/iommu.c 2015-09-10 10:07:33.000000000 -0700 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,7 @@ #include #include #include +#include #include "plpar_wrappers.h" @@ -154,12 +156,15 @@ static int tce_buildmulti_pSeriesLP(stru long l, limit; long tcenum_start = tcenum, npages_start = npages; int ret = 0; + unsigned long flags; if (npages == 1) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } + local_irq_save(flags); /* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); /* This is safe to do since interrupts are off when we're called @@ -169,6 +174,7 @@ static int tce_buildmulti_pSeriesLP(stru tcep = (u64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { + local_irq_restore(flags); return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } @@ -202,6 +208,8 @@ static int tce_buildmulti_pSeriesLP(stru tcenum += limit; } while (npages > 0 && !rc); + local_irq_restore(flags); + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; tce_freemulti_pSeriesLP(tbl, tcenum_start, @@ -270,6 +278,152 @@ static unsigned long tce_get_pSeriesLP(s return tce_ret; } +/* this is compatable with cells for the device tree property */ +struct dynamic_dma_window_prop { + __be32 liobn; /* tce table number */ + __be64 dma_base; /* address hi,lo */ + __be32 tce_shift; /* ilog2(tce_page_size) */ + __be32 window_shift; /* ilog2(tce_window_size) */ +}; + +struct direct_window { + struct device_node *device; + const struct dynamic_dma_window_prop *prop; + struct list_head list; +}; + +/* Dynamic DMA Window support */ +struct ddw_query_response { + u32 windows_available; + u32 largest_available_block; + u32 page_size; + u32 migration_capable; +}; + +struct ddw_create_response { + u32 liobn; + u32 addr_hi; + u32 addr_lo; +}; + +static LIST_HEAD(direct_window_list); +/* prevents races between memory on/offline and window creation */ +static DEFINE_SPINLOCK(direct_window_list_lock); +/* protects initializing window twice for same device */ +static DEFINE_MUTEX(direct_window_init_mutex); +#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" + +static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + int rc; + u64 tce_size, num_tce, dma_offset, next; + u32 tce_shift; + long limit; + + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 512); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn), + dma_offset, + 0, limit); + num_tce -= limit; + } while (num_tce > 0 && !rc); + + return rc; +} + +static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn; + u32 tce_shift; + u64 rc = 0; + long l, limit; + + local_irq_disable(); /* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); + + if (!tcep) { + tcep = (u64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { + local_irq_enable(); + return -ENOMEM; + } + __get_cpu_var(tce_page) = tcep; + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; + + liobn = (u64)be32_to_cpu(maprange->liobn); + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + /* We can map max one pageful of TCEs at a time */ + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + for (l = 0; l < limit; l++) { + tcep[l] = proto_tce | next; + next += tce_size; + } + + rc = plpar_tce_put_indirect(liobn, + dma_offset, + (u64)virt_to_abs(tcep), + limit); + + num_tce -= limit; + } while (num_tce > 0 && !rc); + + /* error cleanup: caller will clear whole range */ + + local_irq_enable(); + return rc; +} + +static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, + unsigned long num_pfn, void *arg) +{ + return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); +} + + #ifdef CONFIG_PCI static void iommu_table_setparms(struct pci_controller *phb, struct device_node *dn, @@ -403,7 +557,7 @@ static void pci_dma_bus_setup_pSeries(st pci->phb->dma_window_size = 0x8000000ul; pci->phb->dma_window_base_cur = 0x8000000ul; - tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, pci->phb->node); iommu_table_setparms(pci->phb, dn, tbl); @@ -448,16 +602,13 @@ static void pci_dma_bus_setup_pSeriesLP( pdn->full_name, ppci->iommu_table); if (!ppci->iommu_table) { - tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, ppci->phb->node); iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window, bus->number); ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node); pr_debug(" created table: %p\n", ppci->iommu_table); } - - if (pdn != dn) - PCI_DN(dn)->iommu_table = ppci->iommu_table; } @@ -478,7 +629,7 @@ static void pci_dma_dev_setup_pSeries(st struct pci_controller *phb = PCI_DN(dn)->phb; pr_debug(" --> first child, no bridge. Allocating iommu table.\n"); - tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, phb->node); iommu_table_setparms(phb, dn, tbl); PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); @@ -500,6 +651,334 @@ static void pci_dma_dev_setup_pSeries(st pci_name(dev)); } +static int __read_mostly disable_ddw; + +static int __init disable_ddw_setup(char *str) +{ + disable_ddw = 1; + printk(KERN_INFO "ppc iommu: disabling ddw.\n"); + + return 0; +} + +early_param("disable_ddw", disable_ddw_setup); + +static void remove_ddw(struct device_node *np) +{ + struct dynamic_dma_window_prop *dwp; + struct property *win64; + const u32 *ddr_avail; + u64 liobn; + int len, ret; + + ddr_avail = of_get_property(np, "ibm,ddw-applicable", &len); + win64 = of_find_property(np, DIRECT64_PROPNAME, NULL); + if (!win64 || !ddr_avail || len < 3 * sizeof(u32)) + return; + + dwp = win64->value; + liobn = (u64)be32_to_cpu(dwp->liobn); + + /* clear the whole window, note the arg is in kernel pages */ + ret = tce_clearrange_multi_pSeriesLP(0, + 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); + if (ret) + pr_warning("%s failed to clear tces in window.\n", + np->full_name); + else + pr_debug("%s successfully cleared tces in window.\n", + np->full_name); + + ret = rtas_call(ddr_avail[2], 1, 1, NULL, liobn); + if (ret) + pr_warning("%s: failed to remove direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddr_avail[2], liobn); + else + pr_debug("%s: successfully removed direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddr_avail[2], liobn); +} + + +static u64 dupe_ddw_if_already_created(struct pci_dev *dev, struct device_node *pdn) +{ + struct device_node *dn; + struct pci_dn *pcidn; + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + u64 dma_addr = 0; + + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + spin_lock(&direct_window_list_lock); + /* check if we already created a window and dupe that config if so */ + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == pdn) { + direct64 = window->prop; + dma_addr = direct64->dma_base; + break; + } + } + spin_unlock(&direct_window_list_lock); + + return dma_addr; +} + +static u64 dupe_ddw_if_kexec(struct pci_dev *dev, struct device_node *pdn) +{ + struct device_node *dn; + struct pci_dn *pcidn; + int len; + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + u64 dma_addr = 0; + + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); + if (direct64) { + if (len < sizeof(struct dynamic_dma_window_prop)) { + remove_ddw(pdn); + } else { + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) { + remove_ddw(pdn); + } else { + window->device = pdn; + window->prop = direct64; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + dma_addr = direct64->dma_base; + } + } + } + + return dma_addr; +} + +static int query_ddw(struct pci_dev *dev, const u32 *ddr_avail, + struct ddw_query_response *query) +{ + struct device_node *dn; + struct pci_dn *pcidn; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + cfg_addr = pcidn->eeh_config_addr; + if (pcidn->eeh_pe_config_addr) + cfg_addr = pcidn->eeh_pe_config_addr; + buid = pcidn->phb->buid; + ret = rtas_call(ddr_avail[0], 3, 5, (u32 *)query, + cfg_addr, BUID_HI(buid), BUID_LO(buid)); + dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x" + " returned %d\n", ddr_avail[0], cfg_addr, (u32)BUID_HI(buid), + (u32)BUID_LO(buid), ret); + return ret; +} + +static int create_ddw(struct pci_dev *dev, const u32 *ddr_avail, + struct ddw_create_response *create, int page_shift, + int window_shift) +{ + struct device_node *dn; + struct pci_dn *pcidn; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + cfg_addr = pcidn->eeh_config_addr; + if (pcidn->eeh_pe_config_addr) + cfg_addr = pcidn->eeh_pe_config_addr; + buid = pcidn->phb->buid; + + do { + /* extra outputs are LIOBN and dma-addr (hi, lo) */ + ret = rtas_call(ddr_avail[1], 5, 4, (u32 *)create, cfg_addr, + BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + } while (rtas_busy_delay(ret)); + dev_info(&dev->dev, + "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " + "(liobn = 0x%x starting addr = %x %x)\n", ddr_avail[1], + cfg_addr, (u32)BUID_HI(buid), (u32)BUID_LO(buid), page_shift, + window_shift, ret, create->liobn, create->addr_hi, create->addr_lo); + + return ret; +} + +/* + * If the PE supports dynamic dma windows, and there is space for a table + * that can map all pages in a linear offset, then setup such a table, + * and record the dma-offset in the struct device. + * + * dev: the pci device we are checking + * pdn: the parent pe node with the ibm,dma_window property + * Future: also check if we can remap the base window for our base page size + * + * returns the dma offset for use by dma_set_mask + */ +static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) +{ + int len, ret; + struct ddw_query_response query; + struct ddw_create_response create; + int page_shift; + u64 dma_addr, max_addr; + struct device_node *dn; + const u32 *uninitialized_var(ddr_avail); + struct direct_window *window; + struct property *win64; + struct dynamic_dma_window_prop *ddwprop; + + mutex_lock(&direct_window_init_mutex); + + dma_addr = dupe_ddw_if_already_created(dev, pdn); + if (dma_addr != 0) + goto out_unlock; + + dma_addr = dupe_ddw_if_kexec(dev, pdn); + if (dma_addr != 0) + goto out_unlock; + + /* + * the ibm,ddw-applicable property holds the tokens for: + * ibm,query-pe-dma-window + * ibm,create-pe-dma-window + * ibm,remove-pe-dma-window + * for the given node in that order. + * the property is actually in the parent, not the PE + */ + ddr_avail = of_get_property(pdn, "ibm,ddw-applicable", &len); + if (!ddr_avail || len < 3 * sizeof(u32)) + goto out_unlock; + + /* + * Query if there is a second window of size to map the + * whole partition. Query returns number of windows, largest + * block assigned to PE (partition endpoint), and two bitmasks + * of page sizes: supported and supported for migrate-dma. + */ + dn = pci_device_to_OF_node(dev); + ret = query_ddw(dev, ddr_avail, &query); + if (ret != 0) + goto out_unlock; + + if (query.windows_available == 0) { + /* + * no additional windows are available for this device. + * We might be able to reallocate the existing window, + * trading in for a larger page size. + */ + dev_dbg(&dev->dev, "no free dynamic windows"); + goto out_unlock; + } + if (query.page_size & 4) { + page_shift = 24; /* 16MB */ + } else if (query.page_size & 2) { + page_shift = 16; /* 64kB */ + } else if (query.page_size & 1) { + page_shift = 12; /* 4kB */ + } else { + dev_dbg(&dev->dev, "no supported direct page size in mask %x", + query.page_size); + goto out_unlock; + } + /* verify the window * number of ptes will map the partition */ + /* check largest block * page size > max memory hotplug addr */ + max_addr = memory_hotplug_max(); + if (query.largest_available_block < (max_addr >> page_shift)) { + dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u " + "%llu-sized pages\n", max_addr, query.largest_available_block, + 1ULL << page_shift); + goto out_unlock; + } + len = order_base_2(max_addr); + win64 = kzalloc(sizeof(struct property), GFP_KERNEL); + if (!win64) { + dev_info(&dev->dev, + "couldn't allocate property for 64bit dma window\n"); + goto out_unlock; + } + win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL); + win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL); + win64->length = sizeof(*ddwprop); + if (!win64->name || !win64->value) { + dev_info(&dev->dev, + "couldn't allocate property name and value\n"); + goto out_free_prop; + } + + ret = create_ddw(dev, ddr_avail, &create, page_shift, len); + if (ret != 0) + goto out_free_prop; + + ddwprop->liobn = cpu_to_be32(create.liobn); + ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2)); + ddwprop->tce_shift = cpu_to_be32(page_shift); + ddwprop->window_shift = cpu_to_be32(len); + + dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n", + create.liobn, dn->full_name); + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + goto out_clear_window; + + ret = walk_system_ram_range(0, lmb_end_of_DRAM() >> PAGE_SHIFT, + win64->value, tce_setrange_multi_pSeriesLP_walk); + if (ret) { + dev_info(&dev->dev, "failed to map direct window for %s: %d\n", + dn->full_name, ret); + goto out_clear_window; + } + + ret = prom_add_property(pdn, win64); + if (ret) { + dev_err(&dev->dev, "unable to add dma window property for %s: %d", + pdn->full_name, ret); + goto out_clear_window; + } + + window->device = pdn; + window->prop = ddwprop; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + + dma_addr = of_read_number(&create.addr_hi, 2); + goto out_unlock; + +out_clear_window: + remove_ddw(pdn); + +out_free_prop: + kfree(win64->name); + kfree(win64->value); + kfree(win64); + +out_unlock: + mutex_unlock(&direct_window_init_mutex); + return dma_addr; +} + static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) { struct device_node *pdn, *dn; @@ -544,7 +1023,7 @@ static void pci_dma_dev_setup_pSeriesLP( pci = PCI_DN(pdn); if (!pci->iommu_table) { - tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, pci->phb->node); iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window, pci->phb->bus->number); @@ -556,24 +1035,145 @@ static void pci_dma_dev_setup_pSeriesLP( set_iommu_table_base(&dev->dev, pci->iommu_table); } + +static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) +{ + bool ddw_enabled = false; + struct device_node *pdn, *dn; + struct pci_dev *pdev; + const void *dma_window = NULL; + u64 dma_offset; + + if (!dev->dma_mask) + return -EIO; + + if (!dev_is_pci(dev)) + goto check_mask; + + pdev = to_pci_dev(dev); + + /* only attempt to use a new window if 64-bit DMA is requested */ + if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { + dn = pci_device_to_OF_node(pdev); + dev_dbg(dev, "node is %s\n", dn->full_name); + + /* + * the device tree might contain the dma-window properties + * per-device and not neccesarily for the bus. So we need to + * search upwards in the tree until we either hit a dma-window + * property, OR find a parent with a table already allocated. + */ + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + pdn = pdn->parent) { + dma_window = of_get_property(pdn, "ibm,dma-window", NULL); + if (dma_window) + break; + } + if (pdn && PCI_DN(pdn)) { + dma_offset = enable_ddw(pdev, pdn); + if (dma_offset != 0) { + dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset); + set_dma_offset(dev, dma_offset); + set_dma_ops(dev, &dma_direct_ops); + ddw_enabled = true; + } + } + } + + /* fall back on iommu ops, restore table pointer with ops */ + if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) { + dev_info(dev, "Restoring 32-bit DMA via iommu\n"); + set_dma_ops(dev, &dma_iommu_ops); + pci_dma_dev_setup_pSeriesLP(pdev); + } + +check_mask: + if (!dma_supported(dev, dma_mask)) + return -EIO; + + *dev->dma_mask = dma_mask; + return 0; +} + #else /* CONFIG_PCI */ #define pci_dma_bus_setup_pSeries NULL #define pci_dma_dev_setup_pSeries NULL #define pci_dma_bus_setup_pSeriesLP NULL #define pci_dma_dev_setup_pSeriesLP NULL +#define dma_set_mask_pSeriesLP NULL #endif /* !CONFIG_PCI */ +static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct direct_window *window; + struct memory_notify *arg = data; + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + default: + break; + } + if (ret && action != MEM_CANCEL_ONLINE) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static struct notifier_block iommu_mem_nb = { + .notifier_call = iommu_mem_notifier, +}; + static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) { int err = NOTIFY_OK; struct device_node *np = node; struct pci_dn *pci = PCI_DN(np); + struct direct_window *window; switch (action) { case PSERIES_RECONFIG_REMOVE: - if (pci && pci->iommu_table && - of_get_property(np, "ibm,dma-window", NULL)) + if (pci && pci->iommu_table) iommu_free_table(pci->iommu_table, np->full_name); + + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == np) { + list_del(&window->list); + kfree(window); + break; + } + } + spin_unlock(&direct_window_list_lock); + + /* + * Because the notifier runs after isolation of the + * slot, we are guaranteed any DMA window has already + * been revoked and the TCEs have been marked invalid, + * so we don't need a call to remove_ddw(np). However, + * if an additional notifier action is added before the + * isolate call, we should update this code for + * completeness with such a call. + */ break; default: err = NOTIFY_DONE; @@ -608,6 +1208,7 @@ void iommu_init_early_pSeries(void) ppc_md.tce_get = tce_get_pSeriesLP; ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP; ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP; + ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; } else { ppc_md.tce_build = tce_build_pSeries; ppc_md.tce_free = tce_free_pSeries; @@ -618,6 +1219,7 @@ void iommu_init_early_pSeries(void) pSeries_reconfig_notifier_register(&iommu_reconfig_nb); + register_memory_notifier(&iommu_mem_nb); set_pci_dma_ops(&dma_iommu_ops); } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/Kconfig linux-2.6.32.ovz/arch/powerpc/platforms/pseries/Kconfig --- linux-2.6.32/arch/powerpc/platforms/pseries/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/Kconfig 2015-09-10 10:06:28.000000000 -0700 @@ -30,6 +30,16 @@ config PSERIES_MSI depends on PCI_MSI && EEH default y +config PSERIES_ENERGY + tristate "pSeries energy management capabilities driver" + depends on PPC_PSERIES + default m + help + Provides interface to platform energy management capabilities + on supported PSERIES platforms. + Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list + and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint + config SCANLOG tristate "Scanlog dump interface" depends on RTAS_PROC && PPC_PSERIES @@ -59,7 +69,7 @@ config PPC_SMLPAR config CMM tristate "Collaborative memory management" - depends on PPC_SMLPAR && !CRASH_DUMP + depends on PPC_SMLPAR default y help Select this option, if you want to enable the kernel interface diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/kexec.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/kexec.c --- linux-2.6.32/arch/powerpc/platforms/pseries/kexec.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/kexec.c 2015-09-10 10:06:45.000000000 -0700 @@ -23,6 +23,17 @@ static void pseries_kexec_cpu_down(int c /* Don't risk a hypervisor call if we're crashing */ if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) { unsigned long addr; + int ret; + + if (get_lppaca()->dtl_enable_mask) { + ret = unregister_dtl(hard_smp_processor_id()); + if (ret) { + pr_err("WARNING: DTL deregistration for cpu " + "%d (hw %d) failed with %d\n", + smp_processor_id(), + hard_smp_processor_id(), ret); + } + } addr = __pa(get_slb_shadow()); if (unregister_slb_shadow(hard_smp_processor_id(), addr)) diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/lpar.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/lpar.c --- linux-2.6.32/arch/powerpc/platforms/pseries/lpar.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/lpar.c 2015-09-10 10:07:23.000000000 -0700 @@ -247,6 +247,8 @@ void vpa_init(int cpu) int hwcpu = get_hard_smp_processor_id(cpu); unsigned long addr; long ret; + struct paca_struct *pp; + struct dtl_entry *dtl; if (cpu_has_feature(CPU_FTR_ALTIVEC)) lppaca[cpu].vmxregs_in_use = 1; @@ -273,6 +275,25 @@ void vpa_init(int cpu) "registration for cpu %d (hw %d) of area %lx " "returns %ld\n", cpu, hwcpu, addr, ret); } + + /* + * Register dispatch trace log, if one has been allocated. + */ + pp = &paca[cpu]; + dtl = pp->dispatch_log; + if (dtl) { + pp->dtl_ridx = 0; + pp->dtl_curr = dtl; + lppaca[cpu].dtl_idx = 0; + + /* hypervisor reads buffer length from this field */ + dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES; + ret = register_dtl(hwcpu, __pa(dtl)); + if (ret) + pr_warning("DTL registration failed for cpu %d (%ld)\n", + cpu, ret); + lppaca[cpu].dtl_enable_mask = 2; + } } static long pSeries_lpar_hpte_insert(unsigned long hpte_group, @@ -307,6 +328,8 @@ static long pSeries_lpar_hpte_insert(uns /* Make pHyp happy */ if ((rflags & _PAGE_NO_CACHE) & !(rflags & _PAGE_WRITETHRU)) hpte_r &= ~_PAGE_COHERENT; + if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N)) + flags |= H_COALESCE_CAND; lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot); if (unlikely(lpar_rc == H_PTEG_FULL)) { @@ -353,7 +376,13 @@ static long pSeries_lpar_hpte_remove(uns (0x1UL << 4), &dummy1, &dummy2); if (lpar_rc == H_SUCCESS) return i; - BUG_ON(lpar_rc != H_NOT_FOUND); + + /* + * The test for adjunct partition is performed before the + * ANDCOND test. H_RESOURCE may be returned, so we need to + * check for that as well. + */ + BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE); slot_offset++; slot_offset &= 0x7; @@ -368,7 +397,7 @@ static void pSeries_lpar_hptab_clear(voi unsigned long hpte_count = size_bytes >> 4; unsigned long dummy1, dummy2, dword0; long lpar_rc; - int i; + unsigned long i; /* TODO: Use bulk call */ for (i = 0; i < hpte_count; i++) { @@ -661,3 +690,47 @@ void arch_free_page(struct page *page, i EXPORT_SYMBOL(arch_free_page); #endif + +/** + * h_get_mpp + * H_GET_MPP hcall returns info in 7 parms + */ +int h_get_mpp(struct hvcall_mpp_data *mpp_data) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + + rc = plpar_hcall9(H_GET_MPP, retbuf); + + mpp_data->entitled_mem = retbuf[0]; + mpp_data->mapped_mem = retbuf[1]; + + mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff; + mpp_data->pool_num = retbuf[2] & 0xffff; + + mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff; + mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff; + mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff; + + mpp_data->pool_size = retbuf[4]; + mpp_data->loan_request = retbuf[5]; + mpp_data->backing_mem = retbuf[6]; + + return rc; +} +EXPORT_SYMBOL(h_get_mpp); + +int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 }; + + rc = plpar_hcall9(H_GET_MPP_X, retbuf); + + mpp_x_data->coalesced_bytes = retbuf[0]; + mpp_x_data->pool_coalesced_bytes = retbuf[1]; + mpp_x_data->pool_purr_cycles = retbuf[2]; + mpp_x_data->pool_spurr_cycles = retbuf[3]; + + return rc; +} diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/Makefile linux-2.6.32.ovz/arch/powerpc/platforms/pseries/Makefile --- linux-2.6.32/arch/powerpc/platforms/pseries/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/Makefile 2015-09-10 10:07:15.000000000 -0700 @@ -8,7 +8,7 @@ endif obj-y := lpar.o hvCall.o nvram.o reconfig.o \ setup.o iommu.o ras.o rtasd.o \ - firmware.o power.o + firmware.o power.o dlpar.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_XICS) += xics.o obj-$(CONFIG_SCANLOG) += scanlog.o @@ -16,6 +16,7 @@ obj-$(CONFIG_EEH) += eeh.o eeh_cache.o e obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_PCI) += pci.o pci_dlpar.o obj-$(CONFIG_PSERIES_MSI) += msi.o +obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o obj-$(CONFIG_HOTPLUG_CPU) += hotplug-cpu.o obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o @@ -23,6 +24,9 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug- obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_HVCS) += hvcserver.o obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o -obj-$(CONFIG_PHYP_DUMP) += phyp_dump.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_DTL) += dtl.o + +ifeq ($(CONFIG_PPC_PSERIES),y) +obj-$(CONFIG_SUSPEND) += suspend.o +endif diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/msi.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/msi.c --- linux-2.6.32/arch/powerpc/platforms/pseries/msi.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/msi.c 2015-09-10 10:06:34.000000000 -0700 @@ -93,8 +93,18 @@ static void rtas_disable_msi(struct pci_ if (!pdn) return; - if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) - pr_debug("rtas_msi: Setting MSIs to 0 failed!\n"); + /* + * disabling MSI with the explicit interface also disables MSI-X + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) { + /* + * may have failed because explicit interface is not + * present + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) { + pr_debug("rtas_msi: Setting MSIs to 0 failed!\n"); + } + } } static int rtas_query_irq_number(struct pci_dn *pdn, int offset) diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/nvram.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/nvram.c --- linux-2.6.32/arch/powerpc/platforms/pseries/nvram.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/nvram.c 2015-09-10 10:07:48.000000000 -0700 @@ -17,17 +17,110 @@ #include #include #include +#include +#include +#include #include #include #include #include #include +/* + * Set oops header version to distingush between old and new format header. + * lnx,oops-log partition max size is 4000, header version > 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + static unsigned int nvram_size; static int nvram_fetch, nvram_store; static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */ static DEFINE_SPINLOCK(nvram_lock); +struct err_log_info { + int error_type; + unsigned int seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +static struct nvram_os_partition rtas_log_partition = { + .name = "ibm,rtas-log", + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; + +static struct nvram_os_partition oops_log_partition = { + .name = "lnx,oops-log", + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *const pseries_nvram_os_partitions[] = { + "ibm,rtas-log", + "lnx,oops-log", + NULL +}; + +struct oops_log_info { + u16 version; + u16 report_length; + u64 timestamp; +} __attribute__((packed)); + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *old_msgs, unsigned long old_len, + const char *new_msgs, unsigned long new_len); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/* See clobbering_unread_rtas_event() */ +#define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */ +static unsigned long last_unread_rtas_event; /* timestamp */ + +/* We preallocate oops_buf during init to avoid kmalloc during oops/panic. */ +static char *oops_data; +static size_t oops_data_sz; +static char *oops_buf; + +#ifdef CONFIG_PSTORE +static struct nvram_os_partition of_config_partition = { + .name = "of-config", + .index = -1, + .os_partition = false +}; + +static struct nvram_os_partition common_partition = { + .name = "common", + .index = -1, + .os_partition = false +}; + +static enum pstore_type_id nvram_type_ids[] = { + PSTORE_TYPE_DMESG, + PSTORE_TYPE_PPC_RTAS, + PSTORE_TYPE_PPC_OF, + PSTORE_TYPE_PPC_COMMON, + -1 +}; +static int read_type; +static unsigned long last_rtas_event; +#endif static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) { @@ -120,6 +213,476 @@ static ssize_t pSeries_nvram_get_size(vo return nvram_size ? nvram_size : -ENODEV; } + +/* nvram_write_os_partition, nvram_write_error_log + * + * We need to buffer the error logs into nvram to ensure that we have + * the failure information to decode. If we have a severe error there + * is no way to guarantee that the OS or the machine is in a state to + * get back to user land and write the error to disk. For example if + * the SCSI device driver causes a Machine Check by writing to a bad + * IO address, there is no way of guaranteeing that the device driver + * is in any state that is would also be able to write the error data + * captured to disk, thus we buffer it in NVRAM for analysis on the + * next boot. + * + * In NVRAM the partition containing the error log buffer will looks like: + * Header (in bytes): + * +-----------+----------+--------+------------+------------------+ + * | signature | checksum | length | name | data | + * |0 |1 |2 3|4 15|16 length-1| + * +-----------+----------+--------+------------+------------------+ + * + * The 'data' section would look like (in bytes): + * +--------------+------------+-----------------------------------+ + * | event_logged | sequence # | error log | + * |0 3|4 7|8 error_log_size-1| + * +--------------+------------+-----------------------------------+ + * + * event_logged: 0 if event has not been logged to syslog, 1 if it has + * sequence #: The unique sequence # for each event. (until it wraps) + * error log: The error log from event_scan + */ +int nvram_write_os_partition(struct nvram_os_partition *part, char * buff, + int length, unsigned int err_type, unsigned int error_log_cnt) +{ + int rc; + loff_t tmp_index; + struct err_log_info info; + + if (part->index < 0) { + return -ESPIPE; + } + + if (length > part->size) { + length = part->size; + } + + info.error_type = err_type; + info.seq_num = error_log_cnt; + + tmp_index = part->index; + + rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); + return rc; + } + + rc = ppc_md.nvram_write(buff, length, &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); + return rc; + } + + return 0; +} + +int nvram_write_error_log(char * buff, int length, + unsigned int err_type, unsigned int error_log_cnt) +{ + int rc = nvram_write_os_partition(&rtas_log_partition, buff, length, + err_type, error_log_cnt); + if (!rc) { + last_unread_rtas_event = get_seconds(); +#ifdef CONFIG_PSTORE + last_rtas_event = get_seconds(); +#endif + } + + return rc; +} + +/* nvram_read_partition + * + * Reads nvram partition for at most 'length' + */ +int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt) +{ + int rc; + loff_t tmp_index; + struct err_log_info info; + + if (part->index == -1) + return -1; + + if (length > part->size) + length = part->size; + + tmp_index = part->index; + + if (part->os_partition) { + rc = ppc_md.nvram_read((char *)&info, + sizeof(struct err_log_info), + &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, + rc); + return rc; + } + } + + rc = ppc_md.nvram_read(buff, length, &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, rc); + return rc; + } + + if (part->os_partition) { + *error_log_cnt = info.seq_num; + *err_type = info.error_type; + } + + return 0; +} + +/* nvram_read_error_log + * + * Reads nvram for error log for at most 'length' + */ +int nvram_read_error_log(char *buff, int length, + unsigned int *err_type, unsigned int *error_log_cnt) +{ + return nvram_read_partition(&rtas_log_partition, buff, length, + err_type, error_log_cnt); +} + + +/* This doesn't actually zero anything, but it sets the event_logged + * word to tell that this event is safely in syslog. + */ +int nvram_clear_error_log(void) +{ + loff_t tmp_index; + int clear_word = ERR_FLAG_ALREADY_LOGGED; + int rc; + + if (rtas_log_partition.index < 0) + return -1; + + tmp_index = rtas_log_partition.index; + + rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index); + if (rc <= 0) { + printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc); + return rc; + } + last_unread_rtas_event = 0; + + return 0; +} + +/* pseries_nvram_init_os_partition + * + * This sets up a partition with an "OS" signature. + * + * The general strategy is the following: + * 1.) If a partition with the indicated name already exists... + * - If it's large enough, use it. + * - Otherwise, recycle it and keep going. + * 2.) Search for a free partition that is large enough. + * 3.) If there's not a free partition large enough, recycle any obsolete + * OS partitions and try again. + * 4.) Will first try getting a chunk that will satisfy the requested size. + * 5.) If a chunk of the requested size cannot be allocated, then try finding + * a chunk that will satisfy the minimum needed. + * + * Returns 0 on success, else -1. + */ +static int __init pseries_nvram_init_os_partition(struct nvram_os_partition + *part) +{ + loff_t p; + int size; + + /* Scan nvram for partitions */ + nvram_scan_partitions(); + + /* Look for ours */ + p = nvram_find_partition2(part->name, NVRAM_SIG_OS, &size); + + /* Found one but too small, remove it */ + if (p && size < part->min_size) { + pr_info("nvram: Found too small %s partition," + " removing it...\n", part->name); + nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL); + p = 0; + } + + /* Create one if we didn't find */ + if (!p) { + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); + if (p == -ENOSPC) { + pr_info("nvram: No room to create %s partition, " + "deleting any obsolete OS partitions...\n", + part->name); + nvram_remove_partition(NULL, NVRAM_SIG_OS, + pseries_nvram_os_partitions); + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); + } + } + + if (p <= 0) { + pr_err("nvram: Failed to find or create %s" + " partition, err %d\n", part->name, (int)p); + return -1; + } + + part->index = p; + part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info); + + return 0; +} + +/* + * Are we using the ibm,rtas-log for oops/panic reports? And if so, + * would logging this oops/panic overwrite an RTAS event that rtas_errd + * hasn't had a chance to read and process? Return 1 if so, else 0. + * + * We assume that if rtas_errd hasn't read the RTAS event in + * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to. + */ +static int clobbering_unread_rtas_event(void) +{ + return (oops_log_partition.index == rtas_log_partition.index + && last_unread_rtas_event + && get_seconds() - last_unread_rtas_event <= + NVRAM_RTAS_READ_TIMEOUT); +} + +#ifdef CONFIG_PSTORE +static int nvram_pstore_open(struct pstore_info *psi) +{ + /* Reset the iterator to start reading partitions again */ + read_type = -1; + return 0; +} + +/** + * nvram_pstore_write - pstore write callback for nvram + * @type: Type of message logged + * @reason: reason behind dump (oops/panic) + * @id: identifier to indicate the write performed + * @part: pstore writes data to registered buffer in parts, + * part number will indicate the same. + * @count: Indicates oops count + * @size: number of bytes written to the registered buffer + * @psi: registered pstore_info structure + * + * Called by pstore_dump() when an oops or panic report is logged in the + * printk buffer. + * Returns 0 on successful write. + */ +static int nvram_pstore_write(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, int count, + size_t size, struct pstore_info *psi) +{ + int rc; + struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf; + + /* part 1 has the recent messages from printk buffer */ + if (part > 1 || type != PSTORE_TYPE_DMESG || + clobbering_unread_rtas_event()) + return -1; + + oops_hdr->version = OOPS_HDR_VERSION; + oops_hdr->report_length = (u16) size; + oops_hdr->timestamp = get_seconds(); + rc = nvram_write_os_partition(&oops_log_partition, oops_buf, + (int) (sizeof(*oops_hdr) + size), ERR_TYPE_KERNEL_PANIC, + count); + + if (rc != 0) + return rc; + + *id = part; + return 0; +} + +/* + * Reads the oops/panic report, rtas, of-config and common partition. + * Returns the length of the data we read from each partition. + * Returns 0 if we've been called before. + */ +static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, + int *count, struct timespec *time, char **buf, + struct pstore_info *psi) +{ + struct oops_log_info *oops_hdr; + unsigned int err_type, id_no, size = 0; + struct nvram_os_partition *part = NULL; + char *buff = NULL; + int sig = 0; + loff_t p; + + read_type++; + + switch (nvram_type_ids[read_type]) { + case PSTORE_TYPE_DMESG: + part = &oops_log_partition; + *type = PSTORE_TYPE_DMESG; + break; + case PSTORE_TYPE_PPC_RTAS: + part = &rtas_log_partition; + *type = PSTORE_TYPE_PPC_RTAS; + time->tv_sec = last_rtas_event; + time->tv_nsec = 0; + break; + case PSTORE_TYPE_PPC_OF: + sig = NVRAM_SIG_OF; + part = &of_config_partition; + *type = PSTORE_TYPE_PPC_OF; + *id = PSTORE_TYPE_PPC_OF; + time->tv_sec = 0; + time->tv_nsec = 0; + break; + case PSTORE_TYPE_PPC_COMMON: + sig = NVRAM_SIG_SYS; + part = &common_partition; + *type = PSTORE_TYPE_PPC_COMMON; + *id = PSTORE_TYPE_PPC_COMMON; + time->tv_sec = 0; + time->tv_nsec = 0; + break; + default: + return 0; + } + + if (!part->os_partition) { + p = nvram_find_partition2(part->name, sig, &size); + if (p <= 0) { + pr_err("nvram: Failed to find partition %s, " + "err %d\n", part->name, (int)p); + return 0; + } + part->index = p; + part->size = size; + } + + buff = kmalloc(part->size, GFP_KERNEL); + + if (!buff) + return -ENOMEM; + + if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) { + kfree(buff); + return 0; + } + + *count = 0; + + if (part->os_partition) + *id = id_no; + + if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) { + int length; + size_t hdr_size; + + oops_hdr = (struct oops_log_info *)buff; + if (oops_hdr->version < OOPS_HDR_VERSION) { + hdr_size = sizeof(u16); + length = oops_hdr->version; + time->tv_sec = 0; + time->tv_nsec = 0; + } else { + hdr_size = sizeof(*oops_hdr); + length = oops_hdr->report_length; + time->tv_sec = oops_hdr->timestamp; + time->tv_nsec = 0; + } + + *buf = kmalloc(length, GFP_KERNEL); + if (*buf == NULL) + return -ENOMEM; + memcpy(*buf, buff + hdr_size, length); + kfree(buff); + return length; + } + + *buf = buff; + return part->size; +} + +static struct pstore_info nvram_pstore_info = { + .owner = THIS_MODULE, + .name = "nvram", + .open = nvram_pstore_open, + .read = nvram_pstore_read, + .write = nvram_pstore_write, +}; + +static int nvram_pstore_init(void) +{ + int rc = 0; + + nvram_pstore_info.buf = oops_data; + nvram_pstore_info.bufsize = oops_data_sz; + + rc = pstore_register(&nvram_pstore_info); + if (rc != 0) + pr_err("nvram: pstore_register() failed, defaults to " + "kmsg_dump; returned %d\n", rc); + + return rc; +} +#else +static int nvram_pstore_init(void) +{ + return -1; +} +#endif + +static void __init nvram_init_oops_partition(int rtas_partition_exists) +{ + int rc; + + rc = pseries_nvram_init_os_partition(&oops_log_partition); + if (rc != 0) { + if (!rtas_partition_exists) + return; + pr_notice("nvram: Using %s partition to log both" + " RTAS errors and oops/panic reports\n", + rtas_log_partition.name); + memcpy(&oops_log_partition, &rtas_log_partition, + sizeof(rtas_log_partition)); + } + oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL); + if (!oops_buf) { + pr_err("nvram: No memory for %s partition\n", + oops_log_partition.name); + return; + } + oops_data = oops_buf + sizeof(struct oops_log_info); + oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info); + + rc = nvram_pstore_init(); + + if (!rc) + return; + + rc = kmsg_dump_register(&nvram_kmsg_dumper); + if (rc != 0) { + pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc); + kfree(oops_buf); + return; + } +} + +static int __init pseries_nvram_init_log_partitions(void) +{ + int rc; + + rc = pseries_nvram_init_os_partition(&rtas_log_partition); + nvram_init_oops_partition(rc == 0); + return 0; +} +machine_arch_initcall(pseries, pseries_nvram_init_log_partitions); + int __init pSeries_nvram_init(void) { struct device_node *nvram; @@ -149,3 +712,50 @@ int __init pSeries_nvram_init(void) return 0; } + +/* + * Try to capture the last capture_len bytes of the printk buffer. Return + * the amount actually captured. + */ +static size_t capture_last_msgs(const char *old_msgs, size_t old_len, + const char *new_msgs, size_t new_len, + char *captured, size_t capture_len) +{ + if (new_len >= capture_len) { + memcpy(captured, new_msgs + (new_len - capture_len), + capture_len); + return capture_len; + } else { + /* Grab the end of old_msgs. */ + size_t old_tail_len = min(old_len, capture_len - new_len); + memcpy(captured, old_msgs + (old_len - old_tail_len), + old_tail_len); + memcpy(captured + old_tail_len, new_msgs, new_len); + return old_tail_len + new_len; + } +} + + +/* our kmsg_dump callback */ +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *old_msgs, unsigned long old_len, + const char *new_msgs, unsigned long new_len) +{ + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; + static unsigned int oops_count = 0; + size_t text_len; + + if (clobbering_unread_rtas_event()) + return; + + text_len = capture_last_msgs(old_msgs, old_len, new_msgs, new_len, + oops_data, oops_data_sz); + oops_hdr->version = OOPS_HDR_VERSION; + oops_hdr->report_length = (u16) text_len; + oops_hdr->timestamp = get_seconds(); + + nvram_write_os_partition(&oops_log_partition, oops_buf, + (int) (sizeof(*oops_hdr) + text_len), + ERR_TYPE_KERNEL_PANIC, ++oops_count); +} diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/offline_states.h linux-2.6.32.ovz/arch/powerpc/platforms/pseries/offline_states.h --- linux-2.6.32/arch/powerpc/platforms/pseries/offline_states.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/offline_states.h 2015-09-10 10:06:00.000000000 -0700 @@ -0,0 +1,19 @@ +#ifndef _OFFLINE_STATES_H_ +#define _OFFLINE_STATES_H_ + +/* Cpu offline states go here */ +enum cpu_state_vals { + CPU_STATE_OFFLINE, + CPU_STATE_INACTIVE, + CPU_STATE_ONLINE, + CPU_MAX_OFFLINE_STATES +}; + +extern enum cpu_state_vals get_cpu_current_state(int cpu); +extern void set_cpu_current_state(int cpu, enum cpu_state_vals state); +extern enum cpu_state_vals get_preferred_offline_state(int cpu); +extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state); +extern void set_default_offline_state(int cpu); +extern int start_secondary(void); +extern void start_secondary_resume(void); +#endif diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/pci_dlpar.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/pci_dlpar.c --- linux-2.6.32/arch/powerpc/platforms/pseries/pci_dlpar.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/pci_dlpar.c 2015-09-10 10:08:00.000000000 -0700 @@ -66,6 +66,17 @@ pcibios_find_pci_bus(struct device_node EXPORT_SYMBOL_GPL(pcibios_find_pci_bus); /** + * pcibios_release_device - release PCI device + * @dev: PCI device + * + * The function is called before releasing the indicated PCI device. + */ +void pcibios_release_device(struct pci_dev *dev) +{ + eeh_remove_device(dev); +} + +/** * pcibios_remove_pci_devices - remove all devices under this bus * * Remove all of the PCI devices under this bus both from the @@ -85,7 +96,7 @@ void pcibios_remove_pci_devices(struct p list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) { pr_debug(" * Removing %s...\n", pci_name(dev)); eeh_remove_bus_device(dev); - pci_remove_bus_device(dev); + pci_stop_and_remove_bus_device(dev); } } EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/phyp_dump.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/phyp_dump.c --- linux-2.6.32/arch/powerpc/platforms/pseries/phyp_dump.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/phyp_dump.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,512 +0,0 @@ -/* - * Hypervisor-assisted dump - * - * Linas Vepstas, Manish Ahuja 2008 - * Copyright 2008 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/* Variables, used to communicate data between early boot and late boot */ -static struct phyp_dump phyp_dump_vars; -struct phyp_dump *phyp_dump_info = &phyp_dump_vars; - -static int ibm_configure_kernel_dump; -/* ------------------------------------------------- */ -/* RTAS interfaces to declare the dump regions */ - -struct dump_section { - u32 dump_flags; - u16 source_type; - u16 error_flags; - u64 source_address; - u64 source_length; - u64 length_copied; - u64 destination_address; -}; - -struct phyp_dump_header { - u32 version; - u16 num_of_sections; - u16 status; - - u32 first_offset_section; - u32 dump_disk_section; - u64 block_num_dd; - u64 num_of_blocks_dd; - u32 offset_dd; - u32 maxtime_to_auto; - /* No dump disk path string used */ - - struct dump_section cpu_data; - struct dump_section hpte_data; - struct dump_section kernel_data; -}; - -/* The dump header *must be* in low memory, so .bss it */ -static struct phyp_dump_header phdr; - -#define NUM_DUMP_SECTIONS 3 -#define DUMP_HEADER_VERSION 0x1 -#define DUMP_REQUEST_FLAG 0x1 -#define DUMP_SOURCE_CPU 0x0001 -#define DUMP_SOURCE_HPTE 0x0002 -#define DUMP_SOURCE_RMO 0x0011 -#define DUMP_ERROR_FLAG 0x2000 -#define DUMP_TRIGGERED 0x4000 -#define DUMP_PERFORMED 0x8000 - - -/** - * init_dump_header() - initialize the header declaring a dump - * Returns: length of dump save area. - * - * When the hypervisor saves crashed state, it needs to put - * it somewhere. The dump header tells the hypervisor where - * the data can be saved. - */ -static unsigned long init_dump_header(struct phyp_dump_header *ph) -{ - unsigned long addr_offset = 0; - - /* Set up the dump header */ - ph->version = DUMP_HEADER_VERSION; - ph->num_of_sections = NUM_DUMP_SECTIONS; - ph->status = 0; - - ph->first_offset_section = - (u32)offsetof(struct phyp_dump_header, cpu_data); - ph->dump_disk_section = 0; - ph->block_num_dd = 0; - ph->num_of_blocks_dd = 0; - ph->offset_dd = 0; - - ph->maxtime_to_auto = 0; /* disabled */ - - /* The first two sections are mandatory */ - ph->cpu_data.dump_flags = DUMP_REQUEST_FLAG; - ph->cpu_data.source_type = DUMP_SOURCE_CPU; - ph->cpu_data.source_address = 0; - ph->cpu_data.source_length = phyp_dump_info->cpu_state_size; - ph->cpu_data.destination_address = addr_offset; - addr_offset += phyp_dump_info->cpu_state_size; - - ph->hpte_data.dump_flags = DUMP_REQUEST_FLAG; - ph->hpte_data.source_type = DUMP_SOURCE_HPTE; - ph->hpte_data.source_address = 0; - ph->hpte_data.source_length = phyp_dump_info->hpte_region_size; - ph->hpte_data.destination_address = addr_offset; - addr_offset += phyp_dump_info->hpte_region_size; - - /* This section describes the low kernel region */ - ph->kernel_data.dump_flags = DUMP_REQUEST_FLAG; - ph->kernel_data.source_type = DUMP_SOURCE_RMO; - ph->kernel_data.source_address = PHYP_DUMP_RMR_START; - ph->kernel_data.source_length = PHYP_DUMP_RMR_END; - ph->kernel_data.destination_address = addr_offset; - addr_offset += ph->kernel_data.source_length; - - return addr_offset; -} - -static void print_dump_header(const struct phyp_dump_header *ph) -{ -#ifdef DEBUG - if (ph == NULL) - return; - - printk(KERN_INFO "dump header:\n"); - /* setup some ph->sections required */ - printk(KERN_INFO "version = %d\n", ph->version); - printk(KERN_INFO "Sections = %d\n", ph->num_of_sections); - printk(KERN_INFO "Status = 0x%x\n", ph->status); - - /* No ph->disk, so all should be set to 0 */ - printk(KERN_INFO "Offset to first section 0x%x\n", - ph->first_offset_section); - printk(KERN_INFO "dump disk sections should be zero\n"); - printk(KERN_INFO "dump disk section = %d\n", ph->dump_disk_section); - printk(KERN_INFO "block num = %lld\n", ph->block_num_dd); - printk(KERN_INFO "number of blocks = %lld\n", ph->num_of_blocks_dd); - printk(KERN_INFO "dump disk offset = %d\n", ph->offset_dd); - printk(KERN_INFO "Max auto time= %d\n", ph->maxtime_to_auto); - - /*set cpu state and hpte states as well scratch pad area */ - printk(KERN_INFO " CPU AREA \n"); - printk(KERN_INFO "cpu dump_flags =%d\n", ph->cpu_data.dump_flags); - printk(KERN_INFO "cpu source_type =%d\n", ph->cpu_data.source_type); - printk(KERN_INFO "cpu error_flags =%d\n", ph->cpu_data.error_flags); - printk(KERN_INFO "cpu source_address =%llx\n", - ph->cpu_data.source_address); - printk(KERN_INFO "cpu source_length =%llx\n", - ph->cpu_data.source_length); - printk(KERN_INFO "cpu length_copied =%llx\n", - ph->cpu_data.length_copied); - - printk(KERN_INFO " HPTE AREA \n"); - printk(KERN_INFO "HPTE dump_flags =%d\n", ph->hpte_data.dump_flags); - printk(KERN_INFO "HPTE source_type =%d\n", ph->hpte_data.source_type); - printk(KERN_INFO "HPTE error_flags =%d\n", ph->hpte_data.error_flags); - printk(KERN_INFO "HPTE source_address =%llx\n", - ph->hpte_data.source_address); - printk(KERN_INFO "HPTE source_length =%llx\n", - ph->hpte_data.source_length); - printk(KERN_INFO "HPTE length_copied =%llx\n", - ph->hpte_data.length_copied); - - printk(KERN_INFO " SRSD AREA \n"); - printk(KERN_INFO "SRSD dump_flags =%d\n", ph->kernel_data.dump_flags); - printk(KERN_INFO "SRSD source_type =%d\n", ph->kernel_data.source_type); - printk(KERN_INFO "SRSD error_flags =%d\n", ph->kernel_data.error_flags); - printk(KERN_INFO "SRSD source_address =%llx\n", - ph->kernel_data.source_address); - printk(KERN_INFO "SRSD source_length =%llx\n", - ph->kernel_data.source_length); - printk(KERN_INFO "SRSD length_copied =%llx\n", - ph->kernel_data.length_copied); -#endif -} - -static ssize_t show_phyp_dump_active(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - - /* create filesystem entry so kdump is phyp-dump aware */ - return sprintf(buf, "%lx\n", phyp_dump_info->phyp_dump_at_boot); -} - -static struct kobj_attribute pdl = __ATTR(phyp_dump_active, 0600, - show_phyp_dump_active, - NULL); - -static void register_dump_area(struct phyp_dump_header *ph, unsigned long addr) -{ - int rc; - - /* Add addr value if not initialized before */ - if (ph->cpu_data.destination_address == 0) { - ph->cpu_data.destination_address += addr; - ph->hpte_data.destination_address += addr; - ph->kernel_data.destination_address += addr; - } - - /* ToDo Invalidate kdump and free memory range. */ - - do { - rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL, - 1, ph, sizeof(struct phyp_dump_header)); - } while (rtas_busy_delay(rc)); - - if (rc) { - printk(KERN_ERR "phyp-dump: unexpected error (%d) on " - "register\n", rc); - print_dump_header(ph); - return; - } - - rc = sysfs_create_file(kernel_kobj, &pdl.attr); - if (rc) - printk(KERN_ERR "phyp-dump: unable to create sysfs" - " file (%d)\n", rc); -} - -static -void invalidate_last_dump(struct phyp_dump_header *ph, unsigned long addr) -{ - int rc; - - /* Add addr value if not initialized before */ - if (ph->cpu_data.destination_address == 0) { - ph->cpu_data.destination_address += addr; - ph->hpte_data.destination_address += addr; - ph->kernel_data.destination_address += addr; - } - - do { - rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL, - 2, ph, sizeof(struct phyp_dump_header)); - } while (rtas_busy_delay(rc)); - - if (rc) { - printk(KERN_ERR "phyp-dump: unexpected error (%d) " - "on invalidate\n", rc); - print_dump_header(ph); - } -} - -/* ------------------------------------------------- */ -/** - * release_memory_range -- release memory previously lmb_reserved - * @start_pfn: starting physical frame number - * @nr_pages: number of pages to free. - * - * This routine will release memory that had been previously - * lmb_reserved in early boot. The released memory becomes - * available for genreal use. - */ -static void release_memory_range(unsigned long start_pfn, - unsigned long nr_pages) -{ - struct page *rpage; - unsigned long end_pfn; - long i; - - end_pfn = start_pfn + nr_pages; - - for (i = start_pfn; i <= end_pfn; i++) { - rpage = pfn_to_page(i); - if (PageReserved(rpage)) { - ClearPageReserved(rpage); - init_page_count(rpage); - __free_page(rpage); - totalram_pages++; - } - } -} - -/** - * track_freed_range -- Counts the range being freed. - * Once the counter goes to zero, it re-registers dump for - * future use. - */ -static void -track_freed_range(unsigned long addr, unsigned long length) -{ - static unsigned long scratch_area_size, reserved_area_size; - - if (addr < phyp_dump_info->init_reserve_start) - return; - - if ((addr >= phyp_dump_info->init_reserve_start) && - (addr <= phyp_dump_info->init_reserve_start + - phyp_dump_info->init_reserve_size)) - reserved_area_size += length; - - if ((addr >= phyp_dump_info->reserved_scratch_addr) && - (addr <= phyp_dump_info->reserved_scratch_addr + - phyp_dump_info->reserved_scratch_size)) - scratch_area_size += length; - - if ((reserved_area_size == phyp_dump_info->init_reserve_size) && - (scratch_area_size == phyp_dump_info->reserved_scratch_size)) { - - invalidate_last_dump(&phdr, - phyp_dump_info->reserved_scratch_addr); - register_dump_area(&phdr, - phyp_dump_info->reserved_scratch_addr); - } -} - -/* ------------------------------------------------- */ -/** - * sysfs_release_region -- sysfs interface to release memory range. - * - * Usage: - * "echo > /sys/kernel/release_region" - * - * Example: - * "echo 0x40000000 0x10000000 > /sys/kernel/release_region" - * - * will release 256MB starting at 1GB. - */ -static ssize_t store_release_region(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long start_addr, length, end_addr; - unsigned long start_pfn, nr_pages; - ssize_t ret; - - ret = sscanf(buf, "%lx %lx", &start_addr, &length); - if (ret != 2) - return -EINVAL; - - track_freed_range(start_addr, length); - - /* Range-check - don't free any reserved memory that - * wasn't reserved for phyp-dump */ - if (start_addr < phyp_dump_info->init_reserve_start) - start_addr = phyp_dump_info->init_reserve_start; - - end_addr = phyp_dump_info->init_reserve_start + - phyp_dump_info->init_reserve_size; - if (start_addr+length > end_addr) - length = end_addr - start_addr; - - /* Release the region of memory assed in by user */ - start_pfn = PFN_DOWN(start_addr); - nr_pages = PFN_DOWN(length); - release_memory_range(start_pfn, nr_pages); - - return count; -} - -static ssize_t show_release_region(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - u64 second_addr_range; - - /* total reserved size - start of scratch area */ - second_addr_range = phyp_dump_info->init_reserve_size - - phyp_dump_info->reserved_scratch_size; - return sprintf(buf, "CPU:0x%llx-0x%llx: HPTE:0x%llx-0x%llx:" - " DUMP:0x%llx-0x%llx, 0x%lx-0x%llx:\n", - phdr.cpu_data.destination_address, - phdr.cpu_data.length_copied, - phdr.hpte_data.destination_address, - phdr.hpte_data.length_copied, - phdr.kernel_data.destination_address, - phdr.kernel_data.length_copied, - phyp_dump_info->init_reserve_start, - second_addr_range); -} - -static struct kobj_attribute rr = __ATTR(release_region, 0600, - show_release_region, - store_release_region); - -static int __init phyp_dump_setup(void) -{ - struct device_node *rtas; - const struct phyp_dump_header *dump_header = NULL; - unsigned long dump_area_start; - unsigned long dump_area_length; - int header_len = 0; - int rc; - - /* If no memory was reserved in early boot, there is nothing to do */ - if (phyp_dump_info->init_reserve_size == 0) - return 0; - - /* Return if phyp dump not supported */ - if (!phyp_dump_info->phyp_dump_configured) - return -ENOSYS; - - /* Is there dump data waiting for us? If there isn't, - * then register a new dump area, and release all of - * the rest of the reserved ram. - * - * The /rtas/ibm,kernel-dump rtas node is present only - * if there is dump data waiting for us. - */ - rtas = of_find_node_by_path("/rtas"); - if (rtas) { - dump_header = of_get_property(rtas, "ibm,kernel-dump", - &header_len); - of_node_put(rtas); - } - - ibm_configure_kernel_dump = rtas_token("ibm,configure-kernel-dump"); - - print_dump_header(dump_header); - dump_area_length = init_dump_header(&phdr); - /* align down */ - dump_area_start = phyp_dump_info->init_reserve_start & PAGE_MASK; - - if (dump_header == NULL) { - register_dump_area(&phdr, dump_area_start); - return 0; - } - - /* re-register the dump area, if old dump was invalid */ - if ((dump_header) && (dump_header->status & DUMP_ERROR_FLAG)) { - invalidate_last_dump(&phdr, dump_area_start); - register_dump_area(&phdr, dump_area_start); - return 0; - } - - if (dump_header) { - phyp_dump_info->reserved_scratch_addr = - dump_header->cpu_data.destination_address; - phyp_dump_info->reserved_scratch_size = - dump_header->cpu_data.source_length + - dump_header->hpte_data.source_length + - dump_header->kernel_data.source_length; - } - - /* Should we create a dump_subsys, analogous to s390/ipl.c ? */ - rc = sysfs_create_file(kernel_kobj, &rr.attr); - if (rc) - printk(KERN_ERR "phyp-dump: unable to create sysfs file (%d)\n", - rc); - - /* ToDo: re-register the dump area, for next time. */ - return 0; -} -machine_subsys_initcall(pseries, phyp_dump_setup); - -int __init early_init_dt_scan_phyp_dump(unsigned long node, - const char *uname, int depth, void *data) -{ - const unsigned int *sizes; - - phyp_dump_info->phyp_dump_configured = 0; - phyp_dump_info->phyp_dump_is_active = 0; - - if (depth != 1 || strcmp(uname, "rtas") != 0) - return 0; - - if (of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL)) - phyp_dump_info->phyp_dump_configured++; - - if (of_get_flat_dt_prop(node, "ibm,dump-kernel", NULL)) - phyp_dump_info->phyp_dump_is_active++; - - sizes = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", - NULL); - if (!sizes) - return 0; - - if (sizes[0] == 1) - phyp_dump_info->cpu_state_size = *((unsigned long *)&sizes[1]); - - if (sizes[3] == 2) - phyp_dump_info->hpte_region_size = - *((unsigned long *)&sizes[4]); - return 1; -} - -/* Look for phyp_dump= cmdline option */ -static int __init early_phyp_dump_enabled(char *p) -{ - phyp_dump_info->phyp_dump_at_boot = 1; - - if (!p) - return 0; - - if (strncmp(p, "1", 1) == 0) - phyp_dump_info->phyp_dump_at_boot = 1; - else if (strncmp(p, "0", 1) == 0) - phyp_dump_info->phyp_dump_at_boot = 0; - - return 0; -} -early_param("phyp_dump", early_phyp_dump_enabled); - -/* Look for phyp_dump_reserve_size= cmdline option */ -static int __init early_phyp_dump_reserve_size(char *p) -{ - if (p) - phyp_dump_info->reserve_bootvar = memparse(p, &p); - - return 0; -} -early_param("phyp_dump_reserve_size", early_phyp_dump_reserve_size); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/plpar_wrappers.h linux-2.6.32.ovz/arch/powerpc/platforms/pseries/plpar_wrappers.h --- linux-2.6.32/arch/powerpc/platforms/pseries/plpar_wrappers.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/plpar_wrappers.h 2015-09-10 10:06:45.000000000 -0700 @@ -4,16 +4,46 @@ #include #include +/* Get state of physical CPU from query_cpu_stopped */ +int smp_query_cpu_stopped(unsigned int pcpu); +#define QCSS_STOPPED 0 +#define QCSS_STOPPING 1 +#define QCSS_NOT_STOPPED 2 +#define QCSS_HARDWARE_ERROR -1 +#define QCSS_HARDWARE_BUSY -2 + static inline long poll_pending(void) { return plpar_hcall_norets(H_POLL_PENDING); } +static inline u8 get_cede_latency_hint(void) +{ + return get_lppaca()->gpr5_dword.fields.cede_latency_hint; +} + +static inline void set_cede_latency_hint(u8 latency_hint) +{ + get_lppaca()->gpr5_dword.fields.cede_latency_hint = latency_hint; +} + static inline long cede_processor(void) { return plpar_hcall_norets(H_CEDE); } +static inline long extended_cede_processor(unsigned long latency_hint) +{ + long rc; + u8 old_latency_hint = get_cede_latency_hint(); + + set_cede_latency_hint(latency_hint); + rc = cede_processor(); + set_cede_latency_hint(old_latency_hint); + + return rc; +} + static inline long vpa_call(unsigned long flags, unsigned long cpu, unsigned long vpa) { @@ -43,9 +73,9 @@ static inline long register_slb_shadow(u return vpa_call(0x3, cpu, vpa); } -static inline long unregister_dtl(unsigned long cpu, unsigned long vpa) +static inline long unregister_dtl(unsigned long cpu) { - return vpa_call(0x6, cpu, vpa); + return vpa_call(0x6, cpu, 0); } static inline long register_dtl(unsigned long cpu, unsigned long vpa) @@ -249,4 +279,10 @@ static inline long plpar_xirr(unsigned l return rc; } +#ifdef CONFIG_SUSPEND +int pseries_suspend_cpu(void); +#else +static inline int pseries_suspend_cpu(void) {return 0;} +#endif + #endif /* _PSERIES_PLPAR_WRAPPERS_H */ diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/pseries_energy.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/pseries_energy.c --- linux-2.6.32/arch/powerpc/platforms/pseries/pseries_energy.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/pseries_energy.c 2015-09-10 10:06:28.000000000 -0700 @@ -0,0 +1,358 @@ +/* + * POWER platform energy management driver + * Copyright (C) 2010 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This pseries platform device driver provides access to + * platform energy management capabilities. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define MODULE_VERS "1.0" +#define MODULE_NAME "pseries_energy" + +/* New hcall number */ + +#define H_BEST_ENERGY 0x2F4 + +/* Driver flags */ + +static int sysfs_entries; + +static int nr_threads_per_core; + +/* Helper routines */ + +/* CPU to core index mapping */ + +#define cpu_core_index_of_thread(cpu) ((cpu) / nr_threads_per_core) +#define cpu_first_thread_of_core(core) ((core) * nr_threads_per_core) + +/* Get the threads-per-core from device tree */ + +static void get_threads_per_core(void) +{ + struct device_node *dn = NULL; + int len; + const int *intserv; + + nr_threads_per_core = 1; /* default */ + + dn = of_find_node_by_type(NULL, "cpu"); + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); + if (intserv) + nr_threads_per_core = len / sizeof(int); + + of_node_put(dn); +} + +/* + * Routine to detect firmware support for hcall + * return 1 if H_BEST_ENERGY is supported + * else return 0 + */ + +static int check_for_h_best_energy(void) +{ + struct device_node *rtas = NULL; + const char *hypertas, *s; + int length; + int rc = 0; + + rtas = of_find_node_by_path("/rtas"); + if (!rtas) + return 0; + + hypertas = of_get_property(rtas, "ibm,hypertas-functions", &length); + if (!hypertas) { + of_node_put(rtas); + return 0; + } + + /* hypertas will have list of strings with hcall names */ + for (s = hypertas; s < hypertas + length; s += strlen(s) + 1) { + if (!strncmp("hcall-best-energy-1", s, 19)) { + rc = 1; /* Found the string */ + break; + } + } + of_node_put(rtas); + return rc; +} + +/* Helper Routines to convert between drc_index to cpu numbers */ + +static u32 cpu_to_drc_index(int cpu) +{ + struct device_node *dn = NULL; + const int *indexes; + int i; + int rc = 1; + u32 ret = 0; + + dn = of_find_node_by_path("/cpus"); + if (dn == NULL) + goto err; + indexes = of_get_property(dn, "ibm,drc-indexes", NULL); + if (indexes == NULL) + goto err_of_node_put; + /* Convert logical cpu number to core number */ + i = cpu_core_index_of_thread(cpu); + /* + * The first element indexes[0] is the number of drc_indexes + * returned in the list. Hence i+1 will get the drc_index + * corresponding to core number i. + */ + WARN_ON(i > indexes[0]); + ret = indexes[i + 1]; + rc = 0; + +err_of_node_put: + of_node_put(dn); +err: + if (rc) + printk(KERN_WARNING "cpu_to_drc_index(%d) failed", cpu); + return ret; +} + +static int drc_index_to_cpu(u32 drc_index) +{ + struct device_node *dn = NULL; + const int *indexes; + int i, cpu = 0; + int rc = 1; + + dn = of_find_node_by_path("/cpus"); + if (dn == NULL) + goto err; + indexes = of_get_property(dn, "ibm,drc-indexes", NULL); + if (indexes == NULL) + goto err_of_node_put; + /* + * First element in the array is the number of drc_indexes + * returned. Search through the list to find the matching + * drc_index and get the core number + */ + for (i = 0; i < indexes[0]; i++) { + if (indexes[i + 1] == drc_index) + break; + } + /* Convert core number to logical cpu number */ + cpu = cpu_first_thread_of_core(i); + rc = 0; + +err_of_node_put: + of_node_put(dn); +err: + if (rc) + printk(KERN_WARNING "drc_index_to_cpu(%d) failed", drc_index); + return cpu; +} + +/* + * pseries hypervisor call H_BEST_ENERGY provides hints to OS on + * preferred logical cpus to activate or deactivate for optimized + * energy consumption. + */ + +#define FLAGS_MODE1 0x004E200000080E01 +#define FLAGS_MODE2 0x004E200000080401 +#define FLAGS_ACTIVATE 0x100 + +static ssize_t get_best_energy_list(char *page, int activate) +{ + int rc, cnt, i, cpu; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + unsigned long flags = 0; + u32 *buf_page; + char *s = page; + + buf_page = (u32 *) get_zeroed_page(GFP_KERNEL); + if (!buf_page) + return -ENOMEM; + + flags = FLAGS_MODE1; + if (activate) + flags |= FLAGS_ACTIVATE; + + rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, 0, __pa(buf_page), + 0, 0, 0, 0, 0, 0); + if (rc != H_SUCCESS) { + free_page((unsigned long) buf_page); + return -EINVAL; + } + + cnt = retbuf[0]; + for (i = 0; i < cnt; i++) { + cpu = drc_index_to_cpu(buf_page[2*i+1]); + if ((cpu_online(cpu) && !activate) || + (!cpu_online(cpu) && activate)) + s += sprintf(s, "%d,", cpu); + } + if (s > page) { /* Something to show */ + s--; /* Suppress last comma */ + s += sprintf(s, "\n"); + } + + free_page((unsigned long) buf_page); + return s-page; +} + +static ssize_t get_best_energy_data(struct sys_device *dev, + char *page, int activate) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + unsigned long flags = 0; + + flags = FLAGS_MODE2; + if (activate) + flags |= FLAGS_ACTIVATE; + + rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, + cpu_to_drc_index(dev->id), + 0, 0, 0, 0, 0, 0, 0); + + if (rc != H_SUCCESS) + return -EINVAL; + + return sprintf(page, "%lu\n", retbuf[1] >> 32); +} + +/* Wrapper functions */ + +static ssize_t cpu_activate_hint_list_show(struct sysdev_class *class, + char *page) +{ + return get_best_energy_list(page, 1); +} + +static ssize_t cpu_deactivate_hint_list_show(struct sysdev_class *class, + char *page) +{ + return get_best_energy_list(page, 0); +} + +static ssize_t percpu_activate_hint_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *page) +{ + return get_best_energy_data(dev, page, 1); +} + +static ssize_t percpu_deactivate_hint_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *page) +{ + return get_best_energy_data(dev, page, 0); +} + +/* + * Create sysfs interface: + * /sys/devices/system/cpu/pseries_activate_hint_list + * /sys/devices/system/cpu/pseries_deactivate_hint_list + * Comma separated list of cpus to activate or deactivate + * /sys/devices/system/cpu/cpuN/pseries_activate_hint + * /sys/devices/system/cpu/cpuN/pseries_deactivate_hint + * Per-cpu value of the hint + */ + +struct sysdev_class_attribute attr_cpu_activate_hint_list = + _SYSDEV_CLASS_ATTR(pseries_activate_hint_list, 0444, + cpu_activate_hint_list_show, NULL); + +struct sysdev_class_attribute attr_cpu_deactivate_hint_list = + _SYSDEV_CLASS_ATTR(pseries_deactivate_hint_list, 0444, + cpu_deactivate_hint_list_show, NULL); + +struct sysdev_attribute attr_percpu_activate_hint = + _SYSDEV_ATTR(pseries_activate_hint, 0444, + percpu_activate_hint_show, NULL); + +struct sysdev_attribute attr_percpu_deactivate_hint = + _SYSDEV_ATTR(pseries_deactivate_hint, 0444, + percpu_deactivate_hint_show, NULL); + +static int __init pseries_energy_init(void) +{ + int cpu, err; + struct sys_device *cpu_sys_dev; + + if (!check_for_h_best_energy()) { + printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n"); + return 0; + } + /* Create the sysfs files */ + err = sysfs_create_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_activate_hint_list.attr); + if (!err) + err = sysfs_create_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_deactivate_hint_list.attr); + + if (err) + return err; + + get_threads_per_core(); + + for_each_possible_cpu(cpu) { + cpu_sys_dev = get_cpu_sysdev(cpu); + err = sysfs_create_file(&cpu_sys_dev->kobj, + &attr_percpu_activate_hint.attr); + if (err) + break; + err = sysfs_create_file(&cpu_sys_dev->kobj, + &attr_percpu_deactivate_hint.attr); + if (err) + break; + } + + if (err) + return err; + + sysfs_entries = 1; /* Removed entries on cleanup */ + return 0; + +} + +static void __exit pseries_energy_cleanup(void) +{ + int cpu; + struct sys_device *cpu_sys_dev; + + if (!sysfs_entries) + return; + + /* Remove the sysfs files */ + sysfs_remove_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_activate_hint_list.attr); + + sysfs_remove_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_deactivate_hint_list.attr); + + for_each_possible_cpu(cpu) { + cpu_sys_dev = get_cpu_sysdev(cpu); + sysfs_remove_file(&cpu_sys_dev->kobj, + &attr_percpu_activate_hint.attr); + sysfs_remove_file(&cpu_sys_dev->kobj, + &attr_percpu_deactivate_hint.attr); + } +} + +module_init(pseries_energy_init); +module_exit(pseries_energy_cleanup); +MODULE_DESCRIPTION("Driver for pSeries platform energy management"); +MODULE_AUTHOR("Vaidyanathan Srinivasan"); +MODULE_LICENSE("GPL"); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/ras.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/ras.c --- linux-2.6.32/arch/powerpc/platforms/pseries/ras.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/ras.c 2015-09-10 10:07:40.000000000 -0700 @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include #include @@ -152,26 +154,127 @@ static int __init init_ras_IRQ(void) return 0; } -__initcall(init_ras_IRQ); +subsys_initcall(init_ras_IRQ); -/* - * Handle power subsystem events (EPOW). - * - * Presently we just log the event has occurred. This should be fixed - * to examine the type of power failure and take appropriate action where - * the time horizon permits something useful to be done. - */ +#define EPOW_SHUTDOWN_NORMAL 1 +#define EPOW_SHUTDOWN_ON_UPS 2 +#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 +#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 + +static void handle_system_shutdown(char event_modifier) +{ + switch (event_modifier) { + case EPOW_SHUTDOWN_NORMAL: + pr_emerg("Firmware initiated power off"); + orderly_poweroff(1); + break; + + case EPOW_SHUTDOWN_ON_UPS: + pr_emerg("Loss of power reported by firmware, system is " + "running on UPS/battery"); + break; + + case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: + pr_emerg("Loss of system critical functions reported by " + "firmware"); + pr_emerg("Check RTAS error log for details"); + orderly_poweroff(1); + break; + + case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: + pr_emerg("Ambient temperature too high reported by firmware"); + pr_emerg("Check RTAS error log for details"); + orderly_poweroff(1); + break; + + default: + pr_err("Unknown power/cooling shutdown event (modifier %d)", + event_modifier); + } +} + +struct epow_errorlog { + unsigned char sensor_value; + unsigned char event_modifier; + unsigned char extended_modifier; + unsigned char reserved; + unsigned char platform_reason; +}; + +#define EPOW_RESET 0 +#define EPOW_WARN_COOLING 1 +#define EPOW_WARN_POWER 2 +#define EPOW_SYSTEM_SHUTDOWN 3 +#define EPOW_SYSTEM_HALT 4 +#define EPOW_MAIN_ENCLOSURE 5 +#define EPOW_POWER_OFF 7 + +void rtas_parse_epow_errlog(struct rtas_error_log *log) +{ + struct pseries_errorlog *pseries_log; + struct epow_errorlog *epow_log; + char action_code; + char modifier; + + pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); + if (pseries_log == NULL) + return; + + epow_log = (struct epow_errorlog *)pseries_log->data; + action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ + modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ + + switch (action_code) { + case EPOW_RESET: + pr_err("Non critical power or cooling issue cleared"); + break; + + case EPOW_WARN_COOLING: + pr_err("Non critical cooling issue reported by firmware"); + pr_err("Check RTAS error log for details"); + break; + + case EPOW_WARN_POWER: + pr_err("Non critical power issue reported by firmware"); + pr_err("Check RTAS error log for details"); + break; + + case EPOW_SYSTEM_SHUTDOWN: + handle_system_shutdown(epow_log->event_modifier); + break; + + case EPOW_SYSTEM_HALT: + pr_emerg("Firmware initiated power off"); + orderly_poweroff(1); + break; + + case EPOW_MAIN_ENCLOSURE: + case EPOW_POWER_OFF: + pr_emerg("Critical power/cooling issue reported by firmware"); + pr_emerg("Check RTAS error log for details"); + pr_emerg("Immediate power off"); + emergency_sync(); + kernel_power_off(); + break; + + default: + pr_err("Unknown power/cooling event (action code %d)", + action_code); + } +} + +/* Handle environmental and power warning (EPOW) interrupts. */ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) { - int status = 0xdeadbeef; - int state = 0; + int status; + int state; int critical; status = rtas_call(ras_get_sensor_state_token, 2, 2, &state, EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX); if (state > 3) - critical = 1; /* Time Critical */ + critical = 1; /* Time Critical */ else critical = 0; @@ -184,14 +287,10 @@ static irqreturn_t ras_epow_interrupt(in critical, __pa(&ras_log_buf), rtas_get_error_log_max()); - udbg_printf("EPOW <0x%lx 0x%x 0x%x>\n", - *((unsigned long *)&ras_log_buf), status, state); - printk(KERN_WARNING "EPOW <0x%lx 0x%x 0x%x>\n", - *((unsigned long *)&ras_log_buf), status, state); - - /* format and print the extended information */ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); + rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); + spin_unlock(&ras_log_buf_lock); return IRQ_HANDLED; } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/reconfig.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/reconfig.c --- linux-2.6.32/arch/powerpc/platforms/pseries/reconfig.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/reconfig.c 2015-09-10 10:07:48.000000000 -0700 @@ -22,7 +22,7 @@ #include #include - +#include "../../kernel/cacheinfo.h" /* * Routines for "runtime" addition and removal of device tree nodes. @@ -96,17 +96,19 @@ static struct device_node *derive_parent return parent; } -static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain); +BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain); int pSeries_reconfig_notifier_register(struct notifier_block *nb) { return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb); } +EXPORT_SYMBOL_GPL(pSeries_reconfig_notifier_register); void pSeries_reconfig_notifier_unregister(struct notifier_block *nb) { blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb); } +EXPORT_SYMBOL_GPL(pSeries_reconfig_notifier_unregister); static int pSeries_reconfig_add_node(const char *path, struct property *proplist) { @@ -422,6 +424,7 @@ static int do_remove_property(char *buf, static int do_update_property(char *buf, size_t bufsize) { struct device_node *np; + struct pSeries_reconfig_prop_update upd_value; unsigned char *value; char *name, *end, *next_prop; int rc, length; @@ -450,6 +453,11 @@ static int do_update_property(char *buf, return -ENODEV; } + upd_value.node = np; + upd_value.property = newprop; + blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_UPDATE_PROPERTY, &upd_value); + rc = prom_update_property(np, newprop, oldprop); if (rc) return rc; @@ -484,6 +492,28 @@ static int do_update_property(char *buf, return 0; } +static int pSeries_reconfig_add_cache_list(void) +{ + int cpu; + + for_each_online_cpu(cpu) + cacheinfo_cpu_online(cpu); + + return 0; +} + +static int pSeries_reconfig_delete_cache_list(void) +{ + int cpu; + + for_each_online_cpu(cpu) + cacheinfo_cpu_offline(cpu); + + return 0; + +} + + /** * ofdt_write - perform operations on the Open Firmware device tree * @@ -532,6 +562,10 @@ static ssize_t ofdt_write(struct file *f rv = do_remove_property(tmp, count - (tmp - kbuf)); else if (!strcmp(kbuf, "update_property")) rv = do_update_property(tmp, count - (tmp - kbuf)); + else if (!strcmp(kbuf, "start_update")) + rv = pSeries_reconfig_delete_cache_list(); + else if (!strcmp(kbuf, "end_update")) + rv = pSeries_reconfig_add_cache_list(); else rv = -EINVAL; out: diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/rtasd.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/rtasd.c --- linux-2.6.32/arch/powerpc/platforms/pseries/rtasd.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/rtasd.c 2015-09-10 10:06:47.000000000 -0700 @@ -453,6 +453,13 @@ static void start_event_scan(void) event_scan_delay); } +/* Cancel the rtas event scan work */ +void rtas_cancel_event_scan(void) +{ + cancel_delayed_work_sync(&event_scan_work); +} +EXPORT_SYMBOL_GPL(rtas_cancel_event_scan); + static int __init rtas_init(void) { struct proc_dir_entry *entry; diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/setup.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/setup.c --- linux-2.6.32/arch/powerpc/platforms/pseries/setup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/setup.c 2015-09-10 10:07:23.000000000 -0700 @@ -274,8 +274,71 @@ static struct notifier_block pci_dn_reco .notifier_call = pci_dn_reconfig_notifier, }; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +/* + * Allocate space for the dispatch trace log for all possible cpus + * and register the buffers with the hypervisor. This is used for + * computing time stolen by the hypervisor. + */ +static int alloc_dispatch_logs(void) +{ + int cpu, ret; + struct paca_struct *pp; + struct dtl_entry *dtl; + struct kmem_cache *dtl_cache; + + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return 0; + + dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, + DISPATCH_LOG_BYTES, 0, NULL); + if (!dtl_cache) { + pr_warn("Failed to create dispatch trace log buffer cache\n"); + pr_warn("Stolen time statistics will be unreliable\n"); + return 0; + } + + for_each_possible_cpu(cpu) { + pp = &paca[cpu]; + dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL); + if (!dtl) { + pr_warning("Failed to allocate dispatch trace log for cpu %d\n", + cpu); + pr_warning("Stolen time statistics will be unreliable\n"); + break; + } + + pp->dtl_ridx = 0; + pp->dispatch_log = dtl; + pp->dispatch_log_end = dtl + N_DISPATCH_LOG; + pp->dtl_curr = dtl; + } + + /* Register the DTL for the current (boot) cpu */ + dtl = get_paca()->dispatch_log; + get_paca()->dtl_ridx = 0; + get_paca()->dtl_curr = dtl; + get_paca()->lppaca_ptr->dtl_idx = 0; + + /* hypervisor reads buffer length from this field */ + dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES; + ret = register_dtl(hard_smp_processor_id(), __pa(dtl)); + if (ret) + pr_warning("DTL registration failed for boot cpu %d (%d)\n", + smp_processor_id(), ret); + get_paca()->lppaca_ptr->dtl_enable_mask = 2; + + return 0; +} + +early_initcall(alloc_dispatch_logs); +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + static void __init pSeries_setup_arch(void) { + if (!strncmp(cur_cpu_spec->platform, "power5", 6)) + mark_hardware_unsupported("Power5 Processor"); + /* Discover PIC type and setup ppc_md accordingly */ pseries_discover_pic(); @@ -288,6 +351,9 @@ static void __init pSeries_setup_arch(vo fwnmi_init(); + /* By default, only probe PCI (can be overriden by rtas_pci) */ + pci_add_flags(PCI_PROBE_ONLY); + /* Find and initialize PCI host bridges */ init_pci_config_tokens(); find_and_init_phbs(); @@ -341,6 +407,16 @@ static int pseries_set_xdabr(unsigned lo #define CMO_CHARACTERISTICS_TOKEN 44 #define CMO_MAXLENGTH 1026 +void pSeries_coalesce_init(void) +{ + struct hvcall_mpp_x_data mpp_x_data; + + if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data)) + powerpc_firmware_features |= FW_FEATURE_XCMO; + else + powerpc_firmware_features &= ~FW_FEATURE_XCMO; +} + /** * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions, * handle that here. (Stolen from parse_system_parameter_string) @@ -410,6 +486,7 @@ void pSeries_cmo_feature_init(void) pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP, CMO_SecPSP); powerpc_firmware_features |= FW_FEATURE_CMO; + pSeries_coalesce_init(); } else pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP, CMO_SecPSP); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/smp.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/smp.c --- linux-2.6.32/arch/powerpc/platforms/pseries/smp.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/smp.c 2015-09-10 10:06:19.000000000 -0700 @@ -48,6 +48,7 @@ #include "plpar_wrappers.h" #include "pseries.h" #include "xics.h" +#include "offline_states.h" /* @@ -56,6 +57,28 @@ */ static cpumask_t of_spin_map; +/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */ +int smp_query_cpu_stopped(unsigned int pcpu) +{ + int cpu_status, status; + int qcss_tok = rtas_token("query-cpu-stopped-state"); + + if (qcss_tok == RTAS_UNKNOWN_SERVICE) { + printk(KERN_INFO "Firmware doesn't support " + "query-cpu-stopped-state\n"); + return QCSS_HARDWARE_ERROR; + } + + status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu); + if (status != 0) { + printk(KERN_ERR + "RTAS query-cpu-stopped-state failed: %i\n", status); + return status; + } + + return cpu_status; +} + /** * smp_startup_cpu() - start the given cpu * @@ -81,9 +104,18 @@ static inline int __devinit smp_startup_ pcpu = get_hard_smp_processor_id(lcpu); + /* Check to see if the CPU out of FW already for kexec */ + if (smp_query_cpu_stopped(pcpu) == QCSS_NOT_STOPPED){ + cpu_set(lcpu, of_spin_map); + return 1; + } + /* Fixup atomic count: it exited inside IRQ handler. */ task_thread_info(paca[lcpu].__current)->preempt_count = 0; + if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE) + goto out; + /* * If the RTAS start-cpu token does not exist then presume the * cpu is already spinning. @@ -98,6 +130,7 @@ static inline int __devinit smp_startup_ return 0; } +out: return 1; } @@ -111,12 +144,16 @@ static void __devinit smp_xics_setup_cpu vpa_init(cpu); cpu_clear(cpu, of_spin_map); + set_cpu_current_state(cpu, CPU_STATE_ONLINE); + set_default_offline_state(cpu); } #endif /* CONFIG_XICS */ static void __devinit smp_pSeries_kick_cpu(int nr) { + long rc; + unsigned long hcpuid; BUG_ON(nr < 0 || nr >= NR_CPUS); if (!smp_startup_cpu(nr)) @@ -128,6 +165,16 @@ static void __devinit smp_pSeries_kick_c * the processor will continue on to secondary_start */ paca[nr].cpu_start = 1; + + set_preferred_offline_state(nr, CPU_STATE_ONLINE); + + if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) { + hcpuid = get_hard_smp_processor_id(nr); + rc = plpar_hcall_norets(H_PROD, hcpuid); + if (rc != H_SUCCESS) + printk(KERN_ERR "Error: Prod to wake up processor %d\ + Ret= %ld\n", nr, rc); + } } static int smp_pSeries_cpu_bootable(unsigned int nr) @@ -135,10 +182,13 @@ static int smp_pSeries_cpu_bootable(unsi /* Special case - we inhibit secondary thread startup * during boot if the user requests it. */ - if (system_state < SYSTEM_RUNNING && - cpu_has_feature(CPU_FTR_SMT) && - !smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) - return 0; + if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { + if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) + return 0; + if (smt_enabled_at_boot + && cpu_thread_in_core(nr) >= smt_enabled_at_boot) + return 0; + } return 1; } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/suspend.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/suspend.c --- linux-2.6.32/arch/powerpc/platforms/pseries/suspend.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/suspend.c 2015-09-10 10:07:39.000000000 -0700 @@ -0,0 +1,246 @@ +/* + * Copyright (C) 2010 Brian King IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static u64 stream_id; +static struct sys_device suspend_sysdev; +static DECLARE_COMPLETION(suspend_work); +static struct rtas_suspend_me_data suspend_data; +static int suspending; + +/** + * pseries_suspend_begin - First phase of hibernation + * + * Check to ensure we are in a valid state to hibernate + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_suspend_begin(suspend_state_t state) +{ + long vasi_state, rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + /* Make sure the state is valid */ + rc = plpar_hcall(H_VASI_STATE, retbuf, stream_id); + + vasi_state = retbuf[0]; + + if (rc) { + pr_err("pseries_suspend_begin: vasi_state returned %ld\n",rc); + return rc; + } else if (vasi_state == H_VASI_ENABLED) { + return -EAGAIN; + } else if (vasi_state != H_VASI_SUSPENDING) { + pr_err("pseries_suspend_begin: vasi_state returned state %ld\n", + vasi_state); + return -EIO; + } + + return 0; +} + +/** + * pseries_suspend_cpu - Suspend a single CPU + * + * Makes the H_JOIN call to suspend the CPU + * + **/ +int pseries_suspend_cpu(void) +{ + if (suspending) + return rtas_suspend_cpu(&suspend_data); + return 0; +} + +/** + * pseries_suspend_enter - Final phase of hibernation + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_suspend_enter(suspend_state_t state) +{ + int rc = rtas_suspend_last_cpu(&suspend_data); + + smp_wmb(); /* Ensure done is ordered wrt suspend_data.error */ + suspending = 0; + suspend_data.done = 1; + + /* Ensure suspending and suspend_data.done is seen on all CPUs, + since they will be waking up shortly */ + mb(); + return rc; +} + +/** + * pseries_prepare_late - Prepare to suspend all other CPUs + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_prepare_late(void) +{ + suspending = 1; + atomic_set(&suspend_data.working, 0); + suspend_data.done = 0; + suspend_data.error = 0; + suspend_data.complete = &suspend_work; + INIT_COMPLETION(suspend_work); + + /* Ensure suspending and suspend_data.done is seen on all CPUs */ + mb(); + return 0; +} + +/** + * store_hibernate - Initiate partition hibernation + * @classdev: sysdev class struct + * @attr: class device attribute struct + * @buf: buffer + * @count: buffer size + * + * Write the stream ID received from the HMC to this file + * to trigger hibernating the partition + * + * Return value: + * number of bytes printed to buffer / other on failure + **/ +static ssize_t store_hibernate(struct sysdev_class *classdev, + const char *buf, size_t count) +{ + cpumask_var_t offline_mask; + int rc; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY)) + return -ENOMEM; + + stream_id = simple_strtoul(buf, NULL, 16); + + do { + rc = pseries_suspend_begin(PM_SUSPEND_MEM); + if (rc == -EAGAIN) + ssleep(1); + } while (rc == -EAGAIN); + + if (!rc) { + /* All present CPUs must be online */ + cpumask_andnot(offline_mask, cpu_present_mask, + cpu_online_mask); + rc = rtas_online_cpus_mask(offline_mask); + if (rc) { + pr_err("%s: Could not bring present CPUs online.\n", + __func__); + goto out; + } + + stop_topology_update(); + rc = pm_suspend(PM_SUSPEND_MEM); + start_topology_update(); + + /* Take down CPUs not online prior to suspend */ + if (!rtas_offline_cpus_mask(offline_mask)) + pr_warn("%s: Could not restore CPUs to offline " + "state.\n", __func__); + } + + stream_id = 0; + + if (!rc) + rc = count; +out: + free_cpumask_var(offline_mask); + return rc; +} + +static SYSDEV_CLASS_ATTR(hibernate, S_IWUSR, NULL, store_hibernate); + +static struct sysdev_class suspend_sysdev_class = { + .name = "power", +}; + +static struct platform_suspend_ops pseries_suspend_ops = { + .valid = suspend_valid_only_mem, + .begin = pseries_suspend_begin, + .prepare_late = pseries_prepare_late, + .enter = pseries_suspend_enter, +}; + +/** + * pseries_suspend_sysfs_register - Register with sysfs + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_suspend_sysfs_register(struct sys_device *sysdev) +{ + int rc; + + if ((rc = sysdev_class_register(&suspend_sysdev_class))) + return rc; + + sysdev->id = 0; + sysdev->cls = &suspend_sysdev_class; + + if ((rc = sysdev_class_create_file(&suspend_sysdev_class, &attr_hibernate))) + goto class_unregister; + + return 0; + +class_unregister: + sysdev_class_unregister(&suspend_sysdev_class); + return rc; +} + +/** + * pseries_suspend_init - initcall for pSeries suspend + * + * Return value: + * 0 on success / other on failure + **/ +static int __init pseries_suspend_init(void) +{ + int rc; + + if (!machine_is(pseries) || !firmware_has_feature(FW_FEATURE_LPAR)) + return 0; + + suspend_data.token = rtas_token("ibm,suspend-me"); + if (suspend_data.token == RTAS_UNKNOWN_SERVICE) + return 0; + + if ((rc = pseries_suspend_sysfs_register(&suspend_sysdev))) + return rc; + + suspend_set_ops(&pseries_suspend_ops); + return 0; +} + +__initcall(pseries_suspend_init); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/xics.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/xics.c --- linux-2.6.32/arch/powerpc/platforms/pseries/xics.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/xics.c 2015-09-10 10:07:49.000000000 -0700 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,12 @@ static struct irq_host *xics_host; */ #define IPI_PRIORITY 4 +/* The least favored priority */ +#define LOWEST_PRIORITY 0xFF + +/* The number of priorities defined above */ +#define MAX_NUM_PRIORITIES 3 + static unsigned int default_server = 0xFF; static unsigned int default_distrib_server = 0; static unsigned int interrupt_server_size = 8; @@ -56,6 +63,12 @@ static int ibm_set_xive; static int ibm_int_on; static int ibm_int_off; +struct xics_cppr { + unsigned char stack[MAX_NUM_PRIORITIES]; + int index; +}; + +static DEFINE_PER_CPU(struct xics_cppr, xics_cppr); /* Direct hardware low level accessors */ @@ -150,18 +163,17 @@ static inline void lpar_qirr_info(int n_ /* Interface to generic irq subsystem */ #ifdef CONFIG_SMP -static int get_irq_server(unsigned int virq, unsigned int strict_check) +static int get_irq_server(unsigned int virq, cpumask_t cpumask, + unsigned int strict_check) { int server; /* For the moment only implement delivery to all cpus or one cpu */ - cpumask_t cpumask; cpumask_t tmp = CPU_MASK_NONE; - cpumask_copy(&cpumask, irq_desc[virq].affinity); if (!distribute_irqs) return default_server; - if (!cpus_equal(cpumask, CPU_MASK_ALL)) { + if (!cpus_subset(cpu_possible_map, cpumask)) { cpus_and(tmp, cpu_online_map, cpumask); server = first_cpu(tmp); @@ -179,7 +191,8 @@ static int get_irq_server(unsigned int v return default_server; } #else -static int get_irq_server(unsigned int virq, unsigned int strict_check) +static int get_irq_server(unsigned int virq, cpumask_t cpumask, + unsigned int strict_check) { return default_server; } @@ -198,7 +211,7 @@ static void xics_unmask_irq(unsigned int if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) return; - server = get_irq_server(virq, 0); + server = get_irq_server(virq, *(irq_to_desc(virq)->affinity), 0); call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server, DEFAULT_PRIORITY); @@ -284,6 +297,19 @@ static inline unsigned int xics_xirr_vec return xirr & 0x00ffffff; } +static void push_cppr(unsigned int vec) +{ + struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); + + if (WARN_ON(os_cppr->index >= MAX_NUM_PRIORITIES - 1)) + return; + + if (vec == XICS_IPI) + os_cppr->stack[++os_cppr->index] = IPI_PRIORITY; + else + os_cppr->stack[++os_cppr->index] = DEFAULT_PRIORITY; +} + static unsigned int xics_get_irq_direct(void) { unsigned int xirr = direct_xirr_info_get(); @@ -294,8 +320,10 @@ static unsigned int xics_get_irq_direct( return NO_IRQ; irq = irq_radix_revmap_lookup(xics_host, vec); - if (likely(irq != NO_IRQ)) + if (likely(irq != NO_IRQ)) { + push_cppr(vec); return irq; + } /* We don't have a linux mapping, so have rtas mask it. */ xics_mask_unknown_vec(vec); @@ -315,8 +343,10 @@ static unsigned int xics_get_irq_lpar(vo return NO_IRQ; irq = irq_radix_revmap_lookup(xics_host, vec); - if (likely(irq != NO_IRQ)) + if (likely(irq != NO_IRQ)) { + push_cppr(vec); return irq; + } /* We don't have a linux mapping, so have RTAS mask it. */ xics_mask_unknown_vec(vec); @@ -326,12 +356,22 @@ static unsigned int xics_get_irq_lpar(vo return NO_IRQ; } +static unsigned char pop_cppr(void) +{ + struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); + + if (WARN_ON(os_cppr->index < 1)) + return LOWEST_PRIORITY; + + return os_cppr->stack[--os_cppr->index]; +} + static void xics_eoi_direct(unsigned int virq) { unsigned int irq = (unsigned int)irq_map[virq].hwirq; iosync(); - direct_xirr_info_set((0xff << 24) | irq); + direct_xirr_info_set((pop_cppr() << 24) | irq); } static void xics_eoi_lpar(unsigned int virq) @@ -339,7 +379,7 @@ static void xics_eoi_lpar(unsigned int v unsigned int irq = (unsigned int)irq_map[virq].hwirq; iosync(); - lpar_xirr_info_set((0xff << 24) | irq); + lpar_xirr_info_set((pop_cppr() << 24) | irq); } static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) @@ -365,7 +405,7 @@ static int xics_set_affinity(unsigned in * For the moment only implement delivery to all cpus or one cpu. * Get current irq_server for the given irq */ - irq_server = get_irq_server(virq, 1); + irq_server = get_irq_server(virq, *cpumask, 1); if (irq_server == -1) { char cpulist[128]; cpumask_scnprintf(cpulist, sizeof(cpulist), cpumask); @@ -508,8 +548,6 @@ void smp_xics_message_pass(int target, i static irqreturn_t xics_ipi_dispatch(int cpu) { - WARN_ON(cpu_is_offline(cpu)); - mb(); /* order mmio clearing qirr */ while (xics_ipi_message[cpu].value) { if (test_and_clear_bit(PPC_MSG_CALL_FUNCTION, @@ -746,6 +784,16 @@ void __init xics_init_IRQ(void) static void xics_set_cpu_priority(unsigned char cppr) { + struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); + + /* + * we only really want to set the priority when there's + * just one cppr value on the stack + */ + WARN_ON(os_cppr->index != 0); + + os_cppr->stack[0] = cppr; + if (firmware_has_feature(FW_FEATURE_LPAR)) lpar_cppr_info(cppr); else @@ -772,15 +820,21 @@ static void xics_set_cpu_giq(unsigned in void xics_setup_cpu(void) { - xics_set_cpu_priority(0xff); + xics_set_cpu_priority(LOWEST_PRIORITY); xics_set_cpu_giq(default_distrib_server, 1); } void xics_teardown_cpu(void) { + struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); int cpu = smp_processor_id(); + /* + * we have to reset the cppr index to 0 because we're + * not going to return from the IPI + */ + os_cppr->index = 0; xics_set_cpu_priority(0); /* Clear any pending IPI request */ diff -uprN linux-2.6.32/arch/powerpc/sysdev/dart_iommu.c linux-2.6.32.ovz/arch/powerpc/sysdev/dart_iommu.c --- linux-2.6.32/arch/powerpc/sysdev/dart_iommu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/dart_iommu.c 2015-09-10 10:07:33.000000000 -0700 @@ -72,11 +72,16 @@ static int dart_is_u4; #define DBG(...) +static DEFINE_SPINLOCK(invalidate_lock); + static inline void dart_tlb_invalidate_all(void) { unsigned long l = 0; unsigned int reg, inv_bit; unsigned long limit; + unsigned long flags; + + spin_lock_irqsave(&invalidate_lock, flags); DBG("dart: flush\n"); @@ -109,12 +114,17 @@ retry: panic("DART: TLB did not flush after waiting a long " "time. Buggy U3 ?"); } + + spin_unlock_irqrestore(&invalidate_lock, flags); } static inline void dart_tlb_invalidate_one(unsigned long bus_rpn) { unsigned int reg; unsigned int l, limit; + unsigned long flags; + + spin_lock_irqsave(&invalidate_lock, flags); reg = DART_CNTL_U4_ENABLE | DART_CNTL_U4_IONE | (bus_rpn & DART_CNTL_U4_IONE_MASK); @@ -136,6 +146,8 @@ wait_more: panic("DART: TLB did not flush after waiting a long " "time. Buggy U4 ?"); } + + spin_unlock_irqrestore(&invalidate_lock, flags); } static void dart_flush(struct iommu_table *tbl) diff -uprN linux-2.6.32/arch/powerpc/sysdev/fsl_pci.c linux-2.6.32.ovz/arch/powerpc/sysdev/fsl_pci.c --- linux-2.6.32/arch/powerpc/sysdev/fsl_pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/fsl_pci.c 2015-09-10 10:07:23.000000000 -0700 @@ -330,7 +330,7 @@ int __init fsl_add_bridge(struct device_ printk(KERN_WARNING "Can't get bus-range for %s, assume" " bus 0\n", dev->full_name); - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); hose = pcibios_alloc_controller(dev); if (!hose) return -ENOMEM; @@ -392,8 +392,22 @@ DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEV DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8641, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8641D, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8610, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1011E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1011, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1013E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1013, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1020E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1020, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1022E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1022, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2010E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2010, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2020E, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2020, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4040E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4040, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4080E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4080, quirk_fsl_pcie_header); #endif /* CONFIG_PPC_85xx || CONFIG_PPC_86xx */ #if defined(CONFIG_PPC_83xx) || defined(CONFIG_PPC_MPC512x) @@ -626,7 +640,7 @@ int __init mpc83xx_add_bridge(struct dev " bus 0\n", dev->full_name); } - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); hose = pcibios_alloc_controller(dev); if (!hose) return -ENOMEM; diff -uprN linux-2.6.32/arch/powerpc/sysdev/grackle.c linux-2.6.32.ovz/arch/powerpc/sysdev/grackle.c --- linux-2.6.32/arch/powerpc/sysdev/grackle.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/grackle.c 2015-09-10 10:07:23.000000000 -0700 @@ -57,7 +57,7 @@ void __init setup_grackle(struct pci_con { setup_indirect_pci(hose, 0xfec00000, 0xfee00000, 0); if (machine_is_compatible("PowerMac1,1")) - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); if (machine_is_compatible("AAPL,PowerBook1998")) grackle_set_loop_snoop(hose, 1); #if 0 /* Disabled for now, HW problems ??? */ diff -uprN linux-2.6.32/arch/powerpc/sysdev/mpic.c linux-2.6.32.ovz/arch/powerpc/sysdev/mpic.c --- linux-2.6.32/arch/powerpc/sysdev/mpic.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/mpic.c 2015-09-10 10:06:49.000000000 -0700 @@ -567,13 +567,11 @@ static void __init mpic_scan_ht_pics(str #endif /* CONFIG_MPIC_U3_HT_IRQS */ #ifdef CONFIG_SMP -static int irq_choose_cpu(unsigned int virt_irq) +static int irq_choose_cpu(const cpumask_t *mask) { - cpumask_t mask; int cpuid; - cpumask_copy(&mask, irq_desc[virt_irq].affinity); - if (cpus_equal(mask, CPU_MASK_ALL)) { + if (cpumask_equal(mask, cpu_all_mask)) { static int irq_rover; static DEFINE_SPINLOCK(irq_rover_lock); unsigned long flags; @@ -594,20 +592,15 @@ static int irq_choose_cpu(unsigned int v spin_unlock_irqrestore(&irq_rover_lock, flags); } else { - cpumask_t tmp; - - cpus_and(tmp, cpu_online_map, mask); - - if (cpus_empty(tmp)) + cpuid = cpumask_first_and(mask, cpu_online_mask); + if (cpuid >= nr_cpu_ids) goto do_round_robin; - - cpuid = first_cpu(tmp); } return get_hard_smp_processor_id(cpuid); } #else -static int irq_choose_cpu(unsigned int virt_irq) +static int irq_choose_cpu(const cpumask_t *mask) { return hard_smp_processor_id(); } @@ -816,7 +809,7 @@ int mpic_set_affinity(unsigned int irq, unsigned int src = mpic_irq_to_hw(irq); if (mpic->flags & MPIC_SINGLE_DEST_CPU) { - int cpuid = irq_choose_cpu(irq); + int cpuid = irq_choose_cpu(cpumask); mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION), 1 << cpuid); } else { diff -uprN linux-2.6.32/arch/powerpc/sysdev/mv64x60_pci.c linux-2.6.32.ovz/arch/powerpc/sysdev/mv64x60_pci.c --- linux-2.6.32/arch/powerpc/sysdev/mv64x60_pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/mv64x60_pci.c 2015-09-10 10:06:06.000000000 -0700 @@ -24,7 +24,7 @@ #define MV64X60_VAL_LEN_MAX 11 #define MV64X60_PCICFG_CPCI_HOTSWAP 0x68 -static ssize_t mv64x60_hs_reg_read(struct kobject *kobj, +static ssize_t mv64x60_hs_reg_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { @@ -45,7 +45,7 @@ static ssize_t mv64x60_hs_reg_read(struc return sprintf(buf, "0x%08x\n", v); } -static ssize_t mv64x60_hs_reg_write(struct kobject *kobj, +static ssize_t mv64x60_hs_reg_write(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { diff -uprN linux-2.6.32/arch/powerpc/sysdev/ppc4xx_pci.c linux-2.6.32.ovz/arch/powerpc/sysdev/ppc4xx_pci.c --- linux-2.6.32/arch/powerpc/sysdev/ppc4xx_pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/ppc4xx_pci.c 2015-09-10 10:07:23.000000000 -0700 @@ -1839,7 +1839,7 @@ static int __init ppc4xx_pci_find_bridge { struct device_node *np; - ppc_pci_flags |= PPC_PCI_ENABLE_PROC_DOMAINS | PPC_PCI_COMPAT_DOMAIN_0; + pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0); #ifdef CONFIG_PPC4xx_PCI_EXPRESS for_each_compatible_node(np, NULL, "ibm,plb-pciex") diff -uprN linux-2.6.32/arch/powerpc/xmon/xmon.c linux-2.6.32.ovz/arch/powerpc/xmon/xmon.c --- linux-2.6.32/arch/powerpc/xmon/xmon.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/xmon/xmon.c 2015-09-10 10:06:38.000000000 -0700 @@ -818,7 +818,7 @@ cmds(struct pt_regs *excp) memzcan(); break; case 'i': - show_mem(); + show_mem(0); break; default: termch = cmd; @@ -1641,7 +1641,8 @@ static void super_regs(void) ptrLpPaca->saved_srr0, ptrLpPaca->saved_srr1); printf(" Saved Gpr3=%.16lx Saved Gpr4=%.16lx \n", ptrLpPaca->saved_gpr3, ptrLpPaca->saved_gpr4); - printf(" Saved Gpr5=%.16lx \n", ptrLpPaca->saved_gpr5); + printf(" Saved Gpr5=%.16lx \n", + ptrLpPaca->gpr5_dword.saved_gpr5); } #endif diff -uprN linux-2.6.32/arch/s390/boot/zfcpdump.c linux-2.6.32.ovz/arch/s390/boot/zfcpdump.c --- linux-2.6.32/arch/s390/boot/zfcpdump.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/boot/zfcpdump.c 2015-09-10 10:07:25.000000000 -0700 @@ -0,0 +1,1141 @@ +/* + * zfcpdump userspace tool + * + * This tool should be used in an intitramfs together with a kernel with + * enabled CONFIG_ZFCPDUMP kernel build option. The tool is able to write + * standalone system dumps on SCSI disks. + * + * See Documentation/s390/zfcpdump.txt for more information! + * + * Copyright IBM Corp. 2003, 2007. + * Author(s): Michael Holzheu + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfcpdump.h" +#ifdef GZIP_SUPPORT +#include +#endif + +static struct globals g; +static char *module_list[] = {"zfcp", "sd_mod", "ext2", "ext3", "zcore_mod", + NULL}; + +/* + * parse one kernel parameter in the form keyword=value + */ +static int parse_parameter(char *parameter) +{ + char *token; + char *end_ptr; + + token = strtok(parameter, "="); + if (token == NULL) + return 0; + + if (strcmp(token, PARM_DIR) == 0) { + /* Dump Dir */ + g.parm_dir = strtok(NULL, "="); + if (g.parm_dir == NULL) { + PRINT_WARN("No value for '%s' parameter specified\n", + PARM_DIR); + PRINT_WARN("Using default: %s\n", PARM_DIR_DFLT); + g.parm_dir = PARM_DIR_DFLT; + } + } else if (strcmp(token, PARM_PART) == 0) { + /* Dump Partition */ + g.parm_part = strtok(NULL, "="); + if (g.parm_part == NULL) { + PRINT_ERR("No value for '%s' parameter " + "specified\n", PARM_PART); + return -1; + } + } else if (strcmp(token, PARM_MEM) == 0) { + /* Dump mem */ + char *mem_str = strtok(NULL, "="); + if (mem_str == NULL) { + PRINT_ERR("No value for '%s' parameter " + "specified\n", PARM_MEM); + return -1; + } + g.parm_mem = strtoll(mem_str, &end_ptr, 0); + if (*end_ptr != 0) { + PRINT_ERR("Invalid value for '%s' parameter " + "specified\n", PARM_MEM); + return -1; + } + } else if (strcmp(token, PARM_COMP) == 0) { + /* Dump Compression */ + g.parm_compress = strtok(NULL, "="); + if (g.parm_compress == NULL) { + PRINT_WARN("No value for '%s' parameter " + "specified\n", PARM_COMP); + PRINT_WARN("Using default: %s\n", PARM_COMP_DFLT); + g.parm_compress = PARM_COMP_DFLT; + } else if ((strcmp(g.parm_compress, PARM_COMP_GZIP) != 0) && + (strcmp(g.parm_compress, PARM_COMP_NONE) != 0)) { + PRINT_WARN("Unknown dump compression '%s' " + "specified!\n", g.parm_compress); + PRINT_WARN("Using default: %s\n", PARM_COMP_DFLT); + g.parm_compress = PARM_COMP_DFLT; + } + } else if (strcmp(token, PARM_DEBUG) == 0) { + /* Dump Debug */ + char *s = strtok(NULL, "="); + if (s == NULL) { + PRINT_WARN("No value for '%s' parameter " + "specified\n", PARM_DEBUG); + PRINT_WARN("Using default: %d\n", PARM_DEBUG_DFLT); + } else { + g.parm_debug = atoi(s); + if ((g.parm_debug < PARM_DEBUG_MIN) || + (g.parm_debug > PARM_DEBUG_MAX)) { + PRINT_WARN("Invalid value (%i) for %s " + "parameter specified (allowed range is " + "%i - %i)\n", g.parm_debug, PARM_DEBUG, + PARM_DEBUG_MIN, PARM_DEBUG_MAX); + PRINT_WARN("Using default: %i\n", + PARM_DEBUG_DFLT); + g.parm_debug = PARM_DEBUG_DFLT; + } + } + } else if (strcmp(token, PARM_MODE) == 0) { + /* Dump Mode */ + char *s = strtok(NULL, "="); + if (s == NULL) { + PRINT_WARN("No value for '%s' parameter " + "specified\n", PARM_MODE); + PRINT_WARN("Using default: %s\n", PARM_MODE_DFLT); + } else if (strcmp(s, PARM_MODE_INTERACT) == 0) { + g.parm_mode = PARM_MODE_INTERACT_NUM; + } else if (strcmp(s, PARM_MODE_AUTO) == 0) { + g.parm_mode = PARM_MODE_AUTO_NUM; + } else { + PRINT_WARN("Unknown dump mode: %s\n", s); + PRINT_WARN("Using default: %s\n", PARM_MODE_DFLT); + } + } + return 0; +} + +/* + * Get dump parameters from /proc/cmdline + * Return: 0 - ok + * (!= 0) - error + */ +static int parse_parmline(void) +{ + int fh, i, count, token_cnt; + char *token; + char *parms[KERN_PARM_MAX]; + + /* setting defaults */ + + g.parm_compress = PARM_COMP_DFLT; + g.parm_dir = PARM_DIR_DFLT; + g.parm_part = PARM_PART_DFLT; + g.parm_debug = PARM_DEBUG_DFLT; + g.parm_mode = PARM_MODE_NUM_DFLT; + g.parm_mem = PARM_MEM_DFLT; + + fh = open(PROC_CMDLINE, O_RDONLY); + if (fh == -1) { + PRINT_PERR("open %s failed\n", PROC_CMDLINE); + return -1; + } + count = read(fh, g.parmline, CMDLINE_MAX_LEN); + if (count == -1) { + PRINT_PERR("read %s failed\n", PROC_CMDLINE); + close(fh); + return -1; + } + g.parmline[count-1] = '\0'; /* remove \n */ + token_cnt = 0; + token = strtok(g.parmline, " \t\n"); + while (token != NULL) { + parms[token_cnt] = token; + token = strtok(NULL, " \t\n"); + token_cnt++; + if (token_cnt >= KERN_PARM_MAX) { + PRINT_WARN("More than %i kernel parmameters " + "specified\n", KERN_PARM_MAX); + break; + } + } + for (i = 0; i < token_cnt; i++) { + if (parse_parameter(parms[i])) { + close(fh); + return -1; + } + } + PRINT_TRACE("dump dir : %s\n", g.parm_dir); + PRINT_TRACE("dump part : %s\n", g.parm_part); + PRINT_TRACE("dump comp : %s\n", g.parm_compress); + PRINT_TRACE("dump debug: %d\n", g.parm_debug); + PRINT_TRACE("dump mem: %llx\n", (unsigned long long) g.parm_mem); + + if (g.parm_mode == PARM_MODE_AUTO_NUM) + PRINT_TRACE("dump mode : %s\n", PARM_MODE_AUTO); + if (g.parm_mode == PARM_MODE_INTERACT_NUM) + PRINT_TRACE("dump mode : %s\n", PARM_MODE_INTERACT); + + sprintf(g.dump_dir, "%s/%s", DUMP_DIR, g.parm_dir); + close(fh); + return 0; +} + +static int write_to_file(const char *file, const char *command) +{ + int fh; + + PRINT_TRACE("Write: %s - %s\n", file, command); + fh = open(file, O_WRONLY); + if (fh == -1) { + PRINT_PERR("Could not open %s\n", file); + return -1; + } + if (write(fh, command, strlen(command)) == -1) { + PRINT_PERR("Write to %s failed\n", file); + close(fh); + return -1; + }; + close(fh); + return 0; +} + +static int read_file(const char *file, char *buf, int size) +{ + ssize_t count; + int fh; + + PRINT_TRACE("Read: %s:\n", file); + fh = open(file, O_RDONLY); + if (fh == -1) { + PRINT_PERR("open %s failed\n", file); + return -1; + } + count = read(fh, buf, size - 1); + if (count < 0) { + PRINT_PERR("read %s failed\n", file); + close(fh); + return -1; + } + buf[count] = 0; + if (buf[strlen(buf) - 1] == '\n') + buf[strlen(buf) - 1] = 0; /* strip newline */ + close(fh); + PRINT_TRACE("'%s'\n", buf); + + return 0; +} + +/* + * Get HSA size + */ +static __u64 get_hsa_size(void) +{ + char buf[128]; + + if (read_file(DEV_ZCORE_HSA, buf, sizeof(buf))) + return 0; + return strtoul(buf, NULL, 16); +} + +/* + * Release HSA + */ +static void release_hsa(void) +{ + write_to_file(DEV_ZCORE_HSA, "0"); +} + +/* + * Enable the scsi disk for dumping + * Return: 0 - ok + * != 0 - error + */ +static int enable_zfcp_device(void) +{ + char command[1024], file[1024]; + struct stat s; + + /* device */ + if (read_file(IPL_DEVNO, g.dump_devno, sizeof(g.dump_devno))) + return -1; + sprintf(file, "/sys/bus/ccw/drivers/zfcp/%s/online", g.dump_devno); + if (write_to_file(file, "1\n")) + return -1; + + /* wwpn */ + if (read_file(IPL_WWPN, g.dump_wwpn, sizeof(g.dump_wwpn))) + return -1; + sprintf(file, "/sys/bus/ccw/drivers/zfcp/%s/port_add", g.dump_devno); + /* The port_add attribute has been removed in recent kernels */ + if (stat(file, &s) == 0) { + sprintf(command, "%s\n", g.dump_wwpn); + if (write_to_file(file, command)) + return -1; + } + + /* lun */ + if (read_file(IPL_LUN, g.dump_lun, sizeof(g.dump_lun))) + return -1; + sprintf(file, "/sys/bus/ccw/drivers/zfcp/%s/%s/unit_add", g.dump_devno, + g.dump_wwpn); + sprintf(command, "%s\n", g.dump_lun); + if (write_to_file(file, command)) + return -1; + + /* bootprog */ + read_file("/sys/firmware/ipl/bootprog", g.dump_bootprog, + sizeof(g.dump_bootprog)); + + return 0; +} + +/* + * Mount the dump device + * Return: 0 - ok + * != 0 - error + */ +static int mount_dump_device(void) +{ + int pid; + char dump_part[16]; + + PRINT_TRACE("e2fsck\n"); + sprintf(dump_part, "%s%i", DEV_SCSI, atoi(g.parm_part)); + + pid = fork(); + if (pid < 0) { + PRINT_PERR("fork failed\n"); + return -1; + } else if (pid == 0) { + execl("/bin/e2fsck", "e2fsck", dump_part, "-y", NULL); + execl("/sbin/e2fsck", "e2fsck", dump_part, "-y", NULL); + exit(1); + } else { + waitpid(pid, NULL, 0); + } + + PRINT_TRACE("mount\n"); + if (mount(dump_part, DUMP_DIR, "ext4", 0, NULL) == 0) + return 0; + if (mount(dump_part, DUMP_DIR, "ext3", 0, NULL) == 0) + return 0; + if (mount(dump_part, DUMP_DIR, "ext2", 0, NULL) != 0) { + PRINT_PERR("mount failed\n"); + return -1; + } + return 0; +} + +/* + * unmount the dump device + * Return: 0 - ok + * != 0 - error + */ +static int umount_dump_device(void) +{ + if (umount(DUMP_DIR) != 0) { + PRINT_PERR("umount failed\n"); + return -1; + } + return 0; +} + +/* + * Terminate the system dumper + */ +static void terminate(void) +{ + int fd; + + sleep(WAIT_TIME_END); /* give the messages time to be displayed */ + fd = open(DEV_ZCORE_REIPL, O_WRONLY, 0); + if (fd == -1) + goto no_reipl; + write(fd, REIPL, 1); + close(fd); +no_reipl: + reboot(LINUX_REBOOT_CMD_POWER_OFF); +} + +/* + * Signal handler for zfcp_dumper + */ +static __sighandler_t dump_sig_handler(int sig, siginfo_t *sip, void*p) +{ + PRINT_ERR("Got signal: %i\n", sig); + PRINT_ERR("Dump failed!\n"); + terminate(); + return 0; +} + +/* + * Setup the Signal handler for zfcp_dumper + * Return: 0 - ok + * !=0 - error + */ +static int init_sig(void) +{ + g.sigact.sa_flags = (SA_NODEFER | SA_SIGINFO | SA_RESETHAND); + g.sigact.sa_handler = (__sighandler_t)dump_sig_handler; + if (sigemptyset(&g.sigact.sa_mask) < 0) + return -1; + if (sigaction(SIGINT, &g.sigact, NULL) < 0) + return -1; + if (sigaction(SIGTERM, &g.sigact, NULL) < 0) + return -1; + if (sigaction(SIGPIPE, &g.sigact, NULL) < 0) + return -1; + if (sigaction(SIGABRT, &g.sigact, NULL) < 0) + return -1; + if (sigaction(SIGSEGV, &g.sigact, NULL) < 0) + return -1; + if (sigaction(SIGBUS, &g.sigact, NULL) < 0) + return -1; + + return 0; +} + +/* + * Set memory management parameters: Ensure that dirty pages are written + * early enough! See "Documentation/filesystems/proc.txt" + * Return: 0 - ok + * !=0 - error + */ +static int tune_vm(void) +{ + char *sysctl_names[] = {"/proc/sys/vm/dirty_ratio", + "/proc/sys/vm/dirty_background_ratio", + "/proc/sys/vm/dirty_writeback_centisecs", + "/proc/sys/vm/dirty_expire_centisecs", + "/proc/sys/vm/vfs_cache_pressure", + "/proc/sys/vm/lowmem_reserve_ratio", + NULL}; + char *sysctl_values[] = {"2", "5", "50", "50", "500", "32", NULL}; + int i; + + i = 0; + while (sysctl_names[i]) { + if (write_to_file(sysctl_names[i], sysctl_values[i])) + return -1; + i++; + } + return 0; +} + +/* + * Get dump number of either new dump or dump to erase + * Parameter: dumpdir - dump directory (absolute path) + * mode - DUMP_FIRST: Find smallest dump number in directory + * - DUMP_LAST: Find highest dump number in directory + * Return: >= 0 - dump number + * -1 - no dump found in directory + * <-1 - error + */ +static int get_dump_num(const char *dumpdir, int mode) +{ + DIR *dir = NULL; + struct dirent *dir_ent; + int dump_found, rc; + + rc = 0; + dump_found = 0; + dir = opendir(dumpdir); + if (!dir) { + PRINT_PERR("Cannot evalute dump number\n"); + return -2; + } + + while ((dir_ent = readdir(dir))) { + int num; + if (sscanf(dir_ent->d_name, "dump.%ui", &num) == 1) { + char suffix[1024] = {}; + + /* + * check if we have something like dump.001 + * this is not treated as dump, since we do not allow + * leading zeros. + * Also files like dump.-1, dump.-10 are ignored. + */ + sscanf(dir_ent->d_name, "dump.%s", suffix); + if (suffix[0] == '-') + continue; + if ((suffix[0] == '0') && isdigit(suffix[1])) + continue; + if (!dump_found) { + dump_found = 1; + rc = num; + } else if (mode == DUMP_LAST) { + rc = MAX(num, rc); + } else if (mode == DUMP_FIRST) { + rc = MIN(num, rc); + } + } + } + if (!dump_found) + rc = NO_DUMP; + closedir(dir); + + return rc; +} + +/* + * Erase oldest dump in dump directory + * Return: 0 - ok + * !=0 - error + */ +static int erase_oldest_dump(void) +{ + int dump_nr; + char dname[1024] = {}; + char answ[1024] = {}; + + dump_nr = get_dump_num(g.dump_dir, DUMP_FIRST); + if (dump_nr < 0) { + PRINT_ERR("Internal error: dump number cannot be evaluated\n"); + return -1; + } + sprintf(dname, "dump.%i", dump_nr); + if (dump_nr == g.dump_nr) { + PRINT_ERR("Sorry, cannot delete any more dumps!\n"); + return -1; + } + if (g.parm_mode == PARM_MODE_AUTO_NUM) { + PRINT("Removing oldest dump: '%s'\n", dname); + } else { + while ((strcmp(answ, "y") != 0) && (strcmp(answ, "n") != 0)) { + PRINT("Remove oldest dump: '%s' (y/n)? ", dname); + scanf("%s", answ); + } + if (strcmp(answ, "n") == 0) + return -1; + } + sprintf(dname, "%s/dump.%i", g.dump_dir, dump_nr); + if (unlink(dname) == -1) { + PRINT_PERR("Could not remove dump\n"); + return -1; + } + sync(); + /* + * Wait some seconds in order to give ext3 time to discover that file + * has been removed. + */ + sleep(WAIT_TIME_ERASE); + PRINT("Dump removed!\n"); + return 0; +} + +/* + * write buffer to dump. In case of ENOSPC try to remove oldest dump + * Parameter: fd - filedescriptor of dump file + * buf - buffer to write + * count - nr of bytes to write + * + * Return: size - written bytes + * <0 - error + */ +static ssize_t dump_write(int fd, const void *buf, size_t count) +{ + ssize_t written, rc; + + written = 0; + while (written != count) { + rc = write(fd, buf + written, count - written); + if ((rc == -1) && (errno == ENOSPC)) { + PRINT_ERR("No space left on device!\n"); + /* Try to erase old dump */ + if (erase_oldest_dump()) + return -1; + continue; + } else if (rc == -1) { + /* Write failed somehow */ + return -1; + } + written += rc; + } + return written; +} + +#ifdef GZIP_SUPPORT +/* + * Wrapper to gzip compress routine + * Parameter: old - buffer to compress (in) + * old_size - size of old buffer in bytes (in) + * new - buffer for compressed data (out) + * new_size - size of 'new' buffer in bytes (in) + * Return: >=0 - Size of compressed buffer + * < 0 - error + */ +static int compress_gzip(const unsigned char *old, __u32 old_size, + unsigned char *new, __u32 new_size) +{ + int rc; + unsigned long len; + + len = old_size; + rc = compress(new, &len, old, new_size); + switch (rc) { + case Z_OK: + return len; + case Z_MEM_ERROR: + PRINT_ERR("Z_MEM_ERROR (not enough memory)!\n"); + return -1; + case Z_BUF_ERROR: + /* In this case the compressed output is bigger than + the uncompressed */ + return -1; + case Z_DATA_ERROR: + PRINT_ERR("Z_DATA_ERROR (input data corrupted)!\n"); + return -1; + default: + PRINT_ERR("Z_UNKNOWN_ERROR (rc 0x%x unknown)!\n", rc); + return -1; + } +} +#endif + +/* + * Do nothing! - No compression + */ +static int compress_none(const unsigned char *old, __u32 old_size, + unsigned char *new, __u32 new_size) +{ + return -1; /* "-1" indicates, that compression was not done */ +} + +/* + * Convert s390 standalone dump header to lkcd dump header + * Parameter: s390_dh - s390 dump header (in) + * dh - lkcd dump header (out) + */ +static void s390_to_lkcd_hdr(struct dump_hdr_s390 *s390_dh, + struct dump_hdr_lkcd *dh) +{ + struct timeval h_time; + + /* adjust todclock to 1970 */ + __u64 tod = s390_dh->tod; + tod -= 0x8126d60e46000000LL - (0x3c26700LL * 1000000 * 4096); + tod >>= 12; + h_time.tv_sec = tod / 1000000; + h_time.tv_usec = tod % 1000000; + + dh->memory_size = s390_dh->memory_size; + dh->memory_start = s390_dh->memory_start; + dh->memory_end = s390_dh->memory_end; + dh->num_dump_pages = s390_dh->num_pages; + dh->page_size = s390_dh->page_size; + dh->dump_level = s390_dh->dump_level; + + sprintf(dh->panic_string, "zSeries-dump (CPUID = %16llx)", + (unsigned long long) s390_dh->cpu_id); + + if (s390_dh->arch_id == DH_ARCH_ID_S390) + strcpy(dh->utsname_machine, "s390"); + else if (s390_dh->arch_id == DH_ARCH_ID_S390X) + strcpy(dh->utsname_machine, "s390x"); + else + strcpy(dh->utsname_machine, ""); + + strcpy(dh->utsname_sysname, ""); + strcpy(dh->utsname_nodename, ""); + strcpy(dh->utsname_release, ""); + strcpy(dh->utsname_version, ""); + strcpy(dh->utsname_domainname, ""); + + dh->magic_number = DUMP_MAGIC_NUMBER; + dh->version = DUMP_VERSION_NUMBER; + dh->header_size = sizeof(struct dump_hdr_lkcd); + dh->time.tv_sec = h_time.tv_sec; + dh->time.tv_usec = h_time.tv_usec; +} + +/* + * Convert s390 standalone dump header to lkcd asm dump header + * Parameter: s390_dh - s390 dump header (in) + * dh_asm - lkcd asm dump header (out) + */ +static void s390_to_lkcd_hdr_asm(struct dump_hdr_s390 *s390_dh, + struct dump_hdr_lkcd_asm *dh_asm) +{ + unsigned int i; + + dh_asm->magic_number = DUMP_MAGIC_NUMBER_ASM; + dh_asm->version = 0; + dh_asm->hdr_size = sizeof(*dh_asm); + dh_asm->cpu_cnt = s390_dh->cpu_cnt; + dh_asm->real_cpu_cnt = s390_dh->real_cpu_cnt; + for (i = 0; i < dh_asm->cpu_cnt; i++) + dh_asm->lc_vec[i] = s390_dh->lc_vec[i]; +} + +/* + * Write progress information to screen + * Parameter: written - So many bytes have been written to the dump + * max - This is the whole memory to be written + */ +static void show_progress(unsigned long long written, unsigned long long max) +{ + int time; + struct timeval t; + double percent; + + gettimeofday(&t, NULL); + time = t.tv_sec; + if ((time < g.last_progress) && (written != max) && (written != 0)) + return; + g.last_progress = time + 10; + percent = ((double) written / (double) max) * 100.0; + PRINT(" %4lli MB of %4lli MB (%5.1f%% )\n", written >> 20, max >> 20, + percent); + fflush(stdout); +} + +/* + * create dump + * + * Return: 0 - ok + * !=0 - error + */ +static int create_dump(void) +{ + struct stat stat_buf; + struct dump_hdr_lkcd dh; + struct dump_hdr_lkcd_asm dh_asm; + struct dump_hdr_s390 s390_dh; + compress_fn_t compress_fn; + struct dump_page dp; + char buf[PAGE_SIZE], dpcpage[PAGE_SIZE]; + char dump_name[1024]; + __u64 mem_loc, mem_count, hsa_size; + __u32 buf_loc = 0, dp_size, dp_flags; + int size, fin, fout, fmap, rc = 0; + char c_info[CHUNK_INFO_SIZE]; + struct mem_chunk *chunk, *chunk_first = NULL, *chunk_prev = NULL; + char *end_ptr; + void *page_buf; + + if (stat(g.dump_dir, &stat_buf) < 0) { + PRINT_ERR("Specified dump dir '%s' not found!\n", g.dump_dir); + return -1; + } else if (!S_ISDIR(stat_buf.st_mode)) { + PRINT_ERR("Specified dump dir '%s' is not a directory!\n", + g.dump_dir); + return -1; + } + + /* Allocate buffer for writing */ + if (posix_memalign(&page_buf, PAGE_SIZE, DUMP_BUF_SIZE)) { + PRINT_ERR("Out of memory: Could not allocate dump buffer\n"); + return -1; + } + + /* initialize progress time */ + g.last_progress = 0; + + /* get dump number */ + g.dump_nr = get_dump_num(g.dump_dir, DUMP_LAST); + if (g.dump_nr == NO_DUMP) + g.dump_nr = 0; + else if (g.dump_nr >= 0) + g.dump_nr += 1; + else + return -1; + + /* Open the memory map file - only available with kernel 2.6.25 or + * higher. If open fails, memory holes cannot be detected and only + * one single memory chunk is assumed */ + fmap = open(DEV_ZCORE_MAP, O_RDONLY, 0); + if (fmap == -1) { + chunk_first = calloc(1, sizeof(struct mem_chunk)); + if (chunk_first == NULL) { + PRINT_ERR("Could not allocate %d bytes of memory\n", + (int) sizeof(struct mem_chunk)); + return -1; + } + chunk_first->size = PARM_MEM_DFLT; + } else { + /* read information about memory chunks (start address and size) */ + do { + if (read(fmap, c_info, sizeof(c_info)) != sizeof(c_info)) { + PRINT_ERR("read() memory map file '%s' " + "failed!\n", DEV_ZCORE_MAP); + rc = -1; + goto failed_close_fmap; + } + chunk = calloc(1, sizeof(struct mem_chunk)); + if (chunk == NULL) { + PRINT_ERR("Could not allocate %d bytes of " + "memory\n", + (int) sizeof(struct mem_chunk)); + rc = -1; + goto failed_free_chunks; + } + chunk->size = strtoul(c_info + 17, &end_ptr, 16); + if (end_ptr != c_info + 33 || *end_ptr != ' ') { + PRINT_ERR("Invalid contents of memory map " + "file '%s'!\n", DEV_ZCORE_MAP); + rc = -1; + goto failed_free_chunks; + } + if (chunk->size == 0) + break; + chunk->addr = strtoul(c_info, &end_ptr, 16); + if (end_ptr != c_info + 16 || *end_ptr != ' ') { + PRINT_ERR("Invalid contents of memory map " + "file '%s'!\n", DEV_ZCORE_MAP); + rc = -1; + goto failed_free_chunks; + } + if (!chunk_first) + chunk_first = chunk; + else + chunk_prev->next = chunk; + chunk_prev = chunk; + } while (1); + } + + hsa_size = get_hsa_size(); + PRINT_TRACE("hsa size: %llx\n", (unsigned long long) hsa_size); + + /* try to open the source device */ + fin = open(DEV_ZCORE, O_RDONLY, 0); + if (fin == -1) { + PRINT_ERR("open() source device '%s' failed!\n", DEV_ZCORE); + rc = -1; + goto failed_free_chunks; + } + + /* make the new filename */ + sprintf(dump_name, "%s/dump.%d", g.dump_dir, g.dump_nr); + fout = open(dump_name, DUMP_FLAGS, DUMP_MODE); + if (fout == -1) { + PRINT_ERR("open() of dump file \"%s\" failed!\n", dump_name); + rc = -1; + goto failed_close_fin; + } + + PRINT("dump file: dump.%d\n", g.dump_nr); + memset(&dh, 0, sizeof(dh)); + + /* get the dump header */ + if (lseek(fin, 0, SEEK_SET) < 0) { + PRINT_ERR("Cannot lseek() to get the dump header from the " + "dump file!\n"); + rc = -1; + goto failed_close_fout; + } + if (read(fin, &s390_dh, sizeof(s390_dh)) != sizeof(s390_dh)) { + PRINT_ERR("Cannot read() dump header from dump file!\n"); + rc = -1; + goto failed_close_fout; + } + + s390_to_lkcd_hdr(&s390_dh, &dh); + if (s390_dh.version >= 5) + s390_to_lkcd_hdr_asm(&s390_dh, &dh_asm); + + if (strcmp(g.parm_compress, PARM_COMP_GZIP) == 0) { +#ifdef GZIP_SUPPORT + dh.dump_compress = DUMP_COMPRESS_GZIP; + compress_fn = compress_gzip; +#else + PRINT_WARN("No gzip support. Compression disabled!\n"); + dh.dump_compress = DUMP_COMPRESS_NONE; + compress_fn = compress_none; +#endif + } else { + dh.dump_compress = DUMP_COMPRESS_NONE; + compress_fn = compress_none; + } + + if (g.parm_mem < dh.memory_size) { + /* dump_mem parameter specified: Adjust memory size */ + dh.memory_size = g.parm_mem; + dh.memory_end = g.parm_mem; + dh.num_dump_pages = g.parm_mem / dh.page_size; + } + + memset(page_buf, 0, PAGE_SIZE); + memcpy(page_buf, &dh, sizeof(dh)); + memcpy(page_buf + sizeof(dh), &dh_asm, sizeof(dh_asm)); + if (lseek(fout, 0L, SEEK_SET) < 0) { + PRINT_ERR("lseek() failed\n"); + rc = -1; + goto failed_close_fout; + } + if (dump_write(fout, page_buf, PAGE_SIZE) != PAGE_SIZE) { + PRINT_ERR("Error: Write dump header failed\n"); + rc = -1; + goto failed_close_fout; + } + if (lseek(fout, LKCD_HDR_SIZE, SEEK_SET) < 0) { + PRINT_ERR("lseek() failed\n"); + rc = -1; + goto failed_close_fout; + } + + /* write dump */ + + chunk = chunk_first; + mem_loc = 0; + mem_count = 0; + if (lseek(fin, DUMP_HEADER_SZ_S390SA, SEEK_SET) < 0) { + PRINT_ERR("lseek() failed\n"); + rc = -1; + goto failed_close_fout; + } + while (mem_loc < dh.memory_end) { + if (mem_loc >= chunk->addr + chunk->size) { + chunk = chunk->next; + mem_loc = chunk->addr; + if (lseek(fin, DUMP_HEADER_SZ_S390SA + mem_loc, + SEEK_SET) < 0) { + PRINT_ERR("lseek() failed\n"); + rc = -1; + goto failed_close_fout; + } + } + if (hsa_size && mem_loc >= hsa_size) { + release_hsa(); + hsa_size = 0; + } + if (read(fin, buf, PAGE_SIZE) != PAGE_SIZE) { + if (errno == EFAULT) { + /* probably memory hole. Skip page */ + mem_loc += PAGE_SIZE; + continue; + } + PRINT_PERR("read error\n"); + rc = -1; + goto failed_close_fout; + } + memset(dpcpage, 0, PAGE_SIZE); + /* get the new compressed page size */ + + size = compress_fn((unsigned char *)buf, PAGE_SIZE, + (unsigned char *)dpcpage, PAGE_SIZE); + + /* if compression failed or compressed was ineffective, + * we write an uncompressed page */ + if (size < 0) { + dp_flags = DUMP_DH_RAW; + dp_size = PAGE_SIZE; + } else { + dp_flags = DUMP_DH_COMPRESSED; + dp_size = size; + } + dp.address = mem_loc; + dp.size = dp_size; + dp.flags = dp_flags; + memcpy(page_buf + buf_loc, &dp, sizeof(dp)); + buf_loc += sizeof(struct dump_page); + /* copy the page of memory */ + if (dp_flags & DUMP_DH_COMPRESSED) + /* copy the compressed page */ + memcpy(page_buf + buf_loc, dpcpage, dp_size); + else + /* copy directly from memory */ + memcpy(page_buf + buf_loc, buf, dp_size); + buf_loc += dp_size; + mem_loc += PAGE_SIZE; + mem_count += PAGE_SIZE; + if (buf_loc + PAGE_SIZE + sizeof(dp) > DUMP_BUF_SIZE) { + unsigned long rem = buf_loc % PAGE_SIZE; + long size = buf_loc - rem; + + if (dump_write(fout, page_buf, size) != size) { + PRINT_ERR("write error\n"); + rc = -1; + goto failed_close_fout; + } + memmove(page_buf, page_buf + size, rem); + buf_loc = rem; + } + show_progress(mem_count, dh.memory_size); + } + + /* write end marker */ + + dp.address = 0x0; + dp.size = 0x0; + dp.flags = DUMP_DH_END; + memcpy(page_buf + buf_loc, &dp, sizeof(dp)); + buf_loc += sizeof(dp); + dump_write(fout, page_buf, buf_loc + (PAGE_SIZE - buf_loc % PAGE_SIZE)); + if (ftruncate(fout, lseek(fout, 0, SEEK_CUR) - + (PAGE_SIZE - buf_loc % PAGE_SIZE)) == -1) { + PRINT_ERR("truncate error\n"); + rc = -1; + goto failed_close_fout; + } + dump_write(fout, &dp, sizeof(dp)); + +failed_close_fout: + close(fout); +failed_close_fin: + close(fin); +failed_free_chunks: + chunk = chunk_first; + while (chunk) { + chunk_prev = chunk; + chunk = chunk->next; + free(chunk_prev); + } +failed_close_fmap: + close(fmap); + return rc; +} + +/* + * Load a kernel module + */ +static void modprobe(const char *module) +{ + pid_t pid; + + pid = fork(); + if (pid < 0) { + PRINT_PERR("fork failed\n"); + return; + } else if (pid == 0) { + execl("/bin/modprobe", "modprobe", module, "-q", NULL); + execl("/sbin/modprobe", "modprobe", module, "-q", NULL); + exit(1); + } else { + waitpid(pid, NULL, 0); + } +} + +/* + * Load all required kernel modules + */ +static void load_modules(void) +{ + int i; + + for (i = 0; module_list[i]; i++) + modprobe(module_list[i]); +} + +/* + * main routine of the zfcp_dumper + */ +int main(int argc, char *argv[]) +{ + char linux_version[128]; + +#ifdef __s390x__ + PRINT("Linux System Dumper starting\n"); + PRINT("Version %s (64 bit)\n", ZFCPDUMP_VERSION); +#else + PRINT("Linux System Dumper starting\n"); + PRINT("Version %s (32 bit)\n", ZFCPDUMP_VERSION); +#endif + + if (init_sig()) { + PRINT_ERR("Init Signals failed!\n"); + goto fail; + } + if (mount("proc", "/proc", "proc", 0, NULL)) { + if (errno != EBUSY) { + PRINT_PERR("Unable to mount proc\n"); + goto fail; + } + } + read_file("/proc/version", linux_version, sizeof(linux_version)); + PRINT("%s\n", linux_version); + + if (mount("sysfs", "/sys", "sysfs", 0, NULL)) { + if (errno != EBUSY) { + PRINT_PERR("Unable to mount sysfs\n"); + goto fail; + } + } + if (mount("debugfs", "/sys/kernel/debug", "debugfs", 0, NULL)) { + if (errno != EBUSY) { + PRINT_PERR("Unable to mount debugfs\n"); + goto fail; + } + } + if (tune_vm()) { + PRINT_PERR("Unable to set VM settings\n"); + goto fail; + } + if (parse_parmline()) { + PRINT_ERR("Could not parse parmline\n"); + goto fail; + } + load_modules(); + if (enable_zfcp_device()) { + PRINT_ERR("Could not enable dump device\n"); + goto fail; + } + PRINT(" \n"); /* leading blank is needed that sclp console prints + the newline */ + PRINT("DUMP PARAMETERS:\n"); + PRINT("================\n"); + PRINT("devno : %s\n", g.dump_devno); + PRINT("wwpn : %s\n", g.dump_wwpn); + PRINT("lun : %s\n", g.dump_lun); + PRINT("conf : %s\n", g.dump_bootprog); + PRINT("partition: %s\n", g.parm_part); + PRINT("directory: %s\n", g.parm_dir); + PRINT("compress : %s\n", g.parm_compress); + PRINT(" \n"); + PRINT("MOUNT DUMP PARTITION:\n"); + PRINT("=====================\n"); + sleep(WAIT_TIME_ONLINE); + if (mount_dump_device()) { + PRINT_ERR("Could not mount dump device\n"); + goto fail; + } + PRINT("DONE.\n"); + PRINT(" \n"); + PRINT("DUMP PROCESS STARTED:\n"); + PRINT("=====================\n"); + + if (create_dump()) + goto fail_umount; + + if (umount_dump_device()) { + PRINT_ERR("Could not umount dump device\n"); + goto fail; + } + PRINT(" \n"); + PRINT("DUMP 'dump.%i' COMPLETE\n", g.dump_nr); + fflush(stdout); + terminate(); + return 0; + +fail_umount: + if (umount_dump_device()) { + PRINT_ERR("Could not umount dump device\n"); + goto fail; + } +fail: + PRINT("DUMP 'dump.%i' FAILED\n", g.dump_nr); + fflush(stdout); + terminate(); + return 1; +} diff -uprN linux-2.6.32/arch/s390/boot/zfcpdump.h linux-2.6.32.ovz/arch/s390/boot/zfcpdump.h --- linux-2.6.32/arch/s390/boot/zfcpdump.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/boot/zfcpdump.h 2015-09-10 10:07:25.000000000 -0700 @@ -0,0 +1,251 @@ +/* + * zfcpdump userspace tool + * + * Copyright IBM Corp. 2003, 2007. + * Author(s): Michael Holzheu + */ + +#ifndef _ZFCPDUMP_H +#define _ZFCPDUMP_H + +#include +#include +#include + +#define ZFCPDUMP_VERSION "2.2" + +#define PRINT_TRACE(x...) \ + do { \ + if (g.parm_debug >= 3) { \ + fprintf(stderr, "TRACE: "); \ + fprintf(stderr, ##x); \ + } \ + } while (0) + +#define PRINT_ERR(x...) \ + do { \ + fprintf(stderr, "ERROR: "); \ + fprintf(stderr, ##x); \ + } while (0) + +#define PRINT_WARN(x...) \ + do { \ + fprintf(stderr, "WARNING: "); \ + fprintf(stderr, ##x); \ + } while (0) + +#define PRINT_PERR(x...) \ + do { \ + fprintf(stderr, "ERROR: "); \ + fprintf(stderr, ##x); \ + perror(""); \ + } while (0) + +#define PRINT(x...) fprintf(stdout, ##x) +#define CMDLINE_MAX_LEN 1024 +#define KERN_PARM_MAX 100 + +#define DUMP_FLAGS (O_CREAT | O_RDWR | O_TRUNC | O_DIRECT) +#define DUMP_MODE (S_IRUSR | S_IWUSR | S_IRGRP) + +struct globals { + char *parm_compress; + char *parm_dir; + char *parm_part; + int parm_debug; + int parm_mode; + __u64 parm_mem; + char parmline[CMDLINE_MAX_LEN]; + char dump_dir[1024]; + int dump_nr; + int last_progress; + struct sigaction sigact; + char dump_devno[16]; + char dump_wwpn[32]; + char dump_lun[32]; + char dump_bootprog[32]; +}; + +#ifndef MIN +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#endif +#ifndef MAX +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#endif + +#define PROC_CMDLINE "/proc/cmdline" +#define PROC_MISC "/proc/misc" +#define DEV_ZCORE "/sys/kernel/debug/zcore/mem" +#define DEV_ZCORE_MAP "/sys/kernel/debug/zcore/memmap" +#define DEV_ZCORE_REIPL "/sys/kernel/debug/zcore/reipl" +#define DEV_ZCORE_HSA "/sys/kernel/debug/zcore/hsa" +#define REIPL "1" +#define DEV_SCSI "/dev/sda" +#define DUMP_DIR "/mnt" + +#define IPL_WWPN "/sys/firmware/ipl/wwpn" +#define IPL_DEVNO "/sys/firmware/ipl/device" +#define IPL_LUN "/sys/firmware/ipl/lun" + +#define PARM_DIR "dump_dir" +#define PARM_DIR_DFLT "/" + +#define PARM_PART "dump_part" +#define PARM_PART_DFLT "1" + +#define PARM_COMP "dump_compress" +#define PARM_COMP_GZIP "gzip" +#define PARM_COMP_NONE "none" +#define PARM_COMP_DFLT PARM_COMP_NONE + +#define PARM_MEM "dump_mem" +#ifdef __s390x__ +#define PARM_MEM_DFLT 0xffffffffffffffff +#else +#define PARM_MEM_DFLT 0xffffffff +#endif + +#define PARM_DEBUG "dump_debug" +#define PARM_DEBUG_DFLT 2 +#define PARM_DEBUG_MIN 1 +#define PARM_DEBUG_MAX 6 + +#define PARM_MODE "dump_mode" +#define PARM_MODE_INTERACT "interactive" +#define PARM_MODE_INTERACT_NUM 0 +#define PARM_MODE_AUTO "auto" +#define PARM_MODE_AUTO_NUM 1 +#define PARM_MODE_DFLT PARM_MODE_INTERACT +#define PARM_MODE_NUM_DFLT PARM_MODE_INTERACT_NUM + +#define DUMP_FIRST 0 +#define DUMP_LAST 1 +#define NO_DUMP -1 + +#define WAIT_TIME_ERASE 5 /* seconds */ +#define WAIT_TIME_END 3 /* seconds */ +#define WAIT_TIME_ONLINE 2 /* seconds */ + +#define UTS_LEN 65 + +#define PAGE_SIZE 4096 +#define DUMP_BUF_SIZE (80 * PAGE_SIZE) +#define LKCD_HDR_SIZE (64 * 1024) + +/* header definitions for dumps from s390 standalone dump tools */ +#define DUMP_MAGIC_S390SA 0xa8190173618f23fdULL /* s390sa magic number */ +#define DUMP_HEADER_SZ_S390SA 4096 + +/* standard header definitions */ +#define DUMP_MAGIC_NUMBER 0xa8190173618f23edULL /* dump magic number */ +#define DUMP_MAGIC_NUMBER_ASM 0x733339302d64756dULL +#define DUMP_VERSION_NUMBER 0x8 /* dump version number */ +#define DUMP_PANIC_LEN 0x100 /* dump panic string length */ + +/* dump compression options -- add as necessary */ +#define DUMP_COMPRESS_NONE 0x0 /* don't compress this dump */ +#define DUMP_COMPRESS_GZIP 0x2 /* use GZIP compression */ + +/* dump header flags -- add as necessary */ +#define DUMP_DH_RAW 0x1 /* raw page (no compression) */ +#define DUMP_DH_COMPRESSED 0x2 /* page is compressed */ +#define DUMP_DH_END 0x4 /* end marker on a full dump */ + +#define CHUNK_INFO_SIZE 34 /* 2 16-byte char, each followed by blank */ + +/* + * This is the header dumped at the top of every valid crash dump. + */ +struct dump_hdr_lkcd { + __u64 magic_number; + __u32 version; + __u32 header_size; + __u32 dump_level; + __u32 page_size; + __u64 memory_size; + __u64 memory_start; + __u64 memory_end; + __u32 num_dump_pages; + char panic_string[DUMP_PANIC_LEN]; + struct { + __u64 tv_sec; + __u64 tv_usec; + } time; + char utsname_sysname[UTS_LEN]; + char utsname_nodename[UTS_LEN]; + char utsname_release[UTS_LEN]; + char utsname_version[UTS_LEN]; + char utsname_machine[UTS_LEN]; + char utsname_domainname[UTS_LEN]; + __u64 current_task; + __u32 dump_compress; + __u32 dump_flags; + __u32 dump_device; +} __attribute__((packed)); + +#define DH_ARCH_ID_S390X 2 +#define DH_ARCH_ID_S390 1 + +/* + * s390 LKCD asm header + */ +struct dump_hdr_lkcd_asm { + __u64 magic_number; + __u32 version; + __u32 hdr_size; + __u16 cpu_cnt; + __u16 real_cpu_cnt; + __u32 lc_vec[512]; +} __attribute__((packed)); + +/* + * This is the header used by zcore + */ +struct dump_hdr_s390 { + __u64 magic_number; + __u32 version; + __u32 header_size; + __u32 dump_level; + __u32 page_size; + __u64 memory_size; + __u64 memory_start; + __u64 memory_end; + __u32 num_pages; + __u32 pad; + __u64 tod; + __u64 cpu_id; + __u32 arch_id; + __u32 volnr; + __u32 build_arch; + __u64 rmem_size; + __u8 mvdump; + __u16 cpu_cnt; + __u16 real_cpu_cnt; + __u8 end_pad1[0x200-0x061]; + __u64 mvdump_sign; + __u64 mvdump_zipl_time; + __u8 end_pad2[0x800-0x210]; + __u32 lc_vec[512]; +} __attribute__((packed)); + +/* + * Header associated to each physical page of memory saved in the system + * crash dump. + */ +struct dump_page { + __u64 address; /* the address of this dump page */ + __u32 size; /* the size of this dump page */ + __u32 flags; /* flags (DUMP_COMPRESSED, DUMP_RAW or DUMP_END) */ +} __attribute__((packed)); + +struct mem_chunk { + __u64 addr; /* the start address of this memory chunk */ + __u64 size; /* the length of this memory chunk */ + struct mem_chunk *next; /* pointer to next memory chunk */ +}; + +/* Compression function */ +typedef int (*compress_fn_t)(const unsigned char *old, __u32 old_size, + unsigned char *new, __u32 size); + +#endif /* _ZFCPDUMP_H */ diff -uprN linux-2.6.32/arch/s390/boot/zfcpdump_initramfs.txt linux-2.6.32.ovz/arch/s390/boot/zfcpdump_initramfs.txt --- linux-2.6.32/arch/s390/boot/zfcpdump_initramfs.txt 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/boot/zfcpdump_initramfs.txt 2015-09-10 10:05:36.000000000 -0700 @@ -0,0 +1,27 @@ +# +# initramfs definition for zfcpdump +# See also Documentation/s390/zfcpdump.txt +# + +dir /dev 755 0 0 +nod /dev/console 644 0 0 c 5 1 +nod /dev/null 644 0 0 c 1 3 +nod /dev/sda1 644 0 0 b 8 1 +nod /dev/sda2 644 0 0 b 8 2 +nod /dev/sda3 644 0 0 b 8 3 +nod /dev/sda4 644 0 0 b 8 4 +nod /dev/sda5 644 0 0 b 8 5 +nod /dev/sda6 644 0 0 b 8 6 +nod /dev/sda7 644 0 0 b 8 7 +nod /dev/sda8 644 0 0 b 8 8 +nod /dev/sda9 644 0 0 b 8 9 +nod /dev/sda10 644 0 0 b 8 10 +nod /dev/sda11 644 0 0 b 8 11 +nod /dev/sda12 644 0 0 b 8 12 +nod /dev/sda13 644 0 0 b 8 13 +nod /dev/sda14 644 0 0 b 8 14 +nod /dev/sda15 644 0 0 b 8 15 +file /init arch/s390/boot/zfcpdump 755 0 0 +dir /proc 755 0 0 +dir /sys 755 0 0 +dir /mnt 755 0 0 diff -uprN linux-2.6.32/arch/s390/crypto/aes_s390.c linux-2.6.32.ovz/arch/s390/crypto/aes_s390.c --- linux-2.6.32/arch/s390/crypto/aes_s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/aes_s390.c 2015-09-10 10:07:48.000000000 -0700 @@ -25,16 +25,18 @@ #include #include #include +#include #include "crypt_s390.h" #define AES_KEYLEN_128 1 #define AES_KEYLEN_192 2 #define AES_KEYLEN_256 4 -static char keylen_flag = 0; +static u8 *ctrblk; +static DEFINE_SPINLOCK(ctrblk_lock); +static char keylen_flag; struct s390_aes_ctx { - u8 iv[AES_BLOCK_SIZE]; u8 key[AES_MAX_KEY_SIZE]; long enc; long dec; @@ -45,6 +47,23 @@ struct s390_aes_ctx { } fallback; }; +struct pcc_param { + u8 key[32]; + u8 tweak[16]; + u8 block[16]; + u8 bit[16]; + u8 xts[16]; +}; + +struct s390_xts_ctx { + u8 key[32]; + u8 pcc_key[32]; + long enc; + long dec; + int key_len; + struct crypto_blkcipher *fallback; +}; + /* * Check if the key_len is supported by the HW. * Returns 0 if it is, a positive number if it is not and software fallback is @@ -307,7 +326,8 @@ static int ecb_aes_crypt(struct blkciphe u8 *in = walk->src.virt.addr; ret = crypt_s390_km(func, param, out, in, n); - BUG_ON((ret < 0) || (ret != n)); + if (ret < 0 || ret != n) + return -EIO; nbytes &= AES_BLOCK_SIZE - 1; ret = blkcipher_walk_done(desc, walk, nbytes); @@ -423,29 +443,36 @@ static int cbc_aes_set_key(struct crypto return aes_set_key(tfm, in_key, key_len); } -static int cbc_aes_crypt(struct blkcipher_desc *desc, long func, void *param, +static int cbc_aes_crypt(struct blkcipher_desc *desc, long func, struct blkcipher_walk *walk) { + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); int ret = blkcipher_walk_virt(desc, walk); unsigned int nbytes = walk->nbytes; + struct { + u8 iv[AES_BLOCK_SIZE]; + u8 key[AES_MAX_KEY_SIZE]; + } param; if (!nbytes) goto out; - memcpy(param, walk->iv, AES_BLOCK_SIZE); + memcpy(param.iv, walk->iv, AES_BLOCK_SIZE); + memcpy(param.key, sctx->key, sctx->key_len); do { /* only use complete blocks */ unsigned int n = nbytes & ~(AES_BLOCK_SIZE - 1); u8 *out = walk->dst.virt.addr; u8 *in = walk->src.virt.addr; - ret = crypt_s390_kmc(func, param, out, in, n); - BUG_ON((ret < 0) || (ret != n)); + ret = crypt_s390_kmc(func, ¶m, out, in, n); + if (ret < 0 || ret != n) + return -EIO; nbytes &= AES_BLOCK_SIZE - 1; ret = blkcipher_walk_done(desc, walk, nbytes); } while ((nbytes = walk->nbytes)); - memcpy(walk->iv, param, AES_BLOCK_SIZE); + memcpy(walk->iv, param.iv, AES_BLOCK_SIZE); out: return ret; @@ -462,7 +489,7 @@ static int cbc_aes_encrypt(struct blkcip return fallback_blk_enc(desc, dst, src, nbytes); blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_aes_crypt(desc, sctx->enc, sctx->iv, &walk); + return cbc_aes_crypt(desc, sctx->enc, &walk); } static int cbc_aes_decrypt(struct blkcipher_desc *desc, @@ -476,7 +503,7 @@ static int cbc_aes_decrypt(struct blkcip return fallback_blk_dec(desc, dst, src, nbytes); blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_aes_crypt(desc, sctx->dec, sctx->iv, &walk); + return cbc_aes_crypt(desc, sctx->dec, &walk); } static struct crypto_alg cbc_aes_alg = { @@ -504,15 +531,377 @@ static struct crypto_alg cbc_aes_alg = { } }; +static int xts_fallback_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int len) +{ + struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); + unsigned int ret; + + xts_ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; + xts_ctx->fallback->base.crt_flags |= (tfm->crt_flags & + CRYPTO_TFM_REQ_MASK); + + ret = crypto_blkcipher_setkey(xts_ctx->fallback, key, len); + if (ret) { + tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; + tfm->crt_flags |= (xts_ctx->fallback->base.crt_flags & + CRYPTO_TFM_RES_MASK); + } + return ret; +} + +static int xts_fallback_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_blkcipher *tfm; + unsigned int ret; + + tfm = desc->tfm; + desc->tfm = xts_ctx->fallback; + + ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes); + + desc->tfm = tfm; + return ret; +} + +static int xts_fallback_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_blkcipher *tfm; + unsigned int ret; + + tfm = desc->tfm; + desc->tfm = xts_ctx->fallback; + + ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes); + + desc->tfm = tfm; + return ret; +} + +static int xts_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + + switch (key_len) { + case 32: + xts_ctx->enc = KM_XTS_128_ENCRYPT; + xts_ctx->dec = KM_XTS_128_DECRYPT; + memcpy(xts_ctx->key + 16, in_key, 16); + memcpy(xts_ctx->pcc_key + 16, in_key + 16, 16); + break; + case 48: + xts_ctx->enc = 0; + xts_ctx->dec = 0; + xts_fallback_setkey(tfm, in_key, key_len); + break; + case 64: + xts_ctx->enc = KM_XTS_256_ENCRYPT; + xts_ctx->dec = KM_XTS_256_DECRYPT; + memcpy(xts_ctx->key, in_key, 32); + memcpy(xts_ctx->pcc_key, in_key + 32, 32); + break; + default: + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + xts_ctx->key_len = key_len; + return 0; +} + +static int xts_aes_crypt(struct blkcipher_desc *desc, long func, + struct s390_xts_ctx *xts_ctx, + struct blkcipher_walk *walk) +{ + unsigned int offset = (xts_ctx->key_len >> 1) & 0x10; + int ret = blkcipher_walk_virt(desc, walk); + unsigned int nbytes = walk->nbytes; + unsigned int n; + u8 *in, *out; + struct pcc_param pcc_param; + struct { + u8 key[32]; + u8 init[16]; + } xts_param; + + if (!nbytes) + goto out; + + memset(pcc_param.block, 0, sizeof(pcc_param.block)); + memset(pcc_param.bit, 0, sizeof(pcc_param.bit)); + memset(pcc_param.xts, 0, sizeof(pcc_param.xts)); + memcpy(pcc_param.tweak, walk->iv, sizeof(pcc_param.tweak)); + memcpy(pcc_param.key, xts_ctx->pcc_key, 32); + ret = crypt_s390_pcc(func, &pcc_param.key[offset]); + if (ret < 0) + return -EIO; + + memcpy(xts_param.key, xts_ctx->key, 32); + memcpy(xts_param.init, pcc_param.xts, 16); + do { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + out = walk->dst.virt.addr; + in = walk->src.virt.addr; + + ret = crypt_s390_km(func, &xts_param.key[offset], out, in, n); + if (ret < 0 || ret != n) + return -EIO; + + nbytes &= AES_BLOCK_SIZE - 1; + ret = blkcipher_walk_done(desc, walk, nbytes); + } while ((nbytes = walk->nbytes)); +out: + return ret; +} + +static int xts_aes_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + if (unlikely(xts_ctx->key_len == 48)) + return xts_fallback_encrypt(desc, dst, src, nbytes); + + blkcipher_walk_init(&walk, dst, src, nbytes); + return xts_aes_crypt(desc, xts_ctx->enc, xts_ctx, &walk); +} + +static int xts_aes_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + if (unlikely(xts_ctx->key_len == 48)) + return xts_fallback_decrypt(desc, dst, src, nbytes); + + blkcipher_walk_init(&walk, dst, src, nbytes); + return xts_aes_crypt(desc, xts_ctx->dec, xts_ctx, &walk); +} + +static int xts_fallback_init(struct crypto_tfm *tfm) +{ + const char *name = tfm->__crt_alg->cra_name; + struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); + + xts_ctx->fallback = crypto_alloc_blkcipher(name, 0, + CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK); + + if (IS_ERR(xts_ctx->fallback)) { + pr_err("Allocating XTS fallback algorithm %s failed\n", + name); + return PTR_ERR(xts_ctx->fallback); + } + return 0; +} + +static void xts_fallback_exit(struct crypto_tfm *tfm) +{ + struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); + + crypto_free_blkcipher(xts_ctx->fallback); + xts_ctx->fallback = NULL; +} + +static struct crypto_alg xts_aes_alg = { + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-s390", + .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_NEED_FALLBACK, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_xts_ctx), + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(xts_aes_alg.cra_list), + .cra_init = xts_fallback_init, + .cra_exit = xts_fallback_exit, + .cra_u = { + .blkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_aes_set_key, + .encrypt = xts_aes_encrypt, + .decrypt = xts_aes_decrypt, + } + } +}; + +static int xts_aes_alg_reg; + +static int ctr_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + + switch (key_len) { + case 16: + sctx->enc = KMCTR_AES_128_ENCRYPT; + sctx->dec = KMCTR_AES_128_DECRYPT; + break; + case 24: + sctx->enc = KMCTR_AES_192_ENCRYPT; + sctx->dec = KMCTR_AES_192_DECRYPT; + break; + case 32: + sctx->enc = KMCTR_AES_256_ENCRYPT; + sctx->dec = KMCTR_AES_256_DECRYPT; + break; + } + + return aes_set_key(tfm, in_key, key_len); +} + +static unsigned int __ctrblk_init(u8 *ctrptr, unsigned int nbytes) +{ + unsigned int i, n; + + /* only use complete blocks, max. PAGE_SIZE */ + n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1); + for (i = AES_BLOCK_SIZE; i < n; i += AES_BLOCK_SIZE) { + memcpy(ctrptr + i, ctrptr + i - AES_BLOCK_SIZE, + AES_BLOCK_SIZE); + crypto_inc(ctrptr + i, AES_BLOCK_SIZE); + } + return n; +} + +static int ctr_aes_crypt(struct blkcipher_desc *desc, long func, + struct s390_aes_ctx *sctx, struct blkcipher_walk *walk) +{ + int ret = blkcipher_walk_virt_block(desc, walk, AES_BLOCK_SIZE); + unsigned int n, nbytes; + u8 buf[AES_BLOCK_SIZE], ctrbuf[AES_BLOCK_SIZE]; + u8 *out, *in, *ctrptr = ctrbuf; + + if (!walk->nbytes) + return ret; + + if (spin_trylock(&ctrblk_lock)) + ctrptr = ctrblk; + + memcpy(ctrptr, walk->iv, AES_BLOCK_SIZE); + while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + out = walk->dst.virt.addr; + in = walk->src.virt.addr; + while (nbytes >= AES_BLOCK_SIZE) { + if (ctrptr == ctrblk) + n = __ctrblk_init(ctrptr, nbytes); + else + n = AES_BLOCK_SIZE; + ret = crypt_s390_kmctr(func, sctx->key, out, in, + n, ctrptr); + if (ret < 0 || ret != n) { + if (ctrptr == ctrblk) + spin_unlock(&ctrblk_lock); + return -EIO; + } + if (n > AES_BLOCK_SIZE) + memcpy(ctrptr, ctrptr + n - AES_BLOCK_SIZE, + AES_BLOCK_SIZE); + crypto_inc(ctrptr, AES_BLOCK_SIZE); + out += n; + in += n; + nbytes -= n; + } + ret = blkcipher_walk_done(desc, walk, nbytes); + } + if (ctrptr == ctrblk) { + if (nbytes) + memcpy(ctrbuf, ctrptr, AES_BLOCK_SIZE); + else + memcpy(walk->iv, ctrptr, AES_BLOCK_SIZE); + spin_unlock(&ctrblk_lock); + } else { + if (!nbytes) + memcpy(walk->iv, ctrptr, AES_BLOCK_SIZE); + } + /* + * final block may be < AES_BLOCK_SIZE, copy only nbytes + */ + if (nbytes) { + out = walk->dst.virt.addr; + in = walk->src.virt.addr; + ret = crypt_s390_kmctr(func, sctx->key, buf, in, + AES_BLOCK_SIZE, ctrbuf); + if (ret < 0 || ret != AES_BLOCK_SIZE) + return -EIO; + memcpy(out, buf, nbytes); + crypto_inc(ctrbuf, AES_BLOCK_SIZE); + ret = blkcipher_walk_done(desc, walk, 0); + memcpy(walk->iv, ctrbuf, AES_BLOCK_SIZE); + } + + return ret; +} + +static int ctr_aes_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ctr_aes_crypt(desc, sctx->enc, sctx, &walk); +} + +static int ctr_aes_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ctr_aes_crypt(desc, sctx->dec, sctx, &walk); +} + +static struct crypto_alg ctr_aes_alg = { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-s390", + .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct s390_aes_ctx), + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ctr_aes_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ctr_aes_set_key, + .encrypt = ctr_aes_encrypt, + .decrypt = ctr_aes_decrypt, + } + } +}; + +static int ctr_aes_alg_reg; + static int __init aes_s390_init(void) { int ret; - if (crypt_s390_func_available(KM_AES_128_ENCRYPT)) + if (crypt_s390_func_available(KM_AES_128_ENCRYPT, CRYPT_S390_MSA)) keylen_flag |= AES_KEYLEN_128; - if (crypt_s390_func_available(KM_AES_192_ENCRYPT)) + if (crypt_s390_func_available(KM_AES_192_ENCRYPT, CRYPT_S390_MSA)) keylen_flag |= AES_KEYLEN_192; - if (crypt_s390_func_available(KM_AES_256_ENCRYPT)) + if (crypt_s390_func_available(KM_AES_256_ENCRYPT, CRYPT_S390_MSA)) keylen_flag |= AES_KEYLEN_256; if (!keylen_flag) @@ -535,9 +924,42 @@ static int __init aes_s390_init(void) if (ret) goto cbc_aes_err; + if (crypt_s390_func_available(KM_XTS_128_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4) && + crypt_s390_func_available(KM_XTS_256_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4)) { + ret = crypto_register_alg(&xts_aes_alg); + if (ret) + goto xts_aes_err; + xts_aes_alg_reg = 1; + } + + if (crypt_s390_func_available(KMCTR_AES_128_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4) && + crypt_s390_func_available(KMCTR_AES_192_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4) && + crypt_s390_func_available(KMCTR_AES_256_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4)) { + ctrblk = (u8 *) __get_free_page(GFP_KERNEL); + if (!ctrblk) { + ret = -ENOMEM; + goto ctr_aes_err; + } + ret = crypto_register_alg(&ctr_aes_alg); + if (ret) { + free_page((unsigned long) ctrblk); + goto ctr_aes_err; + } + ctr_aes_alg_reg = 1; + } + out: return ret; +ctr_aes_err: + crypto_unregister_alg(&xts_aes_alg); +xts_aes_err: + crypto_unregister_alg(&cbc_aes_alg); cbc_aes_err: crypto_unregister_alg(&ecb_aes_alg); ecb_aes_err: @@ -548,6 +970,12 @@ aes_err: static void __exit aes_s390_fini(void) { + if (ctr_aes_alg_reg) { + crypto_unregister_alg(&ctr_aes_alg); + free_page((unsigned long) ctrblk); + } + if (xts_aes_alg_reg) + crypto_unregister_alg(&xts_aes_alg); crypto_unregister_alg(&cbc_aes_alg); crypto_unregister_alg(&ecb_aes_alg); crypto_unregister_alg(&aes_alg); diff -uprN linux-2.6.32/arch/s390/crypto/crypt_s390.h linux-2.6.32.ovz/arch/s390/crypto/crypt_s390.h --- linux-2.6.32/arch/s390/crypto/crypt_s390.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/crypt_s390.h 2015-09-10 10:06:37.000000000 -0700 @@ -24,13 +24,18 @@ #define CRYPT_S390_PRIORITY 300 #define CRYPT_S390_COMPOSITE_PRIORITY 400 +#define CRYPT_S390_MSA 0x1 +#define CRYPT_S390_MSA3 0x2 +#define CRYPT_S390_MSA4 0x4 + /* s390 cryptographic operations */ enum crypt_s390_operations { CRYPT_S390_KM = 0x0100, CRYPT_S390_KMC = 0x0200, CRYPT_S390_KIMD = 0x0300, CRYPT_S390_KLMD = 0x0400, - CRYPT_S390_KMAC = 0x0500 + CRYPT_S390_KMAC = 0x0500, + CRYPT_S390_KMCTR = 0x0600 }; /* @@ -51,6 +56,10 @@ enum crypt_s390_km_func { KM_AES_192_DECRYPT = CRYPT_S390_KM | 0x13 | 0x80, KM_AES_256_ENCRYPT = CRYPT_S390_KM | 0x14, KM_AES_256_DECRYPT = CRYPT_S390_KM | 0x14 | 0x80, + KM_XTS_128_ENCRYPT = CRYPT_S390_KM | 0x32, + KM_XTS_128_DECRYPT = CRYPT_S390_KM | 0x32 | 0x80, + KM_XTS_256_ENCRYPT = CRYPT_S390_KM | 0x34, + KM_XTS_256_DECRYPT = CRYPT_S390_KM | 0x34 | 0x80, }; /* @@ -75,6 +84,26 @@ enum crypt_s390_kmc_func { }; /* + * function codes for KMCTR (CIPHER MESSAGE WITH COUNTER) + * instruction + */ +enum crypt_s390_kmctr_func { + KMCTR_QUERY = CRYPT_S390_KMCTR | 0x0, + KMCTR_DEA_ENCRYPT = CRYPT_S390_KMCTR | 0x1, + KMCTR_DEA_DECRYPT = CRYPT_S390_KMCTR | 0x1 | 0x80, + KMCTR_TDEA_128_ENCRYPT = CRYPT_S390_KMCTR | 0x2, + KMCTR_TDEA_128_DECRYPT = CRYPT_S390_KMCTR | 0x2 | 0x80, + KMCTR_TDEA_192_ENCRYPT = CRYPT_S390_KMCTR | 0x3, + KMCTR_TDEA_192_DECRYPT = CRYPT_S390_KMCTR | 0x3 | 0x80, + KMCTR_AES_128_ENCRYPT = CRYPT_S390_KMCTR | 0x12, + KMCTR_AES_128_DECRYPT = CRYPT_S390_KMCTR | 0x12 | 0x80, + KMCTR_AES_192_ENCRYPT = CRYPT_S390_KMCTR | 0x13, + KMCTR_AES_192_DECRYPT = CRYPT_S390_KMCTR | 0x13 | 0x80, + KMCTR_AES_256_ENCRYPT = CRYPT_S390_KMCTR | 0x14, + KMCTR_AES_256_DECRYPT = CRYPT_S390_KMCTR | 0x14 | 0x80, +}; + +/* * function codes for KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) * instruction */ @@ -83,6 +112,7 @@ enum crypt_s390_kimd_func { KIMD_SHA_1 = CRYPT_S390_KIMD | 1, KIMD_SHA_256 = CRYPT_S390_KIMD | 2, KIMD_SHA_512 = CRYPT_S390_KIMD | 3, + KIMD_GHASH = CRYPT_S390_KIMD | 65, }; /* @@ -284,6 +314,45 @@ static inline int crypt_s390_kmac(long f } /** + * crypt_s390_kmctr: + * @func: the function code passed to KMCTR; see crypt_s390_kmctr_func + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @src: address of source memory area + * @src_len: length of src operand in bytes + * @counter: address of counter value + * + * Executes the KMCTR (CIPHER MESSAGE WITH COUNTER) operation of the CPU. + * + * Returns -1 for failure, 0 for the query func, number of processed + * bytes for encryption/decryption funcs + */ +static inline int crypt_s390_kmctr(long func, void *param, u8 *dest, + const u8 *src, long src_len, u8 *counter) +{ + register long __func asm("0") = func & CRYPT_S390_FUNC_MASK; + register void *__param asm("1") = param; + register const u8 *__src asm("2") = src; + register long __src_len asm("3") = src_len; + register u8 *__dest asm("4") = dest; + register u8 *__ctr asm("6") = counter; + int ret = -1; + + asm volatile( + "0: .insn rrf,0xb92d0000,%3,%1,%4,0 \n" /* KMCTR opcode */ + "1: brc 1,0b \n" /* handle partial completion */ + " la %0,0\n" + "2:\n" + EX_TABLE(0b,2b) EX_TABLE(1b,2b) + : "+d" (ret), "+a" (__src), "+d" (__src_len), "+a" (__dest), + "+a" (__ctr) + : "d" (__func), "a" (__param) : "cc", "memory"); + if (ret < 0) + return ret; + return (func & CRYPT_S390_FUNC_MASK) ? src_len - __src_len : __src_len; +} + +/** * crypt_s390_func_available: * @func: the function code of the specific function; 0 if op in general * @@ -291,13 +360,27 @@ static inline int crypt_s390_kmac(long f * * Returns 1 if func available; 0 if func or op in general not available */ -static inline int crypt_s390_func_available(int func) +static inline int crypt_s390_func_available(int func, + unsigned int facility_mask) { + unsigned long long facility_bits[2]; unsigned char status[16]; int ret; - /* check if CPACF facility (bit 17) is available */ - if (!(stfl() & 1ULL << (31 - 17))) + if (facility_mask & CRYPT_S390_MSA && + !(stfl() & 1ULL << (31 - 17))) + return 0; + + if (facility_mask & CRYPT_S390_MSA3 || + facility_mask & CRYPT_S390_MSA4) + if (stfle(facility_bits, 2) <= 0) + return 0; + + if (facility_mask & CRYPT_S390_MSA3 && + !(facility_bits[1] & (1ULL << (63 - (76 - 64))))) + return 0; + if (facility_mask & CRYPT_S390_MSA4 && + !(facility_bits[1] & (1ULL << (63 - (77 - 64))))) return 0; switch (func & CRYPT_S390_OP_MASK) { @@ -316,6 +399,10 @@ static inline int crypt_s390_func_availa case CRYPT_S390_KMAC: ret = crypt_s390_kmac(KMAC_QUERY, &status, NULL, 0); break; + case CRYPT_S390_KMCTR: + ret = crypt_s390_kmctr(KMCTR_QUERY, &status, NULL, NULL, 0, + NULL); + break; default: return 0; } @@ -326,4 +413,31 @@ static inline int crypt_s390_func_availa return (status[func >> 3] & (0x80 >> (func & 7))) != 0; } +/** + * crypt_s390_pcc: + * @func: the function code passed to KM; see crypt_s390_km_func + * @param: address of parameter block; see POP for details on each func + * + * Executes the PCC (PERFORM CRYPTOGRAPHIC COMPUTATION) operation of the CPU. + * + * Returns -1 for failure, 0 for success. + */ +static inline int crypt_s390_pcc(long func, void *param) +{ + register long __func asm("0") = func & 0x7f; /* encrypt or decrypt */ + register void *__param asm("1") = param; + int ret = -1; + + asm volatile( + "0: .insn rre,0xb92c0000,0,0 \n" /* PCC opcode */ + "1: brc 1,0b \n" /* handle partial completion */ + " la %0,0\n" + "2:\n" + EX_TABLE(0b,2b) EX_TABLE(1b,2b) + : "+d" (ret) + : "d" (__func), "a" (__param) : "cc", "memory"); + return ret; +} + + #endif /* _CRYPTO_ARCH_S390_CRYPT_S390_H */ diff -uprN linux-2.6.32/arch/s390/crypto/des_check_key.c linux-2.6.32.ovz/arch/s390/crypto/des_check_key.c --- linux-2.6.32/arch/s390/crypto/des_check_key.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/des_check_key.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,132 +0,0 @@ -/* - * Cryptographic API. - * - * Function for checking keys for the DES and Tripple DES Encryption - * algorithms. - * - * Originally released as descore by Dana L. How . - * Modified by Raimar Falke for the Linux-Kernel. - * Derived from Cryptoapi and Nettle implementations, adapted for in-place - * scatterlist interface. Changed LGPL to GPL per section 3 of the LGPL. - * - * s390 Version: - * Copyright IBM Corp. 2003 - * Author(s): Thomas Spatzier - * Jan Glauber (jan.glauber@de.ibm.com) - * - * Derived from "crypto/des.c" - * Copyright (c) 1992 Dana L. How. - * Copyright (c) Raimar Falke - * Copyright (c) Gisle Sflensminde - * Copyright (C) 2001 Niels Mvller. - * Copyright (c) 2002 James Morris - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ -#include -#include -#include -#include -#include "crypto_des.h" - -#define ROR(d,c,o) ((d) = (d) >> (c) | (d) << (o)) - -static const u8 parity[] = { - 8,1,0,8,0,8,8,0,0,8,8,0,8,0,2,8,0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,3, - 0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0,8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8, - 0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0,8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8, - 8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8,0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0, - 0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0,8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8, - 8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8,0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0, - 8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8,0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0, - 4,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0,8,5,0,8,0,8,8,0,0,8,8,0,8,0,6,8, -}; - -/* - * RFC2451: Weak key checks SHOULD be performed. - */ -int -crypto_des_check_key(const u8 *key, unsigned int keylen, u32 *flags) -{ - u32 n, w; - - n = parity[key[0]]; n <<= 4; - n |= parity[key[1]]; n <<= 4; - n |= parity[key[2]]; n <<= 4; - n |= parity[key[3]]; n <<= 4; - n |= parity[key[4]]; n <<= 4; - n |= parity[key[5]]; n <<= 4; - n |= parity[key[6]]; n <<= 4; - n |= parity[key[7]]; - w = 0x88888888L; - - if ((*flags & CRYPTO_TFM_REQ_WEAK_KEY) - && !((n - (w >> 3)) & w)) { /* 1 in 10^10 keys passes this test */ - if (n < 0x41415151) { - if (n < 0x31312121) { - if (n < 0x14141515) { - /* 01 01 01 01 01 01 01 01 */ - if (n == 0x11111111) goto weak; - /* 01 1F 01 1F 01 0E 01 0E */ - if (n == 0x13131212) goto weak; - } else { - /* 01 E0 01 E0 01 F1 01 F1 */ - if (n == 0x14141515) goto weak; - /* 01 FE 01 FE 01 FE 01 FE */ - if (n == 0x16161616) goto weak; - } - } else { - if (n < 0x34342525) { - /* 1F 01 1F 01 0E 01 0E 01 */ - if (n == 0x31312121) goto weak; - /* 1F 1F 1F 1F 0E 0E 0E 0E (?) */ - if (n == 0x33332222) goto weak; - } else { - /* 1F E0 1F E0 0E F1 0E F1 */ - if (n == 0x34342525) goto weak; - /* 1F FE 1F FE 0E FE 0E FE */ - if (n == 0x36362626) goto weak; - } - } - } else { - if (n < 0x61616161) { - if (n < 0x44445555) { - /* E0 01 E0 01 F1 01 F1 01 */ - if (n == 0x41415151) goto weak; - /* E0 1F E0 1F F1 0E F1 0E */ - if (n == 0x43435252) goto weak; - } else { - /* E0 E0 E0 E0 F1 F1 F1 F1 (?) */ - if (n == 0x44445555) goto weak; - /* E0 FE E0 FE F1 FE F1 FE */ - if (n == 0x46465656) goto weak; - } - } else { - if (n < 0x64646565) { - /* FE 01 FE 01 FE 01 FE 01 */ - if (n == 0x61616161) goto weak; - /* FE 1F FE 1F FE 0E FE 0E */ - if (n == 0x63636262) goto weak; - } else { - /* FE E0 FE E0 FE F1 FE F1 */ - if (n == 0x64646565) goto weak; - /* FE FE FE FE FE FE FE FE */ - if (n == 0x66666666) goto weak; - } - } - } - } - return 0; -weak: - *flags |= CRYPTO_TFM_RES_WEAK_KEY; - return -EINVAL; -} - -EXPORT_SYMBOL(crypto_des_check_key); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Key Check function for DES & DES3 Cipher Algorithms"); diff -uprN linux-2.6.32/arch/s390/crypto/des_s390.c linux-2.6.32.ovz/arch/s390/crypto/des_s390.c --- linux-2.6.32/arch/s390/crypto/des_s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/des_s390.c 2015-09-10 10:07:48.000000000 -0700 @@ -3,7 +3,7 @@ * * s390 implementation of the DES Cipher Algorithm. * - * Copyright IBM Corp. 2003,2007 + * Copyright IBM Corp. 2003,2011 * Author(s): Thomas Spatzier * Jan Glauber (jan.glauber@de.ibm.com) * @@ -14,63 +14,53 @@ * */ -#include #include #include +#include +#include +#include #include "crypt_s390.h" -#include "crypto_des.h" -#define DES_BLOCK_SIZE 8 -#define DES_KEY_SIZE 8 +#define DES3_KEY_SIZE (3 * DES_KEY_SIZE) -#define DES3_128_KEY_SIZE (2 * DES_KEY_SIZE) -#define DES3_128_BLOCK_SIZE DES_BLOCK_SIZE +static u8 *ctrblk; +static DEFINE_SPINLOCK(ctrblk_lock); -#define DES3_192_KEY_SIZE (3 * DES_KEY_SIZE) -#define DES3_192_BLOCK_SIZE DES_BLOCK_SIZE - -struct crypt_s390_des_ctx { +struct s390_des_ctx { u8 iv[DES_BLOCK_SIZE]; - u8 key[DES_KEY_SIZE]; -}; - -struct crypt_s390_des3_128_ctx { - u8 iv[DES_BLOCK_SIZE]; - u8 key[DES3_128_KEY_SIZE]; -}; - -struct crypt_s390_des3_192_ctx { - u8 iv[DES_BLOCK_SIZE]; - u8 key[DES3_192_KEY_SIZE]; + u8 key[DES3_KEY_SIZE]; }; static int des_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) + unsigned int key_len) { - struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm); + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); u32 *flags = &tfm->crt_flags; - int ret; + u32 tmp[DES_EXPKEY_WORDS]; - /* test if key is valid (not a weak key) */ - ret = crypto_des_check_key(key, keylen, flags); - if (ret == 0) - memcpy(dctx->key, key, keylen); - return ret; + /* check for weak keys */ + if (!des_ekey(tmp, key) && (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) { + *flags |= CRYPTO_TFM_RES_WEAK_KEY; + return -EINVAL; + } + + memcpy(ctx->key, key, key_len); + return 0; } static void des_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm); + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_DEA_ENCRYPT, dctx->key, out, in, DES_BLOCK_SIZE); + crypt_s390_km(KM_DEA_ENCRYPT, ctx->key, out, in, DES_BLOCK_SIZE); } static void des_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm); + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_DEA_DECRYPT, dctx->key, out, in, DES_BLOCK_SIZE); + crypt_s390_km(KM_DEA_DECRYPT, ctx->key, out, in, DES_BLOCK_SIZE); } static struct crypto_alg des_alg = { @@ -79,7 +69,7 @@ static struct crypto_alg des_alg = { .cra_priority = CRYPT_S390_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des_ctx), + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(des_alg.cra_list), .cra_u = { @@ -94,7 +84,7 @@ static struct crypto_alg des_alg = { }; static int ecb_desall_crypt(struct blkcipher_desc *desc, long func, - void *param, struct blkcipher_walk *walk) + u8 *key, struct blkcipher_walk *walk) { int ret = blkcipher_walk_virt(desc, walk); unsigned int nbytes; @@ -105,8 +95,9 @@ static int ecb_desall_crypt(struct blkci u8 *out = walk->dst.virt.addr; u8 *in = walk->src.virt.addr; - ret = crypt_s390_km(func, param, out, in, n); - BUG_ON((ret < 0) || (ret != n)); + ret = crypt_s390_km(func, key, out, in, n); + if (ret < 0 || ret != n) + return -EIO; nbytes &= DES_BLOCK_SIZE - 1; ret = blkcipher_walk_done(desc, walk, nbytes); @@ -116,28 +107,35 @@ static int ecb_desall_crypt(struct blkci } static int cbc_desall_crypt(struct blkcipher_desc *desc, long func, - void *param, struct blkcipher_walk *walk) + struct blkcipher_walk *walk) { + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); int ret = blkcipher_walk_virt(desc, walk); unsigned int nbytes = walk->nbytes; + struct { + u8 iv[DES_BLOCK_SIZE]; + u8 key[DES3_KEY_SIZE]; + } param; if (!nbytes) goto out; - memcpy(param, walk->iv, DES_BLOCK_SIZE); + memcpy(param.iv, walk->iv, DES_BLOCK_SIZE); + memcpy(param.key, ctx->key, DES3_KEY_SIZE); do { /* only use complete blocks */ unsigned int n = nbytes & ~(DES_BLOCK_SIZE - 1); u8 *out = walk->dst.virt.addr; u8 *in = walk->src.virt.addr; - ret = crypt_s390_kmc(func, param, out, in, n); - BUG_ON((ret < 0) || (ret != n)); + ret = crypt_s390_kmc(func, ¶m, out, in, n); + if (ret < 0 || ret != n) + return -EIO; nbytes &= DES_BLOCK_SIZE - 1; ret = blkcipher_walk_done(desc, walk, nbytes); } while ((nbytes = walk->nbytes)); - memcpy(walk->iv, param, DES_BLOCK_SIZE); + memcpy(walk->iv, param.iv, DES_BLOCK_SIZE); out: return ret; @@ -147,22 +145,22 @@ static int ecb_des_encrypt(struct blkcip struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct crypt_s390_des_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_DEA_ENCRYPT, sctx->key, &walk); + return ecb_desall_crypt(desc, KM_DEA_ENCRYPT, ctx->key, &walk); } static int ecb_des_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct crypt_s390_des_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_DEA_DECRYPT, sctx->key, &walk); + return ecb_desall_crypt(desc, KM_DEA_DECRYPT, ctx->key, &walk); } static struct crypto_alg ecb_des_alg = { @@ -171,7 +169,7 @@ static struct crypto_alg ecb_des_alg = { .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des_ctx), + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(ecb_des_alg.cra_list), @@ -190,22 +188,20 @@ static int cbc_des_encrypt(struct blkcip struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct crypt_s390_des_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_DEA_ENCRYPT, sctx->iv, &walk); + return cbc_desall_crypt(desc, KMC_DEA_ENCRYPT, &walk); } static int cbc_des_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct crypt_s390_des_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_DEA_DECRYPT, sctx->iv, &walk); + return cbc_desall_crypt(desc, KMC_DEA_DECRYPT, &walk); } static struct crypto_alg cbc_des_alg = { @@ -214,7 +210,7 @@ static struct crypto_alg cbc_des_alg = { .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des_ctx), + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(cbc_des_alg.cra_list), @@ -237,327 +233,324 @@ static struct crypto_alg cbc_des_alg = { * complementation keys. Any weakness is obviated by the use of * multiple keys. * - * However, if the two independent 64-bit keys are equal, - * then the DES3 operation is simply the same as DES. - * Implementers MUST reject keys that exhibit this property. + * However, if the first two or last two independent 64-bit keys are + * equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the + * same as DES. Implementers MUST reject keys that exhibit this + * property. * */ -static int des3_128_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +static int des3_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int key_len) { - int i, ret; - struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm); - const u8 *temp_key = key; + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); u32 *flags = &tfm->crt_flags; - if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE)) && + if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) && + memcmp(&key[DES_KEY_SIZE], &key[DES_KEY_SIZE * 2], + DES_KEY_SIZE)) && (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) { *flags |= CRYPTO_TFM_RES_WEAK_KEY; return -EINVAL; } - for (i = 0; i < 2; i++, temp_key += DES_KEY_SIZE) { - ret = crypto_des_check_key(temp_key, DES_KEY_SIZE, flags); - if (ret < 0) - return ret; - } - memcpy(dctx->key, key, keylen); + memcpy(ctx->key, key, key_len); return 0; } -static void des3_128_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +static void des3_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { - struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm); + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_TDEA_128_ENCRYPT, dctx->key, dst, (void*)src, - DES3_128_BLOCK_SIZE); + crypt_s390_km(KM_TDEA_192_ENCRYPT, ctx->key, dst, src, DES_BLOCK_SIZE); } -static void des3_128_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +static void des3_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { - struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm); + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_TDEA_128_DECRYPT, dctx->key, dst, (void*)src, - DES3_128_BLOCK_SIZE); + crypt_s390_km(KM_TDEA_192_DECRYPT, ctx->key, dst, src, DES_BLOCK_SIZE); } -static struct crypto_alg des3_128_alg = { - .cra_name = "des3_ede128", - .cra_driver_name = "des3_ede128-s390", +static struct crypto_alg des3_alg = { + .cra_name = "des3_ede", + .cra_driver_name = "des3_ede-s390", .cra_priority = CRYPT_S390_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = DES3_128_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_128_ctx), + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(des3_128_alg.cra_list), + .cra_list = LIST_HEAD_INIT(des3_alg.cra_list), .cra_u = { .cipher = { - .cia_min_keysize = DES3_128_KEY_SIZE, - .cia_max_keysize = DES3_128_KEY_SIZE, - .cia_setkey = des3_128_setkey, - .cia_encrypt = des3_128_encrypt, - .cia_decrypt = des3_128_decrypt, + .cia_min_keysize = DES3_KEY_SIZE, + .cia_max_keysize = DES3_KEY_SIZE, + .cia_setkey = des3_setkey, + .cia_encrypt = des3_encrypt, + .cia_decrypt = des3_decrypt, } } }; -static int ecb_des3_128_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_des3_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_128_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_TDEA_128_ENCRYPT, sctx->key, &walk); + return ecb_desall_crypt(desc, KM_TDEA_192_ENCRYPT, ctx->key, &walk); } -static int ecb_des3_128_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_des3_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_128_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_TDEA_128_DECRYPT, sctx->key, &walk); + return ecb_desall_crypt(desc, KM_TDEA_192_DECRYPT, ctx->key, &walk); } -static struct crypto_alg ecb_des3_128_alg = { - .cra_name = "ecb(des3_ede128)", - .cra_driver_name = "ecb-des3_ede128-s390", +static struct crypto_alg ecb_des3_alg = { + .cra_name = "ecb(des3_ede)", + .cra_driver_name = "ecb-des3_ede-s390", .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_128_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_128_ctx), + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT( - ecb_des3_128_alg.cra_list), + ecb_des3_alg.cra_list), .cra_u = { .blkcipher = { - .min_keysize = DES3_128_KEY_SIZE, - .max_keysize = DES3_128_KEY_SIZE, - .setkey = des3_128_setkey, - .encrypt = ecb_des3_128_encrypt, - .decrypt = ecb_des3_128_decrypt, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .setkey = des3_setkey, + .encrypt = ecb_des3_encrypt, + .decrypt = ecb_des3_decrypt, } } }; -static int cbc_des3_128_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_des3_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_128_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_TDEA_128_ENCRYPT, sctx->iv, &walk); + return cbc_desall_crypt(desc, KMC_TDEA_192_ENCRYPT, &walk); } -static int cbc_des3_128_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_des3_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_128_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_TDEA_128_DECRYPT, sctx->iv, &walk); + return cbc_desall_crypt(desc, KMC_TDEA_192_DECRYPT, &walk); } -static struct crypto_alg cbc_des3_128_alg = { - .cra_name = "cbc(des3_ede128)", - .cra_driver_name = "cbc-des3_ede128-s390", +static struct crypto_alg cbc_des3_alg = { + .cra_name = "cbc(des3_ede)", + .cra_driver_name = "cbc-des3_ede-s390", .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_128_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_128_ctx), + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT( - cbc_des3_128_alg.cra_list), + cbc_des3_alg.cra_list), .cra_u = { .blkcipher = { - .min_keysize = DES3_128_KEY_SIZE, - .max_keysize = DES3_128_KEY_SIZE, - .ivsize = DES3_128_BLOCK_SIZE, - .setkey = des3_128_setkey, - .encrypt = cbc_des3_128_encrypt, - .decrypt = cbc_des3_128_decrypt, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des3_setkey, + .encrypt = cbc_des3_encrypt, + .decrypt = cbc_des3_decrypt, } } }; -/* - * RFC2451: - * - * For DES-EDE3, there is no known need to reject weak or - * complementation keys. Any weakness is obviated by the use of - * multiple keys. - * - * However, if the first two or last two independent 64-bit keys are - * equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the - * same as DES. Implementers MUST reject keys that exhibit this - * property. - * - */ -static int des3_192_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +static unsigned int __ctrblk_init(u8 *ctrptr, unsigned int nbytes) { - int i, ret; - struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm); - const u8 *temp_key = key; - u32 *flags = &tfm->crt_flags; + unsigned int i, n; - if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) && - memcmp(&key[DES_KEY_SIZE], &key[DES_KEY_SIZE * 2], - DES_KEY_SIZE)) && - (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) { - *flags |= CRYPTO_TFM_RES_WEAK_KEY; - return -EINVAL; + /* align to block size, max. PAGE_SIZE */ + n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(DES_BLOCK_SIZE - 1); + for (i = DES_BLOCK_SIZE; i < n; i += DES_BLOCK_SIZE) { + memcpy(ctrptr + i, ctrptr + i - DES_BLOCK_SIZE, DES_BLOCK_SIZE); + crypto_inc(ctrptr + i, DES_BLOCK_SIZE); + } + return n; +} + +static int ctr_desall_crypt(struct blkcipher_desc *desc, long func, + struct s390_des_ctx *ctx, + struct blkcipher_walk *walk) +{ + int ret = blkcipher_walk_virt_block(desc, walk, DES_BLOCK_SIZE); + unsigned int n, nbytes; + u8 buf[DES_BLOCK_SIZE], ctrbuf[DES_BLOCK_SIZE]; + u8 *out, *in, *ctrptr = ctrbuf; + + if (!walk->nbytes) + return ret; + + if (spin_trylock(&ctrblk_lock)) + ctrptr = ctrblk; + + memcpy(ctrptr, walk->iv, DES_BLOCK_SIZE); + while ((nbytes = walk->nbytes) >= DES_BLOCK_SIZE) { + out = walk->dst.virt.addr; + in = walk->src.virt.addr; + while (nbytes >= DES_BLOCK_SIZE) { + if (ctrptr == ctrblk) + n = __ctrblk_init(ctrptr, nbytes); + else + n = DES_BLOCK_SIZE; + ret = crypt_s390_kmctr(func, ctx->key, out, in, + n, ctrptr); + if (ret < 0 || ret != n) { + if (ctrptr == ctrblk) + spin_unlock(&ctrblk_lock); + return -EIO; + } + if (n > DES_BLOCK_SIZE) + memcpy(ctrptr, ctrptr + n - DES_BLOCK_SIZE, + DES_BLOCK_SIZE); + crypto_inc(ctrptr, DES_BLOCK_SIZE); + out += n; + in += n; + nbytes -= n; + } + ret = blkcipher_walk_done(desc, walk, nbytes); } - for (i = 0; i < 3; i++, temp_key += DES_KEY_SIZE) { - ret = crypto_des_check_key(temp_key, DES_KEY_SIZE, flags); - if (ret < 0) - return ret; + if (ctrptr == ctrblk) { + if (nbytes) + memcpy(ctrbuf, ctrptr, DES_BLOCK_SIZE); + else + memcpy(walk->iv, ctrptr, DES_BLOCK_SIZE); + spin_unlock(&ctrblk_lock); + } else { + if (!nbytes) + memcpy(walk->iv, ctrptr, DES_BLOCK_SIZE); + } + /* final block may be < DES_BLOCK_SIZE, copy only nbytes */ + if (nbytes) { + out = walk->dst.virt.addr; + in = walk->src.virt.addr; + ret = crypt_s390_kmctr(func, ctx->key, buf, in, + DES_BLOCK_SIZE, ctrbuf); + if (ret < 0 || ret != DES_BLOCK_SIZE) + return -EIO; + memcpy(out, buf, nbytes); + crypto_inc(ctrbuf, DES_BLOCK_SIZE); + ret = blkcipher_walk_done(desc, walk, 0); + memcpy(walk->iv, ctrbuf, DES_BLOCK_SIZE); } - memcpy(dctx->key, key, keylen); - return 0; -} - -static void des3_192_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm); - - crypt_s390_km(KM_TDEA_192_ENCRYPT, dctx->key, dst, (void*)src, - DES3_192_BLOCK_SIZE); -} - -static void des3_192_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm); - - crypt_s390_km(KM_TDEA_192_DECRYPT, dctx->key, dst, (void*)src, - DES3_192_BLOCK_SIZE); + return ret; } -static struct crypto_alg des3_192_alg = { - .cra_name = "des3_ede", - .cra_driver_name = "des3_ede-s390", - .cra_priority = CRYPT_S390_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = DES3_192_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_192_ctx), - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(des3_192_alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = DES3_192_KEY_SIZE, - .cia_max_keysize = DES3_192_KEY_SIZE, - .cia_setkey = des3_192_setkey, - .cia_encrypt = des3_192_encrypt, - .cia_decrypt = des3_192_decrypt, - } - } -}; - -static int ecb_des3_192_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_des_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_192_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_TDEA_192_ENCRYPT, sctx->key, &walk); + return ctr_desall_crypt(desc, KMCTR_DEA_ENCRYPT, ctx, &walk); } -static int ecb_des3_192_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_des_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_192_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_TDEA_192_DECRYPT, sctx->key, &walk); + return ctr_desall_crypt(desc, KMCTR_DEA_DECRYPT, ctx, &walk); } -static struct crypto_alg ecb_des3_192_alg = { - .cra_name = "ecb(des3_ede)", - .cra_driver_name = "ecb-des3_ede-s390", +static struct crypto_alg ctr_des_alg = { + .cra_name = "ctr(des)", + .cra_driver_name = "ctr-des-s390", .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_192_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_192_ctx), + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT( - ecb_des3_192_alg.cra_list), + .cra_list = LIST_HEAD_INIT(ctr_des_alg.cra_list), .cra_u = { .blkcipher = { - .min_keysize = DES3_192_KEY_SIZE, - .max_keysize = DES3_192_KEY_SIZE, - .setkey = des3_192_setkey, - .encrypt = ecb_des3_192_encrypt, - .decrypt = ecb_des3_192_decrypt, + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des_setkey, + .encrypt = ctr_des_encrypt, + .decrypt = ctr_des_decrypt, } } }; -static int cbc_des3_192_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_des3_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_192_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_TDEA_192_ENCRYPT, sctx->iv, &walk); + return ctr_desall_crypt(desc, KMCTR_TDEA_192_ENCRYPT, ctx, &walk); } -static int cbc_des3_192_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_des3_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_192_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_TDEA_192_DECRYPT, sctx->iv, &walk); + return ctr_desall_crypt(desc, KMCTR_TDEA_192_DECRYPT, ctx, &walk); } -static struct crypto_alg cbc_des3_192_alg = { - .cra_name = "cbc(des3_ede)", - .cra_driver_name = "cbc-des3_ede-s390", +static struct crypto_alg ctr_des3_alg = { + .cra_name = "ctr(des3_ede)", + .cra_driver_name = "ctr-des3_ede-s390", .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_192_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_192_ctx), + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT( - cbc_des3_192_alg.cra_list), + .cra_list = LIST_HEAD_INIT(ctr_des3_alg.cra_list), .cra_u = { .blkcipher = { - .min_keysize = DES3_192_KEY_SIZE, - .max_keysize = DES3_192_KEY_SIZE, - .ivsize = DES3_192_BLOCK_SIZE, - .setkey = des3_192_setkey, - .encrypt = cbc_des3_192_encrypt, - .decrypt = cbc_des3_192_decrypt, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des3_setkey, + .encrypt = ctr_des3_encrypt, + .decrypt = ctr_des3_decrypt, } } }; -static int des_s390_init(void) +static int __init des_s390_init(void) { - int ret = 0; + int ret; - if (!crypt_s390_func_available(KM_DEA_ENCRYPT) || - !crypt_s390_func_available(KM_TDEA_128_ENCRYPT) || - !crypt_s390_func_available(KM_TDEA_192_ENCRYPT)) + if (!crypt_s390_func_available(KM_DEA_ENCRYPT, CRYPT_S390_MSA) || + !crypt_s390_func_available(KM_TDEA_192_ENCRYPT, CRYPT_S390_MSA)) return -EOPNOTSUPP; ret = crypto_register_alg(&des_alg); @@ -569,41 +562,46 @@ static int des_s390_init(void) ret = crypto_register_alg(&cbc_des_alg); if (ret) goto cbc_des_err; - - ret = crypto_register_alg(&des3_128_alg); + ret = crypto_register_alg(&des3_alg); if (ret) - goto des3_128_err; - ret = crypto_register_alg(&ecb_des3_128_alg); + goto des3_err; + ret = crypto_register_alg(&ecb_des3_alg); if (ret) - goto ecb_des3_128_err; - ret = crypto_register_alg(&cbc_des3_128_alg); + goto ecb_des3_err; + ret = crypto_register_alg(&cbc_des3_alg); if (ret) - goto cbc_des3_128_err; - - ret = crypto_register_alg(&des3_192_alg); - if (ret) - goto des3_192_err; - ret = crypto_register_alg(&ecb_des3_192_alg); - if (ret) - goto ecb_des3_192_err; - ret = crypto_register_alg(&cbc_des3_192_alg); - if (ret) - goto cbc_des3_192_err; + goto cbc_des3_err; + if (crypt_s390_func_available(KMCTR_DEA_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4) && + crypt_s390_func_available(KMCTR_TDEA_192_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4)) { + ret = crypto_register_alg(&ctr_des_alg); + if (ret) + goto ctr_des_err; + ret = crypto_register_alg(&ctr_des3_alg); + if (ret) + goto ctr_des3_err; + ctrblk = (u8 *) __get_free_page(GFP_KERNEL); + if (!ctrblk) { + ret = -ENOMEM; + goto ctr_mem_err; + } + } out: return ret; -cbc_des3_192_err: - crypto_unregister_alg(&ecb_des3_192_alg); -ecb_des3_192_err: - crypto_unregister_alg(&des3_192_alg); -des3_192_err: - crypto_unregister_alg(&cbc_des3_128_alg); -cbc_des3_128_err: - crypto_unregister_alg(&ecb_des3_128_alg); -ecb_des3_128_err: - crypto_unregister_alg(&des3_128_alg); -des3_128_err: +ctr_mem_err: + crypto_unregister_alg(&ctr_des3_alg); +ctr_des3_err: + crypto_unregister_alg(&ctr_des_alg); +ctr_des_err: + crypto_unregister_alg(&cbc_des3_alg); +cbc_des3_err: + crypto_unregister_alg(&ecb_des3_alg); +ecb_des3_err: + crypto_unregister_alg(&des3_alg); +des3_err: crypto_unregister_alg(&cbc_des_alg); cbc_des_err: crypto_unregister_alg(&ecb_des_alg); @@ -613,21 +611,23 @@ des_err: goto out; } -static void __exit des_s390_fini(void) +static void __exit des_s390_exit(void) { - crypto_unregister_alg(&cbc_des3_192_alg); - crypto_unregister_alg(&ecb_des3_192_alg); - crypto_unregister_alg(&des3_192_alg); - crypto_unregister_alg(&cbc_des3_128_alg); - crypto_unregister_alg(&ecb_des3_128_alg); - crypto_unregister_alg(&des3_128_alg); + if (ctrblk) { + crypto_unregister_alg(&ctr_des_alg); + crypto_unregister_alg(&ctr_des3_alg); + free_page((unsigned long) ctrblk); + } + crypto_unregister_alg(&cbc_des3_alg); + crypto_unregister_alg(&ecb_des3_alg); + crypto_unregister_alg(&des3_alg); crypto_unregister_alg(&cbc_des_alg); crypto_unregister_alg(&ecb_des_alg); crypto_unregister_alg(&des_alg); } module_init(des_s390_init); -module_exit(des_s390_fini); +module_exit(des_s390_exit); MODULE_ALIAS("des"); MODULE_ALIAS("des3_ede"); diff -uprN linux-2.6.32/arch/s390/crypto/ghash_s390.c linux-2.6.32.ovz/arch/s390/crypto/ghash_s390.c --- linux-2.6.32/arch/s390/crypto/ghash_s390.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/ghash_s390.c 2015-09-10 10:07:48.000000000 -0700 @@ -0,0 +1,167 @@ +/* + * Cryptographic API. + * + * s390 implementation of the GHASH algorithm for GCM (Galois/Counter Mode). + * + * Copyright IBM Corp. 2011 + * Author(s): Gerald Schaefer + */ + +#include +#include + +#include "crypt_s390.h" + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 + +struct ghash_ctx { + u8 icv[16]; + u8 key[16]; +}; + +struct ghash_desc_ctx { + u8 buffer[GHASH_BLOCK_SIZE]; + u32 bytes; +}; + +static int ghash_init(struct shash_desc *desc) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->key, key, GHASH_BLOCK_SIZE); + memset(ctx->icv, 0, GHASH_BLOCK_SIZE); + + return 0; +} + +static int ghash_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + unsigned int n; + u8 *buf = dctx->buffer; + int ret; + + if (dctx->bytes) { + u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); + + n = min(srclen, dctx->bytes); + dctx->bytes -= n; + srclen -= n; + + memcpy(pos, src, n); + src += n; + + if (!dctx->bytes) { + ret = crypt_s390_kimd(KIMD_GHASH, ctx, buf, + GHASH_BLOCK_SIZE); + if (ret != GHASH_BLOCK_SIZE) + return -EIO; + } + } + + n = srclen & ~(GHASH_BLOCK_SIZE - 1); + if (n) { + ret = crypt_s390_kimd(KIMD_GHASH, ctx, src, n); + if (ret != n) + return -EIO; + src += n; + srclen -= n; + } + + if (srclen) { + dctx->bytes = GHASH_BLOCK_SIZE - srclen; + memcpy(buf, src, srclen); + } + + return 0; +} + +static int ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) +{ + u8 *buf = dctx->buffer; + int ret; + + if (dctx->bytes) { + u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); + + memset(pos, 0, dctx->bytes); + + ret = crypt_s390_kimd(KIMD_GHASH, ctx, buf, GHASH_BLOCK_SIZE); + if (ret != GHASH_BLOCK_SIZE) + return -EIO; + } + + dctx->bytes = 0; + return 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + int ret; + + ret = ghash_flush(ctx, dctx); + if (!ret) + memcpy(dst, ctx->icv, GHASH_BLOCK_SIZE); + return ret; +} + +static struct shash_alg ghash_alg = { + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), + .base = { + .cra_name = "ghash", + .cra_driver_name = "ghash-s390", + .cra_priority = CRYPT_S390_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list), + }, +}; + +static int __init ghash_mod_init(void) +{ + if (!crypt_s390_func_available(KIMD_GHASH, + CRYPT_S390_MSA | CRYPT_S390_MSA4)) + return -EOPNOTSUPP; + + return crypto_register_shash(&ghash_alg); +} + +static void __exit ghash_mod_exit(void) +{ + crypto_unregister_shash(&ghash_alg); +} + +module_init(ghash_mod_init); +module_exit(ghash_mod_exit); + +MODULE_ALIAS("ghash"); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("GHASH Message Digest Algorithm, s390 implementation"); diff -uprN linux-2.6.32/arch/s390/crypto/Makefile linux-2.6.32.ovz/arch/s390/crypto/Makefile --- linux-2.6.32/arch/s390/crypto/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/Makefile 2015-09-10 10:07:48.000000000 -0700 @@ -5,6 +5,7 @@ obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o -obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o des_check_key.o +obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o obj-$(CONFIG_S390_PRNG) += prng.o +obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o diff -uprN linux-2.6.32/arch/s390/crypto/prng.c linux-2.6.32.ovz/arch/s390/crypto/prng.c --- linux-2.6.32/arch/s390/crypto/prng.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/prng.c 2015-09-10 10:06:37.000000000 -0700 @@ -77,7 +77,7 @@ static void prng_seed(int nbytes) /* Add the entropy */ while (nbytes >= 8) { - *((__u64 *)parm_block) ^= *((__u64 *)buf+i*8); + *((__u64 *)parm_block) ^= *((__u64 *)(buf+i)); prng_add_entropy(); i += 8; nbytes -= 8; @@ -166,7 +166,7 @@ static int __init prng_init(void) int ret; /* check if the CPU has a PRNG */ - if (!crypt_s390_func_available(KMC_PRNG)) + if (!crypt_s390_func_available(KMC_PRNG, CRYPT_S390_MSA)) return -EOPNOTSUPP; if (prng_chunk_size < 8) diff -uprN linux-2.6.32/arch/s390/crypto/sha1_s390.c linux-2.6.32.ovz/arch/s390/crypto/sha1_s390.c --- linux-2.6.32/arch/s390/crypto/sha1_s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/sha1_s390.c 2015-09-10 10:06:37.000000000 -0700 @@ -90,7 +90,7 @@ static struct shash_alg alg = { static int __init sha1_s390_init(void) { - if (!crypt_s390_func_available(KIMD_SHA_1)) + if (!crypt_s390_func_available(KIMD_SHA_1, CRYPT_S390_MSA)) return -EOPNOTSUPP; return crypto_register_shash(&alg); } diff -uprN linux-2.6.32/arch/s390/crypto/sha256_s390.c linux-2.6.32.ovz/arch/s390/crypto/sha256_s390.c --- linux-2.6.32/arch/s390/crypto/sha256_s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/sha256_s390.c 2015-09-10 10:06:37.000000000 -0700 @@ -86,7 +86,7 @@ static struct shash_alg alg = { static int sha256_s390_init(void) { - if (!crypt_s390_func_available(KIMD_SHA_256)) + if (!crypt_s390_func_available(KIMD_SHA_256, CRYPT_S390_MSA)) return -EOPNOTSUPP; return crypto_register_shash(&alg); diff -uprN linux-2.6.32/arch/s390/crypto/sha512_s390.c linux-2.6.32.ovz/arch/s390/crypto/sha512_s390.c --- linux-2.6.32/arch/s390/crypto/sha512_s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/sha512_s390.c 2015-09-10 10:06:37.000000000 -0700 @@ -132,7 +132,7 @@ static int __init init(void) { int ret; - if (!crypt_s390_func_available(KIMD_SHA_512)) + if (!crypt_s390_func_available(KIMD_SHA_512, CRYPT_S390_MSA)) return -EOPNOTSUPP; if ((ret = crypto_register_shash(&sha512_alg)) < 0) goto out; diff -uprN linux-2.6.32/arch/s390/crypto/sha_common.c linux-2.6.32.ovz/arch/s390/crypto/sha_common.c --- linux-2.6.32/arch/s390/crypto/sha_common.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/sha_common.c 2015-09-10 10:07:48.000000000 -0700 @@ -35,16 +35,19 @@ int s390_sha_update(struct shash_desc *d if (index) { memcpy(ctx->buf + index, data, bsize - index); ret = crypt_s390_kimd(ctx->func, ctx->state, ctx->buf, bsize); - BUG_ON(ret != bsize); + if (ret != bsize) + return -EIO; data += bsize - index; len -= bsize - index; + index = 0; } /* process as many blocks as possible */ if (len >= bsize) { ret = crypt_s390_kimd(ctx->func, ctx->state, data, len & ~(bsize - 1)); - BUG_ON(ret != (len & ~(bsize - 1))); + if (ret != (len & ~(bsize - 1))) + return -EIO; data += ret; len -= ret; } @@ -86,7 +89,8 @@ int s390_sha_final(struct shash_desc *de memcpy(ctx->buf + end - 8, &bits, sizeof(bits)); ret = crypt_s390_kimd(ctx->func, ctx->state, ctx->buf, end); - BUG_ON(ret != end); + if (ret != end) + return -EIO; /* copy digest to out */ memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm)); diff -uprN linux-2.6.32/arch/s390/hypfs/hypfs_dbfs.c linux-2.6.32.ovz/arch/s390/hypfs/hypfs_dbfs.c --- linux-2.6.32/arch/s390/hypfs/hypfs_dbfs.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/hypfs_dbfs.c 2015-09-10 10:06:31.000000000 -0700 @@ -0,0 +1,115 @@ +/* + * Hypervisor filesystem for Linux on s390 - debugfs interface + * + * Copyright (C) IBM Corp. 2010 + * Author(s): Michael Holzheu + */ + +#include +#include "hypfs.h" + +static struct dentry *dbfs_dir; + +static struct hypfs_dbfs_data *hypfs_dbfs_data_alloc(struct hypfs_dbfs_file *f) +{ + struct hypfs_dbfs_data *data; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + kref_init(&data->kref); + data->dbfs_file = f; + return data; +} + +static void hypfs_dbfs_data_free(struct kref *kref) +{ + struct hypfs_dbfs_data *data; + + data = container_of(kref, struct hypfs_dbfs_data, kref); + data->dbfs_file->data_free(data->buf_free_ptr); + kfree(data); +} + +static void data_free_delayed(struct work_struct *work) +{ + struct hypfs_dbfs_data *data; + struct hypfs_dbfs_file *df; + + df = container_of(work, struct hypfs_dbfs_file, data_free_work.work); + mutex_lock(&df->lock); + data = df->data; + df->data = NULL; + mutex_unlock(&df->lock); + kref_put(&data->kref, hypfs_dbfs_data_free); +} + +static ssize_t dbfs_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct hypfs_dbfs_data *data; + struct hypfs_dbfs_file *df; + ssize_t rc; + + if (*ppos != 0) + return 0; + + df = file->f_path.dentry->d_inode->i_private; + mutex_lock(&df->lock); + if (!df->data) { + data = hypfs_dbfs_data_alloc(df); + if (!data) { + mutex_unlock(&df->lock); + return -ENOMEM; + } + rc = df->data_create(&data->buf, &data->buf_free_ptr, + &data->size); + if (rc) { + mutex_unlock(&df->lock); + kfree(data); + return rc; + } + df->data = data; + schedule_delayed_work(&df->data_free_work, HZ); + } + data = df->data; + kref_get(&data->kref); + mutex_unlock(&df->lock); + + rc = simple_read_from_buffer(buf, size, ppos, data->buf, data->size); + kref_put(&data->kref, hypfs_dbfs_data_free); + return rc; +} + +static const struct file_operations dbfs_ops = { + .read = dbfs_read, +}; + +int hypfs_dbfs_create_file(struct hypfs_dbfs_file *df) +{ + df->dentry = debugfs_create_file(df->name, 0400, dbfs_dir, df, + &dbfs_ops); + if (IS_ERR(df->dentry)) + return PTR_ERR(df->dentry); + mutex_init(&df->lock); + INIT_DELAYED_WORK(&df->data_free_work, data_free_delayed); + return 0; +} + +void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df) +{ + debugfs_remove(df->dentry); +} + +int hypfs_dbfs_init(void) +{ + dbfs_dir = debugfs_create_dir("s390_hypfs", NULL); + if (IS_ERR(dbfs_dir)) + return PTR_ERR(dbfs_dir); + return 0; +} + +void hypfs_dbfs_exit(void) +{ + debugfs_remove(dbfs_dir); +} diff -uprN linux-2.6.32/arch/s390/hypfs/hypfs_diag.c linux-2.6.32.ovz/arch/s390/hypfs/hypfs_diag.c --- linux-2.6.32/arch/s390/hypfs/hypfs_diag.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/hypfs_diag.c 2015-09-10 10:06:31.000000000 -0700 @@ -12,10 +12,10 @@ #include #include -#include #include #include #include +#include #include #include "hypfs.h" @@ -23,6 +23,8 @@ #define CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */ #define TMP_SIZE 64 /* size of temporary buffers */ +#define DBFS_D204_HDR_VERSION 0 + /* diag 204 subcodes */ enum diag204_sc { SUBC_STIB4 = 4, @@ -48,6 +50,8 @@ static void *diag204_buf; /* 4K aligned static void *diag204_buf_vmalloc; /* vmalloc pointer for diag204 data */ static int diag204_buf_pages; /* number of pages for diag204 data */ +static struct dentry *dbfs_d204_file; + /* * DIAG 204 data structures and member access functions. * @@ -164,7 +168,7 @@ static inline void part_hdr__part_name(e LPAR_NAME_LEN); EBCASC(name, LPAR_NAME_LEN); name[LPAR_NAME_LEN] = 0; - strstrip(name); + strim(name); } struct cpu_info { @@ -365,18 +369,21 @@ static void diag204_free_buffer(void) } else { free_pages((unsigned long) diag204_buf, 0); } - diag204_buf_pages = 0; diag204_buf = NULL; } +static void *page_align_ptr(void *ptr) +{ + return (void *) PAGE_ALIGN((unsigned long) ptr); +} + static void *diag204_alloc_vbuf(int pages) { /* The buffer has to be page aligned! */ diag204_buf_vmalloc = vmalloc(PAGE_SIZE * (pages + 1)); if (!diag204_buf_vmalloc) return ERR_PTR(-ENOMEM); - diag204_buf = (void*)((unsigned long)diag204_buf_vmalloc - & ~0xfffUL) + 0x1000; + diag204_buf = page_align_ptr(diag204_buf_vmalloc); diag204_buf_pages = pages; return diag204_buf; } @@ -469,17 +476,26 @@ fail_alloc: return rc; } +static int diag204_do_store(void *buf, int pages) +{ + int rc; + + rc = diag204((unsigned long) diag204_store_sc | + (unsigned long) diag204_info_type, pages, buf); + return rc < 0 ? -ENOSYS : 0; +} + static void *diag204_store(void) { void *buf; - int pages; + int pages, rc; buf = diag204_get_buffer(diag204_info_type, &pages); if (IS_ERR(buf)) goto out; - if (diag204((unsigned long)diag204_store_sc | - (unsigned long)diag204_info_type, pages, buf) < 0) - return ERR_PTR(-ENOSYS); + rc = diag204_do_store(buf, pages); + if (rc) + return ERR_PTR(rc); out: return buf; } @@ -488,7 +504,7 @@ out: static int diag224(void *ptr) { - int rc = -ENOTSUPP; + int rc = -EOPNOTSUPP; asm volatile( " diag %1,%2,0x224\n" @@ -507,7 +523,7 @@ static int diag224_get_name_table(void) return -ENOMEM; if (diag224(diag224_cpu_names)) { kfree(diag224_cpu_names); - return -ENOTSUPP; + return -EOPNOTSUPP; } EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16); return 0; @@ -523,10 +539,54 @@ static int diag224_idx2name(int index, c memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN), CPU_NAME_LEN); name[CPU_NAME_LEN] = 0; - strstrip(name); + strim(name); + return 0; +} + +struct dbfs_d204_hdr { + u64 len; /* Length of d204 buffer without header */ + u16 version; /* Version of header */ + u8 sc; /* Used subcode */ + char reserved[53]; +} __attribute__ ((packed)); + +struct dbfs_d204 { + struct dbfs_d204_hdr hdr; /* 64 byte header */ + char buf[]; /* d204 buffer */ +} __attribute__ ((packed)); + +static int dbfs_d204_create(void **data, void **data_free_ptr, size_t *size) +{ + struct dbfs_d204 *d204; + int rc, buf_size; + void *base; + + buf_size = PAGE_SIZE * (diag204_buf_pages + 1) + sizeof(d204->hdr); + base = vmalloc(buf_size); + if (!base) + return -ENOMEM; + memset(base, 0, buf_size); + d204 = page_align_ptr(base + sizeof(d204->hdr)) - sizeof(d204->hdr); + rc = diag204_do_store(d204->buf, diag204_buf_pages); + if (rc) { + vfree(base); + return rc; + } + d204->hdr.version = DBFS_D204_HDR_VERSION; + d204->hdr.len = PAGE_SIZE * diag204_buf_pages; + d204->hdr.sc = diag204_store_sc; + *data = d204; + *data_free_ptr = base; + *size = d204->hdr.len + sizeof(struct dbfs_d204_hdr); return 0; } +static struct hypfs_dbfs_file dbfs_file_d204 = { + .name = "diag_204", + .data_create = dbfs_d204_create, + .data_free = vfree, +}; + __init int hypfs_diag_init(void) { int rc; @@ -535,19 +595,29 @@ __init int hypfs_diag_init(void) pr_err("The hardware system does not support hypfs\n"); return -ENODATA; } - rc = diag224_get_name_table(); - if (rc) { - diag204_free_buffer(); - pr_err("The hardware system does not provide all " - "functions required by hypfs\n"); + if (diag204_info_type == INFO_EXT) { + rc = hypfs_dbfs_create_file(&dbfs_file_d204); + if (rc) + return rc; } - return rc; + if (!MACHINE_IS_VM) { + rc = diag224_get_name_table(); + if (rc) { + pr_err("The hardware system does not provide all " + "functions required by hypfs\n"); + debugfs_remove(dbfs_d204_file); + return rc; + } + } + return 0; } void hypfs_diag_exit(void) { + debugfs_remove(dbfs_d204_file); diag224_delete_name_table(); diag204_free_buffer(); + hypfs_dbfs_remove_file(&dbfs_file_d204); } /* diff -uprN linux-2.6.32/arch/s390/hypfs/hypfs.h linux-2.6.32.ovz/arch/s390/hypfs/hypfs.h --- linux-2.6.32/arch/s390/hypfs/hypfs.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/hypfs.h 2015-09-10 10:06:31.000000000 -0700 @@ -11,6 +11,9 @@ #include #include +#include +#include +#include #define REG_FILE_MODE 0440 #define UPDATE_FILE_MODE 0220 @@ -34,6 +37,36 @@ extern int hypfs_diag_create_files(struc /* VM Hypervisor */ extern int hypfs_vm_init(void); +extern void hypfs_vm_exit(void); extern int hypfs_vm_create_files(struct super_block *sb, struct dentry *root); +/* debugfs interface */ +struct hypfs_dbfs_file; + +struct hypfs_dbfs_data { + void *buf; + void *buf_free_ptr; + size_t size; + struct hypfs_dbfs_file *dbfs_file;; + struct kref kref; +}; + +struct hypfs_dbfs_file { + const char *name; + int (*data_create)(void **data, void **data_free_ptr, + size_t *size); + void (*data_free)(const void *buf_free_ptr); + + /* Private data for hypfs_dbfs.c */ + struct hypfs_dbfs_data *data; + struct delayed_work data_free_work; + struct mutex lock; + struct dentry *dentry; +}; + +extern int hypfs_dbfs_init(void); +extern void hypfs_dbfs_exit(void); +extern int hypfs_dbfs_create_file(struct hypfs_dbfs_file *df); +extern void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df); + #endif /* _HYPFS_H_ */ diff -uprN linux-2.6.32/arch/s390/hypfs/hypfs_vm.c linux-2.6.32.ovz/arch/s390/hypfs/hypfs_vm.c --- linux-2.6.32/arch/s390/hypfs/hypfs_vm.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/hypfs_vm.c 2015-09-10 10:06:31.000000000 -0700 @@ -10,9 +10,11 @@ #include #include #include +#include #include "hypfs.h" #define NAME_LEN 8 +#define DBFS_D2FC_HDR_VERSION 0 static char local_guest[] = " "; static char all_guests[] = "* "; @@ -76,28 +78,31 @@ static int diag2fc(int size, char* query return -residual_cnt; } -static struct diag2fc_data *diag2fc_store(char *query, int *count) +/* + * Allocate buffer for "query" and store diag 2fc at "offset" + */ +static void *diag2fc_store(char *query, unsigned int *count, int offset) { + void *data; int size; - struct diag2fc_data *data; do { size = diag2fc(0, query, NULL); if (size < 0) return ERR_PTR(-EACCES); - data = vmalloc(size); + data = vmalloc(size + offset); if (!data) return ERR_PTR(-ENOMEM); - if (diag2fc(size, query, data) == 0) + if (diag2fc(size, query, data + offset) == 0) break; vfree(data); } while (1); - *count = (size / sizeof(*data)); + *count = (size / sizeof(struct diag2fc_data)); return data; } -static void diag2fc_free(void *data) +static void diag2fc_free(const void *data) { vfree(data); } @@ -124,7 +129,7 @@ static int hpyfs_vm_create_guest(struct /* guest dir */ memcpy(guest_name, data->guest_name, NAME_LEN); EBCASC(guest_name, NAME_LEN); - strstrip(guest_name); + strim(guest_name); guest_dir = hypfs_mkdir(sb, systems_dir, guest_name); if (IS_ERR(guest_dir)) return PTR_ERR(guest_dir); @@ -168,9 +173,10 @@ int hypfs_vm_create_files(struct super_b { struct dentry *dir, *file; struct diag2fc_data *data; - int rc, i, count = 0; + unsigned int count = 0; + int rc, i; - data = diag2fc_store(guest_query, &count); + data = diag2fc_store(guest_query, &count, 0); if (IS_ERR(data)) return PTR_ERR(data); @@ -218,14 +224,60 @@ failed: return rc; } +struct dbfs_d2fc_hdr { + u64 len; /* Length of d2fc buffer without header */ + u16 version; /* Version of header */ + char tod_ext[16]; /* TOD clock for d2fc */ + u64 count; /* Number of VM guests in d2fc buffer */ + char reserved[30]; +} __attribute__ ((packed)); + +struct dbfs_d2fc { + struct dbfs_d2fc_hdr hdr; /* 64 byte header */ + char buf[]; /* d2fc buffer */ +} __attribute__ ((packed)); + +static int dbfs_diag2fc_create(void **data, void **data_free_ptr, size_t *size) +{ + struct dbfs_d2fc *d2fc; + unsigned int count; + + d2fc = diag2fc_store(guest_query, &count, sizeof(d2fc->hdr)); + if (IS_ERR(d2fc)) + return PTR_ERR(d2fc); + get_clock_ext(d2fc->hdr.tod_ext); + d2fc->hdr.len = count * sizeof(struct diag2fc_data); + d2fc->hdr.version = DBFS_D2FC_HDR_VERSION; + d2fc->hdr.count = count; + memset(&d2fc->hdr.reserved, 0, sizeof(d2fc->hdr.reserved)); + *data = d2fc; + *data_free_ptr = d2fc; + *size = d2fc->hdr.len + sizeof(struct dbfs_d2fc_hdr); + return 0; +} + +static struct hypfs_dbfs_file dbfs_file_2fc = { + .name = "diag_2fc", + .data_create = dbfs_diag2fc_create, + .data_free = diag2fc_free, +}; + int hypfs_vm_init(void) { + if (!MACHINE_IS_VM) + return 0; if (diag2fc(0, all_guests, NULL) > 0) guest_query = all_guests; else if (diag2fc(0, local_guest, NULL) > 0) guest_query = local_guest; else return -EACCES; + return hypfs_dbfs_create_file(&dbfs_file_2fc); +} - return 0; +void hypfs_vm_exit(void) +{ + if (!MACHINE_IS_VM) + return; + hypfs_dbfs_remove_file(&dbfs_file_2fc); } diff -uprN linux-2.6.32/arch/s390/hypfs/inode.c linux-2.6.32.ovz/arch/s390/hypfs/inode.c --- linux-2.6.32/arch/s390/hypfs/inode.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/inode.c 2015-09-10 10:06:31.000000000 -0700 @@ -14,8 +14,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -145,7 +145,7 @@ static int hypfs_open(struct inode *inod } mutex_unlock(&fs_info->lock); } - return 0; + return nonseekable_open(inode, filp); } static ssize_t hypfs_aio_read(struct kiocb *iocb, const struct iovec *iov, @@ -288,46 +288,30 @@ static int hypfs_fill_super(struct super sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = HYPFS_MAGIC; sb->s_op = &hypfs_s_ops; - if (hypfs_parse_options(data, sb)) { - rc = -EINVAL; - goto err_alloc; - } + if (hypfs_parse_options(data, sb)) + return -EINVAL; root_inode = hypfs_make_inode(sb, S_IFDIR | 0755); - if (!root_inode) { - rc = -ENOMEM; - goto err_alloc; - } + if (!root_inode) + return -ENOMEM; root_inode->i_op = &simple_dir_inode_operations; root_inode->i_fop = &simple_dir_operations; - root_dentry = d_alloc_root(root_inode); + sb->s_root = root_dentry = d_alloc_root(root_inode); if (!root_dentry) { iput(root_inode); - rc = -ENOMEM; - goto err_alloc; + return -ENOMEM; } if (MACHINE_IS_VM) rc = hypfs_vm_create_files(sb, root_dentry); else rc = hypfs_diag_create_files(sb, root_dentry); if (rc) - goto err_tree; + return rc; sbi->update_file = hypfs_create_update_file(sb, root_dentry); - if (IS_ERR(sbi->update_file)) { - rc = PTR_ERR(sbi->update_file); - goto err_tree; - } + if (IS_ERR(sbi->update_file)) + return PTR_ERR(sbi->update_file); hypfs_update_update(sb); - sb->s_root = root_dentry; pr_info("Hypervisor filesystem mounted\n"); return 0; - -err_tree: - hypfs_delete_tree(root_dentry); - d_genocide(root_dentry); - dput(root_dentry); -err_alloc: - kfree(sbi); - return rc; } static int hypfs_get_super(struct file_system_type *fst, int flags, @@ -340,12 +324,12 @@ static void hypfs_kill_super(struct supe { struct hypfs_sb_info *sb_info = sb->s_fs_info; - if (sb->s_root) { + if (sb->s_root) hypfs_delete_tree(sb->s_root); + if (sb_info->update_file) hypfs_remove(sb_info->update_file); - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; - } + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; kill_litter_super(sb); } @@ -484,20 +468,21 @@ static int __init hypfs_init(void) { int rc; - if (MACHINE_IS_VM) { - if (hypfs_vm_init()) - /* no diag 2fc, just exit */ - return -ENODATA; - } else { - if (hypfs_diag_init()) { - rc = -ENODATA; - goto fail_diag; - } + rc = hypfs_dbfs_init(); + if (rc) + return rc; + if (hypfs_diag_init()) { + rc = -ENODATA; + goto fail_dbfs_exit; + } + if (hypfs_vm_init()) { + rc = -ENODATA; + goto fail_hypfs_diag_exit; } s390_kobj = kobject_create_and_add("s390", hypervisor_kobj); if (!s390_kobj) { rc = -ENOMEM; - goto fail_sysfs; + goto fail_hypfs_vm_exit; } rc = register_filesystem(&hypfs_type); if (rc) @@ -506,18 +491,21 @@ static int __init hypfs_init(void) fail_filesystem: kobject_put(s390_kobj); -fail_sysfs: - if (!MACHINE_IS_VM) - hypfs_diag_exit(); -fail_diag: +fail_hypfs_vm_exit: + hypfs_vm_exit(); +fail_hypfs_diag_exit: + hypfs_diag_exit(); +fail_dbfs_exit: + hypfs_dbfs_exit(); pr_err("Initialization of hypfs failed with rc=%i\n", rc); return rc; } static void __exit hypfs_exit(void) { - if (!MACHINE_IS_VM) - hypfs_diag_exit(); + hypfs_diag_exit(); + hypfs_vm_exit(); + hypfs_dbfs_exit(); unregister_filesystem(&hypfs_type); kobject_put(s390_kobj); } diff -uprN linux-2.6.32/arch/s390/hypfs/Makefile linux-2.6.32.ovz/arch/s390/hypfs/Makefile --- linux-2.6.32/arch/s390/hypfs/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/Makefile 2015-09-10 10:06:31.000000000 -0700 @@ -4,4 +4,4 @@ obj-$(CONFIG_S390_HYPFS_FS) += s390_hypfs.o -s390_hypfs-objs := inode.o hypfs_diag.o hypfs_vm.o +s390_hypfs-objs := inode.o hypfs_diag.o hypfs_vm.o hypfs_dbfs.o diff -uprN linux-2.6.32/arch/s390/include/asm/atomic.h linux-2.6.32.ovz/arch/s390/include/asm/atomic.h --- linux-2.6.32/arch/s390/include/asm/atomic.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/atomic.h 2015-09-10 10:05:44.000000000 -0700 @@ -21,7 +21,7 @@ #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 2) #define __CS_LOOP(ptr, op_val, op_string) ({ \ - typeof(ptr->counter) old_val, new_val; \ + int old_val, new_val; \ asm volatile( \ " l %0,%2\n" \ "0: lr %1,%0\n" \ @@ -38,7 +38,7 @@ #else /* __GNUC__ */ #define __CS_LOOP(ptr, op_val, op_string) ({ \ - typeof(ptr->counter) old_val, new_val; \ + int old_val, new_val; \ asm volatile( \ " l %0,0(%3)\n" \ "0: lr %1,%0\n" \ @@ -143,7 +143,7 @@ static inline int atomic_add_unless(atom #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 2) #define __CSG_LOOP(ptr, op_val, op_string) ({ \ - typeof(ptr->counter) old_val, new_val; \ + long long old_val, new_val; \ asm volatile( \ " lg %0,%2\n" \ "0: lgr %1,%0\n" \ @@ -160,7 +160,7 @@ static inline int atomic_add_unless(atom #else /* __GNUC__ */ #define __CSG_LOOP(ptr, op_val, op_string) ({ \ - typeof(ptr->counter) old_val, new_val; \ + long long old_val, new_val; \ asm volatile( \ " lg %0,0(%3)\n" \ "0: lgr %1,%0\n" \ diff -uprN linux-2.6.32/arch/s390/include/asm/bitops.h linux-2.6.32.ovz/arch/s390/include/asm/bitops.h --- linux-2.6.32/arch/s390/include/asm/bitops.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/bitops.h 2015-09-10 10:07:34.000000000 -0700 @@ -879,6 +879,33 @@ static inline int ext2_find_next_bit(voi #include +/* Compat macros for RHEL6 */ +#define find_next_zero_bit_le(addr, size, offset) \ + generic_find_next_zero_le_bit(addr, size, offset) +#define find_next_bit_le(addr, size, offset) \ + generic_find_next_le_bit(addr, size, offset) +#define __set_bit_le(nr, addr) \ + __set_bit((nr) ^ (__BITOPS_WORDSIZE - 8), addr) +#define __clear_bit_le(nr, addr) \ + __clear_bit((nr) ^ (__BITOPS_WORDSIZE - 8), addr) + +#define test_bit_le(nr, addr) \ + test_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) +#define set_bit_le(nr, addr) \ + set_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) +#define clear_bit_le(nr, addr) \ + clear_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) + +#define test_and_set_bit_le(nr, addr) \ + test_and_set_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) +#define test_and_clear_bit_le(nr, addr) \ + test_and_clear_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) + +#define __test_and_set_bit_le(nr, addr) \ + __test_and_set_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) +#define __test_and_clear_bit_le(nr, addr) \ + __test_and_clear_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) + #endif /* __KERNEL__ */ #endif /* _S390_BITOPS_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/bug.h linux-2.6.32.ovz/arch/s390/include/asm/bug.h --- linux-2.6.32/arch/s390/include/asm/bug.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/bug.h 2015-09-10 10:06:13.000000000 -0700 @@ -52,14 +52,18 @@ for (;;); \ } while (0) +#define __WARN_TAINT(taint) do { \ + __EMIT_BUG(BUGFLAG_TAINT(taint)); \ +} while (0) + #define WARN_ON(x) ({ \ int __ret_warn_on = !!(x); \ if (__builtin_constant_p(__ret_warn_on)) { \ if (__ret_warn_on) \ - __EMIT_BUG(BUGFLAG_WARNING); \ + __WARN(); \ } else { \ if (unlikely(__ret_warn_on)) \ - __EMIT_BUG(BUGFLAG_WARNING); \ + __WARN(); \ } \ unlikely(__ret_warn_on); \ }) diff -uprN linux-2.6.32/arch/s390/include/asm/ccwdev.h linux-2.6.32.ovz/arch/s390/include/asm/ccwdev.h --- linux-2.6.32/arch/s390/include/asm/ccwdev.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/ccwdev.h 2015-09-10 10:07:52.000000000 -0700 @@ -17,6 +17,9 @@ struct irb; struct ccw1; struct ccw_dev_id; +/* from asm/schid.h */ +struct subchannel_id; + /* simplified initializers for struct ccw_device: * CCW_DEVICE and CCW_DEVICE_DEVTYPE initialize one * entry in your MODULE_DEVICE_TABLE and set the match_flag correctly */ @@ -91,6 +94,24 @@ struct ccw_device { void (*handler) (struct ccw_device *, unsigned long, struct irb *); }; +/* + * Possible events used by the path_event notifier. + */ +#define PE_NONE 0x0 +#define PE_PATH_GONE 0x1 /* A path is no longer available. */ +#define PE_PATH_AVAILABLE 0x2 /* A path has become available and + was successfully verified. */ +#define PE_PATHGROUP_ESTABLISHED 0x4 /* A pathgroup was reset and had + to be established again. */ + +/* + * Possible CIO actions triggered by the unit check handler. + */ +enum uc_todo { + UC_TODO_RETRY, + UC_TODO_RETRY_ON_NEW_PATH, + UC_TODO_STOP +}; /** * struct ccw driver - device driver for channel attached devices @@ -101,12 +122,14 @@ struct ccw_device { * @set_online: called when setting device online * @set_offline: called when setting device offline * @notify: notify driver of device state changes + * @path_event: notify driver of channel path events * @shutdown: called at device shutdown * @prepare: prepare for pm state transition * @complete: undo work done in @prepare * @freeze: callback for freezing during hibernation snapshotting * @thaw: undo work done in @freeze * @restore: callback for restoring after hibernation + * @uc_handler: callback for unit check handler * @driver: embedded device driver structure * @name: device driver name */ @@ -118,12 +141,14 @@ struct ccw_driver { int (*set_online) (struct ccw_device *); int (*set_offline) (struct ccw_device *); int (*notify) (struct ccw_device *, int); + void (*path_event) (struct ccw_device *, int *); void (*shutdown) (struct ccw_device *); int (*prepare) (struct ccw_device *); void (*complete) (struct ccw_device *); int (*freeze)(struct ccw_device *); int (*thaw) (struct ccw_device *); int (*restore)(struct ccw_device *); + enum uc_todo (*uc_handler) (struct ccw_device *, struct irb *); struct device_driver driver; char *name; }; @@ -142,6 +167,8 @@ struct ccw1; extern int ccw_device_set_options_mask(struct ccw_device *, unsigned long); extern int ccw_device_set_options(struct ccw_device *, unsigned long); extern void ccw_device_clear_options(struct ccw_device *, unsigned long); +int ccw_device_is_pathgroup(struct ccw_device *cdev); +int ccw_device_is_multipath(struct ccw_device *cdev); /* Allow for i/o completion notification after primary interrupt status. */ #define CCWDEV_EARLY_NOTIFICATION 0x0001 @@ -151,6 +178,8 @@ extern void ccw_device_clear_options(str #define CCWDEV_DO_PATHGROUP 0x0004 /* Allow forced onlining of boxed devices. */ #define CCWDEV_ALLOW_FORCE 0x0008 +/* Try to use multipath mode. */ +#define CCWDEV_DO_MULTIPATH 0x0010 extern int ccw_device_start(struct ccw_device *, struct ccw1 *, unsigned long, __u8, unsigned long); @@ -178,6 +207,8 @@ int ccw_device_tm_start_timeout(struct c unsigned long, u8, int); int ccw_device_tm_intrg(struct ccw_device *cdev); +int ccw_device_get_mdc(struct ccw_device *cdev, u8 mask); + extern int ccw_device_set_online(struct ccw_device *cdev); extern int ccw_device_set_offline(struct ccw_device *cdev); @@ -194,8 +225,12 @@ extern void ccw_device_get_id(struct ccw extern struct ccw_device *ccw_device_probe_console(void); extern int ccw_device_force_console(void); +int ccw_device_siosl(struct ccw_device *); + // FIXME: these have to go extern int _ccw_device_get_subchannel_number(struct ccw_device *); -extern void *ccw_device_get_chp_desc(struct ccw_device *, int); +extern void ccw_device_get_schid(struct ccw_device *, struct subchannel_id *); + +struct channel_path_desc *ccw_device_get_chp_desc(struct ccw_device *, int); #endif /* _S390_CCWDEV_H_ */ diff -uprN linux-2.6.32/arch/s390/include/asm/ccwgroup.h linux-2.6.32.ovz/arch/s390/include/asm/ccwgroup.h --- linux-2.6.32/arch/s390/include/asm/ccwgroup.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/ccwgroup.h 2015-09-10 10:07:25.000000000 -0700 @@ -71,6 +71,9 @@ int ccwgroup_create_from_string(struct d struct ccw_driver *cdrv, int num_devices, const char *buf); +extern int ccwgroup_set_online(struct ccwgroup_device *gdev); +extern int ccwgroup_set_offline(struct ccwgroup_device *gdev); + extern int ccwgroup_probe_ccwdev(struct ccw_device *cdev); extern void ccwgroup_remove_ccwdev(struct ccw_device *cdev); diff -uprN linux-2.6.32/arch/s390/include/asm/chpid.h linux-2.6.32.ovz/arch/s390/include/asm/chpid.h --- linux-2.6.32/arch/s390/include/asm/chpid.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/chpid.h 2015-09-10 10:07:52.000000000 -0700 @@ -23,6 +23,17 @@ struct chp_id { #ifdef __KERNEL__ #include +struct channel_path_desc { + u8 flags; + u8 lsn; + u8 desc; + u8 chpid; + u8 swla; + u8 zeroes; + u8 chla; + u8 chpp; +} __packed; + static inline void chp_id_init(struct chp_id *chpid) { memset(chpid, 0, sizeof(struct chp_id)); diff -uprN linux-2.6.32/arch/s390/include/asm/chsc.h linux-2.6.32.ovz/arch/s390/include/asm/chsc.h --- linux-2.6.32/arch/s390/include/asm/chsc.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/chsc.h 2015-09-10 10:07:15.000000000 -0700 @@ -125,32 +125,4 @@ struct chsc_cpd_info { #define CHSC_INFO_CPD _IOWR(CHSC_IOCTL_MAGIC, 0x87, struct chsc_cpd_info) #define CHSC_INFO_DCAL _IOWR(CHSC_IOCTL_MAGIC, 0x88, struct chsc_dcal) -#ifdef __KERNEL__ - -struct css_general_char { - u64 : 12; - u32 dynio : 1; /* bit 12 */ - u32 : 28; - u32 aif : 1; /* bit 41 */ - u32 : 3; - u32 mcss : 1; /* bit 45 */ - u32 fcs : 1; /* bit 46 */ - u32 : 1; - u32 ext_mb : 1; /* bit 48 */ - u32 : 7; - u32 aif_tdd : 1; /* bit 56 */ - u32 : 1; - u32 qebsm : 1; /* bit 58 */ - u32 : 8; - u32 aif_osa : 1; /* bit 67 */ - u32 : 14; - u32 cib : 1; /* bit 82 */ - u32 : 5; - u32 fcx : 1; /* bit 88 */ - u32 : 7; -}__attribute__((packed)); - -extern struct css_general_char css_general_characteristics; - -#endif /* __KERNEL__ */ #endif diff -uprN linux-2.6.32/arch/s390/include/asm/cio.h linux-2.6.32.ovz/arch/s390/include/asm/cio.h --- linux-2.6.32/arch/s390/include/asm/cio.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/cio.h 2015-09-10 10:07:15.000000000 -0700 @@ -85,6 +85,18 @@ struct erw { } __attribute__ ((packed)); /** + * struct erw_eadm - EADM Subchannel extended report word + * @b: aob error + * @r: arsb error + */ +struct erw_eadm { + __u32 : 16; + __u32 b : 1; + __u32 r : 1; + __u32 : 14; +} __packed; + +/** * struct sublog - subchannel logout area * @res0: reserved * @esf: extended status flags @@ -175,9 +187,22 @@ struct esw3 { } __attribute__ ((packed)); /** + * struct esw_eadm - EADM Subchannel Extended Status Word (ESW) + * @sublog: subchannel logout + * @erw: extended report word + */ +struct esw_eadm { + __u32 sublog; + struct erw_eadm erw; + __u32 : 32; + __u32 : 32; + __u32 : 32; +} __packed; + +/** * struct irb - interruption response block * @scsw: subchannel status word - * @esw: extened status word, 4 formats + * @esw: extened status word * @ecw: extended control word * * The irb that is handed to the device driver when an interrupt occurs. For @@ -196,6 +221,7 @@ struct irb { struct esw1 esw1; struct esw2 esw2; struct esw3 esw3; + struct esw_eadm eadm; } esw; __u8 ecw[32]; } __attribute__ ((packed,aligned(4))); diff -uprN linux-2.6.32/arch/s390/include/asm/compat.h linux-2.6.32.ovz/arch/s390/include/asm/compat.h --- linux-2.6.32/arch/s390/include/asm/compat.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/compat.h 2015-09-10 10:06:37.000000000 -0700 @@ -168,7 +168,7 @@ static inline compat_uptr_t ptr_to_compa static inline int is_compat_task(void) { - return test_thread_flag(TIF_31BIT); + return is_32bit_task(); } #else @@ -180,7 +180,7 @@ static inline int is_compat_task(void) #endif -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { unsigned long stack; diff -uprN linux-2.6.32/arch/s390/include/asm/cpu_mf.h linux-2.6.32.ovz/arch/s390/include/asm/cpu_mf.h --- linux-2.6.32/arch/s390/include/asm/cpu_mf.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/cpu_mf.h 2015-09-10 10:07:25.000000000 -0700 @@ -0,0 +1,111 @@ +/* + * CPU-measurement facilities + * + * Copyright IBM Corp. 2012 + * Author(s): Hendrik Brueckner + * Jan Glauber + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + */ +#ifndef _ASM_S390_CPU_MF_H +#define _ASM_S390_CPU_MF_H + +#define CPU_MF_INT_CF_CACA (1 << 7) /* counter auth. change alert */ +#define CPU_MF_INT_CF_LCDA (1 << 6) /* loss of counter data alert */ +#define CPU_MF_INT_RI_HALTED (1 << 5) /* run-time instr. halted */ +#define CPU_MF_INT_RI_BUF_FULL (1 << 4) /* run-time instr. program + buffer full */ + +#define CPU_MF_INT_CF_MASK (CPU_MF_INT_CF_CACA|CPU_MF_INT_CF_LCDA) +#define CPU_MF_INT_RI_MASK (CPU_MF_INT_RI_HALTED|CPU_MF_INT_RI_BUF_FULL) + +/* CPU measurement facility support */ +static inline int cpum_cf_avail(void) +{ + unsigned long long facility_bits[2]; + + if (!MACHINE_HAS_SPP) + return 0; + + if (stfle(facility_bits, 2) <= 0) + return 0; + if (facility_bits[1] & (1ULL << 60)) + return 1; + else + return 0; +} + +static inline int cpum_sf_avail(void) +{ + unsigned long long facility_bits[2]; + + if (!MACHINE_HAS_SPP) + return 0; + + if (stfle(facility_bits, 2) <= 0) + return 0; + if (facility_bits[1] & (1ULL << 59)) + return 1; + else + return 0; +} + + +struct cpumf_ctr_info { + u16 cfvn; + u16 auth_ctl; + u16 enable_ctl; + u16 act_ctl; + u16 max_cpu; + u16 csvn; + u16 max_cg; + u16 reserved1; + u32 reserved2[12]; +} __packed; + +/* Query counter information */ +static inline int qctri(struct cpumf_ctr_info *info) +{ + int rc = -EINVAL; + + asm volatile ( + "0: .insn s,0xb28e0000,%1\n" + "1: lhi %0,0\n" + "2:\n" + EX_TABLE(1b, 2b) + : "+d" (rc), "=Q" (*info)); + return rc; +} + +/* Load CPU-counter-set controls */ +static inline int lcctl(u64 ctl) +{ + int cc; + + asm volatile ( + " .insn s,0xb2840000,%1\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (cc) : "m" (ctl) : "cc"); + return cc; +} + +/* Extract CPU counter */ +static inline int ecctr(u64 ctr, u64 *val) +{ + register u64 content asm("4") = 0; + int cc; + + asm volatile ( + " .insn rre,0xb2e40000,%0,%2\n" + " ipm %1\n" + " srl %1,28\n" + : "=d" (content), "=d" (cc) : "d" (ctr) : "cc"); + if (!cc) + *val = content; + return cc; +} + +#endif /* _ASM_S390_CPU_MF_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/cputime.h linux-2.6.32.ovz/arch/s390/include/asm/cputime.h --- linux-2.6.32/arch/s390/include/asm/cputime.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/cputime.h 2015-09-10 10:06:55.000000000 -0700 @@ -88,6 +88,23 @@ msecs_to_cputime(const unsigned int m) } /* + * Convert cputime to microseconds and back. + */ +static inline unsigned int +cputime_to_usecs(const cputime_t cputime) +{ + return cputime_div(cputime, 4096); +} + +static inline cputime_t +usecs_to_cputime(const unsigned int m) +{ + return (cputime_t) m * 4096; +} + +#define usecs_to_cputime64(m) usecs_to_cputime(m) + +/* * Convert cputime to milliseconds and back. */ static inline unsigned int @@ -183,6 +200,7 @@ struct s390_idle_data { unsigned long long idle_count; unsigned long long idle_enter; unsigned long long idle_time; + int nohz_delay; }; DECLARE_PER_CPU(struct s390_idle_data, s390_idle); @@ -198,4 +216,11 @@ static inline void s390_idle_check(void) vtime_start_cpu(); } +static inline int s390_nohz_delay(int cpu) +{ + return per_cpu(s390_idle, cpu).nohz_delay != 0; +} + +#define arch_needs_cpu(cpu) s390_nohz_delay(cpu) + #endif /* _S390_CPUTIME_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/crash.h linux-2.6.32.ovz/arch/s390/include/asm/crash.h --- linux-2.6.32/arch/s390/include/asm/crash.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/crash.h 2015-09-10 10:07:26.000000000 -0700 @@ -0,0 +1,61 @@ +#ifndef _S390_CRASH_H +#define _S390_CRASH_H + +#ifdef __KERNEL__ + +#include +#include + + +/* + * For swapped prefix pages get bounce buffer using xlate_dev_mem_ptr() + */ +static inline void *map_virtual(u64 offset, struct page **pp) +{ + struct page *page; + unsigned long pfn; + void *vaddr; + + vaddr = xlate_dev_mem_ptr(offset); + pfn = ((unsigned long) vaddr) >> PAGE_SHIFT; + if ((unsigned long) vaddr != offset) + page = pfn_to_page(pfn); + else + page = NULL; + + if (!page_is_ram(pfn)) { + printk(KERN_INFO + "crash memory driver: !page_is_ram(pfn: %lx)\n", pfn); + return NULL; + } + + if (!pfn_valid(pfn)) { + printk(KERN_INFO + "crash memory driver: invalid pfn: %lx )\n", pfn); + return NULL; + } + + *pp = page; + return vaddr; +} + +/* + * Free bounce buffer if necessary + */ +static inline void unmap_virtual(struct page *page) +{ + void *vaddr; + + if (page) { + /* + * Because for bounce buffers vaddr will never be 0 + * unxlate_dev_mem_ptr() will always free the bounce buffer. + */ + vaddr = (void *)(page_to_pfn(page) << PAGE_SHIFT); + unxlate_dev_mem_ptr(0, vaddr); + } +} + +#endif /* __KERNEL__ */ + +#endif /* _S390_CRASH_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/crw.h linux-2.6.32.ovz/arch/s390/include/asm/crw.h --- linux-2.6.32/arch/s390/include/asm/crw.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/crw.h 2015-09-10 10:06:31.000000000 -0700 @@ -32,6 +32,7 @@ typedef void (*crw_handler_t)(struct crw extern int crw_register_handler(int rsc, crw_handler_t handler); extern void crw_unregister_handler(int rsc); extern void crw_handle_channel_report(void); +void crw_wait_for_channel_report(void); #define NR_RSCS 16 diff -uprN linux-2.6.32/arch/s390/include/asm/css_chars.h linux-2.6.32.ovz/arch/s390/include/asm/css_chars.h --- linux-2.6.32/arch/s390/include/asm/css_chars.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/css_chars.h 2015-09-10 10:07:15.000000000 -0700 @@ -0,0 +1,39 @@ +#ifndef _ASM_CSS_CHARS_H +#define _ASM_CSS_CHARS_H + +#include + +#ifdef __KERNEL__ + +struct css_general_char { + u64 : 12; + u32 dynio : 1; /* bit 12 */ + u32 : 4; + u32 eadm : 1; /* bit 17 */ + u32 : 23; + u32 aif : 1; /* bit 41 */ + u32 : 3; + u32 mcss : 1; /* bit 45 */ + u32 fcs : 1; /* bit 46 */ + u32 : 1; + u32 ext_mb : 1; /* bit 48 */ + u32 : 7; + u32 aif_tdd : 1; /* bit 56 */ + u32 : 1; + u32 qebsm : 1; /* bit 58 */ + u32 : 8; + u32 aif_osa : 1; /* bit 67 */ + u32 : 12; + u32 eadm_rf : 1; /* bit 80 */ + u32 : 1; + u32 cib : 1; /* bit 82 */ + u32 : 5; + u32 fcx : 1; /* bit 88 */ + u32 : 19; + u32 alt_ssi : 1; /* bit 108 */ +} __packed; + +extern struct css_general_char css_general_characteristics; + +#endif /* __KERNEL__ */ +#endif diff -uprN linux-2.6.32/arch/s390/include/asm/dasd.h linux-2.6.32.ovz/arch/s390/include/asm/dasd.h --- linux-2.6.32/arch/s390/include/asm/dasd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/dasd.h 2015-09-10 10:06:37.000000000 -0700 @@ -73,6 +73,7 @@ typedef struct dasd_information2_t { * 0x02: use diag discipline (diag) * 0x04: set the device initially online (internal use only) * 0x08: enable ERP related logging + * 0x20: give access to raw eckd data */ #define DASD_FEATURE_DEFAULT 0x00 #define DASD_FEATURE_READONLY 0x01 @@ -80,6 +81,8 @@ typedef struct dasd_information2_t { #define DASD_FEATURE_INITIAL_ONLINE 0x04 #define DASD_FEATURE_ERPLOG 0x08 #define DASD_FEATURE_FAILFAST 0x10 +#define DASD_FEATURE_FAILONSLCK 0x20 +#define DASD_FEATURE_USERAW 0x40 #define DASD_PARTN_BITS 2 @@ -217,6 +220,25 @@ typedef struct dasd_symmio_parms { int rssd_result_len; } __attribute__ ((packed)) dasd_symmio_parms_t; +/* + * Data returned by Sense Path Group ID (SNID) + */ +struct dasd_snid_data { + struct { + __u8 group:2; + __u8 reserve:2; + __u8 mode:1; + __u8 res:3; + } __attribute__ ((packed)) path_state; + __u8 pgid[11]; +} __attribute__ ((packed)); + +struct dasd_snid_ioctl_data { + struct dasd_snid_data data; + __u8 path_mask; +} __attribute__ ((packed)); + + /******************************************************************************** * SECTION: Definition of IOCTLs * @@ -261,25 +283,10 @@ typedef struct dasd_symmio_parms { /* Set Attributes (cache operations) */ #define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t) +/* Get Sense Path Group ID (SNID) data */ +#define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data) + #define BIODASDSYMMIO _IOWR(DASD_IOCTL_LETTER, 240, dasd_symmio_parms_t) #endif /* DASD_H */ -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-indent-level: 4 - * c-brace-imaginary-offset: 0 - * c-brace-offset: -4 - * c-argdecl-indent: 4 - * c-label-offset: -4 - * c-continued-statement-offset: 4 - * c-continued-brace-offset: 0 - * indent-tabs-mode: nil - * tab-width: 8 - * End: - */ diff -uprN linux-2.6.32/arch/s390/include/asm/debug.h linux-2.6.32.ovz/arch/s390/include/asm/debug.h --- linux-2.6.32/arch/s390/include/asm/debug.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/debug.h 2015-09-10 10:06:59.000000000 -0700 @@ -131,6 +131,7 @@ void debug_unregister(debug_info_t* id); void debug_set_level(debug_info_t* id, int new_level); +void debug_set_critical(void); void debug_stop_all(void); static inline debug_entry_t* diff -uprN linux-2.6.32/arch/s390/include/asm/diag.h linux-2.6.32.ovz/arch/s390/include/asm/diag.h --- linux-2.6.32/arch/s390/include/asm/diag.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/diag.h 2015-09-10 10:06:49.000000000 -0700 @@ -9,9 +9,22 @@ #define _ASM_S390_DIAG_H /* - * Diagnose 10: Release pages + * Diagnose 10: Release page range */ -extern void diag10(unsigned long addr); +static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn) +{ + unsigned long start_addr, end_addr; + + start_addr = start_pfn << PAGE_SHIFT; + end_addr = (start_pfn + num_pfn - 1) << PAGE_SHIFT; + + asm volatile( + "0: diag %0,%1,0x10\n" + "1:\n" + EX_TABLE(0b, 1b) + EX_TABLE(1b, 1b) + : : "a" (start_addr), "a" (end_addr)); +} /* * Diagnose 14: Input spool file manipulation diff -uprN linux-2.6.32/arch/s390/include/asm/eadm.h linux-2.6.32.ovz/arch/s390/include/asm/eadm.h --- linux-2.6.32/arch/s390/include/asm/eadm.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/eadm.h 2015-09-10 10:07:25.000000000 -0700 @@ -0,0 +1,130 @@ +#ifndef _ASM_S390_EADM_H +#define _ASM_S390_EADM_H + +#include +#include +#include + +struct arqb { + u64 data; + u16 fmt:4; + u16:12; + u16 cmd_code; + u16:16; + u16 msb_count; + u32 reserved[12]; +} __packed; + +#define ARQB_CMD_MOVE 1 + +struct arsb { + u16 fmt:4; + u32:28; + u8 ef; + u8:8; + u8 ecbi; + u8:8; + u8 fvf; + u16:16; + u8 eqc; + u32:32; + u64 fail_msb; + u64 fail_aidaw; + u64 fail_ms; + u64 fail_scm; + u32 reserved[4]; +} __packed; + +#define EQC_WR_PROHIBIT 22 + +struct msb { + u8 fmt:4; + u8 oc:4; + u8 flags; + u16:12; + u16 bs:4; + u32 blk_count; + u64 data_addr; + u64 scm_addr; + u64:64; +} __packed; + +struct aidaw { + u8 flags; + u32 :24; + u32 :32; + u64 data_addr; +} __packed; + +#define MSB_OC_CLEAR 0 +#define MSB_OC_READ 1 +#define MSB_OC_WRITE 2 +#define MSB_OC_RELEASE 3 + +#define MSB_FLAG_BNM 0x80 +#define MSB_FLAG_IDA 0x40 + +#define MSB_BS_4K 0 +#define MSB_BS_1M 1 + +#define AOB_NR_MSB 124 + +struct aob { + struct arqb request; + struct arsb response; + struct msb msb[AOB_NR_MSB]; +} __packed __aligned(PAGE_SIZE); + +struct aob_rq_header { + struct scm_device *scmdev; + char data[0]; +}; + +struct scm_device { + u64 address; + u64 size; + unsigned int nr_max_block; + struct device dev; + spinlock_t lock; + struct { + unsigned int persistence:4; + unsigned int oper_state:4; + unsigned int data_state:4; + unsigned int rank:4; + unsigned int release:1; + unsigned int res_id:8; + } __packed attrs; +}; + +#define OP_STATE_GOOD 1 +#define OP_STATE_TEMP_ERR 2 +#define OP_STATE_PERM_ERR 3 + +enum scm_event {SCM_CHANGE, SCM_AVAIL}; + +struct scm_driver { + struct device_driver drv; + int (*probe) (struct scm_device *scmdev); + int (*remove) (struct scm_device *scmdev); + void (*notify) (struct scm_device *scmdev, enum scm_event event); + void (*handler) (struct scm_device *scmdev, void *data, int error); +}; + +int scm_driver_register(struct scm_driver *scmdrv); +void scm_driver_unregister(struct scm_driver *scmdrv); + +int scm_start_aob(struct aob *aob); +void scm_irq_handler(struct aob *aob, int error); + +struct eadm_ops { + int (*eadm_start) (struct aob *aob); + struct module *owner; +}; + +int scm_get_ref(void); +void scm_put_ref(void); + +void register_eadm_ops(struct eadm_ops *ops); +void unregister_eadm_ops(struct eadm_ops *ops); + +#endif /* _ASM_S390_EADM_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/elf.h linux-2.6.32.ovz/arch/s390/include/asm/elf.h --- linux-2.6.32/arch/s390/include/asm/elf.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/elf.h 2015-09-10 10:07:19.000000000 -0700 @@ -103,6 +103,7 @@ #define HWCAP_S390_HPAGE 128 #define HWCAP_S390_ETF3EH 256 #define HWCAP_S390_HIGH_GPRS 512 +#define HWCAP_S390_TE 1024 /* * These are used to set parameters in the core dumps. @@ -162,7 +163,9 @@ extern unsigned int vdso_enabled; use of this is to invoke "./ld.so someprog" to test out a new version of the loader. We need to make sure that it is out of the way of the program that it will "exec", and that there is sufficient room for the brk. */ -#define ELF_ET_DYN_BASE (STACK_TOP / 3 * 2) + +extern unsigned long randomize_et_dyn(unsigned long base); +#define ELF_ET_DYN_BASE (randomize_et_dyn(STACK_TOP / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. */ @@ -207,6 +210,8 @@ do { \ current->mm->context.noexec == 0; \ }) +#define STACK_RND_MASK 0x7ffUL + #define ARCH_DLINFO \ do { \ if (vdso_enabled) \ @@ -219,4 +224,7 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 int arch_setup_additional_pages(struct linux_binprm *, int); +extern unsigned long arch_randomize_brk(struct mm_struct *mm); +#define arch_randomize_brk arch_randomize_brk + #endif diff -uprN linux-2.6.32/arch/s390/include/asm/hardirq.h linux-2.6.32.ovz/arch/s390/include/asm/hardirq.h --- linux-2.6.32/arch/s390/include/asm/hardirq.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/hardirq.h 2015-09-10 10:06:30.000000000 -0700 @@ -15,7 +15,6 @@ #include #include #include -#include #include #define local_softirq_pending() (S390_lowcore.softirq_pending) diff -uprN linux-2.6.32/arch/s390/include/asm/hugetlb.h linux-2.6.32.ovz/arch/s390/include/asm/hugetlb.h --- linux-2.6.32/arch/s390/include/asm/hugetlb.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/hugetlb.h 2015-09-10 10:07:15.000000000 -0700 @@ -92,15 +92,6 @@ static inline pte_t huge_ptep_get(pte_t return pte; } -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - pte_t pte = huge_ptep_get(ptep); - - pmd_clear((pmd_t *) ptep); - return pte; -} - static inline void __pmd_csp(pmd_t *pmdp) { register unsigned long reg2 asm("2") = pmd_val(*pmdp); @@ -153,6 +144,15 @@ static inline void huge_ptep_invalidate( return; } +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = huge_ptep_get(ptep); + + huge_ptep_invalidate(mm, addr, ptep); + return pte; +} + #define huge_ptep_set_access_flags(__vma, __addr, __ptep, __entry, __dirty) \ ({ \ int __changed = !pte_same(huge_ptep_get(__ptep), __entry); \ @@ -167,9 +167,7 @@ static inline void huge_ptep_invalidate( ({ \ pte_t __pte = huge_ptep_get(__ptep); \ if (pte_write(__pte)) { \ - if (atomic_read(&(__mm)->mm_users) > 1 || \ - (__mm) != current->active_mm) \ - huge_ptep_invalidate(__mm, __addr, __ptep); \ + huge_ptep_invalidate(__mm, __addr, __ptep); \ set_huge_pte_at(__mm, __addr, __ptep, \ huge_pte_wrprotect(__pte)); \ } \ diff -uprN linux-2.6.32/arch/s390/include/asm/io.h linux-2.6.32.ovz/arch/s390/include/asm/io.h --- linux-2.6.32/arch/s390/include/asm/io.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/io.h 2015-09-10 10:07:26.000000000 -0700 @@ -38,11 +38,8 @@ static inline void * phys_to_virt(unsign return (void *) address; } -/* - * Convert a physical pointer to a virtual kernel pointer for /dev/mem - * access - */ -#define xlate_dev_mem_ptr(p) __va(p) +void *xlate_dev_mem_ptr(unsigned long phys); +void unxlate_dev_mem_ptr(unsigned long phys, void *addr); /* * Convert a virtual cached pointer to an uncached pointer diff -uprN linux-2.6.32/arch/s390/include/asm/ipl.h linux-2.6.32.ovz/arch/s390/include/asm/ipl.h --- linux-2.6.32/arch/s390/include/asm/ipl.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/ipl.h 2015-09-10 10:06:59.000000000 -0700 @@ -88,6 +88,8 @@ extern u32 ipl_flags; extern u32 dump_prefix_page; extern unsigned int zfcpdump_prefix_array[]; +extern unsigned long long elfcorehdr_size; + extern void do_reipl(void); extern void do_halt(void); extern void do_poff(void); @@ -167,5 +169,7 @@ enum diag308_rc { }; extern int diag308(unsigned long subcode, void *addr); +extern void store_status(void); +extern void lgr_info_log(void); #endif /* _ASM_S390_IPL_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/isc.h linux-2.6.32.ovz/arch/s390/include/asm/isc.h --- linux-2.6.32/arch/s390/include/asm/isc.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/isc.h 2015-09-10 10:07:15.000000000 -0700 @@ -14,6 +14,7 @@ /* Regular I/O interrupts. */ #define IO_SCH_ISC 3 /* regular I/O subchannels */ #define CONSOLE_ISC 1 /* console I/O subchannel */ +#define EADM_SCH_ISC 4 /* EADM subchannels */ #define CHSC_SCH_ISC 7 /* CHSC subchannels */ /* Adapter interrupts. */ #define QDIO_AIRQ_ISC IO_SCH_ISC /* I/O subchannel in qdio mode */ diff -uprN linux-2.6.32/arch/s390/include/asm/Kbuild linux-2.6.32.ovz/arch/s390/include/asm/Kbuild --- linux-2.6.32/arch/s390/include/asm/Kbuild 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/Kbuild 2015-09-10 10:07:43.000000000 -0700 @@ -9,6 +9,8 @@ header-y += vtoc.h header-y += zcrypt.h header-y += chsc.h +generic-y += trace_clock.h + unifdef-y += cmb.h unifdef-y += debug.h unifdef-y += chpid.h diff -uprN linux-2.6.32/arch/s390/include/asm/kexec.h linux-2.6.32.ovz/arch/s390/include/asm/kexec.h --- linux-2.6.32/arch/s390/include/asm/kexec.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/kexec.h 2015-09-10 10:06:54.000000000 -0700 @@ -30,14 +30,41 @@ /* Not more than 2GB */ #define KEXEC_CONTROL_MEMORY_LIMIT (1UL<<31) +/* Maximum address we can use for the crash control pages */ +#define KEXEC_CRASH_CONTROL_MEMORY_LIMIT (-1UL) + /* Allocate one page for the pdp and the second for the code */ #define KEXEC_CONTROL_PAGE_SIZE 4096 +/* Alignment of crashkernel memory */ +#define KEXEC_CRASH_MEM_ALIGN HPAGE_SIZE + /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_S390 +/* + * Size for s390x ELF notes per CPU + * + * Seven notes plus zero note at the end: prstatus, fpregset, timer, + * tod_cmp, tod_reg, control regs, and prefix + */ +#define KEXEC_NOTE_BYTES \ + (ALIGN(sizeof(struct elf_note), 4) * 8 + \ + ALIGN(sizeof("CORE"), 4) * 7 + \ + ALIGN(sizeof(struct elf_prstatus), 4) + \ + ALIGN(sizeof(elf_fpregset_t), 4) + \ + ALIGN(sizeof(u64), 4) + \ + ALIGN(sizeof(u64), 4) + \ + ALIGN(sizeof(u32), 4) + \ + ALIGN(sizeof(u64) * 16, 4) + \ + ALIGN(sizeof(u32), 4) \ + ) + /* Provide a dummy definition to avoid build failures. */ static inline void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) { } +#define KEXEC_AUTO_RESERVED_SIZE (1ULL<<27) /* 128M */ +#define KEXEC_AUTO_THRESHOLD (1ULL<<32) /* 4G */ + #endif /*_S390_KEXEC_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/kvm.h linux-2.6.32.ovz/arch/s390/include/asm/kvm.h --- linux-2.6.32/arch/s390/include/asm/kvm.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/kvm.h 2015-09-10 10:05:42.000000000 -0700 @@ -1,6 +1,5 @@ #ifndef __LINUX_KVM_S390_H #define __LINUX_KVM_S390_H - /* * asm-s390/kvm.h - KVM s390 specific structures and definitions * @@ -15,6 +14,8 @@ */ #include +#define __KVM_S390 + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* general purpose regs for s390 */ diff -uprN linux-2.6.32/arch/s390/include/asm/kvm_host.h linux-2.6.32.ovz/arch/s390/include/asm/kvm_host.h --- linux-2.6.32/arch/s390/include/asm/kvm_host.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/kvm_host.h 2015-09-10 10:07:34.000000000 -0700 @@ -144,6 +144,7 @@ struct kvm_vcpu_stat { u32 instruction_sigp_prefix; u32 instruction_sigp_restart; u32 diagnose_44; + u32 diagnose_9c; }; struct kvm_s390_io_info { diff -uprN linux-2.6.32/arch/s390/include/asm/kvm_para.h linux-2.6.32.ovz/arch/s390/include/asm/kvm_para.h --- linux-2.6.32/arch/s390/include/asm/kvm_para.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/kvm_para.h 2015-09-10 10:07:13.000000000 -0700 @@ -149,6 +149,11 @@ static inline unsigned int kvm_arch_para return 0; } +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + #endif #endif /* __S390_KVM_PARA_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/local64.h linux-2.6.32.ovz/arch/s390/include/asm/local64.h --- linux-2.6.32/arch/s390/include/asm/local64.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/local64.h 2015-09-10 10:06:30.000000000 -0700 @@ -0,0 +1 @@ +#include diff -uprN linux-2.6.32/arch/s390/include/asm/lowcore.h linux-2.6.32.ovz/arch/s390/include/asm/lowcore.h --- linux-2.6.32/arch/s390/include/asm/lowcore.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/lowcore.h 2015-09-10 10:07:19.000000000 -0700 @@ -69,6 +69,7 @@ #define __LC_INT_CLOCK 0x02c8 #define __LC_MACHINE_FLAGS 0x02d8 #define __LC_FTRACE_FUNC 0x02dc +#define __LC_CURRENT_PID 0x02f0 #define __LC_IRB 0x0300 #define __LC_PFAULT_INTPARM 0x0080 #define __LC_CPU_TIMER_SAVE_AREA 0x00d8 @@ -116,6 +117,9 @@ #define __LC_VDSO_PER_CPU 0x0350 #define __LC_MACHINE_FLAGS 0x0358 #define __LC_FTRACE_FUNC 0x0360 +#define __LC_SIE_HOOK 0x0368 +#define __LC_CMF_HPP 0x0370 +#define __LC_CURRENT_PID 0x0378 #define __LC_IRB 0x0380 #define __LC_PASTE 0x03c0 #define __LC_PFAULT_INTPARM 0x11b8 @@ -143,6 +147,7 @@ void system_call(void); void pgm_check_handler(void); void mcck_int_handler(void); void io_int_handler(void); +void psw_restart_int_handler(void); struct save_area_s390 { u32 ext_save; @@ -293,7 +298,8 @@ struct _lowcore __u64 clock_comparator; /* 0x02d0 */ __u32 machine_flags; /* 0x02d8 */ __u32 ftrace_func; /* 0x02dc */ - __u8 pad_0x02f0[0x0300-0x02f0]; /* 0x02f0 */ + __u32 current_pid; /* 0x02f0 */ + __u8 pad_0x02f4[0x0300-0x02f4]; /* 0x02f4 */ /* Interrupt response block */ __u8 irb[64]; /* 0x0300 */ @@ -307,9 +313,10 @@ struct _lowcore */ __u32 ipib; /* 0x0e00 */ __u32 ipib_checksum; /* 0x0e04 */ + __u32 vmcore_info; /* 0x0e08 */ /* Align to the top 1k of prefix area */ - __u8 pad_0x0e08[0x1000-0x0e08]; /* 0x0e08 */ + __u8 pad_0x0e0c[0x1000-0x0e0c]; /* 0x0e0c */ #else /* !__s390x__ */ /* 0x0000 - 0x01ff: defined by architecture */ __u32 ccw1[2]; /* 0x0000 */ @@ -399,7 +406,9 @@ struct _lowcore __u64 vdso_per_cpu_data; /* 0x0350 */ __u64 machine_flags; /* 0x0358 */ __u64 ftrace_func; /* 0x0360 */ - __u8 pad_0x0368[0x0380-0x0368]; /* 0x0368 */ + __u64 sie_hook; /* 0x0368 */ + __u64 cmf_hpp; /* 0x0370 */ + __u64 current_pid; /* 0x0378 */ /* Interrupt response block. */ __u8 irb[64]; /* 0x0380 */ @@ -416,7 +425,8 @@ struct _lowcore */ __u64 ipib; /* 0x0e00 */ __u32 ipib_checksum; /* 0x0e08 */ - __u8 pad_0x0e0c[0x11b8-0x0e0c]; /* 0x0e0c */ + __u64 vmcore_info; /* 0x0e0c */ + __u8 pad_0x0e14[0x11b8-0x0e14]; /* 0x0e14 */ /* 64 bit extparam used for pfault/diag 250: defined by architecture */ __u64 ext_params2; /* 0x11B8 */ @@ -436,9 +446,13 @@ struct _lowcore __u8 pad_0x1338[0x1340-0x1338]; /* 0x1338 */ __u32 access_regs_save_area[16]; /* 0x1340 */ __u64 cregs_save_area[16]; /* 0x1380 */ + __u8 pad_0x1400[0x1800-0x1400]; /* 0x1400 */ + + /* Transaction abort diagnostic block */ + __u8 pgm_tdb[256]; /* 0x1800 */ /* align to the top of the prefix area */ - __u8 pad_0x1400[0x2000-0x1400]; /* 0x1400 */ + __u8 pad_0x1900[0x2000-0x1900]; /* 0x1900 */ #endif /* !__s390x__ */ } __attribute__((packed)); /* End structure*/ diff -uprN linux-2.6.32/arch/s390/include/asm/mman.h linux-2.6.32.ovz/arch/s390/include/asm/mman.h --- linux-2.6.32/arch/s390/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/mman.h 2015-09-10 10:07:23.000000000 -0700 @@ -12,8 +12,8 @@ #include #if defined(__KERNEL__) && !defined(__ASSEMBLY__) && defined(CONFIG_64BIT) -int s390_mmap_check(unsigned long addr, unsigned long len); -#define arch_mmap_check(addr,len,flags) s390_mmap_check(addr,len) +int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags); +#define arch_mmap_check(addr, len, flags) s390_mmap_check(addr, len, flags) #endif #endif /* __S390_MMAN_H__ */ diff -uprN linux-2.6.32/arch/s390/include/asm/mmu_context.h linux-2.6.32.ovz/arch/s390/include/asm/mmu_context.h --- linux-2.6.32/arch/s390/include/asm/mmu_context.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/mmu_context.h 2015-09-10 10:06:59.000000000 -0700 @@ -11,11 +11,13 @@ #include #include -#include +#include static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + atomic_set(&mm->context.attach_count, 0); + mm->context.flush_mm = 0; mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS; #ifdef CONFIG_64BIT mm->context.asce_bits |= _ASCE_TYPE_REGION3; @@ -36,7 +38,7 @@ static inline int init_new_context(struc mm->context.has_pgste = 1; mm->context.alloc_pgste = 1; } else { - mm->context.noexec = s390_noexec; + mm->context.noexec = (user_mode == SECONDARY_SPACE_MODE); mm->context.has_pgste = 0; mm->context.alloc_pgste = 0; } @@ -58,7 +60,7 @@ static inline void update_mm(struct mm_s pgd_t *pgd = mm->pgd; S390_lowcore.user_asce = mm->context.asce_bits | __pa(pgd); - if (switch_amode) { + if (user_mode != HOME_SPACE_MODE) { /* Load primary space page table origin. */ pgd = mm->context.noexec ? get_shadow_table(pgd) : pgd; S390_lowcore.user_exec_asce = mm->context.asce_bits | __pa(pgd); @@ -76,6 +78,12 @@ static inline void switch_mm(struct mm_s { cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); update_mm(next, tsk); + atomic_dec(&prev->context.attach_count); + WARN_ON(atomic_read(&prev->context.attach_count) < 0); + atomic_inc(&next->context.attach_count); + /* Check for TLBs not flushed yet */ + if (next->context.flush_mm) + __tlb_flush_mm(next); } #define enter_lazy_tlb(mm,tsk) do { } while (0) @@ -87,4 +95,15 @@ static inline void activate_mm(struct mm switch_mm(prev, next, current); } +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + if (oldmm->context.asce_limit < mm->context.asce_limit) + crst_table_downgrade(mm, oldmm->context.asce_limit); +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + #endif /* __S390_MMU_CONTEXT_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/mmu.h linux-2.6.32.ovz/arch/s390/include/asm/mmu.h --- linux-2.6.32/arch/s390/include/asm/mmu.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/mmu.h 2015-09-10 10:06:14.000000000 -0700 @@ -2,6 +2,8 @@ #define __MMU_H typedef struct { + atomic_t attach_count; + unsigned int flush_mm; spinlock_t list_lock; struct list_head crst_list; struct list_head pgtable_list; diff -uprN linux-2.6.32/arch/s390/include/asm/module.h linux-2.6.32.ovz/arch/s390/include/asm/module.h --- linux-2.6.32/arch/s390/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/module.h 2015-09-10 10:05:36.000000000 -0700 @@ -29,14 +29,17 @@ struct mod_arch_specific }; #ifdef __s390x__ +#define MODULES_ARE_ELF64 #define ElfW(x) Elf64_ ## x #define ELFW(x) ELF64_ ## x #else +#define MODULES_ARE_ELF32 #define ElfW(x) Elf32_ ## x #define ELFW(x) ELF32_ ## x #endif #define Elf_Addr ElfW(Addr) +#define Elf_Rel ElfW(Rel) #define Elf_Rela ElfW(Rela) #define Elf_Shdr ElfW(Shdr) #define Elf_Sym ElfW(Sym) diff -uprN linux-2.6.32/arch/s390/include/asm/mutex.h linux-2.6.32.ovz/arch/s390/include/asm/mutex.h --- linux-2.6.32/arch/s390/include/asm/mutex.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/mutex.h 2015-09-10 10:06:49.000000000 -0700 @@ -7,3 +7,5 @@ */ #include + +#define arch_mutex_cpu_relax() barrier() diff -uprN linux-2.6.32/arch/s390/include/asm/page.h linux-2.6.32.ovz/arch/s390/include/asm/page.h --- linux-2.6.32/arch/s390/include/asm/page.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/page.h 2015-09-10 10:06:31.000000000 -0700 @@ -129,6 +129,11 @@ struct page; void arch_free_page(struct page *page, int order); void arch_alloc_page(struct page *page, int order); +static inline int devmem_is_allowed(unsigned long pfn) +{ + return 0; +} + #define HAVE_ARCH_FREE_PAGE #define HAVE_ARCH_ALLOC_PAGE diff -uprN linux-2.6.32/arch/s390/include/asm/perf_event.h linux-2.6.32.ovz/arch/s390/include/asm/perf_event.h --- linux-2.6.32/arch/s390/include/asm/perf_event.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/perf_event.h 2015-09-10 10:07:25.000000000 -0700 @@ -1,10 +1,16 @@ /* * Performance event support - s390 specific definitions. * - * Copyright 2009 Martin Schwidefsky, IBM Corporation. + * Copyright IBM Corp. 2009, 2012 + * Author(s): Martin Schwidefsky + * Hendrik Brueckner */ -static inline void set_perf_event_pending(void) {} -static inline void clear_perf_event_pending(void) {} +#include -#define PERF_EVENT_INDEX_OFFSET 0 +/* CPU-measurement counter facility */ +#define PERF_CPUM_CF_MAX_CTR 256 + +/* Per-CPU flags for PMU states */ +#define PMU_F_RESERVED 0x1000 +#define PMU_F_ENABLED 0x2000 diff -uprN linux-2.6.32/arch/s390/include/asm/pgalloc.h linux-2.6.32.ovz/arch/s390/include/asm/pgalloc.h --- linux-2.6.32/arch/s390/include/asm/pgalloc.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/pgalloc.h 2015-09-10 10:05:38.000000000 -0700 @@ -143,7 +143,8 @@ static inline pgd_t *pgd_alloc(struct mm spin_lock_init(&mm->context.list_lock); INIT_LIST_HEAD(&mm->context.crst_list); INIT_LIST_HEAD(&mm->context.pgtable_list); - return (pgd_t *) crst_table_alloc(mm, s390_noexec); + return (pgd_t *) + crst_table_alloc(mm, user_mode == SECONDARY_SPACE_MODE); } #define pgd_free(mm, pgd) crst_table_free(mm, (unsigned long *) pgd) diff -uprN linux-2.6.32/arch/s390/include/asm/pgtable.h linux-2.6.32.ovz/arch/s390/include/asm/pgtable.h --- linux-2.6.32/arch/s390/include/asm/pgtable.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/pgtable.h 2015-09-10 10:07:48.000000000 -0700 @@ -105,35 +105,18 @@ extern char empty_zero_page[PAGE_SIZE]; #ifndef __ASSEMBLY__ /* * The vmalloc area will always be on the topmost area of the kernel - * mapping. We reserve 96MB (31bit) / 1GB (64bit) for vmalloc, + * mapping. We reserve 96MB (31bit) / 128GB (64bit) for vmalloc, * which should be enough for any sane case. * By putting vmalloc at the top, we maximise the gap between physical * memory and vmalloc to catch misplaced memory accesses. As a side * effect, this also makes sure that 64 bit module code cannot be used * as system call address. */ - extern unsigned long VMALLOC_START; +extern unsigned long VMALLOC_END; +extern struct page *vmemmap; -#ifndef __s390x__ -#define VMALLOC_SIZE (96UL << 20) -#define VMALLOC_END 0x7e000000UL -#define VMEM_MAP_END 0x80000000UL -#else /* __s390x__ */ -#define VMALLOC_SIZE (1UL << 30) -#define VMALLOC_END 0x3e040000000UL -#define VMEM_MAP_END 0x40000000000UL -#endif /* __s390x__ */ - -/* - * VMEM_MAX_PHYS is the highest physical address that can be added to the 1:1 - * mapping. This needs to be calculated at compile time since the size of the - * VMEM_MAP is static but the size of struct page can change. - */ -#define VMEM_MAX_PAGES ((VMEM_MAP_END - VMALLOC_END) / sizeof(struct page)) -#define VMEM_MAX_PFN min(VMALLOC_START >> PAGE_SHIFT, VMEM_MAX_PAGES) -#define VMEM_MAX_PHYS ((VMEM_MAX_PFN << PAGE_SHIFT) & ~((16 << 20) - 1)) -#define vmemmap ((struct page *) VMALLOC_END) +#define VMEM_MAX_PHYS ((unsigned long) vmemmap) /* * A 31 bit pagetable entry of S390 has following format: @@ -878,7 +861,8 @@ static inline void ptep_invalidate(struc #define ptep_get_and_clear(__mm, __address, __ptep) \ ({ \ pte_t __pte = *(__ptep); \ - if (atomic_read(&(__mm)->mm_users) > 1 || \ + (__mm)->context.flush_mm = 1; \ + if (atomic_read(&(__mm)->context.attach_count) > 1 || \ (__mm) != current->active_mm) \ ptep_invalidate(__mm, __address, __ptep); \ else \ @@ -921,7 +905,8 @@ static inline pte_t ptep_get_and_clear_f ({ \ pte_t __pte = *(__ptep); \ if (pte_write(__pte)) { \ - if (atomic_read(&(__mm)->mm_users) > 1 || \ + (__mm)->context.flush_mm = 1; \ + if (atomic_read(&(__mm)->context.attach_count) > 1 || \ (__mm) != current->active_mm) \ ptep_invalidate(__mm, __addr, __ptep); \ set_pte_at(__mm, __addr, __ptep, pte_wrprotect(__pte)); \ @@ -1117,6 +1102,10 @@ static inline pte_t mk_swap_pte(unsigned ((pte_t) { ((((__off) & 0x7f) << 1) + (((__off) >> 7) << 12)) \ | _PAGE_TYPE_FILE }) +/* TODO: s390 cannot support io_remap_pfn_range... */ +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + remap_pfn_range(vma, vaddr, pfn, size, prot) + #endif /* !__ASSEMBLY__ */ #define kern_addr_valid(addr) (1) diff -uprN linux-2.6.32/arch/s390/include/asm/processor.h linux-2.6.32.ovz/arch/s390/include/asm/processor.h --- linux-2.6.32/arch/s390/include/asm/processor.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/processor.h 2015-09-10 10:07:40.000000000 -0700 @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef __KERNEL__ /* @@ -86,8 +87,19 @@ struct thread_struct { unsigned long ieee_instruction_pointer; /* pfault_wait is used to block the process on a pfault event */ unsigned long pfault_wait; +#ifndef __GENKSYMS__ + /* cpu runtime instrumentation */ + struct runtime_instr_cb *ri_cb; + int ri_signum; + unsigned long per_flags; /* Flags to control debug behavior */ +#ifdef CONFIG_64BIT + unsigned char trap_tdb[256]; /* Transaction abort diagnose block */ +#endif +#endif }; +#define PER_FLAG_NO_TE 1UL /* Flag to disable transactions. */ + typedef struct thread_struct thread_struct; /* @@ -150,11 +162,6 @@ extern int kernel_thread(int (*fn)(void */ extern unsigned long thread_saved_pc(struct task_struct *t); -/* - * Print register of task into buffer. Used in fs/proc/array.c. - */ -extern void task_show_regs(struct seq_file *m, struct task_struct *task); - extern void show_code(struct pt_regs *regs); unsigned long get_wchan(struct task_struct *p); diff -uprN linux-2.6.32/arch/s390/include/asm/ptrace.h linux-2.6.32.ovz/arch/s390/include/asm/ptrace.h --- linux-2.6.32/arch/s390/include/asm/ptrace.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/ptrace.h 2015-09-10 10:07:39.000000000 -0700 @@ -201,6 +201,9 @@ typedef union typedef struct { __u32 fpc; +#ifndef __GENKSYMS__ + __u32 pad; +#endif freg_t fprs[NUM_FPRS]; } s390_fp_regs; @@ -208,7 +211,6 @@ typedef struct #define FPC_FLAGS_MASK 0x00F80000 #define FPC_DXC_MASK 0x0000FF00 #define FPC_RM_MASK 0x00000003 -#define FPC_VALID_MASK 0xF8F8FF03 /* this typedef defines how a Program Status Word looks like */ typedef struct @@ -236,6 +238,7 @@ typedef struct #define PSW_MASK_ASC 0x0000C000UL #define PSW_MASK_CC 0x00003000UL #define PSW_MASK_PM 0x00000F00UL +#define PSW_MASK_RI 0x00000000UL #define PSW_ADDR_AMODE 0x80000000UL #define PSW_ADDR_INSN 0x7FFFFFFFUL @@ -261,6 +264,7 @@ typedef struct #define PSW_MASK_ASC 0x0000C00000000000UL #define PSW_MASK_CC 0x0000300000000000UL #define PSW_MASK_PM 0x00000F0000000000UL +#define PSW_MASK_RI 0x0000008000000000UL #define PSW_ADDR_AMODE 0x0000000000000000UL #define PSW_ADDR_INSN 0xFFFFFFFFFFFFFFFFUL @@ -290,7 +294,7 @@ extern long psw_user32_bits; is the condition code and the program mask bits. */ #define PSW_MASK_MERGE(CURRENT,NEW) \ (((CURRENT) & ~(PSW_MASK_CC|PSW_MASK_PM)) | \ - ((NEW) & (PSW_MASK_CC|PSW_MASK_PM))) + ((NEW) & (PSW_MASK_CC|PSW_MASK_PM|PSW_MASK_RI))) /* * The s390_regs structure is used to define the elf_gregset_t. @@ -341,7 +345,7 @@ typedef struct unsigned long cr[NUM_CR_WORDS]; } per_cr_words; -#define PER_EM_MASK 0xE8000000UL +#define PER_EM_MASK 0xEB000000UL typedef struct { @@ -357,9 +361,17 @@ typedef struct unsigned em_storage_alteration : 1; unsigned em_gpr_alt_unused : 1; unsigned em_store_real_address : 1; +#ifdef __GENKSYMS__ unsigned : 3; unsigned branch_addr_ctl : 1; unsigned : 1; +#else + unsigned : 1; + unsigned em_transaction_end : 1; + unsigned em_nullification : 1; + unsigned branch_addr_ctl : 1; + unsigned suspension_ctl : 1; +#endif unsigned storage_alt_space_ctl : 1; unsigned : 21; unsigned long starting_addr; @@ -436,6 +448,9 @@ typedef struct #define PTRACE_PEEKDATA_AREA 0x5003 #define PTRACE_POKETEXT_AREA 0x5004 #define PTRACE_POKEDATA_AREA 0x5005 +#define PTRACE_GET_LAST_BREAK 0x5006 +#define PTRACE_ENABLE_TE 0x5009 +#define PTRACE_DISABLE_TE 0x5010 /* * PT_PROT definition is loosely based on hppa bsd definition in @@ -496,8 +511,13 @@ extern void user_disable_single_step(str #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) #define user_stack_pointer(regs)((regs)->gprs[15]) -#define regs_return_value(regs)((regs)->gprs[2]) #define profile_pc(regs) instruction_pointer(regs) + +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->gprs[2]; +} + extern void show_regs(struct pt_regs * regs); #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff -uprN linux-2.6.32/arch/s390/include/asm/qdio.h linux-2.6.32.ovz/arch/s390/include/asm/qdio.h --- linux-2.6.32/arch/s390/include/asm/qdio.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/qdio.h 2015-09-10 10:07:44.000000000 -0700 @@ -13,7 +13,8 @@ #include #include -#define QDIO_MAX_QUEUES_PER_IRQ 32 +/* only use 4 queues to save some cachelines */ +#define QDIO_MAX_QUEUES_PER_IRQ 4 #define QDIO_MAX_BUFFERS_PER_Q 128 #define QDIO_MAX_BUFFERS_MASK (QDIO_MAX_BUFFERS_PER_Q - 1) #define QDIO_MAX_ELEMENTS_PER_BUFFER 16 @@ -45,6 +46,8 @@ struct qdesfmt0 { u32 : 16; } __attribute__ ((packed)); +#define QDR_AC_MULTI_BUFFER_ENABLE 0x01 + /** * struct qdr - queue description record (QDR) * @qfmt: queue format @@ -83,6 +86,7 @@ struct qdr { #define QIB_AC_OUTBOUND_PCI_SUPPORTED 0x40 #define QIB_RFLAGS_ENABLE_QEBSM 0x80 +#define QIB_RFLAGS_ENABLE_DATA_DIV 0x02 /** * struct qib - queue information block (QIB) @@ -120,6 +124,25 @@ struct slibe { u64 parms; }; + +struct qaob { + u64 res0[6]; + u8 res1; + u8 res2; + u8 res3; + u8 aorc; + u8 flags; + u16 cbtbs; + u8 sb_count; + u64 sba[QDIO_MAX_ELEMENTS_PER_BUFFER]; + u16 dcount[QDIO_MAX_ELEMENTS_PER_BUFFER]; + u64 user0; + u64 res4[2]; + u64 user1; + u64 user2; +} __attribute__ ((packed, aligned(256))); + + /** * struct slib - storage list information block (SLIB) * @nsliba: next SLIB address (if any) @@ -137,110 +160,47 @@ struct slib { struct slibe slibe[QDIO_MAX_BUFFERS_PER_Q]; } __attribute__ ((packed, aligned(2048))); -/** - * struct sbal_flags - storage block address list flags - * @last: last entry - * @cont: contiguous storage - * @frag: fragmentation - */ -struct sbal_flags { - u8 : 1; - u8 last : 1; - u8 cont : 1; - u8 : 1; - u8 frag : 2; - u8 : 2; -} __attribute__ ((packed)); - -#define SBAL_FLAGS_FIRST_FRAG 0x04000000UL -#define SBAL_FLAGS_MIDDLE_FRAG 0x08000000UL -#define SBAL_FLAGS_LAST_FRAG 0x0c000000UL -#define SBAL_FLAGS_LAST_ENTRY 0x40000000UL -#define SBAL_FLAGS_CONTIGUOUS 0x20000000UL +#define SBAL_EFLAGS_LAST_ENTRY 0x40 +#define SBAL_EFLAGS_CONTIGUOUS 0x20 +#define SBAL_EFLAGS_FIRST_FRAG 0x04 +#define SBAL_EFLAGS_MIDDLE_FRAG 0x08 +#define SBAL_EFLAGS_LAST_FRAG 0x0c +#define SBAL_EFLAGS_MASK 0x6f -#define SBAL_FLAGS0_DATA_CONTINUATION 0x20UL +#define SBAL_SFLAGS0_PCI_REQ 0x40 +#define SBAL_SFLAGS0_DATA_CONTINUATION 0x20 /* Awesome OpenFCP extensions */ -#define SBAL_FLAGS0_TYPE_STATUS 0x00UL -#define SBAL_FLAGS0_TYPE_WRITE 0x08UL -#define SBAL_FLAGS0_TYPE_READ 0x10UL -#define SBAL_FLAGS0_TYPE_WRITE_READ 0x18UL -#define SBAL_FLAGS0_MORE_SBALS 0x04UL -#define SBAL_FLAGS0_COMMAND 0x02UL -#define SBAL_FLAGS0_LAST_SBAL 0x00UL -#define SBAL_FLAGS0_ONLY_SBAL SBAL_FLAGS0_COMMAND -#define SBAL_FLAGS0_MIDDLE_SBAL SBAL_FLAGS0_MORE_SBALS -#define SBAL_FLAGS0_FIRST_SBAL SBAL_FLAGS0_MORE_SBALS | SBAL_FLAGS0_COMMAND -#define SBAL_FLAGS0_PCI 0x40 - -/** - * struct sbal_sbalf_0 - sbal flags for sbale 0 - * @pci: PCI indicator - * @cont: data continuation - * @sbtype: storage-block type (FCP) - */ -struct sbal_sbalf_0 { - u8 : 1; - u8 pci : 1; - u8 cont : 1; - u8 sbtype : 2; - u8 : 3; -} __attribute__ ((packed)); - -/** - * struct sbal_sbalf_1 - sbal flags for sbale 1 - * @key: storage key - */ -struct sbal_sbalf_1 { - u8 : 4; - u8 key : 4; -} __attribute__ ((packed)); - -/** - * struct sbal_sbalf_14 - sbal flags for sbale 14 - * @erridx: error index - */ -struct sbal_sbalf_14 { - u8 : 4; - u8 erridx : 4; -} __attribute__ ((packed)); - -/** - * struct sbal_sbalf_15 - sbal flags for sbale 15 - * @reason: reason for error state - */ -struct sbal_sbalf_15 { - u8 reason; -} __attribute__ ((packed)); - -/** - * union sbal_sbalf - storage block address list flags - * @i0: sbalf0 - * @i1: sbalf1 - * @i14: sbalf14 - * @i15: sblaf15 - * @value: raw value - */ -union sbal_sbalf { - struct sbal_sbalf_0 i0; - struct sbal_sbalf_1 i1; - struct sbal_sbalf_14 i14; - struct sbal_sbalf_15 i15; - u8 value; -}; +#define SBAL_SFLAGS0_TYPE_STATUS 0x00 +#define SBAL_SFLAGS0_TYPE_WRITE 0x08 +#define SBAL_SFLAGS0_TYPE_READ 0x10 +#define SBAL_SFLAGS0_TYPE_WRITE_READ 0x18 +#define SBAL_SFLAGS0_MORE_SBALS 0x04 +#define SBAL_SFLAGS0_COMMAND 0x02 +#define SBAL_SFLAGS0_LAST_SBAL 0x00 +#define SBAL_SFLAGS0_ONLY_SBAL SBAL_SFLAGS0_COMMAND +#define SBAL_SFLAGS0_MIDDLE_SBAL SBAL_SFLAGS0_MORE_SBALS +#define SBAL_SFLAGS0_FIRST_SBAL (SBAL_SFLAGS0_MORE_SBALS | SBAL_SFLAGS0_COMMAND) /** * struct qdio_buffer_element - SBAL entry - * @flags: flags + * @eflags: SBAL entry flags + * @scount: SBAL count + * @sflags: whole SBAL flags * @length: length * @addr: address */ struct qdio_buffer_element { - u32 flags; + u8 eflags; + /* private: */ + u8 res1; + /* public: */ + u8 scount; + u8 sflags; u32 length; #ifdef CONFIG_32BIT /* private: */ - void *reserved; + void *res2; /* public: */ #endif void *addr; @@ -283,6 +243,42 @@ struct slsb { u8 val[QDIO_MAX_BUFFERS_PER_Q]; } __attribute__ ((packed, aligned(256))); +/** + * struct qdio_outbuf_state - SBAL related asynchronous operation information + * (for communication with upper layer programs) + * (only required for use with completion queues) + * @flags: flags indicating state of buffer + * @aob: pointer to QAOB used for the particular SBAL + * @user: pointer to upper layer program's state information related to SBAL + * (stored in user1 data of QAOB) + */ +struct qdio_outbuf_state { + u8 flags; + struct qaob *aob; + void *user; +}; + +#define QDIO_OUTBUF_STATE_FLAG_NONE 0x00 +#define QDIO_OUTBUF_STATE_FLAG_PENDING 0x01 + +#define CHSC_AC1_INITIATE_INPUTQ 0x80 + +/* qdio adapter-characteristics-1 flag */ +#define AC1_SIGA_INPUT_NEEDED 0x40 /* process input queues */ +#define AC1_SIGA_OUTPUT_NEEDED 0x20 /* process output queues */ +#define AC1_SIGA_SYNC_NEEDED 0x10 /* ask hypervisor to sync */ +#define AC1_AUTOMATIC_SYNC_ON_THININT 0x08 /* set by hypervisor */ +#define AC1_AUTOMATIC_SYNC_ON_OUT_PCI 0x04 /* set by hypervisor */ +#define AC1_SC_QEBSM_AVAILABLE 0x02 /* available for subchannel */ +#define AC1_SC_QEBSM_ENABLED 0x01 /* enabled for subchannel */ + +#define CHSC_AC3_FORMAT2_CQ_AVAILABLE 0x8000 + +#define CHSC_AC2_MULTI_BUFFER_AVAILABLE 0x0080 +#define CHSC_AC2_MULTI_BUFFER_ENABLED 0x0040 +#define CHSC_AC2_DATA_DIV_AVAILABLE 0x0010 +#define CHSC_AC2_DATA_DIV_ENABLED 0x0002 + struct qdio_ssqd_desc { u8 flags; u8:8; @@ -301,8 +297,7 @@ struct qdio_ssqd_desc { u64 sch_token; u8 mro; u8 mri; - u8:8; - u8 sbalic; + u16 qdioac3; u16:16; u8:8; u8 mmwc; @@ -336,33 +331,41 @@ typedef void qdio_handler_t(struct ccw_d * @adapter_name: name for the adapter * @qib_param_field_format: format for qib_parm_field * @qib_param_field: pointer to 128 bytes or NULL, if no param field + * @qib_rflags: rflags to set * @input_slib_elements: pointer to no_input_qs * 128 words of data or NULL * @output_slib_elements: pointer to no_output_qs * 128 words of data or NULL * @no_input_qs: number of input queues * @no_output_qs: number of output queues * @input_handler: handler to be called for input queues * @output_handler: handler to be called for output queues + * @queue_start_poll_array: polling handlers (one per input queue or NULL) * @int_parm: interruption parameter * @flags: initialization flags * @input_sbal_addr_array: address of no_input_qs * 128 pointers * @output_sbal_addr_array: address of no_output_qs * 128 pointers + * @output_sbal_state_array: no_output_qs * 128 state info (for CQ or NULL) */ struct qdio_initialize { struct ccw_device *cdev; unsigned char q_format; + unsigned char qdr_ac; unsigned char adapter_name[8]; unsigned int qib_param_field_format; unsigned char *qib_param_field; + unsigned char qib_rflags; unsigned long *input_slib_elements; unsigned long *output_slib_elements; unsigned int no_input_qs; unsigned int no_output_qs; qdio_handler_t *input_handler; qdio_handler_t *output_handler; + void (**queue_start_poll_array) (struct ccw_device *, int, + unsigned long); unsigned long int_parm; unsigned long flags; void **input_sbal_addr_array; void **output_sbal_addr_array; + struct qdio_outbuf_state *output_sbal_state_array; }; #define QDIO_STATE_INACTIVE 0x00000002 /* after qdio_cleanup */ @@ -374,14 +377,16 @@ struct qdio_initialize { #define QDIO_FLAG_SYNC_OUTPUT 0x02 #define QDIO_FLAG_PCI_OUT 0x10 -extern int qdio_initialize(struct qdio_initialize *); extern int qdio_allocate(struct qdio_initialize *); extern int qdio_establish(struct qdio_initialize *); extern int qdio_activate(struct ccw_device *); +extern void qdio_release_aob(struct qaob *); extern int do_QDIO(struct ccw_device *cdev, unsigned int callflags, int q_nr, unsigned int bufnr, unsigned int count); -extern int qdio_cleanup(struct ccw_device*, int); +extern int qdio_start_irq(struct ccw_device *, int); +extern int qdio_stop_irq(struct ccw_device *, int); +extern int qdio_get_next_buffers(struct ccw_device *, int, int *, int *); extern int qdio_shutdown(struct ccw_device*, int); extern int qdio_free(struct ccw_device *); extern int qdio_get_ssqd_desc(struct ccw_device *dev, struct qdio_ssqd_desc*); diff -uprN linux-2.6.32/arch/s390/include/asm/qeth.h linux-2.6.32.ovz/arch/s390/include/asm/qeth.h --- linux-2.6.32/arch/s390/include/asm/qeth.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/qeth.h 2015-09-10 10:06:49.000000000 -0700 @@ -28,39 +28,70 @@ struct qeth_arp_cache_entry { __u8 reserved2[32]; } __attribute__ ((packed)); +enum qeth_arp_ipaddrtype { + QETHARP_IP_ADDR_V4 = 1, + QETHARP_IP_ADDR_V6 = 2, +}; +struct qeth_arp_entrytype { + __u8 mac; + __u8 ip; +} __attribute__((packed)); + +#define QETH_QARP_MEDIASPECIFIC_BYTES 32 +#define QETH_QARP_MACADDRTYPE_BYTES 1 struct qeth_arp_qi_entry7 { - __u8 media_specific[32]; - __u8 macaddr_type; - __u8 ipaddr_type; + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; __u8 macaddr[6]; __u8 ipaddr[4]; } __attribute__((packed)); +struct qeth_arp_qi_entry7_ipv6 { + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; + __u8 macaddr[6]; + __u8 ipaddr[16]; +} __attribute__((packed)); + struct qeth_arp_qi_entry7_short { - __u8 macaddr_type; - __u8 ipaddr_type; + struct qeth_arp_entrytype type; __u8 macaddr[6]; __u8 ipaddr[4]; } __attribute__((packed)); +struct qeth_arp_qi_entry7_short_ipv6 { + struct qeth_arp_entrytype type; + __u8 macaddr[6]; + __u8 ipaddr[16]; +} __attribute__((packed)); + struct qeth_arp_qi_entry5 { - __u8 media_specific[32]; - __u8 macaddr_type; - __u8 ipaddr_type; + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; __u8 ipaddr[4]; } __attribute__((packed)); +struct qeth_arp_qi_entry5_ipv6 { + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; + __u8 ipaddr[16]; +} __attribute__((packed)); + struct qeth_arp_qi_entry5_short { - __u8 macaddr_type; - __u8 ipaddr_type; + struct qeth_arp_entrytype type; __u8 ipaddr[4]; } __attribute__((packed)); +struct qeth_arp_qi_entry5_short_ipv6 { + struct qeth_arp_entrytype type; + __u8 ipaddr[16]; +} __attribute__((packed)); /* * can be set by user if no "media specific information" is wanted * -> saves a lot of space in user space buffer */ #define QETH_QARP_STRIP_ENTRIES 0x8000 +#define QETH_QARP_WITH_IPV6 0x4000 #define QETH_QARP_REQUEST_MASK 0x00ff /* data sent to user space as result of query arp ioctl */ diff -uprN linux-2.6.32/arch/s390/include/asm/reset.h linux-2.6.32.ovz/arch/s390/include/asm/reset.h --- linux-2.6.32/arch/s390/include/asm/reset.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/reset.h 2015-09-10 10:06:49.000000000 -0700 @@ -17,5 +17,5 @@ struct reset_call { extern void register_reset_call(struct reset_call *reset); extern void unregister_reset_call(struct reset_call *reset); -extern void s390_reset_system(void); +extern void s390_reset_system(void (*func)(void *), void *data); #endif /* _ASM_S390_RESET_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/runtime_instr.h linux-2.6.32.ovz/arch/s390/include/asm/runtime_instr.h --- linux-2.6.32/arch/s390/include/asm/runtime_instr.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/runtime_instr.h 2015-09-10 10:07:19.000000000 -0700 @@ -0,0 +1,98 @@ +#ifndef _RUNTIME_INSTR_H +#define _RUNTIME_INSTR_H + +#define S390_RUNTIME_INSTR_START 0x1 +#define S390_RUNTIME_INSTR_STOP 0x2 + +struct runtime_instr_cb { + __u64 buf_current; + __u64 buf_origin; + __u64 buf_limit; + + __u32 valid : 1; + __u32 pstate : 1; + __u32 pstate_set_buf : 1; + __u32 home_space : 1; + __u32 altered : 1; + __u32 : 3; + __u32 pstate_sample : 1; + __u32 sstate_sample : 1; + __u32 pstate_collect : 1; + __u32 sstate_collect : 1; + __u32 : 1; + __u32 halted_int : 1; + __u32 int_requested : 1; + __u32 buffer_full_int : 1; + __u32 key : 4; + __u32 : 9; + __u32 rgs : 3; + + __u32 mode : 4; + __u32 next : 1; + __u32 mae : 1; + __u32 : 2; + __u32 call_type_br : 1; + __u32 return_type_br : 1; + __u32 other_type_br : 1; + __u32 bc_other_type : 1; + __u32 emit : 1; + __u32 tx_abort : 1; + __u32 : 2; + __u32 bp_xn : 1; + __u32 bp_xt : 1; + __u32 bp_ti : 1; + __u32 bp_ni : 1; + __u32 suppr_y : 1; + __u32 suppr_z : 1; + + __u32 dc_miss_extra : 1; + __u32 lat_lev_ignore : 1; + __u32 ic_lat_lev : 4; + __u32 dc_lat_lev : 4; + + __u64 reserved1; + __u64 scaling_factor; + __u64 rsic; + __u64 reserved2; +} __packed __aligned(8); + +extern struct runtime_instr_cb runtime_instr_empty_cb; + +static inline void load_runtime_instr_cb(struct runtime_instr_cb *cb) +{ + asm volatile(".insn rsy,0xeb0000000060,0,0,%0" /* LRIC */ + : : "Q" (*cb)); +} + +static inline void store_runtime_instr_cb(struct runtime_instr_cb *cb) +{ + asm volatile(".insn rsy,0xeb0000000061,0,0,%0" /* STRIC */ + : "=Q" (*cb) : : "cc"); +} + +static inline void save_ri_cb(struct runtime_instr_cb *cb_prev) +{ +#ifdef CONFIG_64BIT + if (cb_prev) + store_runtime_instr_cb(cb_prev); +#endif +} + +static inline void restore_ri_cb(struct runtime_instr_cb *cb_next, + struct runtime_instr_cb *cb_prev) +{ +#ifdef CONFIG_64BIT + if (cb_next) + load_runtime_instr_cb(cb_next); + else if (cb_prev) + load_runtime_instr_cb(&runtime_instr_empty_cb); +#endif +} + +#ifdef CONFIG_64BIT +extern void exit_thread_runtime_instr(void); +#else +static inline void exit_thread_runtime_instr(void) { } +#endif + +#endif /* _RUNTIME_INSTR_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/s390_ext.h linux-2.6.32.ovz/arch/s390/include/asm/s390_ext.h --- linux-2.6.32/arch/s390/include/asm/s390_ext.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/s390_ext.h 2015-09-10 10:07:19.000000000 -0700 @@ -29,4 +29,7 @@ int unregister_external_interrupt(__u16 int unregister_early_external_interrupt(__u16 code, ext_int_handler_t handler, ext_int_info_t *info); +void measurement_alert_subclass_register(void); +void measurement_alert_subclass_unregister(void); + #endif diff -uprN linux-2.6.32/arch/s390/include/asm/scsw.h linux-2.6.32.ovz/arch/s390/include/asm/scsw.h --- linux-2.6.32/arch/s390/include/asm/scsw.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/scsw.h 2015-09-10 10:07:15.000000000 -0700 @@ -1,7 +1,7 @@ /* * Helper functions for scsw access. * - * Copyright IBM Corp. 2008,2009 + * Copyright IBM Corp. 2008, 2012 * Author(s): Peter Oberparleiter */ @@ -9,7 +9,7 @@ #define _ASM_S390_SCSW_H_ #include -#include +#include #include /** @@ -100,14 +100,46 @@ struct tm_scsw { } __attribute__ ((packed)); /** + * struct eadm_scsw - subchannel status word for eadm subchannels + * @key: subchannel key + * @eswf: esw format + * @cc: deferred condition code + * @ectl: extended control + * @fctl: function control + * @actl: activity control + * @stctl: status control + * @aob: AOB address + * @dstat: device status + * @cstat: subchannel status + */ +struct eadm_scsw { + u32 key:4; + u32:1; + u32 eswf:1; + u32 cc:2; + u32:6; + u32 ectl:1; + u32:2; + u32 fctl:3; + u32 actl:7; + u32 stctl:5; + u32 aob; + u32 dstat:8; + u32 cstat:8; + u32:16; +} __packed; + +/** * union scsw - subchannel status word * @cmd: command-mode SCSW * @tm: transport-mode SCSW + * @eadm: eadm SCSW */ union scsw { struct cmd_scsw cmd; struct tm_scsw tm; -} __attribute__ ((packed)); + struct eadm_scsw eadm; +} __packed; #define SCSW_FCTL_CLEAR_FUNC 0x1 #define SCSW_FCTL_HALT_FUNC 0x2 diff -uprN linux-2.6.32/arch/s390/include/asm/setup.h linux-2.6.32.ovz/arch/s390/include/asm/setup.h --- linux-2.6.32/arch/s390/include/asm/setup.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/setup.h 2015-09-10 10:07:19.000000000 -0700 @@ -2,7 +2,7 @@ * include/asm-s390/setup.h * * S390 version - * Copyright IBM Corp. 1999,2006 + * Copyright IBM Corp. 1999,2010 */ #ifndef _ASM_S390_SETUP_H @@ -26,15 +26,21 @@ #define IPL_DEVICE (*(unsigned long *) (0x10404)) #define INITRD_START (*(unsigned long *) (0x1040C)) #define INITRD_SIZE (*(unsigned long *) (0x10414)) +#define OLDMEM_BASE (*(unsigned long *) (0x1041C)) +#define OLDMEM_SIZE (*(unsigned long *) (0x10424)) #else /* __s390x__ */ #define IPL_DEVICE (*(unsigned long *) (0x10400)) #define INITRD_START (*(unsigned long *) (0x10408)) #define INITRD_SIZE (*(unsigned long *) (0x10410)) +#define OLDMEM_BASE (*(unsigned long *) (0x10418)) +#define OLDMEM_SIZE (*(unsigned long *) (0x10420)) #endif /* __s390x__ */ #define COMMAND_LINE ((char *) (0x10480)) #define CHUNK_READ_WRITE 0 #define CHUNK_READ_ONLY 1 +#define CHUNK_OLDMEM 4 +#define CHUNK_CRASHK 5 struct mem_chunk { unsigned long addr; @@ -48,18 +54,15 @@ extern int memory_end_set; extern unsigned long memory_end; void detect_memory_layout(struct mem_chunk chunk[]); +void create_mem_hole(struct mem_chunk memory_chunk[], unsigned long addr, + unsigned long size, int type); -#ifdef CONFIG_S390_SWITCH_AMODE -extern unsigned int switch_amode; -#else -#define switch_amode (0) -#endif - -#ifdef CONFIG_S390_EXEC_PROTECT -extern unsigned int s390_noexec; -#else -#define s390_noexec (0) -#endif +#define PRIMARY_SPACE_MODE 0 +#define ACCESS_REGISTER_MODE 1 +#define SECONDARY_SPACE_MODE 2 +#define HOME_SPACE_MODE 3 + +extern unsigned int user_mode; /* * Machine features detected in head.S @@ -76,6 +79,8 @@ extern unsigned int s390_noexec; #define MACHINE_FLAG_KVM (1UL << 9) #define MACHINE_FLAG_HPAGE (1UL << 10) #define MACHINE_FLAG_PFMF (1UL << 11) +#define MACHINE_FLAG_SPP (1UL << 13) +#define MACHINE_FLAG_TE (1UL << 14) #define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM) #define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM) @@ -90,6 +95,8 @@ extern unsigned int s390_noexec; #define MACHINE_HAS_MVCOS (0) #define MACHINE_HAS_HPAGE (0) #define MACHINE_HAS_PFMF (0) +#define MACHINE_HAS_SPP (0) +#define MACHINE_HAS_TE (0) #else /* __s390x__ */ #define MACHINE_HAS_IEEE (1) #define MACHINE_HAS_CSP (1) @@ -99,9 +106,12 @@ extern unsigned int s390_noexec; #define MACHINE_HAS_MVCOS (S390_lowcore.machine_flags & MACHINE_FLAG_MVCOS) #define MACHINE_HAS_HPAGE (S390_lowcore.machine_flags & MACHINE_FLAG_HPAGE) #define MACHINE_HAS_PFMF (S390_lowcore.machine_flags & MACHINE_FLAG_PFMF) +#define MACHINE_HAS_SPP (S390_lowcore.machine_flags & MACHINE_FLAG_SPP) +#define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) #endif /* __s390x__ */ #define ZFCPDUMP_HSA_SIZE (32UL<<20) +#define ZFCPDUMP_HSA_SIZE_MAX (64UL<<20) /* * Console mode. Override with conmode= @@ -130,10 +140,14 @@ extern char kernel_nss_name[]; #define IPL_DEVICE 0x10404 #define INITRD_START 0x1040C #define INITRD_SIZE 0x10414 +#define OLDMEM_BASE 0x1041C +#define OLDMEM_SIZE 0x10424 #else /* __s390x__ */ #define IPL_DEVICE 0x10400 #define INITRD_START 0x10408 #define INITRD_SIZE 0x10410 +#define OLDMEM_BASE 0x10418 +#define OLDMEM_SIZE 0x10420 #endif /* __s390x__ */ #define COMMAND_LINE 0x10480 diff -uprN linux-2.6.32/arch/s390/include/asm/sigcontext.h linux-2.6.32.ovz/arch/s390/include/asm/sigcontext.h --- linux-2.6.32/arch/s390/include/asm/sigcontext.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/sigcontext.h 2015-09-10 10:07:39.000000000 -0700 @@ -51,6 +51,9 @@ typedef struct typedef struct { unsigned int fpc; +#ifndef __GENKSYMS__ + unsigned int pad; +#endif double fprs[__NUM_FPRS]; } _s390_fp_regs; diff -uprN linux-2.6.32/arch/s390/include/asm/sigp.h linux-2.6.32.ovz/arch/s390/include/asm/sigp.h --- linux-2.6.32/arch/s390/include/asm/sigp.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/sigp.h 2015-09-10 10:06:49.000000000 -0700 @@ -15,11 +15,19 @@ #ifndef __SIGP__ #define __SIGP__ -#include -#include +#include /* get real cpu address from logical cpu number */ -extern volatile int __cpu_logical_map[]; +extern int __cpu_logical_map[]; + +static inline int cpu_logical_map(int cpu) +{ +#ifdef CONFIG_SMP + return __cpu_logical_map[cpu]; +#else + return stap(); +#endif +} typedef enum { @@ -62,6 +70,7 @@ typedef enum ec_schedule=0, ec_call_function, ec_call_function_single, + ec_stop_cpu, ec_bit_last } ec_bit_sig; @@ -79,7 +88,7 @@ signal_processor(__u16 cpu_addr, sigp_or " ipm %0\n" " srl %0,28\n" : "=d" (ccode) - : "d" (reg1), "d" (__cpu_logical_map[cpu_addr]), + : "d" (reg1), "d" (cpu_logical_map(cpu_addr)), "a" (order_code) : "cc" , "memory"); return ccode; } @@ -98,7 +107,7 @@ signal_processor_p(__u32 parameter, __u1 " ipm %0\n" " srl %0,28\n" : "=d" (ccode) - : "d" (reg1), "d" (__cpu_logical_map[cpu_addr]), + : "d" (reg1), "d" (cpu_logical_map(cpu_addr)), "a" (order_code) : "cc" , "memory"); return ccode; } @@ -118,10 +127,28 @@ signal_processor_ps(__u32 *statusptr, __ " ipm %0\n" " srl %0,28\n" : "=d" (ccode), "+d" (reg1) - : "d" (__cpu_logical_map[cpu_addr]), "a" (order_code) + : "d" (cpu_logical_map(cpu_addr)), "a" (order_code) : "cc" , "memory"); *statusptr = reg1; return ccode; } +/* + * Signal processor + */ +static inline int raw_sigp(u16 cpu, int order) +{ + register unsigned long reg1 asm ("1") = 0; + int ccode; + + asm volatile( + " sigp %1,%2,0(%3)\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (ccode) + : "d" (reg1), "d" (cpu), + "a" (order) : "cc" , "memory"); + return ccode; +} + #endif /* __SIGP__ */ diff -uprN linux-2.6.32/arch/s390/include/asm/smp.h linux-2.6.32.ovz/arch/s390/include/asm/smp.h --- linux-2.6.32/arch/s390/include/asm/smp.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/smp.h 2015-09-10 10:07:52.000000000 -0700 @@ -35,6 +35,7 @@ typedef struct extern void machine_restart_smp(char *); extern void machine_halt_smp(void); extern void machine_power_off_smp(void); +extern void smp_restart_with_online_cpu(void); #define NO_PROC_ID 0xFF /* No processor magic marker */ @@ -51,7 +52,6 @@ extern void machine_power_off_smp(void); #define PROC_CHANGE_PENALTY 20 /* Schedule penalty */ #define raw_smp_processor_id() (S390_lowcore.cpu_nr) -#define cpu_logical_map(cpu) (cpu) extern int __cpu_disable (void); extern void __cpu_die (unsigned int cpu); diff -uprN linux-2.6.32/arch/s390/include/asm/socket.h linux-2.6.32.ovz/arch/s390/include/asm/socket.h --- linux-2.6.32/arch/s390/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/socket.h 2015-09-10 10:07:41.000000000 -0700 @@ -30,7 +30,7 @@ #define SO_PRIORITY 12 #define SO_LINGER 13 #define SO_BSDCOMPAT 14 -/* To add :#define SO_REUSEPORT 15 */ +#define SO_REUSEPORT 15 #define SO_PASSCRED 16 #define SO_PEERCRED 17 #define SO_RCVLOWAT 18 @@ -68,4 +68,9 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 + +#define SO_BUSY_POLL 46 +#define SO_BPF_EXTENSIONS 48 + #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/sparsemem.h linux-2.6.32.ovz/arch/s390/include/asm/sparsemem.h --- linux-2.6.32/arch/s390/include/asm/sparsemem.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/sparsemem.h 2015-09-10 10:07:32.000000000 -0700 @@ -4,8 +4,13 @@ #ifdef CONFIG_64BIT #define SECTION_SIZE_BITS 28 +#ifdef __GENKSYMS__ #define MAX_PHYSADDR_BITS 42 #define MAX_PHYSMEM_BITS 42 +#else +#define MAX_PHYSADDR_BITS 46 +#define MAX_PHYSMEM_BITS 46 +#endif #else diff -uprN linux-2.6.32/arch/s390/include/asm/statfs.h linux-2.6.32.ovz/arch/s390/include/asm/statfs.h --- linux-2.6.32/arch/s390/include/asm/statfs.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/statfs.h 2015-09-10 10:06:38.000000000 -0700 @@ -33,7 +33,8 @@ struct statfs { __kernel_fsid_t f_fsid; int f_namelen; int f_frsize; - int f_spare[5]; + int f_flags; + int f_spare[4]; }; struct statfs64 { @@ -47,7 +48,8 @@ struct statfs64 { __kernel_fsid_t f_fsid; int f_namelen; int f_frsize; - int f_spare[5]; + int f_flags; + int f_spare[4]; }; struct compat_statfs64 { @@ -61,7 +63,8 @@ struct compat_statfs64 { __kernel_fsid_t f_fsid; __u32 f_namelen; __u32 f_frsize; - __u32 f_spare[5]; + __u32 f_flags; + __u32 f_spare[4]; }; #endif /* __s390x__ */ diff -uprN linux-2.6.32/arch/s390/include/asm/sysinfo.h linux-2.6.32.ovz/arch/s390/include/asm/sysinfo.h --- linux-2.6.32/arch/s390/include/asm/sysinfo.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/sysinfo.h 2015-09-10 10:06:31.000000000 -0700 @@ -15,7 +15,10 @@ #define __ASM_S390_SYSINFO_H struct sysinfo_1_1_1 { - char reserved_0[32]; + unsigned short :16; + unsigned char ccr; + unsigned char cai; + char reserved_0[28]; char manufacturer[16]; char type[4]; char reserved_1[12]; diff -uprN linux-2.6.32/arch/s390/include/asm/system.h linux-2.6.32.ovz/arch/s390/include/asm/system.h --- linux-2.6.32/arch/s390/include/asm/system.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/system.h 2015-09-10 10:07:48.000000000 -0700 @@ -20,59 +20,96 @@ struct task_struct; extern struct task_struct *__switch_to(void *, void *); +extern void update_per_regs(struct task_struct *task); -static inline void save_fp_regs(s390_fp_regs *fpregs) +static inline int test_fp_ctl(u32 fpc) { + u32 orig_fpc; + int rc; + + if (!MACHINE_HAS_IEEE) + return 0; + asm volatile( - " std 0,8(%1)\n" - " std 2,24(%1)\n" - " std 4,40(%1)\n" - " std 6,56(%1)" - : "=m" (*fpregs) : "a" (fpregs), "m" (*fpregs) : "memory"); + " efpc %1\n" + " sfpc %2\n" + "0: sfpc %1\n" + " la %0,0\n" + "1:\n" + EX_TABLE(0b,1b) + : "=d" (rc), "=d" (orig_fpc) + : "d" (fpc), "0" (-EINVAL)); + return rc; +} + +static inline void save_fp_ctl(u32 *fpc) +{ if (!MACHINE_HAS_IEEE) return; + asm volatile( - " stfpc 0(%1)\n" - " std 1,16(%1)\n" - " std 3,32(%1)\n" - " std 5,48(%1)\n" - " std 7,64(%1)\n" - " std 8,72(%1)\n" - " std 9,80(%1)\n" - " std 10,88(%1)\n" - " std 11,96(%1)\n" - " std 12,104(%1)\n" - " std 13,112(%1)\n" - " std 14,120(%1)\n" - " std 15,128(%1)\n" - : "=m" (*fpregs) : "a" (fpregs), "m" (*fpregs) : "memory"); + " stfpc %0\n" + : "+Q" (*fpc)); } -static inline void restore_fp_regs(s390_fp_regs *fpregs) +static inline int restore_fp_ctl(u32 *fpc) { + int rc; + + if (!MACHINE_HAS_IEEE) + return 0; + asm volatile( - " ld 0,8(%0)\n" - " ld 2,24(%0)\n" - " ld 4,40(%0)\n" - " ld 6,56(%0)" - : : "a" (fpregs), "m" (*fpregs)); + " lfpc %1\n" + "0: la %0,0\n" + "1:\n" + EX_TABLE(0b,1b) + : "=d" (rc) : "Q" (*fpc), "0" (-EINVAL)); + return rc; +} + +static inline void save_fp_regs(freg_t *fprs) +{ + asm volatile("std 0,%0" : "=Q" (fprs[0])); + asm volatile("std 2,%0" : "=Q" (fprs[2])); + asm volatile("std 4,%0" : "=Q" (fprs[4])); + asm volatile("std 6,%0" : "=Q" (fprs[6])); if (!MACHINE_HAS_IEEE) return; - asm volatile( - " lfpc 0(%0)\n" - " ld 1,16(%0)\n" - " ld 3,32(%0)\n" - " ld 5,48(%0)\n" - " ld 7,64(%0)\n" - " ld 8,72(%0)\n" - " ld 9,80(%0)\n" - " ld 10,88(%0)\n" - " ld 11,96(%0)\n" - " ld 12,104(%0)\n" - " ld 13,112(%0)\n" - " ld 14,120(%0)\n" - " ld 15,128(%0)\n" - : : "a" (fpregs), "m" (*fpregs)); + asm volatile("std 1,%0" : "=Q" (fprs[1])); + asm volatile("std 3,%0" : "=Q" (fprs[3])); + asm volatile("std 5,%0" : "=Q" (fprs[5])); + asm volatile("std 7,%0" : "=Q" (fprs[7])); + asm volatile("std 8,%0" : "=Q" (fprs[8])); + asm volatile("std 9,%0" : "=Q" (fprs[9])); + asm volatile("std 10,%0" : "=Q" (fprs[10])); + asm volatile("std 11,%0" : "=Q" (fprs[11])); + asm volatile("std 12,%0" : "=Q" (fprs[12])); + asm volatile("std 13,%0" : "=Q" (fprs[13])); + asm volatile("std 14,%0" : "=Q" (fprs[14])); + asm volatile("std 15,%0" : "=Q" (fprs[15])); +} + +static inline void restore_fp_regs(freg_t *fprs) +{ + asm volatile("ld 0,%0" : : "Q" (fprs[0])); + asm volatile("ld 2,%0" : : "Q" (fprs[2])); + asm volatile("ld 4,%0" : : "Q" (fprs[4])); + asm volatile("ld 6,%0" : : "Q" (fprs[6])); + if (!MACHINE_HAS_IEEE) + return; + asm volatile("ld 1,%0" : : "Q" (fprs[1])); + asm volatile("ld 3,%0" : : "Q" (fprs[3])); + asm volatile("ld 5,%0" : : "Q" (fprs[5])); + asm volatile("ld 7,%0" : : "Q" (fprs[7])); + asm volatile("ld 8,%0" : : "Q" (fprs[8])); + asm volatile("ld 9,%0" : : "Q" (fprs[9])); + asm volatile("ld 10,%0" : : "Q" (fprs[10])); + asm volatile("ld 11,%0" : : "Q" (fprs[11])); + asm volatile("ld 12,%0" : : "Q" (fprs[12])); + asm volatile("ld 13,%0" : : "Q" (fprs[13])); + asm volatile("ld 14,%0" : : "Q" (fprs[14])); + asm volatile("ld 15,%0" : : "Q" (fprs[15])); } static inline void save_access_regs(unsigned int *acrs) @@ -88,10 +125,15 @@ static inline void restore_access_regs(u #define switch_to(prev,next,last) do { \ if (prev == next) \ break; \ - save_fp_regs(&prev->thread.fp_regs); \ - restore_fp_regs(&next->thread.fp_regs); \ + save_fp_ctl(&prev->thread.fp_regs.fpc); \ + save_fp_regs(prev->thread.fp_regs.fprs); \ + restore_fp_ctl(&next->thread.fp_regs.fpc); \ + restore_fp_regs(next->thread.fp_regs.fprs); \ save_access_regs(&prev->thread.acrs[0]); \ + save_ri_cb(prev->thread.ri_cb); \ restore_access_regs(&next->thread.acrs[0]); \ + restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); \ + update_per_regs(next); \ prev = __switch_to(prev,next); \ } while (0) @@ -110,6 +152,10 @@ extern void pfault_fini(void); #endif /* CONFIG_PFAULT */ extern void cmma_init(void); +extern int memcpy_real(void *, void *, size_t); +extern void copy_to_absolute_zero(void *dest, void *src, size_t count); +extern int copy_to_user_real(void __user *dest, void *src, size_t count); +extern int copy_from_user_real(void *dest, void __user *src, size_t count); #define finish_arch_switch(prev) do { \ set_fs(current->thread.mm_segment); \ @@ -456,11 +502,6 @@ extern void (*_machine_power_off)(void); #define arch_align_stack(x) (x) -#ifdef CONFIG_TRACE_IRQFLAGS -extern psw_t sysc_restore_trace_psw; -extern psw_t io_restore_trace_psw; -#endif - static inline int tprot(unsigned long addr) { int rc = -EFAULT; diff -uprN linux-2.6.32/arch/s390/include/asm/thread_info.h linux-2.6.32.ovz/arch/s390/include/asm/thread_info.h --- linux-2.6.32/arch/s390/include/asm/thread_info.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/thread_info.h 2015-09-10 10:06:49.000000000 -0700 @@ -31,6 +31,7 @@ #define ASYNC_SIZE (PAGE_SIZE << ASYNC_ORDER) #ifndef __ASSEMBLY__ +#include #include #include #include @@ -50,6 +51,10 @@ struct thread_info { struct restart_block restart_block; __u64 user_timer; __u64 system_timer; +#ifndef __GENKSYMS__ + unsigned long last_break; /* last breaking-event-address. */ + struct list_head list; /* pfault task list */ +#endif }; /* @@ -117,6 +122,12 @@ static inline struct thread_info *curren #define _TIF_31BIT (1<> 9; + * + * In order to avoid an overflow with the multiplication we can rewrite this. + * With a split todval == 2^32 * th + tl (th upper 32 bits, tl lower 32 bits) + * we end up with + * + * ns = ((2^32 * th + tl) * 125 ) >> 9; + * -> ns = (2^23 * th * 125) + ((tl * 125) >> 9); + * + */ +static inline unsigned long long tod_to_ns(unsigned long long todval) +{ + unsigned long long ns; + + ns = ((todval >> 32) << 23) * 125; + ns += ((todval & 0xffffffff) * 125) >> 9; + return ns; +} + #endif diff -uprN linux-2.6.32/arch/s390/include/asm/tlbflush.h linux-2.6.32.ovz/arch/s390/include/asm/tlbflush.h --- linux-2.6.32/arch/s390/include/asm/tlbflush.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/tlbflush.h 2015-09-10 10:07:26.000000000 -0700 @@ -73,8 +73,6 @@ static inline void __tlb_flush_idte(unsi static inline void __tlb_flush_mm(struct mm_struct * mm) { - if (unlikely(cpumask_empty(mm_cpumask(mm)))) - return; /* * If the machine has IDTE we prefer to do a per mm flush * on all cpus instead of doing a local flush if the mm @@ -94,8 +92,12 @@ static inline void __tlb_flush_mm(struct static inline void __tlb_flush_mm_cond(struct mm_struct * mm) { - if (atomic_read(&mm->mm_users) <= 1 && mm == current->active_mm) + spin_lock(&mm->page_table_lock); + if (mm->context.flush_mm) { __tlb_flush_mm(mm); + mm->context.flush_mm = 0; + } + spin_unlock(&mm->page_table_lock); } /* diff -uprN linux-2.6.32/arch/s390/include/asm/tlb.h linux-2.6.32.ovz/arch/s390/include/asm/tlb.h --- linux-2.6.32/arch/s390/include/asm/tlb.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/tlb.h 2015-09-10 10:06:14.000000000 -0700 @@ -50,8 +50,7 @@ static inline struct mmu_gather *tlb_gat struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); tlb->mm = mm; - tlb->fullmm = full_mm_flush || (num_online_cpus() == 1) || - (atomic_read(&mm->mm_users) <= 1 && mm == current->active_mm); + tlb->fullmm = full_mm_flush; tlb->nr_ptes = 0; tlb->nr_pxds = TLB_NR_PTRS; if (tlb->fullmm) diff -uprN linux-2.6.32/arch/s390/include/asm/topology.h linux-2.6.32.ovz/arch/s390/include/asm/topology.h --- linux-2.6.32/arch/s390/include/asm/topology.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/topology.h 2015-09-10 10:06:31.000000000 -0700 @@ -3,13 +3,32 @@ #include -#define mc_capable() (1) - -const struct cpumask *cpu_coregroup_mask(unsigned int cpu); - +extern unsigned char cpu_core_id[NR_CPUS]; extern cpumask_t cpu_core_map[NR_CPUS]; +static inline const struct cpumask *cpu_coregroup_mask(unsigned int cpu) +{ + return &cpu_core_map[cpu]; +} + +#define topology_core_id(cpu) (cpu_core_id[cpu]) #define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) +#define mc_capable() (1) + +#ifdef CONFIG_SCHED_BOOK + +extern unsigned char cpu_book_id[NR_CPUS]; +extern cpumask_t cpu_book_map[NR_CPUS]; + +static inline const struct cpumask *cpu_book_mask(unsigned int cpu) +{ + return &cpu_book_map[cpu]; +} + +#define topology_book_id(cpu) (cpu_book_id[cpu]) +#define topology_book_cpumask(cpu) (&cpu_book_map[cpu]) + +#endif /* CONFIG_SCHED_BOOK */ int topology_set_cpu_management(int fc); void topology_schedule_update(void); @@ -28,7 +47,7 @@ static inline void s390_init_cpu_topolog }; #endif -#define SD_MC_INIT SD_CPU_INIT +#define SD_BOOK_INIT SD_CPU_INIT #include diff -uprN linux-2.6.32/arch/s390/include/asm/trace_clock.h linux-2.6.32.ovz/arch/s390/include/asm/trace_clock.h --- linux-2.6.32/arch/s390/include/asm/trace_clock.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/trace_clock.h 2015-09-10 10:07:32.000000000 -0700 @@ -0,0 +1 @@ +#include diff -uprN linux-2.6.32/arch/s390/include/asm/types.h linux-2.6.32.ovz/arch/s390/include/asm/types.h --- linux-2.6.32/arch/s390/include/asm/types.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/types.h 2015-09-10 10:06:57.000000000 -0700 @@ -31,12 +31,6 @@ typedef __signed__ long saddr_t; #ifndef __ASSEMBLY__ typedef u64 dma64_addr_t; -#ifdef __s390x__ -/* DMA addresses come in 32-bit and 64-bit flavours. */ -typedef u64 dma_addr_t; -#else -typedef u32 dma_addr_t; -#endif #ifndef __s390x__ typedef union { diff -uprN linux-2.6.32/arch/s390/include/asm/uaccess.h linux-2.6.32.ovz/arch/s390/include/asm/uaccess.h --- linux-2.6.32/arch/s390/include/asm/uaccess.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/uaccess.h 2015-09-10 10:05:38.000000000 -0700 @@ -93,6 +93,8 @@ extern struct uaccess_ops uaccess_mvcos; extern struct uaccess_ops uaccess_mvcos_switch; extern struct uaccess_ops uaccess_pt; +extern int __handle_fault(unsigned long, unsigned long, int); + static inline int __put_user_fn(size_t size, void __user *ptr, void *x) { size = uaccess.copy_to_user_small(size, ptr, x); diff -uprN linux-2.6.32/arch/s390/include/asm/unistd.h linux-2.6.32.ovz/arch/s390/include/asm/unistd.h --- linux-2.6.32/arch/s390/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/unistd.h 2015-09-10 10:07:25.000000000 -0700 @@ -269,7 +269,17 @@ #define __NR_pwritev 329 #define __NR_rt_tgsigqueueinfo 330 #define __NR_perf_event_open 331 -#define NR_syscalls 332 +/* #define __NR_fanotify_init 332 */ +/* #define __NR_fanotify_mark 333 */ +/* #define __NR_prlimit64 334 */ +/* #define __NR_name_to_handle_at 335 */ +/* #define __NR_open_by_handle_at 336 */ +/* #define __NR_clock_adjtime 337 */ +#define __NR_syncfs 338 +/* whole due to s390_runtime_instr */ +#define __NR_s390_runtime_instr 342 +#define __NR_setns 343 +#define NR_syscalls 344 /* * There are some system calls that are not present on 64 bit, some @@ -376,6 +386,9 @@ #define __IGNORE_migrate_pages #define __IGNORE_move_pages +/* Ignore system calls that are also reachable via sys_socket */ +#define __IGNORE_recvmmsg + #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_ALARM diff -uprN linux-2.6.32/arch/s390/include/asm/vdso.h linux-2.6.32.ovz/arch/s390/include/asm/vdso.h --- linux-2.6.32/arch/s390/include/asm/vdso.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/vdso.h 2015-09-10 10:05:58.000000000 -0700 @@ -7,7 +7,7 @@ #define VDSO32_LBASE 0 #define VDSO64_LBASE 0 -#define VDSO_VERSION_STRING LINUX_2.6.26 +#define VDSO_VERSION_STRING LINUX_2.6.29 #ifndef __ASSEMBLY__ @@ -29,6 +29,7 @@ struct vdso_data { __u32 tz_minuteswest; /* Minutes west of Greenwich 0x30 */ __u32 tz_dsttime; /* Type of dst correction 0x34 */ __u32 ectg_available; + __u32 ntp_mult; /* NTP adjusted multiplier 0x3C */ }; struct vdso_per_cpu_data { diff -uprN linux-2.6.32/arch/s390/include/asm/zcrypt.h linux-2.6.32.ovz/arch/s390/include/asm/zcrypt.h --- linux-2.6.32/arch/s390/include/asm/zcrypt.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/zcrypt.h 2015-09-10 10:07:42.000000000 -0700 @@ -154,6 +154,67 @@ struct ica_xcRB { unsigned short priority_window; unsigned int status; } __attribute__((packed)); + +/** + * struct ep11_cprb - EP11 connectivity programming request block + * @cprb_len: CPRB header length [0x0020] + * @cprb_ver_id: CPRB version id. [0x04] + * @pad_000: Alignment pad bytes + * @flags: Admin cmd [0x80] or functional cmd [0x00] + * @func_id: Function id / subtype [0x5434] + * @source_id: Source id [originator id] + * @target_id: Target id [usage/ctrl domain id] + * @ret_code: Return code + * @reserved1: Reserved + * @reserved2: Reserved + * @payload_len: Payload length + */ +struct ep11_cprb { + uint16_t cprb_len; + unsigned char cprb_ver_id; + unsigned char pad_000[2]; + unsigned char flags; + unsigned char func_id[2]; + uint32_t source_id; + uint32_t target_id; + uint32_t ret_code; + uint32_t reserved1; + uint32_t reserved2; + uint32_t payload_len; +} __attribute__((packed)); + +/** + * struct ep11_target_dev - EP11 target device list + * @ap_id: AP device id + * @dom_id: Usage domain id + */ +struct ep11_target_dev { + uint16_t ap_id; + uint16_t dom_id; +}; + +/** + * struct ep11_urb - EP11 user request block + * @targets_num: Number of target adapters + * @targets: Addr to target adapter list + * @weight: Level of request priority + * @req_no: Request id/number + * @req_len: Request length + * @req: Addr to request block + * @resp_len: Response length + * @resp: Addr to response block + */ +struct ep11_urb { + uint16_t targets_num; + uint64_t targets; + uint64_t weight; + uint64_t req_no; + uint64_t req_len; + uint64_t req; + uint64_t resp_len; + uint64_t resp; +} __attribute__((packed)); + #define AUTOSELECT ((unsigned int)0xFFFFFFFF) #define ZCRYPT_IOCTL_MAGIC 'z' @@ -183,6 +244,9 @@ struct ica_xcRB { * ZSECSENDCPRB * Send an arbitrary CPRB to a crypto card. * + * ZSENDEP11CPRB + * Send an arbitrary EP11 CPRB to an EP11 coprocessor crypto card. + * * Z90STAT_STATUS_MASK * Return an 64 element array of unsigned chars for the status of * all devices. @@ -256,6 +320,7 @@ struct ica_xcRB { #define ICARSAMODEXPO _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0) #define ICARSACRT _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0) #define ZSECSENDCPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0) +#define ZSENDEP11CPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0) /* New status calls */ #define Z90STAT_TOTALCOUNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x40, int) diff -uprN linux-2.6.32/arch/s390/Kconfig linux-2.6.32.ovz/arch/s390/Kconfig --- linux-2.6.32/arch/s390/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/Kconfig 2015-09-10 10:07:15.000000000 -0700 @@ -40,9 +40,6 @@ config ARCH_HAS_ILOG2_U64 config GENERIC_HWEIGHT def_bool y -config GENERIC_TIME - def_bool y - config GENERIC_TIME_VSYSCALL def_bool y @@ -60,6 +57,9 @@ config NO_IOMEM config NO_DMA def_bool y +config ARCH_DMA_ADDR_T_64BIT + def_bool 64BIT + config GENERIC_LOCKBREAK bool default y @@ -87,14 +87,16 @@ config S390 select HAVE_SYSCALL_TRACEPOINTS select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_DEFAULT_NO_SPIN_MUTEXES + select HAVE_ARCH_MUTEX_CPU_RELAX select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_KVM if 64BIT select HAVE_ARCH_TRACEHOOK select INIT_ALL_POSSIBLE + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS + select ARCH_HAVE_NMI_SAFE_CMPXCHG config SCHED_OMIT_FRAME_POINTER bool @@ -165,6 +167,13 @@ config HOTPLUG_CPU can be controlled through /sys/devices/system/cpu/cpu#. Say N if you want to disable CPU hotplug. +config SCHED_BOOK + bool "Book scheduler support" + depends on SMP + help + Book scheduler support improves the CPU scheduler's decision making + when dealing with machines that have several books. + config MATHEMU bool "IEEE FPU emulation" depends on MARCH_G5 @@ -192,23 +201,8 @@ config AUDIT_ARCH bool default y -config S390_SWITCH_AMODE - bool "Switch kernel/user addressing modes" - help - This option allows to switch the addressing modes of kernel and user - space. The kernel parameter switch_amode=on will enable this feature, - default is disabled. Enabling this (via kernel parameter) on machines - earlier than IBM System z9-109 EC/BC will reduce system performance. - - Note that this option will also be selected by selecting the execute - protection option below. Enabling the execute protection via the - noexec kernel parameter will also switch the addressing modes, - independent of the switch_amode kernel parameter. - - config S390_EXEC_PROTECT bool "Data execute protection" - select S390_SWITCH_AMODE help This option allows to enable a buffer overflow protection for user space programs and it also selects the addressing mode option above. @@ -390,6 +384,24 @@ config CHSC_SCH If unsure, say N. +config SCM_BUS + def_bool y + depends on 64BIT + prompt "SCM bus driver" + help + Bus driver for Storage Class Memory. + +config EADM_SCH + def_tristate m + prompt "Support for EADM subchannels" + depends on SCM_BUS + help + This driver allows usage of EADM subchannels. EADM subchannels act + as a communication vehicle for SCM increments. + + To compile this driver as a module, choose M here: the + module will be called eadm_sch. + comment "Misc" config IPL @@ -557,6 +569,30 @@ config KEXEC current kernel, and to start another kernel. It is like a reboot but is independent of hardware/microcode support. +config KEXEC_AUTO_RESERVE + bool "automatically reserve memory for kexec kernel" + depends on KEXEC + default y + ---help--- + Automatically reserve memory for a kexec kernel, so that you don't + need to specify numbers for the "crashkernel=X@Y" boot option, + instead you can use "crashkernel=auto". This option works for + systems that have at least 4 GiB memory. Then 128 MiB plus a small + amount of additional memory that grows linearly with the 1st kernel + memory size is reserved. When "crashkernel=auto" is used for + systems with less than 4 GiB memory, no crashkernel memory will be + reserved. + +config CRASH_DUMP + bool "kernel crash dumps" + depends on 64BIT + help + Generate crash dump after being started by kexec. + Crash dump kernels are loaded in the main kernel with kexec-tools + into a specially reserved region and then later executed after + a crash by kdump/kexec. + For more details see Documentation/kdump/kdump.txt + config ZFCPDUMP bool "zfcpdump support" select SMP @@ -618,6 +654,11 @@ source "arch/s390/Kconfig.debug" source "security/Kconfig" +config KEYS_COMPAT + bool + depends on COMPAT && KEYS + default y + source "crypto/Kconfig" source "lib/Kconfig" diff -uprN linux-2.6.32/arch/s390/Kconfig.debug linux-2.6.32.ovz/arch/s390/Kconfig.debug --- linux-2.6.32/arch/s390/Kconfig.debug 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/Kconfig.debug 2015-09-10 10:06:31.000000000 -0700 @@ -6,4 +6,17 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" +config STRICT_DEVMEM + def_bool y + prompt "Filter access to /dev/mem" + ---help--- + This option restricts access to /dev/mem. If this option is + disabled, you allow userspace access to all memory, including + kernel and userspace memory. Accidental memory access is likely + to be disastrous. + Memory access is required for experts who want to debug the kernel. + + If you are unsure, say Y. + + endmenu diff -uprN linux-2.6.32/arch/s390/kernel/asm-offsets.c linux-2.6.32.ovz/arch/s390/kernel/asm-offsets.c --- linux-2.6.32/arch/s390/kernel/asm-offsets.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/asm-offsets.c 2015-09-10 10:07:19.000000000 -0700 @@ -30,6 +30,7 @@ int main(void) DEFINE(__TI_precount, offsetof(struct thread_info, preempt_count)); DEFINE(__TI_user_timer, offsetof(struct thread_info, user_timer)); DEFINE(__TI_system_timer, offsetof(struct thread_info, system_timer)); + DEFINE(__TI_last_break, offsetof(struct thread_info, last_break)); BLANK(); DEFINE(__PT_ARGS, offsetof(struct pt_regs, args)); DEFINE(__PT_PSW, offsetof(struct pt_regs, psw)); @@ -52,6 +53,7 @@ int main(void) DEFINE(__VDSO_WTOM_NSEC, offsetof(struct vdso_data, wtom_clock_nsec)); DEFINE(__VDSO_TIMEZONE, offsetof(struct vdso_data, tz_minuteswest)); DEFINE(__VDSO_ECTG_OK, offsetof(struct vdso_data, ectg_available)); + DEFINE(__VDSO_NTP_MULT, offsetof(struct vdso_data, ntp_mult)); DEFINE(__VDSO_ECTG_BASE, offsetof(struct vdso_per_cpu_data, ectg_timer_base)); DEFINE(__VDSO_ECTG_USER, @@ -65,5 +67,9 @@ int main(void) DEFINE(__SIGP_RESTART, sigp_restart); DEFINE(__SIGP_SENSE, sigp_sense); DEFINE(__SIGP_INITIAL_CPU_RESET, sigp_initial_cpu_reset); + DEFINE(__LC_RST_OLD_PSW, offsetof(struct _lowcore, restart_old_psw)); + /* constants for transactional execution */ + DEFINE(__LC_PGM_TDB, offsetof(struct _lowcore, pgm_tdb)); + DEFINE(__THREAD_trap_tdb, offsetof(struct task_struct, thread.trap_tdb)); return 0; } diff -uprN linux-2.6.32/arch/s390/kernel/base.S linux-2.6.32.ovz/arch/s390/kernel/base.S --- linux-2.6.32/arch/s390/kernel/base.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/base.S 2015-09-10 10:06:56.000000000 -0700 @@ -75,6 +75,61 @@ s390_base_pgm_handler_fn: .quad 0 .previous +# +# Calls diag 308 subcode 1 and continues execution +# +# The following conditions must be ensured before calling this function: +# * Prefix register = 0 +# * Lowcore protection is disabled +# + .globl do_reset_diag308 +do_reset_diag308: + larl %r4,.Lctlregs # Save control registers + stctg %c0,%c15,0(%r4) + larl %r4,.Lfpctl # Floating point control register + stfpc 0(%r4) + larl %r4,.Lcontinue_psw # Save PSW flags + epsw %r2,%r3 + stm %r2,%r3,0(%r4) + larl %r4,.Lrestart_psw # Setup restart PSW at absolute 0 + lghi %r3,0 + lg %r4,0(%r4) # Save PSW + sturg %r4,%r3 # Use sturg, because of large pages + lghi %r1,1 + diag %r1,%r1,0x308 +.Lrestart_part2: + lhi %r0,0 # Load r0 with zero + lhi %r1,2 # Use mode 2 = ESAME (dump) + sigp %r1,%r0,0x12 # Switch to ESAME mode + sam64 # Switch to 64 bit addressing mode + larl %r4,.Lctlregs # Restore control registers + lctlg %c0,%c15,0(%r4) + larl %r4,.Lfpctl # Restore floating point ctl register + lfpc 0(%r4) + larl %r4,.Lcontinue_psw # Restore PSW flags + lpswe 0(%r4) +.Lcontinue: + br %r14 +.align 16 +.Lrestart_psw: + .long 0x00080000,0x80000000 + .Lrestart_part2 + + .section .data..nosave,"aw",@progbits +.align 8 +.Lcontinue_psw: + .quad 0,.Lcontinue + .previous + + .section .bss +.align 8 +.Lctlregs: + .rept 16 + .quad 0 + .endr +.Lfpctl: + .long 0 + .previous + #else /* CONFIG_64BIT */ .globl s390_base_mcck_handler diff -uprN linux-2.6.32/arch/s390/kernel/compat_linux.c linux-2.6.32.ovz/arch/s390/kernel/compat_linux.c --- linux-2.6.32/arch/s390/kernel/compat_linux.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/compat_linux.c 2015-09-10 10:07:24.000000000 -0700 @@ -31,14 +31,8 @@ #include #include #include -#include #include #include -#include -#include -#include -#include -#include #include #include #include @@ -446,14 +440,14 @@ asmlinkage long sys32_execve(char __user compat_uptr_t __user *envp) { struct pt_regs *regs = task_pt_regs(current); - char *filename; + struct filename *filename; long rc; filename = getname(name); rc = PTR_ERR(filename); if (IS_ERR(filename)) return rc; - rc = compat_do_execve(filename, argv, envp, regs); + rc = compat_do_execve(filename->name, argv, envp, regs); if (rc) goto out; current->thread.fp_regs.fpc=0; @@ -683,38 +677,6 @@ struct mmap_arg_struct_emu31 { u32 offset; }; -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct file * file = NULL; - unsigned long error = -EBADF; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - if (!IS_ERR((void *) error) && error + len >= 0x80000000ULL) { - /* Result is out of bounds. */ - do_munmap(current->mm, addr, len); - error = -ENOMEM; - } - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - - asmlinkage unsigned long old32_mmap(struct mmap_arg_struct_emu31 __user *arg) { @@ -728,7 +690,8 @@ old32_mmap(struct mmap_arg_struct_emu31 if (a.offset & ~PAGE_MASK) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } @@ -741,7 +704,7 @@ sys32_mmap2(struct mmap_arg_struct_emu31 if (copy_from_user(&a, arg, sizeof(a))) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); out: return error; } diff -uprN linux-2.6.32/arch/s390/kernel/compat_linux.h linux-2.6.32.ovz/arch/s390/kernel/compat_linux.h --- linux-2.6.32/arch/s390/kernel/compat_linux.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/compat_linux.h 2015-09-10 10:07:39.000000000 -0700 @@ -4,10 +4,6 @@ #include #include #include -#include -#include -#include -#include /* Macro that masks the high order bit of an 32 bit pointer and converts it*/ /* to a 64 bit pointer */ @@ -106,6 +102,9 @@ typedef union typedef struct { unsigned int fpc; +#ifndef __GENKSYMS__ + unsigned int pad; +#endif freg_t32 fprs[__NUM_FPRS]; } _s390_fp_regs32; diff -uprN linux-2.6.32/arch/s390/kernel/compat_signal.c linux-2.6.32.ovz/arch/s390/kernel/compat_signal.c --- linux-2.6.32/arch/s390/kernel/compat_signal.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/compat_signal.c 2015-09-10 10:07:39.000000000 -0700 @@ -297,51 +297,55 @@ sys32_sigaltstack(const stack_t32 __user static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs) { - _s390_regs_common32 regs32; - int err, i; + _sigregs32 user_sregs; + int i; - regs32.psw.mask = PSW32_MASK_MERGE(psw32_user_bits, - (__u32)(regs->psw.mask >> 32)); - regs32.psw.addr = PSW32_ADDR_AMODE31 | (__u32) regs->psw.addr; + user_sregs.regs.psw.mask = PSW32_MASK_MERGE(psw32_user_bits, + (__u32)(regs->psw.mask >> 32)); + user_sregs.regs.psw.addr = PSW32_ADDR_AMODE31 | (__u32) regs->psw.addr; for (i = 0; i < NUM_GPRS; i++) - regs32.gprs[i] = (__u32) regs->gprs[i]; + user_sregs.regs.gprs[i] = (__u32) regs->gprs[i]; save_access_regs(current->thread.acrs); - memcpy(regs32.acrs, current->thread.acrs, sizeof(regs32.acrs)); - err = __copy_to_user(&sregs->regs, ®s32, sizeof(regs32)); - if (err) - return err; - save_fp_regs(¤t->thread.fp_regs); - /* s390_fp_regs and _s390_fp_regs32 are the same ! */ - return __copy_to_user(&sregs->fpregs, ¤t->thread.fp_regs, - sizeof(_s390_fp_regs32)); + memcpy(&user_sregs.regs.acrs, current->thread.acrs, + sizeof(user_sregs.regs.acrs)); + save_fp_ctl(¤t->thread.fp_regs.fpc); + save_fp_regs(current->thread.fp_regs.fprs); + memcpy(&user_sregs.fpregs, ¤t->thread.fp_regs, + sizeof(user_sregs.fpregs)); + if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32))) + return -EFAULT; + return 0; } static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) { - _s390_regs_common32 regs32; - int err, i; + _sigregs32 user_sregs; + int i; /* Alwys make any pending restarted system call return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; - err = __copy_from_user(®s32, &sregs->regs, sizeof(regs32)); - if (err) - return err; + if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs))) + return -EFAULT; + + /* Loading the floating-point-control word can fail. Do that first. */ + if (restore_fp_ctl(&user_sregs.fpregs.fpc)) + return -EINVAL; + + /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ regs->psw.mask = PSW_MASK_MERGE(regs->psw.mask, - (__u64)regs32.psw.mask << 32); - regs->psw.addr = (__u64)(regs32.psw.addr & PSW32_ADDR_INSN); + (__u64) user_sregs.regs.psw.mask << 32); + regs->psw.addr = (__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_INSN); for (i = 0; i < NUM_GPRS; i++) - regs->gprs[i] = (__u64) regs32.gprs[i]; - memcpy(current->thread.acrs, regs32.acrs, sizeof(current->thread.acrs)); + regs->gprs[i] = (__u64) user_sregs.regs.gprs[i]; + memcpy(¤t->thread.acrs, user_sregs.regs.acrs, + sizeof(current->thread.acrs)); restore_access_regs(current->thread.acrs); - err = __copy_from_user(¤t->thread.fp_regs, &sregs->fpregs, - sizeof(_s390_fp_regs32)); - current->thread.fp_regs.fpc &= FPC_VALID_MASK; - if (err) - return err; + memcpy(¤t->thread.fp_regs, &user_sregs.fpregs, + sizeof(current->thread.fp_regs)); - restore_fp_regs(¤t->thread.fp_regs); + restore_fp_regs(current->thread.fp_regs.fprs); regs->svcnr = 0; /* disable syscall checks */ return 0; } @@ -353,18 +357,18 @@ static int save_sigregs_gprs_high(struct for (i = 0; i < NUM_GPRS; i++) gprs_high[i] = regs->gprs[i] >> 32; - - return __copy_to_user(uregs, &gprs_high, sizeof(gprs_high)); + if (__copy_to_user(uregs, &gprs_high, sizeof(gprs_high))) + return -EFAULT; + return 0; } static int restore_sigregs_gprs_high(struct pt_regs *regs, __u32 __user *uregs) { __u32 gprs_high[NUM_GPRS]; - int err, i; + int i; - err = __copy_from_user(&gprs_high, uregs, sizeof(gprs_high)); - if (err) - return err; + if (__copy_from_user(&gprs_high, uregs, sizeof(gprs_high))) + return -EFAULT; for (i = 0; i < NUM_GPRS; i++) *(__u32 *)®s->gprs[i] = gprs_high[i]; return 0; diff -uprN linux-2.6.32/arch/s390/kernel/compat_wrapper.S linux-2.6.32.ovz/arch/s390/kernel/compat_wrapper.S --- linux-2.6.32/arch/s390/kernel/compat_wrapper.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/compat_wrapper.S 2015-09-10 10:07:25.000000000 -0700 @@ -785,7 +785,7 @@ sys32_getresuid16_wrapper: sys32_poll_wrapper: llgtr %r2,%r2 # struct pollfd * llgfr %r3,%r3 # unsigned int - lgfr %r4,%r4 # long + lgfr %r4,%r4 # int jg sys_poll # branch to system call .globl compat_sys_nfsservctl_wrapper @@ -1855,3 +1855,20 @@ sys32_execve_wrapper: llgtr %r3,%r3 # compat_uptr_t * llgtr %r4,%r4 # compat_uptr_t * jg sys32_execve # branch to system call + + .globl sys_syncfs_wrapper +sys_syncfs_wrapper: + lgfr %r2,%r2 # int + jg sys_syncfs + + .globl sys_setns_wrapper +sys_setns_wrapper: + lgfr %r2,%r2 # int + lgfr %r3,%r3 # int + jg sys_setns + + .globl sys_s390_runtime_instr_wrapper +sys_s390_runtime_instr_wrapper: + lgfr %r2,%r2 # int + lgfr %r3,%r3 # int + jg sys_s390_runtime_instr diff -uprN linux-2.6.32/arch/s390/kernel/crash_dump.c linux-2.6.32.ovz/arch/s390/kernel/crash_dump.c --- linux-2.6.32/arch/s390/kernel/crash_dump.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/crash_dump.c 2015-09-10 10:08:00.000000000 -0700 @@ -0,0 +1,460 @@ +/* + * S390 kdump implementation + * + * Copyright IBM Corp. 2011 + * Author(s): Michael Holzheu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) +#define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) +#define PTR_DIFF(x, y) ((unsigned long)(((char *) (x)) - ((unsigned long) (y)))) + +/* + * stores the physical address of elf header of crash image + * + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence put + * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +EXPORT_SYMBOL(elfcorehdr_addr); + +/* + * stores the size of elf header of crash image + */ +unsigned long long elfcorehdr_size; + +/* + * elfcorehdr= specifies the location of elf core header stored by the crashed + * kernel. This option will be passed by kexec loader to the capture kernel. + * + * Syntax: elfcorehdr=[size[KMG]@]offset[KMG] + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + if (*end == '@') { + elfcorehdr_size = elfcorehdr_addr; + elfcorehdr_addr = memparse(end + 1, &end); + } + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); + +/* + * Copy one page from "oldmem" + * + * For the kdump reserved memory this functions performs a swap operation: + * - [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] is mapped to [0 - OLDMEM_SIZE]. + * - [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + unsigned long src; + + if (!csize) + return 0; + + src = (pfn << PAGE_SHIFT) + offset; + if (src < OLDMEM_SIZE) + src += OLDMEM_BASE; + else if (src > OLDMEM_BASE && + src < OLDMEM_BASE + OLDMEM_SIZE) + src -= OLDMEM_BASE; + if (userbuf) + copy_to_user_real((void __user *) buf, (void *) src, csize); + else + memcpy_real(buf, (void *) src, csize); + return csize; +} + +/* + * Copy memory from old kernel + */ +static int copy_from_oldmem(void *dest, void *src, size_t count) +{ + unsigned long copied = 0; + int rc; + + if ((unsigned long) src < OLDMEM_SIZE) { + copied = min(count, OLDMEM_SIZE - (unsigned long) src); + rc = memcpy_real(dest, src + OLDMEM_BASE, copied); + if (rc) + return rc; + } + return memcpy_real(dest + copied, src + copied, count - copied); +} + +/* + * Alloc memory and panic in case of ENOMEM + */ +static void *kzalloc_panic(int len) +{ + void *rc; + + rc = kzalloc(len, GFP_KERNEL); + if (!rc) + panic("s390 kdump kzalloc (%d) failed", len); + return rc; +} + +/* + * Get memory layout and create hole for oldmem + */ +static struct mem_chunk *get_memory_layout(void) +{ + struct mem_chunk *chunk_array; + + chunk_array = kzalloc_panic(MEMORY_CHUNKS * sizeof(struct mem_chunk)); + detect_memory_layout(chunk_array); + create_mem_hole(chunk_array, OLDMEM_BASE, OLDMEM_SIZE, CHUNK_CRASHK); + return chunk_array; +} + +/* + * Initialize ELF note + */ +static void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len, + const char *name) +{ + Elf64_Nhdr *note; + u64 len; + + note = (Elf64_Nhdr *)buf; + note->n_namesz = strlen(name) + 1; + note->n_descsz = d_len; + note->n_type = type; + len = sizeof(Elf64_Nhdr); + + memcpy(buf + len, name, note->n_namesz); + len = roundup(len + note->n_namesz, 4); + + memcpy(buf + len, desc, note->n_descsz); + len = roundup(len + note->n_descsz, 4); + + return PTR_ADD(buf, len); +} + +/* + * Initialize prstatus note + */ +static void *nt_prstatus(void *ptr, struct save_area_s390x *sa) +{ + struct elf_prstatus nt_prstatus; + static int cpu_nr = 1; + + memset(&nt_prstatus, 0, sizeof(nt_prstatus)); + memcpy(&nt_prstatus.pr_reg.gprs, sa->gp_regs, sizeof(sa->gp_regs)); + memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw)); + memcpy(&nt_prstatus.pr_reg.acrs, sa->acc_regs, sizeof(sa->acc_regs)); + nt_prstatus.pr_pid = cpu_nr; + cpu_nr++; + + return nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus), + "CORE"); +} + +/* + * Initialize fpregset (floating point) note + */ +static void *nt_fpregset(void *ptr, struct save_area_s390x *sa) +{ + elf_fpregset_t nt_fpregset; + + memset(&nt_fpregset, 0, sizeof(nt_fpregset)); + memcpy(&nt_fpregset.fpc, &sa->fp_ctrl_reg, sizeof(sa->fp_ctrl_reg)); + memcpy(&nt_fpregset.fprs, &sa->fp_regs, sizeof(sa->fp_regs)); + + return nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset), + "CORE"); +} + +/* + * Initialize timer note + */ +static void *nt_s390_timer(void *ptr, struct save_area_s390x *sa) +{ + return nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer), + KEXEC_CORE_NOTE_NAME); +} + +/* + * Initialize TOD clock comparator note + */ +static void *nt_s390_tod_cmp(void *ptr, struct save_area_s390x *sa) +{ + return nt_init(ptr, NT_S390_TODCMP, &sa->clk_cmp, + sizeof(sa->clk_cmp), KEXEC_CORE_NOTE_NAME); +} + +/* + * Initialize TOD programmable register note + */ +static void *nt_s390_tod_preg(void *ptr, struct save_area_s390x *sa) +{ + return nt_init(ptr, NT_S390_TODPREG, &sa->tod_reg, + sizeof(sa->tod_reg), KEXEC_CORE_NOTE_NAME); +} + +/* + * Initialize control register note + */ +static void *nt_s390_ctrs(void *ptr, struct save_area_s390x *sa) +{ + return nt_init(ptr, NT_S390_CTRS, &sa->ctrl_regs, + sizeof(sa->ctrl_regs), KEXEC_CORE_NOTE_NAME); +} + +/* + * Initialize prefix register note + */ +static void *nt_s390_prefix(void *ptr, struct save_area_s390x *sa) +{ + return nt_init(ptr, NT_S390_PREFIX, &sa->pref_reg, + sizeof(sa->pref_reg), KEXEC_CORE_NOTE_NAME); +} + +/* + * Fill ELF notes for one CPU with save area registers + */ +void *fill_cpu_elf_notes(void *ptr, struct save_area_s390x *sa) +{ + ptr = nt_prstatus(ptr, sa); + ptr = nt_fpregset(ptr, sa); + ptr = nt_s390_timer(ptr, sa); + ptr = nt_s390_tod_cmp(ptr, sa); + ptr = nt_s390_tod_preg(ptr, sa); + ptr = nt_s390_ctrs(ptr, sa); + ptr = nt_s390_prefix(ptr, sa); + return ptr; +} + +/* + * Initialize prpsinfo note (new kernel) + */ +static void *nt_prpsinfo(void *ptr) +{ + struct elf_prpsinfo prpsinfo; + + memset(&prpsinfo, 0, sizeof(prpsinfo)); + prpsinfo.pr_sname = 'R'; + strcpy(prpsinfo.pr_fname, "vmlinux"); + return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo), + KEXEC_CORE_NOTE_NAME); +} + +/* + * Initialize vmcoreinfo note (new kernel) + */ +static void *nt_vmcoreinfo(void *ptr) +{ + char nt_name[11], *vmcoreinfo; + Elf64_Nhdr note; + void *addr; + + if (copy_from_oldmem(&addr, &S390_lowcore.vmcore_info, sizeof(addr))) + return ptr; + memset(nt_name, 0, sizeof(nt_name)); + if (copy_from_oldmem(¬e, addr, sizeof(note))) + return ptr; + if (copy_from_oldmem(nt_name, addr + sizeof(note), sizeof(nt_name) - 1)) + return ptr; + if (strcmp(nt_name, "VMCOREINFO") != 0) + return ptr; + vmcoreinfo = kzalloc_panic(note.n_descsz + 1); + if (copy_from_oldmem(vmcoreinfo, addr + 24, note.n_descsz)) + return ptr; + vmcoreinfo[note.n_descsz + 1] = 0; + return nt_init(ptr, 0, vmcoreinfo, note.n_descsz, "VMCOREINFO"); +} + +/* + * Initialize ELF header (new kernel) + */ +static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) +{ + memset(ehdr, 0, sizeof(*ehdr)); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2MSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = EM_S390; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + ehdr->e_phnum = mem_chunk_cnt + 1; + return ehdr + 1; +} + +/* + * Return CPU count for ELF header (new kernel) + */ +static int get_cpu_cnt(void) +{ + int i, cpus = 0; + + for (i = 0; zfcpdump_save_areas[i]; i++) { + if (zfcpdump_save_areas[i]->s390x.pref_reg == 0) + continue; + cpus++; + } + return cpus; +} + +/* + * Return memory chunk count for ELF header (new kernel) + */ +static int get_mem_chunk_cnt(void) +{ + struct mem_chunk *chunk_array, *mem_chunk; + int i, cnt = 0; + + chunk_array = get_memory_layout(); + for (i = 0; i < MEMORY_CHUNKS; i++) { + mem_chunk = &chunk_array[i]; + if (chunk_array[i].type != CHUNK_READ_WRITE && + chunk_array[i].type != CHUNK_READ_ONLY) + continue; + if (mem_chunk->size == 0) + continue; + cnt++; + } + kfree(chunk_array); + return cnt; +} + +/* + * Relocate pointer in order to allow vmcore code access the data + */ +static inline unsigned long relocate(unsigned long addr) +{ + return OLDMEM_BASE + addr; +} + +/* + * Initialize ELF loads (new kernel) + */ +static int loads_init(Elf64_Phdr *phdr, u64 loads_offset) +{ + struct mem_chunk *chunk_array, *mem_chunk; + int i; + + chunk_array = get_memory_layout(); + for (i = 0; i < MEMORY_CHUNKS; i++) { + mem_chunk = &chunk_array[i]; + if (mem_chunk->size == 0) + break; + if (chunk_array[i].type != CHUNK_READ_WRITE && + chunk_array[i].type != CHUNK_READ_ONLY) + continue; + else + phdr->p_filesz = mem_chunk->size; + phdr->p_type = PT_LOAD; + phdr->p_offset = mem_chunk->addr; + phdr->p_vaddr = mem_chunk->addr; + phdr->p_paddr = mem_chunk->addr; + phdr->p_memsz = mem_chunk->size; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; + phdr++; + } + kfree(chunk_array); + return i; +} + +/* + * Initialize notes (new kernel) + */ +static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) +{ + struct save_area_s390x *sa; + void *ptr_start = ptr; + int i; + + ptr = nt_prpsinfo(ptr); + + for (i = 0; zfcpdump_save_areas[i]; i++) { + sa = &zfcpdump_save_areas[i]->s390x; + if (sa->pref_reg == 0) + continue; + ptr = fill_cpu_elf_notes(ptr, sa); + } + ptr = nt_vmcoreinfo(ptr); + memset(phdr, 0, sizeof(*phdr)); + phdr->p_type = PT_NOTE; + phdr->p_offset = relocate(notes_offset); + phdr->p_filesz = (unsigned long) PTR_SUB(ptr, ptr_start); + phdr->p_memsz = phdr->p_filesz; + return ptr; +} + +/* + * Create ELF core header (new kernel) + */ +static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) +{ + Elf64_Phdr *phdr_notes, *phdr_loads; + int mem_chunk_cnt; + void *ptr, *hdr; + u32 alloc_size; + u64 hdr_off; + + mem_chunk_cnt = get_mem_chunk_cnt(); + + alloc_size = 0x1000 + get_cpu_cnt() * 0x300 + + mem_chunk_cnt * sizeof(Elf64_Phdr); + hdr = kzalloc_panic(alloc_size); + /* Init elf header */ + ptr = ehdr_init(hdr, mem_chunk_cnt); + /* Init program headers */ + phdr_notes = ptr; + ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); + phdr_loads = ptr; + ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); + /* Init notes */ + hdr_off = PTR_DIFF(ptr, hdr); + ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); + /* Init loads */ + hdr_off = PTR_DIFF(ptr, hdr); + loads_init(phdr_loads, ((unsigned long) hdr) + hdr_off); + *elfcorebuf_sz = hdr_off; + *elfcorebuf = (void *) relocate((unsigned long) hdr); + BUG_ON(*elfcorebuf_sz > alloc_size); +} + +/* + * Create kdump ELF core header in new kernel, if it has not been passed via + * the "elfcorehdr" kernel parameter + */ +static int setup_kdump_elfcorehdr(void) +{ + size_t elfcorebuf_sz; + char *elfcorebuf; + + if (!OLDMEM_BASE || is_kdump_kernel()) + return -EINVAL; + s390_elf_corehdr_create(&elfcorebuf, &elfcorebuf_sz); + elfcorehdr_addr = (unsigned long long) elfcorebuf; + elfcorehdr_size = elfcorebuf_sz; + return 0; +} + +subsys_initcall(setup_kdump_elfcorehdr); diff -uprN linux-2.6.32/arch/s390/kernel/debug.c linux-2.6.32.ovz/arch/s390/kernel/debug.c --- linux-2.6.32/arch/s390/kernel/debug.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/debug.c 2015-09-10 10:06:59.000000000 -0700 @@ -2,8 +2,8 @@ * arch/s390/kernel/debug.c * S/390 debug facility * - * Copyright (C) 1999, 2000 IBM Deutschland Entwicklung GmbH, - * IBM Corporation + * Copyright IBM Corp. 1999, 2012 + * * Author(s): Michael Holzheu (holzheu@de.ibm.com), * Holge