From d0b2f91bede3bd5e3d24dd6803e56eee959c1797 Mon Sep 17 00:00:00 2001 From: AndrĂ© Fabian Silva Delgado Date: Thu, 20 Oct 2016 00:10:27 -0300 Subject: Linux-libre 4.8.2-gnu --- arch/x86/kernel/acpi/boot.c | 20 +- arch/x86/kernel/acpi/cstate.c | 2 +- arch/x86/kernel/amd_gart_64.c | 21 +- arch/x86/kernel/amd_nb.c | 39 +-- arch/x86/kernel/apb_timer.c | 29 +- arch/x86/kernel/apic/apic.c | 61 +++- arch/x86/kernel/apic/apic_flat_64.c | 4 +- arch/x86/kernel/apic/apic_noop.c | 2 - arch/x86/kernel/apic/apic_numachip.c | 2 - arch/x86/kernel/apic/bigsmp_32.c | 1 - arch/x86/kernel/apic/hw_nmi.c | 2 +- arch/x86/kernel/apic/io_apic.c | 29 +- arch/x86/kernel/apic/ipi.c | 1 - arch/x86/kernel/apic/probe_32.c | 5 +- arch/x86/kernel/apic/probe_64.c | 3 +- arch/x86/kernel/apic/vector.c | 25 +- arch/x86/kernel/apic/x2apic_cluster.c | 86 ++--- arch/x86/kernel/apic/x2apic_phys.c | 1 - arch/x86/kernel/apic/x2apic_uv_x.c | 49 ++- arch/x86/kernel/asm-offsets.c | 4 +- arch/x86/kernel/cpu/common.c | 27 +- arch/x86/kernel/cpu/hypervisor.c | 3 +- arch/x86/kernel/cpu/intel.c | 14 +- arch/x86/kernel/cpu/match.c | 2 +- arch/x86/kernel/cpu/mcheck/mce-apei.c | 2 +- arch/x86/kernel/cpu/mcheck/mce.c | 6 +- arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +- arch/x86/kernel/cpu/microcode/amd.c | 49 ++- arch/x86/kernel/cpu/microcode/core.c | 7 +- arch/x86/kernel/cpu/microcode/intel.c | 261 ++++++++------ arch/x86/kernel/cpu/microcode/intel_lib.c | 2 - arch/x86/kernel/cpu/mshyperv.c | 3 +- arch/x86/kernel/cpu/mtrr/cleanup.c | 1 - arch/x86/kernel/cpu/mtrr/generic.c | 2 +- arch/x86/kernel/cpu/mtrr/if.c | 1 - arch/x86/kernel/cpu/mtrr/main.c | 2 +- arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- arch/x86/kernel/cpu/rdrand.c | 4 +- arch/x86/kernel/cpu/vmware.c | 3 +- arch/x86/kernel/crash.c | 2 +- arch/x86/kernel/dumpstack.c | 25 +- arch/x86/kernel/dumpstack_32.c | 6 +- arch/x86/kernel/dumpstack_64.c | 18 +- arch/x86/kernel/e820.c | 14 +- arch/x86/kernel/early-quirks.c | 404 +++++++++++----------- arch/x86/kernel/ebda.c | 114 ++++--- arch/x86/kernel/espfix_64.c | 3 - arch/x86/kernel/fpu/core.c | 33 +- arch/x86/kernel/fpu/init.c | 34 +- arch/x86/kernel/fpu/regset.c | 52 ++- arch/x86/kernel/fpu/signal.c | 58 +++- arch/x86/kernel/fpu/xstate.c | 547 +++++++++++++++++++----------- arch/x86/kernel/head32.c | 2 - arch/x86/kernel/head64.c | 1 - arch/x86/kernel/head_64.S | 3 +- arch/x86/kernel/hpet.c | 72 ++-- arch/x86/kernel/hw_breakpoint.c | 3 +- arch/x86/kernel/i386_ksyms_32.c | 5 +- arch/x86/kernel/i8253.c | 2 +- arch/x86/kernel/io_delay.c | 2 +- arch/x86/kernel/irq.c | 3 +- arch/x86/kernel/irq_32.c | 1 - arch/x86/kernel/irq_64.c | 1 - arch/x86/kernel/kdebugfs.c | 2 +- arch/x86/kernel/kvm.c | 4 +- arch/x86/kernel/kvmclock.c | 1 + arch/x86/kernel/mpparse.c | 1 - arch/x86/kernel/nmi.c | 1 + arch/x86/kernel/paravirt-spinlocks.c | 2 +- arch/x86/kernel/paravirt.c | 3 +- arch/x86/kernel/pci-calgary_64.c | 14 +- arch/x86/kernel/pci-dma.c | 4 +- arch/x86/kernel/pci-nommu.c | 4 +- arch/x86/kernel/pci-swiotlb.c | 6 +- arch/x86/kernel/platform-quirks.c | 4 +- arch/x86/kernel/pmem.c | 2 +- arch/x86/kernel/process.c | 5 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 15 +- arch/x86/kernel/ptrace.c | 19 +- arch/x86/kernel/pvclock.c | 17 +- arch/x86/kernel/reboot.c | 23 +- arch/x86/kernel/rtc.c | 3 +- arch/x86/kernel/setup.c | 35 +- arch/x86/kernel/setup_percpu.c | 3 + arch/x86/kernel/signal.c | 40 ++- arch/x86/kernel/signal_compat.c | 108 ++++++ arch/x86/kernel/smpboot.c | 83 +++-- arch/x86/kernel/stacktrace.c | 2 +- arch/x86/kernel/tboot.c | 25 +- arch/x86/kernel/test_rodata.c | 5 - arch/x86/kernel/traps.c | 2 +- arch/x86/kernel/tsc.c | 111 ++++-- arch/x86/kernel/tsc_msr.c | 68 ++-- arch/x86/kernel/vm86_32.c | 5 +- arch/x86/kernel/x8664_ksyms_64.c | 6 +- arch/x86/kernel/x86_init.c | 3 +- 97 files changed, 1681 insertions(+), 1131 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 9414f8458..fbd194444 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include @@ -161,13 +161,15 @@ static int __init acpi_parse_madt(struct acpi_table_header *table) /** * acpi_register_lapic - register a local apic and generates a logic cpu number * @id: local apic id to register + * @acpiid: ACPI id to register * @enabled: this cpu is enabled or not * * Returns the logic cpu number which maps to the local apic */ -static int acpi_register_lapic(int id, u8 enabled) +static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) { unsigned int ver = 0; + int cpu; if (id >= MAX_LOCAL_APIC) { printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); @@ -180,9 +182,13 @@ static int acpi_register_lapic(int id, u8 enabled) } if (boot_cpu_physical_apicid != -1U) - ver = apic_version[boot_cpu_physical_apicid]; + ver = boot_cpu_apic_version; + + cpu = generic_processor_info(id, ver); + if (cpu >= 0) + early_per_cpu(x86_cpu_to_acpiid, cpu) = acpiid; - return generic_processor_info(id, ver); + return cpu; } static int __init @@ -212,7 +218,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) if (!apic->apic_id_valid(apic_id) && enabled) printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); else - acpi_register_lapic(apic_id, enabled); + acpi_register_lapic(apic_id, processor->uid, enabled); #else printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); #endif @@ -240,6 +246,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, const unsigned long end) * when we use CPU hotplug. */ acpi_register_lapic(processor->id, /* APIC ID */ + processor->processor_id, /* ACPI ID */ processor->lapic_flags & ACPI_MADT_ENABLED); return 0; @@ -258,6 +265,7 @@ acpi_parse_sapic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); acpi_register_lapic((processor->id << 8) | processor->eid,/* APIC ID */ + processor->processor_id, /* ACPI ID */ processor->lapic_flags & ACPI_MADT_ENABLED); return 0; @@ -714,7 +722,7 @@ int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu) { int cpu; - cpu = acpi_register_lapic(physid, ACPI_MADT_ENABLED); + cpu = acpi_register_lapic(physid, U32_MAX, ACPI_MADT_ENABLED); if (cpu < 0) { pr_info(PREFIX "Unable to map lapic to logical cpu number\n"); return cpu; diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 4b28159e0..bdfad6421 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -5,7 +5,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 8e3842fc8..63ff468a7 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -242,7 +241,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, static dma_addr_t gart_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned long bus; phys_addr_t paddr = page_to_phys(page) + offset; @@ -264,7 +263,7 @@ static dma_addr_t gart_map_page(struct device *dev, struct page *page, */ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned long iommu_page; int npages; @@ -286,7 +285,7 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, * Wrapper for pci_unmap_single working with scatterlists. */ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *s; int i; @@ -294,7 +293,7 @@ static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, for_each_sg(sg, s, nents, i) { if (!s->dma_length || !s->length) break; - gart_unmap_page(dev, s->dma_address, s->dma_length, dir, NULL); + gart_unmap_page(dev, s->dma_address, s->dma_length, dir, 0); } } @@ -316,7 +315,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, addr = dma_map_area(dev, addr, s->length, dir, 0); if (addr == bad_dma_addr) { if (i > 0) - gart_unmap_sg(dev, sg, i, dir, NULL); + gart_unmap_sg(dev, sg, i, dir, 0); nents = 0; sg[0].dma_length = 0; break; @@ -387,7 +386,7 @@ dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, * Merge chunks that have page aligned sizes into a continuous mapping. */ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, - enum dma_data_direction dir, struct dma_attrs *attrs) + enum dma_data_direction dir, unsigned long attrs) { struct scatterlist *s, *ps, *start_sg, *sgmap; int need = 0, nextneed, i, out, start; @@ -457,7 +456,7 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, error: flush_gart(); - gart_unmap_sg(dev, sg, out, dir, NULL); + gart_unmap_sg(dev, sg, out, dir, 0); /* When it was forced or merged try again in a dumb way */ if (force_iommu || iommu_merge) { @@ -477,7 +476,7 @@ error: /* allocate and map a coherent mapping */ static void * gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, - gfp_t flag, struct dma_attrs *attrs) + gfp_t flag, unsigned long attrs) { dma_addr_t paddr; unsigned long align_mask; @@ -509,9 +508,9 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, /* free a coherent mapping */ static void gart_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, struct dma_attrs *attrs) + dma_addr_t dma_addr, unsigned long attrs) { - gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); + gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, 0); dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index e991d5c8b..4fdf6230d 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include @@ -219,24 +219,22 @@ int amd_set_subcaches(int cpu, unsigned long mask) return 0; } -static int amd_cache_gart(void) +static void amd_cache_gart(void) { u16 i; - if (!amd_nb_has_feature(AMD_NB_GART)) - return 0; - - flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); - if (!flush_words) { - amd_northbridges.flags &= ~AMD_NB_GART; - return -ENOMEM; - } + if (!amd_nb_has_feature(AMD_NB_GART)) + return; - for (i = 0; i != amd_nb_num(); i++) - pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, - &flush_words[i]); + flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + pr_notice("Cannot initialize GART flush words, GART support disabled\n"); + return; + } - return 0; + for (i = 0; i != amd_nb_num(); i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]); } void amd_flush_garts(void) @@ -278,17 +276,10 @@ EXPORT_SYMBOL_GPL(amd_flush_garts); static __init int init_amd_nbs(void) { - int err = 0; + amd_cache_northbridges(); + amd_cache_gart(); - err = amd_cache_northbridges(); - - if (err < 0) - pr_notice("Cannot enumerate AMD northbridges\n"); - - if (amd_cache_gart() < 0) - pr_notice("Cannot initialize GART flush words, GART support disabled\n"); - - return err; + return 0; } /* This has to go after the PCI subsystem */ diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index cefacbad1..456316f6c 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -215,26 +215,18 @@ void apbt_setup_secondary_clock(void) * cpu timers during the offline process due to the ordering of notification. * the extra interrupt is harmless. */ -static int apbt_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) +static int apbt_cpu_dead(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu); - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DEAD: - dw_apb_clockevent_pause(adev->timer); - if (system_state == SYSTEM_RUNNING) { - pr_debug("skipping APBT CPU %lu offline\n", cpu); - } else { - pr_debug("APBT clockevent for cpu %lu offline\n", cpu); - dw_apb_clockevent_stop(adev->timer); - } - break; - default: - pr_debug("APBT notified %lu, no action\n", action); + dw_apb_clockevent_pause(adev->timer); + if (system_state == SYSTEM_RUNNING) { + pr_debug("skipping APBT CPU %u offline\n", cpu); + } else { + pr_debug("APBT clockevent for cpu %u offline\n", cpu); + dw_apb_clockevent_stop(adev->timer); } - return NOTIFY_OK; + return 0; } static __init int apbt_late_init(void) @@ -242,9 +234,8 @@ static __init int apbt_late_init(void) if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT || !apb_timer_block_enabled) return 0; - /* This notifier should be called after workqueue is ready */ - hotcpu_notifier(apbt_cpuhp_notify, -20); - return 0; + return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "X86_APB_DEAD", NULL, + apbt_cpu_dead); } fs_initcall(apbt_late_init); #else diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b15e1c158..076c315cd 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -64,6 +64,8 @@ unsigned disabled_cpus; unsigned int boot_cpu_physical_apicid = -1U; EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); +u8 boot_cpu_apic_version; + /* * The highest APIC ID seen during enumeration. */ @@ -92,8 +94,10 @@ static int apic_extnmi = APIC_EXTNMI_BSP; */ DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid, BAD_APICID); DEFINE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid, BAD_APICID); +DEFINE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid, U32_MAX); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); +EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_acpiid); #ifdef CONFIG_X86_32 @@ -145,7 +149,7 @@ static int force_enable_local_apic __initdata; */ static int __init parse_lapic(char *arg) { - if (config_enabled(CONFIG_X86_32) && !arg) + if (IS_ENABLED(CONFIG_X86_32) && !arg) force_enable_local_apic = 1; else if (arg && !strncmp(arg, "notscdeadline", 13)) setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); @@ -311,7 +315,7 @@ int lapic_get_maxlvt(void) /* Clock divisor */ #define APIC_DIVISOR 16 -#define TSC_DIVISOR 32 +#define TSC_DIVISOR 8 /* * This function sets up the local APIC timer, with a timeout of @@ -563,12 +567,36 @@ static void setup_APIC_timer(void) CLOCK_EVT_FEAT_DUMMY); levt->set_next_event = lapic_next_deadline; clockevents_config_and_register(levt, - (tsc_khz / TSC_DIVISOR) * 1000, + tsc_khz * (1000 / TSC_DIVISOR), 0xF, ~0UL); } else clockevents_register_device(levt); } +/* + * Install the updated TSC frequency from recalibration at the TSC + * deadline clockevent devices. + */ +static void __lapic_update_tsc_freq(void *info) +{ + struct clock_event_device *levt = this_cpu_ptr(&lapic_events); + + if (!this_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return; + + clockevents_update_freq(levt, tsc_khz * (1000 / TSC_DIVISOR)); +} + +void lapic_update_tsc_freq(void) +{ + /* + * The clockevent device's ->mult and ->shift can both be + * changed. In order to avoid races, schedule the frequency + * update code on each CPU. + */ + on_each_cpu(__lapic_update_tsc_freq, NULL, 0); +} + /* * In this functions we calibrate APIC bus clocks to the external timer. * @@ -1790,8 +1818,7 @@ void __init init_apic_mappings(void) * since smp_sanity_check is prepared for such a case * and disable smp mode */ - apic_version[new_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } @@ -1806,13 +1833,10 @@ void __init register_lapic_address(unsigned long address) } if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = read_apic_id(); - apic_version[boot_cpu_physical_apicid] = - GET_APIC_VERSION(apic_read(APIC_LVR)); + boot_cpu_apic_version = GET_APIC_VERSION(apic_read(APIC_LVR)); } } -int apic_version[MAX_LOCAL_APIC]; - /* * Local APIC interrupts */ @@ -2048,7 +2072,7 @@ int generic_processor_info(int apicid, int version) int thiscpu = max + disabled_cpus - 1; pr_warning( - "ACPI: NR_CPUS/possible_cpus limit of %i almost" + "APIC: NR_CPUS/possible_cpus limit of %i almost" " reached. Keeping one slot for boot cpu." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); @@ -2060,14 +2084,13 @@ int generic_processor_info(int apicid, int version) int thiscpu = max + disabled_cpus; pr_warning( - "ACPI: NR_CPUS/possible_cpus limit of %i reached." + "APIC: NR_CPUS/possible_cpus limit of %i reached." " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); disabled_cpus++; return -EINVAL; } - num_processors++; if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed @@ -2088,12 +2111,15 @@ int generic_processor_info(int apicid, int version) if (topology_update_package_map(apicid, cpu) < 0) { int thiscpu = max + disabled_cpus; - pr_warning("ACPI: Package limit reached. Processor %d/0x%x ignored.\n", + pr_warning("APIC: Package limit reached. Processor %d/0x%x ignored.\n", thiscpu, apicid); + disabled_cpus++; return -ENOSPC; } + num_processors++; + /* * Validate version */ @@ -2102,11 +2128,10 @@ int generic_processor_info(int apicid, int version) cpu, apicid); version = 0x10; } - apic_version[apicid] = version; - if (version != apic_version[boot_cpu_physical_apicid]) { + if (version != boot_cpu_apic_version) { pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); + boot_cpu_apic_version, cpu, version); } physid_set(apicid, phys_cpu_present_map); @@ -2249,7 +2274,7 @@ int __init APIC_init_uniprocessor(void) * Complain if the BIOS pretends there is one. */ if (!boot_cpu_has(X86_FEATURE_APIC) && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + APIC_INTEGRATED(boot_cpu_apic_version)) { pr_err("BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); return -1; diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 76f89e2b2..5b2ae106b 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -181,7 +181,6 @@ static struct apic apic_flat = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFu << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, @@ -278,7 +277,6 @@ static struct apic apic_physflat = { .get_apic_id = flat_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFu << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index 13d19ed58..c05688b2d 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -11,7 +11,6 @@ #include #include -#include #include #include #include @@ -141,7 +140,6 @@ struct apic apic_noop = { .get_apic_id = noop_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0x0F << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index ab5c2c685..714d4fda0 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -269,7 +269,6 @@ static const struct apic apic_numachip1 __refconst = { .get_apic_id = numachip1_get_apic_id, .set_apic_id = numachip1_set_apic_id, - .apic_id_mask = 0xffU << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, @@ -321,7 +320,6 @@ static const struct apic apic_numachip2 __refconst = { .get_apic_id = numachip2_get_apic_id, .set_apic_id = numachip2_set_apic_id, - .apic_id_mask = 0xffU << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index cf9bd896c..06dbaa458 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -171,7 +171,6 @@ static struct apic apic_bigsmp = { .get_apic_id = bigsmp_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0xFF << 24, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 7788ce643..f29501e1a 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_HARDLOCKUP_DETECTOR diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 446702ed9..48e6d84f1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include #include @@ -981,7 +981,7 @@ static int alloc_irq_from_domain(struct irq_domain *domain, int ioapic, u32 gsi, return __irq_domain_alloc_irqs(domain, irq, 1, ioapic_alloc_attr_node(info), - info, legacy); + info, legacy, NULL); } /* @@ -1014,7 +1014,8 @@ static int alloc_isa_irq_from_domain(struct irq_domain *domain, info->ioapic_pin)) return -ENOMEM; } else { - irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true); + irq = __irq_domain_alloc_irqs(domain, irq, 1, node, info, true, + NULL); if (irq >= 0) { irq_data = irq_domain_get_irq_data(domain, irq); data = irq_data->chip_data; @@ -1592,7 +1593,7 @@ void __init setup_ioapic_ids_from_mpc(void) * no meaning without the serial APIC bus. */ if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + || APIC_XAPIC(boot_cpu_apic_version)) return; setup_ioapic_ids_from_mpc_nocheck(); } @@ -2422,7 +2423,7 @@ static int io_apic_get_unique_id(int ioapic, int apic_id) static u8 io_apic_unique_id(int idx, u8 id) { if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - !APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + !APIC_XAPIC(boot_cpu_apic_version)) return io_apic_get_unique_id(idx, id); else return id; @@ -2567,29 +2568,25 @@ static struct resource * __init ioapic_setup_resources(void) unsigned long n; struct resource *res; char *mem; - int i, num = 0; + int i; - for_each_ioapic(i) - num++; - if (num == 0) + if (nr_ioapics == 0) return NULL; n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= num; + n *= nr_ioapics; mem = alloc_bootmem(n); res = (void *)mem; - mem += sizeof(struct resource) * num; + mem += sizeof(struct resource) * nr_ioapics; - num = 0; for_each_ioapic(i) { - res[num].name = mem; - res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i); mem += IOAPIC_RESOURCE_NAME_SIZE; - ioapics[i].iomem_res = &res[num]; - num++; + ioapics[i].iomem_res = &res[i]; } ioapic_resources = res; diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c index 2a0f225af..3a205d4a1 100644 --- a/arch/x86/kernel/apic/ipi.c +++ b/arch/x86/kernel/apic/ipi.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index f316e34ab..563096267 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -8,7 +8,7 @@ */ #include #include -#include +#include #include #include #include @@ -101,7 +101,6 @@ static struct apic apic_default = { .get_apic_id = default_get_apic_id, .set_apic_id = NULL, - .apic_id_mask = 0x0F << 24, .cpu_mask_to_apicid_and = flat_cpu_mask_to_apicid_and, @@ -153,7 +152,7 @@ early_param("apic", parse_apic); void __init default_setup_apic_routing(void) { - int version = apic_version[boot_cpu_physical_apicid]; + int version = boot_cpu_apic_version; if (num_possible_cpus() > 8) { switch (boot_cpu_data.x86_vendor) { diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 1793dba7a..c303054b9 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -11,10 +11,9 @@ #include #include #include -#include +#include #include #include -#include #include #include diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index a5e400afc..5d30c5e42 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -523,7 +523,7 @@ static int apic_set_affinity(struct irq_data *irq_data, struct apic_chip_data *data = irq_data->chip_data; int err, irq = irq_data->irq; - if (!config_enabled(CONFIG_SMP)) + if (!IS_ENABLED(CONFIG_SMP)) return -EPERM; if (!cpumask_intersects(dest, cpu_online_mask)) @@ -661,11 +661,28 @@ void irq_complete_move(struct irq_cfg *cfg) */ void irq_force_complete_move(struct irq_desc *desc) { - struct irq_data *irqdata = irq_desc_get_irq_data(desc); - struct apic_chip_data *data = apic_chip_data(irqdata); - struct irq_cfg *cfg = data ? &data->cfg : NULL; + struct irq_data *irqdata; + struct apic_chip_data *data; + struct irq_cfg *cfg; unsigned int cpu; + /* + * The function is called for all descriptors regardless of which + * irqdomain they belong to. For example if an IRQ is provided by + * an irq_chip as part of a GPIO driver, the chip data for that + * descriptor is specific to the irq_chip in question. + * + * Check first that the chip_data is what we expect + * (apic_chip_data) before touching it any further. + */ + irqdata = irq_domain_get_irq_data(x86_vector_domain, + irq_desc_get_irq(desc)); + if (!irqdata) + return; + + data = apic_chip_data(irqdata); + cfg = data ? &data->cfg : NULL; + if (!cfg) return; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index aca8b75c1..54f35d988 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -152,68 +152,53 @@ static void init_x2apic_ldr(void) } } - /* - * At CPU state changes, update the x2apic cluster sibling info. - */ -static int -update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) +/* + * At CPU state changes, update the x2apic cluster sibling info. + */ +static int x2apic_prepare_cpu(unsigned int cpu) { - unsigned int this_cpu = (unsigned long)hcpu; - unsigned int cpu; - int err = 0; - - switch (action) { - case CPU_UP_PREPARE: - if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu), - GFP_KERNEL)) { - err = -ENOMEM; - } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu), - GFP_KERNEL)) { - free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); - err = -ENOMEM; - } - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - for_each_online_cpu(cpu) { - if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) - continue; - cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); - cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); - } - free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); - free_cpumask_var(per_cpu(ipi_mask, this_cpu)); - break; + if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL)) + return -ENOMEM; + + if (!zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL)) { + free_cpumask_var(per_cpu(cpus_in_cluster, cpu)); + return -ENOMEM; } - return notifier_from_errno(err); + return 0; } -static struct notifier_block x2apic_cpu_notifier = { - .notifier_call = update_clusterinfo, -}; - -static int x2apic_init_cpu_notifier(void) +static int x2apic_dead_cpu(unsigned int this_cpu) { - int cpu = smp_processor_id(); - - zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL); - zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL); - - BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); + int cpu; - cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); - register_hotcpu_notifier(&x2apic_cpu_notifier); - return 1; + for_each_online_cpu(cpu) { + if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + cpumask_clear_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); + cpumask_clear_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); + } + free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); + free_cpumask_var(per_cpu(ipi_mask, this_cpu)); + return 0; } static int x2apic_cluster_probe(void) { - if (x2apic_mode) - return x2apic_init_cpu_notifier(); - else + int cpu = smp_processor_id(); + int ret; + + if (!x2apic_mode) + return 0; + + ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", + x2apic_prepare_cpu, x2apic_dead_cpu); + if (ret < 0) { + pr_err("Failed to register X2APIC_PREPARE\n"); return 0; + } + cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); + return 1; } static const struct cpumask *x2apic_cluster_target_cpus(void) @@ -270,7 +255,6 @@ static struct apic apic_x2apic_cluster = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = x2apic_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index a1242e2c1..4f13f54f1 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -126,7 +126,6 @@ static struct apic apic_x2apic_phys = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = x2apic_set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 29003154f..cb0673c1e 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include @@ -223,6 +223,11 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) if (strncmp(oem_id, "SGI", 3) != 0) return 0; + if (numa_off) { + pr_err("UV: NUMA is off, disabling UV support\n"); + return 0; + } + /* Setup early hub type field in uv_hub_info for Node 0 */ uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; @@ -325,7 +330,7 @@ static __init void build_uv_gr_table(void) struct uv_gam_range_entry *gre = uv_gre_table; struct uv_gam_range_s *grt; unsigned long last_limit = 0, ram_limit = 0; - int bytes, i, sid, lsid = -1; + int bytes, i, sid, lsid = -1, indx = 0, lindx = -1; if (!gre) return; @@ -356,11 +361,12 @@ static __init void build_uv_gr_table(void) } sid = gre->sockid - _min_socket; if (lsid < sid) { /* new range */ - grt = &_gr_table[sid]; - grt->base = lsid; + grt = &_gr_table[indx]; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; lsid = sid; + lindx = indx++; continue; } if (lsid == sid && !ram_limit) { /* update range */ @@ -371,7 +377,7 @@ static __init void build_uv_gr_table(void) } if (!ram_limit) { /* non-contiguous ram range */ grt++; - grt->base = sid - 1; + grt->base = lindx; grt->nasid = gre->nasid; grt->limit = last_limit = gre->limit; continue; @@ -582,7 +588,6 @@ static struct apic __refdata apic_x2apic_uv_x = { .get_apic_id = x2apic_get_apic_id, .set_apic_id = set_apic_id, - .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid_and = uv_cpu_mask_to_apicid_and, @@ -919,7 +924,7 @@ static void uv_heartbeat(unsigned long ignored) uv_set_scir_bits(bits); /* enable next timer period */ - mod_timer_pinned(timer, jiffies + SCIR_CPU_HB_INTERVAL); + mod_timer(timer, jiffies + SCIR_CPU_HB_INTERVAL); } static void uv_heartbeat_enable(int cpu) @@ -928,7 +933,7 @@ static void uv_heartbeat_enable(int cpu) struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); - setup_timer(timer, uv_heartbeat, cpu); + setup_pinned_timer(timer, uv_heartbeat, cpu); timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_scir_info(cpu)->enabled = 1; @@ -1156,19 +1161,18 @@ static void __init decode_gam_rng_tbl(unsigned long ptr) for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (!index) { pr_info("UV: GAM Range Table...\n"); - pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n", + pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", "Range", "", "Size", "Type", "NASID", - "SID", "PN", "PXM"); + "SID", "PN"); } pr_info( - "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n", + "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", index++, (unsigned long)lgre << UV_GAM_RANGE_SHFT, (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, ((unsigned long)(gre->limit - lgre)) >> (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ - gre->type, gre->nasid, gre->sockid, - gre->pnode, gre->pxm); + gre->type, gre->nasid, gre->sockid, gre->pnode); lgre = gre->limit; if (sock_min > gre->sockid) @@ -1287,7 +1291,7 @@ static void __init build_socket_tables(void) _pnode_to_socket[i] = SOCK_EMPTY; /* fill in pnode/node/addr conversion list values */ - pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n"); + pr_info("UV: GAM Building socket/pnode conversion tables\n"); for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { if (gre->type == UV_GAM_RANGE_TYPE_HOLE) continue; @@ -1295,20 +1299,18 @@ static void __init build_socket_tables(void) if (_socket_to_pnode[i] != SOCK_EMPTY) continue; /* duplicate */ _socket_to_pnode[i] = gre->pnode; - _socket_to_node[i] = gre->pxm; i = gre->pnode - minpnode; _pnode_to_socket[i] = gre->sockid; pr_info( - "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n", + "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", gre->sockid, gre->type, gre->nasid, _socket_to_pnode[gre->sockid - minsock], - _socket_to_node[gre->sockid - minsock], _pnode_to_socket[gre->pnode - minpnode]); } - /* check socket -> node values */ + /* Set socket -> node values */ lnid = -1; for_each_present_cpu(cpu) { int nid = cpu_to_node(cpu); @@ -1319,14 +1321,9 @@ static void __init build_socket_tables(void) lnid = nid; apicid = per_cpu(x86_cpu_to_apicid, cpu); sockid = apicid >> uv_cpuid.socketid_shift; - i = sockid - minsock; - - if (nid != _socket_to_node[i]) { - pr_warn( - "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n", - i, sockid, gre->type, _socket_to_node[i], nid); - _socket_to_node[i] = nid; - } + _socket_to_node[sockid - minsock] = nid; + pr_info("UV: sid:%02x: apicid:%04x node:%2d\n", + sockid, apicid, nid); } /* Setup physical blade to pnode translation from GAM Range Table */ diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 674134e9f..2bd5c6ff7 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -31,7 +31,9 @@ void common(void) { BLANK(); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); - OFFSET(TI_addr_limit, thread_info, addr_limit); + + BLANK(); + OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0fe6953f4..bcc9ccc22 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include @@ -804,21 +804,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid_p()) - return; + if (have_cpuid_p()) { + cpu_detect(c); + get_cpu_vendor(c); + get_cpu_cap(c); - cpu_detect(c); - get_cpu_vendor(c); - get_cpu_cap(c); - - if (this_cpu->c_early_init) - this_cpu->c_early_init(c); + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); - c->cpu_index = 0; - filter_cpuid_features(c, false); + c->cpu_index = 0; + filter_cpuid_features(c, false); - if (this_cpu->c_bsp_init) - this_cpu->c_bsp_init(c); + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); + } setup_force_cpu_cap(X86_FEATURE_ALWAYS); fpu__init_system(c); @@ -1452,7 +1451,7 @@ void cpu_init(void) struct task_struct *me; struct tss_struct *t; unsigned long v; - int cpu = stack_smp_processor_id(); + int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 73d391ae4..27e46658e 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -21,7 +21,8 @@ * */ -#include +#include +#include #include #include diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 6e2ffbebb..fcd484d2b 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -300,15 +301,14 @@ static void intel_workarounds(struct cpuinfo_x86 *c) } /* - * P4 Xeon errata 037 workaround. + * P4 Xeon erratum 037 workaround. * Hardware prefetcher may cause stale data to be loaded into the cache. */ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { if (msr_set_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) - > 0) { + MSR_IA32_MISC_ENABLE_PREFETCH_DISABLE_BIT) > 0) { pr_info("CPU: C0 stepping P4 Xeon detected.\n"); - pr_info("CPU: Disabling hardware prefetching (Errata 037)\n"); + pr_info("CPU: Disabling hardware prefetching (Erratum 037)\n"); } } @@ -509,6 +509,10 @@ static void init_intel(struct cpuinfo_x86 *c) (c->x86_model == 29 || c->x86_model == 46 || c->x86_model == 47)) set_cpu_bug(c, X86_BUG_CLFLUSH_MONITOR); + if (c->x86 == 6 && boot_cpu_has(X86_FEATURE_MWAIT) && + ((c->x86_model == INTEL_FAM6_ATOM_GOLDMONT))) + set_cpu_bug(c, X86_BUG_MONITOR); + #ifdef CONFIG_X86_64 if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index fbb5e9055..e42117d5f 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include /** diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 34c89a3e8..83f1a98d3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -46,7 +46,7 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err) return; mce_setup(&m); - m.bank = 1; + m.bank = -1; /* Fake a memory read error with unknown channel */ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 92e5e37d9..79d8ec849 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -425,7 +425,7 @@ static u64 mce_rdmsrl(u32 msr) } if (rdmsrl_safe(msr, &v)) { - WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); + WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr); /* * Return zero in case the access faulted. This should * not happen normally but can happen if the CPU does @@ -1309,7 +1309,7 @@ static void __restart_timer(struct timer_list *t, unsigned long interval) if (timer_pending(t)) { if (time_before(when, t->expires)) - mod_timer_pinned(t, when); + mod_timer(t, when); } else { t->expires = round_jiffies(when); add_timer_on(t, smp_processor_id()); @@ -1735,7 +1735,7 @@ static void __mcheck_cpu_init_timer(void) struct timer_list *t = this_cpu_ptr(&mce_timer); unsigned int cpu = smp_processor_id(); - setup_timer(t, mce_timer_fn, cpu); + setup_pinned_timer(t, mce_timer_fn, cpu); mce_start_timer(cpu, t); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 10b066165..7b7f3be78 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -93,7 +93,7 @@ const char * const amd_df_mcablock_names[] = { EXPORT_SYMBOL_GPL(amd_df_mcablock_names); static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); -static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ +static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ static void amd_threshold_interrupt(void); static void amd_deferred_error_interrupt(void); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index d4872a456..f1e551ff5 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -54,26 +54,27 @@ static LIST_HEAD(pcache); */ static u8 *container; static size_t container_size; +static bool ucode_builtin; static u32 ucode_new_rev; -u8 amd_ucode_patch[PATCH_MAX_SIZE]; +static u8 amd_ucode_patch[PATCH_MAX_SIZE]; static u16 this_equiv_id; static struct cpio_data ucode_cpio; -/* - * Microcode patch container file is prepended to the initrd in cpio format. - * See Documentation/x86/early-microcode.txt - */ -static __initdata char ucode_path[] = "/*(DEBLOBBED)*/"; - static struct cpio_data __init find_ucode_in_initrd(void) { - long offset = 0; +#ifdef CONFIG_BLK_DEV_INITRD char *path; void *start; size_t size; + /* + * Microcode patch container file is prepended to the initrd in cpio + * format. See Documentation/x86/early-microcode.txt + */ + static __initdata char ucode_path[] = "/*(DEBLOBBED)*/"; + #ifdef CONFIG_X86_32 struct boot_params *p; @@ -89,9 +90,12 @@ static struct cpio_data __init find_ucode_in_initrd(void) path = ucode_path; start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); size = boot_params.hdr.ramdisk_size; -#endif +#endif /* !CONFIG_X86_32 */ - return find_cpio_data(path, start, size, &offset); + return find_cpio_data(path, start, size, NULL); +#else + return (struct cpio_data){ NULL, 0, "" }; +#endif } static size_t compute_container_size(u8 *data, u32 total_size) @@ -278,22 +282,26 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, void __init load_ucode_amd_bsp(unsigned int family) { struct cpio_data cp; + bool *builtin; void **data; size_t *size; #ifdef CONFIG_X86_32 data = (void **)__pa_nodebug(&ucode_cpio.data); size = (size_t *)__pa_nodebug(&ucode_cpio.size); + builtin = (bool *)__pa_nodebug(&ucode_builtin); #else data = &ucode_cpio.data; size = &ucode_cpio.size; + builtin = &ucode_builtin; #endif - cp = find_ucode_in_initrd(); - if (!cp.data) { - if (!load_builtin_amd_microcode(&cp, family)) - return; - } + *builtin = load_builtin_amd_microcode(&cp, family); + if (!*builtin) + cp = find_ucode_in_initrd(); + + if (!(cp.data && cp.size)) + return; *data = cp.data; *size = cp.size; @@ -352,6 +360,7 @@ void load_ucode_amd_ap(void) unsigned int cpu = smp_processor_id(); struct equiv_cpu_entry *eq; struct microcode_amd *mc; + u8 *cont = container; u32 rev, eax; u16 eq_id; @@ -368,8 +377,12 @@ void load_ucode_amd_ap(void) if (check_current_patch_level(&rev, false)) return; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + if (!ucode_builtin) + cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(container + CONTAINER_HDR_SZ); + eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); eq_id = find_equiv_id(eq, eax); if (!eq_id) @@ -431,6 +444,10 @@ int __init save_microcode_in_initrd_amd(void) else container = cont_va; + /* Add CONFIG_RANDOMIZE_MEMORY offset. */ + if (!ucode_builtin) + container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 12823b6eb..df04b2d03 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -60,7 +60,6 @@ static bool dis_ucode_ldr; static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -EXPORT_SYMBOL_GPL(ucode_cpu_info); /* * Operations that are run on a target cpu: @@ -182,17 +181,17 @@ static int __init save_microcode_in_initrd(void) switch (c->x86_vendor) { case X86_VENDOR_INTEL: if (c->x86 >= 6) - save_microcode_in_initrd_intel(); + return save_microcode_in_initrd_intel(); break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - save_microcode_in_initrd_amd(); + return save_microcode_in_initrd_amd(); break; default: break; } - return 0; + return -EINVAL; } void reload_early_microcode(void) diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 2ea7cc260..8c88845b9 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -40,9 +40,13 @@ #include /* - * Temporary microcode blobs pointers storage. We note here the pointers to - * microcode blobs we've got from whatever storage (detached initrd, builtin). - * Later on, we put those into final storage mc_saved_data.mc_saved. + * Temporary microcode blobs pointers storage. We note here during early load + * the pointers to microcode blobs we've got from whatever storage (detached + * initrd, builtin). Later on, we put those into final storage + * mc_saved_data.mc_saved. + * + * Important: those are offsets from the beginning of initrd or absolute + * addresses within the kernel image when built-in. */ static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT]; @@ -51,8 +55,15 @@ static struct mc_saved_data { struct microcode_intel **mc_saved; } mc_saved_data; +/* Microcode blobs within the initrd. 0 if builtin. */ +static struct ucode_blobs { + unsigned long start; + bool valid; +} blobs; + +/* Go through saved patches and find the one suitable for the current CPU. */ static enum ucode_state -load_microcode_early(struct microcode_intel **saved, +find_microcode_patch(struct microcode_intel **saved, unsigned int num_saved, struct ucode_cpu_info *uci) { struct microcode_intel *ucode_ptr, *new_mc = NULL; @@ -121,13 +132,13 @@ load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, if (!mcs->mc_saved) { copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count); - return load_microcode_early(mc_saved_tmp, count, uci); + return find_microcode_patch(mc_saved_tmp, count, uci); } else { #ifdef CONFIG_X86_32 microcode_phys(mc_saved_tmp, mcs); - return load_microcode_early(mc_saved_tmp, count, uci); + return find_microcode_patch(mc_saved_tmp, count, uci); #else - return load_microcode_early(mcs->mc_saved, count, uci); + return find_microcode_patch(mcs->mc_saved, count, uci); #endif } } @@ -450,8 +461,6 @@ static void show_saved_mc(void) #endif } -#ifdef CONFIG_HOTPLUG_CPU -static DEFINE_MUTEX(x86_cpu_microcode_mutex); /* * Save this mc into mc_saved_data. So it will be loaded early when a CPU is * hot added or resumes. @@ -459,19 +468,18 @@ static DEFINE_MUTEX(x86_cpu_microcode_mutex); * Please make sure this mc should be a valid microcode patch before calling * this function. */ -int save_mc_for_early(u8 *mc) +static void save_mc_for_early(u8 *mc) { +#ifdef CONFIG_HOTPLUG_CPU + /* Synchronization during CPU hotplug. */ + static DEFINE_MUTEX(x86_cpu_microcode_mutex); + struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; unsigned int mc_saved_count_init; unsigned int num_saved; struct microcode_intel **mc_saved; - int ret = 0; - int i; + int ret, i; - /* - * Hold hotplug lock so mc_saved_data is not accessed by a CPU in - * hotplug. - */ mutex_lock(&x86_cpu_microcode_mutex); mc_saved_count_init = mc_saved_data.num_saved; @@ -509,11 +517,8 @@ int save_mc_for_early(u8 *mc) out: mutex_unlock(&x86_cpu_microcode_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(save_mc_for_early); #endif +} static bool __init load_builtin_intel_microcode(struct cpio_data *cp) { @@ -532,37 +537,6 @@ static bool __init load_builtin_intel_microcode(struct cpio_data *cp) #endif } -static __initdata char ucode_name[] = "/*(DEBLOBBED)*/"; -static __init enum ucode_state -scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - unsigned long start, unsigned long size, - struct ucode_cpu_info *uci) -{ - struct cpio_data cd; - long offset = 0; -#ifdef CONFIG_X86_32 - char *p = (char *)__pa_nodebug(ucode_name); -#else - char *p = ucode_name; -#endif - - cd.data = NULL; - cd.size = 0; - - /* try built-in microcode if no initrd */ - if (!size) { - if (!load_builtin_intel_microcode(&cd)) - return UCODE_ERROR; - } else { - cd = find_cpio_data(p, (void *)start, size, &offset); - if (!cd.data) - return UCODE_ERROR; - } - - return get_matching_model_microcode(start, cd.data, cd.size, - mcs, mc_ptrs, uci); -} - /* * Print ucode update info. */ @@ -680,38 +654,117 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) */ int __init save_microcode_in_initrd_intel(void) { - unsigned int count = mc_saved_data.num_saved; struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; - int ret = 0; + unsigned int count = mc_saved_data.num_saved; + unsigned long offset = 0; + int ret; if (!count) - return ret; + return 0; + + /* + * We have found a valid initrd but it might've been relocated in the + * meantime so get its updated address. + */ + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid) + offset = initrd_start; - copy_ptrs(mc_saved, mc_tmp_ptrs, get_initrd_start(), count); + copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count); ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) pr_err("Cannot save microcode patches from initrd.\n"); - - show_saved_mc(); + else + show_saved_mc(); return ret; } +static __init enum ucode_state +__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp) +{ +#ifdef CONFIG_BLK_DEV_INITRD + static __initdata char ucode_name[] = "/*(DEBLOBBED)*/"; + char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name) + : ucode_name; +# ifdef CONFIG_X86_32 + unsigned long start = 0, size; + struct boot_params *params; + + params = (struct boot_params *)__pa_nodebug(&boot_params); + size = params->hdr.ramdisk_size; + + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + start = (size ? params->hdr.ramdisk_image : 0); + +# else /* CONFIG_X86_64 */ + unsigned long start = 0, size; + + size = (u64)boot_params.ext_ramdisk_size << 32; + size |= boot_params.hdr.ramdisk_size; + + if (size) { + start = (u64)boot_params.ext_ramdisk_image << 32; + start |= boot_params.hdr.ramdisk_image; + + start += PAGE_OFFSET; + } +# endif + + *cd = find_cpio_data(p, (void *)start, size, NULL); + if (cd->data) { + blbp->start = start; + blbp->valid = true; + + return UCODE_OK; + } else +#endif /* CONFIG_BLK_DEV_INITRD */ + return UCODE_ERROR; +} + +static __init enum ucode_state +scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, + struct ucode_cpu_info *uci, struct ucode_blobs *blbp) +{ + struct cpio_data cd = { NULL, 0, "" }; + enum ucode_state ret; + + /* try built-in microcode first */ + if (load_builtin_intel_microcode(&cd)) + /* + * Invalidate blobs as we might've gotten an initrd too, + * supplied by the boot loader, by mistake or simply forgotten + * there. That's fine, we ignore it since we've found builtin + * microcode already. + */ + blbp->valid = false; + else { + ret = __scan_microcode_initrd(&cd, blbp); + if (ret != UCODE_OK) + return ret; + } + + return get_matching_model_microcode(blbp->start, cd.data, cd.size, + mcs, mc_ptrs, uci); +} + static void __init _load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - unsigned long start, unsigned long size) + struct ucode_blobs *blbp) { struct ucode_cpu_info uci; enum ucode_state ret; collect_cpu_info_early(&uci); - ret = scan_microcode(mcs, mc_ptrs, start, size, &uci); + ret = scan_microcode(mcs, mc_ptrs, &uci, blbp); if (ret != UCODE_OK) return; - ret = load_microcode(mcs, mc_ptrs, start, &uci); + ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci); if (ret != UCODE_OK) return; @@ -720,54 +773,60 @@ _load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, void __init load_ucode_intel_bsp(void) { - u64 start, size; -#ifdef CONFIG_X86_32 - struct boot_params *p; - - p = (struct boot_params *)__pa_nodebug(&boot_params); - size = p->hdr.ramdisk_size; + struct ucode_blobs *blobs_p; + struct mc_saved_data *mcs; + unsigned long *ptrs; - /* - * Set start only if we have an initrd image. We cannot use initrd_start - * because it is not set that early yet. - */ - start = (size ? p->hdr.ramdisk_image : 0); - - _load_ucode_intel_bsp((struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_tmp_ptrs), - start, size); +#ifdef CONFIG_X86_32 + mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + ptrs = (unsigned long *)__pa_nodebug(&mc_tmp_ptrs); + blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); #else - size = boot_params.hdr.ramdisk_size; - start = (size ? boot_params.hdr.ramdisk_image + PAGE_OFFSET : 0); - - _load_ucode_intel_bsp(&mc_saved_data, mc_tmp_ptrs, start, size); + mcs = &mc_saved_data; + ptrs = mc_tmp_ptrs; + blobs_p = &blobs; #endif + + _load_ucode_intel_bsp(mcs, ptrs, blobs_p); } void load_ucode_intel_ap(void) { - unsigned long *mcs_tmp_p; - struct mc_saved_data *mcs_p; + struct ucode_blobs *blobs_p; + unsigned long *ptrs, start = 0; + struct mc_saved_data *mcs; struct ucode_cpu_info uci; enum ucode_state ret; -#ifdef CONFIG_X86_32 - mcs_tmp_p = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); - mcs_p = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); +#ifdef CONFIG_X86_32 + mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); + ptrs = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); + blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); #else - mcs_tmp_p = mc_tmp_ptrs; - mcs_p = &mc_saved_data; + mcs = &mc_saved_data; + ptrs = mc_tmp_ptrs; + blobs_p = &blobs; #endif /* * If there is no valid ucode previously saved in memory, no need to * update ucode on this AP. */ - if (!mcs_p->num_saved) + if (!mcs->num_saved) return; + if (blobs_p->valid) { + start = blobs_p->start; + + /* + * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles + * physmem mapping too and there we have the initrd. + */ + start += PAGE_OFFSET - __PAGE_OFFSET_BASE; + } + collect_cpu_info_early(&uci); - ret = load_microcode(mcs_p, mcs_tmp_p, get_initrd_start_addr(), &uci); + ret = load_microcode(mcs, ptrs, start, &uci); if (ret != UCODE_OK) return; @@ -784,7 +843,7 @@ void reload_ucode_intel(void) collect_cpu_info_early(&uci); - ret = load_microcode_early(mc_saved_data.mc_saved, + ret = find_microcode_patch(mc_saved_data.mc_saved, mc_saved_data.num_saved, &uci); if (ret != UCODE_OK) return; @@ -794,6 +853,7 @@ void reload_ucode_intel(void) static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { + static struct cpu_signature prev; struct cpuinfo_x86 *c = &cpu_data(cpu_num); unsigned int val[2]; @@ -808,8 +868,13 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) } csig->rev = c->microcode; - pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", - cpu_num, csig->sig, csig->pf, csig->rev); + + /* No extra locking on prev, races are harmless. */ + if (csig->sig != prev.sig || csig->pf != prev.pf || csig->rev != prev.rev) { + pr_info("sig=0x%x, pf=0x%x, revision=0x%x\n", + csig->sig, csig->pf, csig->rev); + prev = *csig; + } return 0; } @@ -838,6 +903,7 @@ static int apply_microcode_intel(int cpu) struct ucode_cpu_info *uci; struct cpuinfo_x86 *c; unsigned int val[2]; + static int prev_rev; /* We should bind the task to the CPU */ if (WARN_ON(raw_smp_processor_id() != cpu)) @@ -872,11 +938,14 @@ static int apply_microcode_intel(int cpu) return -1; } - pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n", - cpu, val[1], - mc->hdr.date & 0xffff, - mc->hdr.date >> 24, - (mc->hdr.date >> 16) & 0xff); + if (val[1] != prev_rev) { + pr_info("updated to revision 0x%x, date = %04x-%02x-%02x\n", + val[1], + mc->hdr.date & 0xffff, + mc->hdr.date >> 24, + (mc->hdr.date >> 16) & 0xff); + prev_rev = val[1]; + } c = &cpu_data(cpu); diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index 2ce1a7dc4..406cb6c0d 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -141,7 +141,6 @@ int microcode_sanity_check(void *mc, int print_err) } return 0; } -EXPORT_SYMBOL_GPL(microcode_sanity_check); /* * Returns 1 if update has been found, 0 otherwise. @@ -183,4 +182,3 @@ int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) return find_matching_signature(mc, csig, cpf); } -EXPORT_SYMBOL_GPL(has_newer_microcode); diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 10c11b4da..8f44c5a50 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 31e951ce6..3b442b64c 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -17,7 +17,6 @@ * License along with this library; if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 16e37a258..fdc55215d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -4,7 +4,7 @@ */ #define DEBUG -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index d76f13d6d..6d9b45549 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 7d393ecde..28f1b54b7 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 2e8caf03f..181eabeca 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -12,7 +12,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index f6f50c4ce..cfa97ff67 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -39,9 +39,9 @@ __setup("nordrand", x86_rdrand_setup); */ #define SANITY_CHECK_LOOPS 8 +#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { -#ifdef CONFIG_ARCH_RANDOM unsigned long tmp; int i; @@ -55,5 +55,5 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) return; } } -#endif } +#endif diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 8cac429b6..1ff0598d3 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -22,7 +22,8 @@ */ #include -#include +#include +#include #include #include #include diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 9ef978d69..9616cf769 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ef8017ca5..92e8f0a71 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -87,7 +87,7 @@ static inline int valid_stack_ptr(struct task_struct *task, else return 0; } - return p > t && p < t + THREAD_SIZE - size; + return p >= t && p < t + THREAD_SIZE - size; } unsigned long @@ -98,6 +98,14 @@ print_context_stack(struct task_struct *task, { struct stack_frame *frame = (struct stack_frame *)bp; + /* + * If we overflowed the stack into a guard page, jump back to the + * bottom of the usable stack. + */ + if ((unsigned long)task_stack_page(task) - (unsigned long)stack < + PAGE_SIZE) + stack = (unsigned long *)task_stack_page(task); + while (valid_stack_ptr(task, stack, sizeof(*stack), end)) { unsigned long addr; @@ -197,6 +205,11 @@ void show_stack(struct task_struct *task, unsigned long *sp) show_stack_log_lvl(task, NULL, sp, bp, ""); } +void show_stack_regs(struct pt_regs *regs) +{ + show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, ""); +} + static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; static int die_owner = -1; static unsigned int die_nest_count; @@ -226,6 +239,8 @@ unsigned long oops_begin(void) EXPORT_SYMBOL_GPL(oops_begin); NOKPROBE_SYMBOL(oops_begin); +void __noreturn rewind_stack_do_exit(int signr); + void oops_end(unsigned long flags, struct pt_regs *regs, int signr) { if (regs && kexec_should_crash(current)) @@ -247,7 +262,13 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - do_exit(signr); + + /* + * We're not going to return, but we might be on an IST stack or + * have very little stack space left. Rewind the stack and kill + * the task. + */ + rewind_stack_do_exit(signr); } NOKPROBE_SYMBOL(oops_end); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index fef917e79..09675712e 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -96,7 +96,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, int i; if (sp == NULL) { - if (task) + if (regs) + sp = (unsigned long *)regs->sp; + else if (task) sp = (unsigned long *)task->thread.sp; else sp = (unsigned long *)&sp; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d558a8a49..9ee4520ce 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -264,7 +264,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, * back trace for this cpu: */ if (sp == NULL) { - if (task) + if (regs) + sp = (unsigned long *)regs->sp; + else if (task) sp = (unsigned long *)task->thread.sp; else sp = (unsigned long *)&sp; @@ -272,6 +274,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { + unsigned long word; + if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); @@ -281,12 +285,18 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; } + + if (probe_kernel_address(stack, word)) + break; + if ((i % STACKSLOTS_PER_LINE) == 0) { if (i != 0) pr_cont("\n"); - printk("%s %016lx", log_lvl, *stack++); + printk("%s %016lx", log_lvl, word); } else - pr_cont(" %016lx", *stack++); + pr_cont(" %016lx", word); + + stack++; touch_nmi_watchdog(); } preempt_enable(); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 621b501f8..8a90f1517 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -348,7 +348,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, * continue building up new bios map based on this * information */ - if (current_type != last_type || current_type == E820_PRAM) { + if (current_type != last_type) { if (last_type != 0) { new_bios[new_bios_entry].size = change_point[chgidx]->addr - last_addr; @@ -754,7 +754,7 @@ u64 __init early_reserve_e820(u64 size, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) { int i; unsigned long last_pfn = 0; @@ -765,11 +765,7 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn) unsigned long start_pfn; unsigned long end_pfn; - /* - * Persistent memory is accounted as ram for purposes of - * establishing max_pfn and mem_map. - */ - if (ei->type != E820_RAM && ei->type != E820_PRAM) + if (ei->type != type) continue; start_pfn = ei->addr >> PAGE_SHIFT; @@ -794,12 +790,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn) } unsigned long __init e820_end_of_ram_pfn(void) { - return e820_end_pfn(MAX_ARCH_PFN); + return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); } unsigned long __init e820_end_of_low_ram_pfn(void) { - return e820_end_pfn(1UL << (32-PAGE_SHIFT)); + return e820_end_pfn(1UL << (32 - PAGE_SHIFT), E820_RAM); } static void early_panic(char *msg) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 57b71373b..de7501edb 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -237,36 +237,19 @@ static void __init intel_remapping_check(int num, int slot, int func) * despite the efforts of the "RAM buffer" approach, which simply rounds * memory boundaries up to 64M to try to catch space that may decode * as RAM and so is not suitable for MMIO. - * - * And yes, so far on current devices the base addr is always under 4G. */ -static u32 __init intel_stolen_base(int num, int slot, int func, size_t stolen_size) -{ - u32 base; - - /* - * For the PCI IDs in this quirk, the stolen base is always - * in 0x5c, aka the BDSM register (yes that's really what - * it's called). - */ - base = read_pci_config(num, slot, func, 0x5c); - base &= ~((1<<20) - 1); - - return base; -} #define KB(x) ((x) * 1024UL) #define MB(x) (KB (KB (x))) -#define GB(x) (MB (KB (x))) static size_t __init i830_tseg_size(void) { - u8 tmp = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); + u8 esmramc = read_pci_config_byte(0, 0, 0, I830_ESMRAMC); - if (!(tmp & TSEG_ENABLE)) + if (!(esmramc & TSEG_ENABLE)) return 0; - if (tmp & I830_TSEG_SIZE_1M) + if (esmramc & I830_TSEG_SIZE_1M) return MB(1); else return KB(512); @@ -274,27 +257,26 @@ static size_t __init i830_tseg_size(void) static size_t __init i845_tseg_size(void) { - u8 tmp = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + u8 esmramc = read_pci_config_byte(0, 0, 0, I845_ESMRAMC); + u8 tseg_size = esmramc & I845_TSEG_SIZE_MASK; - if (!(tmp & TSEG_ENABLE)) + if (!(esmramc & TSEG_ENABLE)) return 0; - switch (tmp & I845_TSEG_SIZE_MASK) { - case I845_TSEG_SIZE_512K: - return KB(512); - case I845_TSEG_SIZE_1M: - return MB(1); + switch (tseg_size) { + case I845_TSEG_SIZE_512K: return KB(512); + case I845_TSEG_SIZE_1M: return MB(1); default: - WARN_ON(1); - return 0; + WARN(1, "Unknown ESMRAMC value: %x!\n", esmramc); } + return 0; } static size_t __init i85x_tseg_size(void) { - u8 tmp = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); + u8 esmramc = read_pci_config_byte(0, 0, 0, I85X_ESMRAMC); - if (!(tmp & TSEG_ENABLE)) + if (!(esmramc & TSEG_ENABLE)) return 0; return MB(1); @@ -314,285 +296,287 @@ static size_t __init i85x_mem_size(void) * On 830/845/85x the stolen memory base isn't available in any * register. We need to calculate it as TOM-TSEG_SIZE-stolen_size. */ -static u32 __init i830_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i830_stolen_base(int num, int slot, int func, + size_t stolen_size) { - return i830_mem_size() - i830_tseg_size() - stolen_size; + return (phys_addr_t)i830_mem_size() - i830_tseg_size() - stolen_size; } -static u32 __init i845_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i845_stolen_base(int num, int slot, int func, + size_t stolen_size) { - return i830_mem_size() - i845_tseg_size() - stolen_size; + return (phys_addr_t)i830_mem_size() - i845_tseg_size() - stolen_size; } -static u32 __init i85x_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i85x_stolen_base(int num, int slot, int func, + size_t stolen_size) { - return i85x_mem_size() - i85x_tseg_size() - stolen_size; + return (phys_addr_t)i85x_mem_size() - i85x_tseg_size() - stolen_size; } -static u32 __init i865_stolen_base(int num, int slot, int func, size_t stolen_size) +static phys_addr_t __init i865_stolen_base(int num, int slot, int func, + size_t stolen_size) { + u16 toud; + /* * FIXME is the graphics stolen memory region * always at TOUD? Ie. is it always the last * one to be allocated by the BIOS? */ - return read_pci_config_16(0, 0, 0, I865_TOUD) << 16; + toud = read_pci_config_16(0, 0, 0, I865_TOUD); + + return (phys_addr_t)toud << 16; +} + +static phys_addr_t __init gen3_stolen_base(int num, int slot, int func, + size_t stolen_size) +{ + u32 bsm; + + /* Almost universally we can find the Graphics Base of Stolen Memory + * at register BSM (0x5c) in the igfx configuration space. On a few + * (desktop) machines this is also mirrored in the bridge device at + * different locations, or in the MCHBAR. + */ + bsm = read_pci_config(num, slot, func, INTEL_BSM); + + return (phys_addr_t)bsm & INTEL_BSM_MASK; } static size_t __init i830_stolen_size(int num, int slot, int func) { - size_t stolen_size; u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); - - switch (gmch_ctrl & I830_GMCH_GMS_MASK) { - case I830_GMCH_GMS_STOLEN_512: - stolen_size = KB(512); - break; - case I830_GMCH_GMS_STOLEN_1024: - stolen_size = MB(1); - break; - case I830_GMCH_GMS_STOLEN_8192: - stolen_size = MB(8); - break; - case I830_GMCH_GMS_LOCAL: - /* local memory isn't part of the normal address space */ - stolen_size = 0; - break; + gms = gmch_ctrl & I830_GMCH_GMS_MASK; + + switch (gms) { + case I830_GMCH_GMS_STOLEN_512: return KB(512); + case I830_GMCH_GMS_STOLEN_1024: return MB(1); + case I830_GMCH_GMS_STOLEN_8192: return MB(8); + /* local memory isn't part of the normal address space */ + case I830_GMCH_GMS_LOCAL: return 0; default: - return 0; + WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl); } - return stolen_size; + return 0; } static size_t __init gen3_stolen_size(int num, int slot, int func) { - size_t stolen_size; u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(0, 0, 0, I830_GMCH_CTRL); - - switch (gmch_ctrl & I855_GMCH_GMS_MASK) { - case I855_GMCH_GMS_STOLEN_1M: - stolen_size = MB(1); - break; - case I855_GMCH_GMS_STOLEN_4M: - stolen_size = MB(4); - break; - case I855_GMCH_GMS_STOLEN_8M: - stolen_size = MB(8); - break; - case I855_GMCH_GMS_STOLEN_16M: - stolen_size = MB(16); - break; - case I855_GMCH_GMS_STOLEN_32M: - stolen_size = MB(32); - break; - case I915_GMCH_GMS_STOLEN_48M: - stolen_size = MB(48); - break; - case I915_GMCH_GMS_STOLEN_64M: - stolen_size = MB(64); - break; - case G33_GMCH_GMS_STOLEN_128M: - stolen_size = MB(128); - break; - case G33_GMCH_GMS_STOLEN_256M: - stolen_size = MB(256); - break; - case INTEL_GMCH_GMS_STOLEN_96M: - stolen_size = MB(96); - break; - case INTEL_GMCH_GMS_STOLEN_160M: - stolen_size = MB(160); - break; - case INTEL_GMCH_GMS_STOLEN_224M: - stolen_size = MB(224); - break; - case INTEL_GMCH_GMS_STOLEN_352M: - stolen_size = MB(352); - break; + gms = gmch_ctrl & I855_GMCH_GMS_MASK; + + switch (gms) { + case I855_GMCH_GMS_STOLEN_1M: return MB(1); + case I855_GMCH_GMS_STOLEN_4M: return MB(4); + case I855_GMCH_GMS_STOLEN_8M: return MB(8); + case I855_GMCH_GMS_STOLEN_16M: return MB(16); + case I855_GMCH_GMS_STOLEN_32M: return MB(32); + case I915_GMCH_GMS_STOLEN_48M: return MB(48); + case I915_GMCH_GMS_STOLEN_64M: return MB(64); + case G33_GMCH_GMS_STOLEN_128M: return MB(128); + case G33_GMCH_GMS_STOLEN_256M: return MB(256); + case INTEL_GMCH_GMS_STOLEN_96M: return MB(96); + case INTEL_GMCH_GMS_STOLEN_160M:return MB(160); + case INTEL_GMCH_GMS_STOLEN_224M:return MB(224); + case INTEL_GMCH_GMS_STOLEN_352M:return MB(352); default: - stolen_size = 0; - break; + WARN(1, "Unknown GMCH_CTRL value: %x!\n", gmch_ctrl); } - return stolen_size; + return 0; } static size_t __init gen6_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; - gmch_ctrl &= SNB_GMCH_GMS_MASK; + gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK; - return gmch_ctrl << 25; /* 32 MB units */ + return (size_t)gms * MB(32); } static size_t __init gen8_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; - gmch_ctrl &= BDW_GMCH_GMS_MASK; - return gmch_ctrl << 25; /* 32 MB units */ + gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK; + + return (size_t)gms * MB(32); } static size_t __init chv_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= SNB_GMCH_GMS_SHIFT; - gmch_ctrl &= SNB_GMCH_GMS_MASK; + gms = (gmch_ctrl >> SNB_GMCH_GMS_SHIFT) & SNB_GMCH_GMS_MASK; /* * 0x0 to 0x10: 32MB increments starting at 0MB * 0x11 to 0x16: 4MB increments starting at 8MB * 0x17 to 0x1d: 4MB increments start at 36MB */ - if (gmch_ctrl < 0x11) - return gmch_ctrl << 25; - else if (gmch_ctrl < 0x17) - return (gmch_ctrl - 0x11 + 2) << 22; + if (gms < 0x11) + return (size_t)gms * MB(32); + else if (gms < 0x17) + return (size_t)(gms - 0x11 + 2) * MB(4); else - return (gmch_ctrl - 0x17 + 9) << 22; + return (size_t)(gms - 0x17 + 9) * MB(4); } -struct intel_stolen_funcs { - size_t (*size)(int num, int slot, int func); - u32 (*base)(int num, int slot, int func, size_t size); -}; - static size_t __init gen9_stolen_size(int num, int slot, int func) { u16 gmch_ctrl; + u16 gms; gmch_ctrl = read_pci_config_16(num, slot, func, SNB_GMCH_CTRL); - gmch_ctrl >>= BDW_GMCH_GMS_SHIFT; - gmch_ctrl &= BDW_GMCH_GMS_MASK; + gms = (gmch_ctrl >> BDW_GMCH_GMS_SHIFT) & BDW_GMCH_GMS_MASK; - if (gmch_ctrl < 0xf0) - return gmch_ctrl << 25; /* 32 MB units */ + /* 0x0 to 0xef: 32MB increments starting at 0MB */ + /* 0xf0 to 0xfe: 4MB increments starting at 4MB */ + if (gms < 0xf0) + return (size_t)gms * MB(32); else - /* 4MB increments starting at 0xf0 for 4MB */ - return (gmch_ctrl - 0xf0 + 1) << 22; + return (size_t)(gms - 0xf0 + 1) * MB(4); } -typedef size_t (*stolen_size_fn)(int num, int slot, int func); +struct intel_early_ops { + size_t (*stolen_size)(int num, int slot, int func); + phys_addr_t (*stolen_base)(int num, int slot, int func, size_t size); +}; -static const struct intel_stolen_funcs i830_stolen_funcs __initconst = { - .base = i830_stolen_base, - .size = i830_stolen_size, +static const struct intel_early_ops i830_early_ops __initconst = { + .stolen_base = i830_stolen_base, + .stolen_size = i830_stolen_size, }; -static const struct intel_stolen_funcs i845_stolen_funcs __initconst = { - .base = i845_stolen_base, - .size = i830_stolen_size, +static const struct intel_early_ops i845_early_ops __initconst = { + .stolen_base = i845_stolen_base, + .stolen_size = i830_stolen_size, }; -static const struct intel_stolen_funcs i85x_stolen_funcs __initconst = { - .base = i85x_stolen_base, - .size = gen3_stolen_size, +static const struct intel_early_ops i85x_early_ops __initconst = { + .stolen_base = i85x_stolen_base, + .stolen_size = gen3_stolen_size, }; -static const struct intel_stolen_funcs i865_stolen_funcs __initconst = { - .base = i865_stolen_base, - .size = gen3_stolen_size, +static const struct intel_early_ops i865_early_ops __initconst = { + .stolen_base = i865_stolen_base, + .stolen_size = gen3_stolen_size, }; -static const struct intel_stolen_funcs gen3_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen3_stolen_size, +static const struct intel_early_ops gen3_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen3_stolen_size, }; -static const struct intel_stolen_funcs gen6_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen6_stolen_size, +static const struct intel_early_ops gen6_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen6_stolen_size, }; -static const struct intel_stolen_funcs gen8_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen8_stolen_size, +static const struct intel_early_ops gen8_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen8_stolen_size, }; -static const struct intel_stolen_funcs gen9_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = gen9_stolen_size, +static const struct intel_early_ops gen9_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = gen9_stolen_size, }; -static const struct intel_stolen_funcs chv_stolen_funcs __initconst = { - .base = intel_stolen_base, - .size = chv_stolen_size, +static const struct intel_early_ops chv_early_ops __initconst = { + .stolen_base = gen3_stolen_base, + .stolen_size = chv_stolen_size, }; -static const struct pci_device_id intel_stolen_ids[] __initconst = { - INTEL_I830_IDS(&i830_stolen_funcs), - INTEL_I845G_IDS(&i845_stolen_funcs), - INTEL_I85X_IDS(&i85x_stolen_funcs), - INTEL_I865G_IDS(&i865_stolen_funcs), - INTEL_I915G_IDS(&gen3_stolen_funcs), - INTEL_I915GM_IDS(&gen3_stolen_funcs), - INTEL_I945G_IDS(&gen3_stolen_funcs), - INTEL_I945GM_IDS(&gen3_stolen_funcs), - INTEL_VLV_M_IDS(&gen6_stolen_funcs), - INTEL_VLV_D_IDS(&gen6_stolen_funcs), - INTEL_PINEVIEW_IDS(&gen3_stolen_funcs), - INTEL_I965G_IDS(&gen3_stolen_funcs), - INTEL_G33_IDS(&gen3_stolen_funcs), - INTEL_I965GM_IDS(&gen3_stolen_funcs), - INTEL_GM45_IDS(&gen3_stolen_funcs), - INTEL_G45_IDS(&gen3_stolen_funcs), - INTEL_IRONLAKE_D_IDS(&gen3_stolen_funcs), - INTEL_IRONLAKE_M_IDS(&gen3_stolen_funcs), - INTEL_SNB_D_IDS(&gen6_stolen_funcs), - INTEL_SNB_M_IDS(&gen6_stolen_funcs), - INTEL_IVB_M_IDS(&gen6_stolen_funcs), - INTEL_IVB_D_IDS(&gen6_stolen_funcs), - INTEL_HSW_D_IDS(&gen6_stolen_funcs), - INTEL_HSW_M_IDS(&gen6_stolen_funcs), - INTEL_BDW_M_IDS(&gen8_stolen_funcs), - INTEL_BDW_D_IDS(&gen8_stolen_funcs), - INTEL_CHV_IDS(&chv_stolen_funcs), - INTEL_SKL_IDS(&gen9_stolen_funcs), - INTEL_BXT_IDS(&gen9_stolen_funcs), - INTEL_KBL_IDS(&gen9_stolen_funcs), +static const struct pci_device_id intel_early_ids[] __initconst = { + INTEL_I830_IDS(&i830_early_ops), + INTEL_I845G_IDS(&i845_early_ops), + INTEL_I85X_IDS(&i85x_early_ops), + INTEL_I865G_IDS(&i865_early_ops), + INTEL_I915G_IDS(&gen3_early_ops), + INTEL_I915GM_IDS(&gen3_early_ops), + INTEL_I945G_IDS(&gen3_early_ops), + INTEL_I945GM_IDS(&gen3_early_ops), + INTEL_VLV_M_IDS(&gen6_early_ops), + INTEL_VLV_D_IDS(&gen6_early_ops), + INTEL_PINEVIEW_IDS(&gen3_early_ops), + INTEL_I965G_IDS(&gen3_early_ops), + INTEL_G33_IDS(&gen3_early_ops), + INTEL_I965GM_IDS(&gen3_early_ops), + INTEL_GM45_IDS(&gen3_early_ops), + INTEL_G45_IDS(&gen3_early_ops), + INTEL_IRONLAKE_D_IDS(&gen3_early_ops), + INTEL_IRONLAKE_M_IDS(&gen3_early_ops), + INTEL_SNB_D_IDS(&gen6_early_ops), + INTEL_SNB_M_IDS(&gen6_early_ops), + INTEL_IVB_M_IDS(&gen6_early_ops), + INTEL_IVB_D_IDS(&gen6_early_ops), + INTEL_HSW_D_IDS(&gen6_early_ops), + INTEL_HSW_M_IDS(&gen6_early_ops), + INTEL_BDW_M_IDS(&gen8_early_ops), + INTEL_BDW_D_IDS(&gen8_early_ops), + INTEL_CHV_IDS(&chv_early_ops), + INTEL_SKL_IDS(&gen9_early_ops), + INTEL_BXT_IDS(&gen9_early_ops), + INTEL_KBL_IDS(&gen9_early_ops), }; -static void __init intel_graphics_stolen(int num, int slot, int func) +static void __init +intel_graphics_stolen(int num, int slot, int func, + const struct intel_early_ops *early_ops) { + phys_addr_t base, end; size_t size; + + size = early_ops->stolen_size(num, slot, func); + base = early_ops->stolen_base(num, slot, func, size); + + if (!size || !base) + return; + + end = base + size - 1; + printk(KERN_INFO "Reserving Intel graphics memory at %pa-%pa\n", + &base, &end); + + /* Mark this space as reserved */ + e820_add_region(base, size, E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +} + +static void __init intel_graphics_quirks(int num, int slot, int func) +{ + const struct intel_early_ops *early_ops; + u16 device; int i; - u32 start; - u16 device, subvendor, subdevice; device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID); - subvendor = read_pci_config_16(num, slot, func, - PCI_SUBSYSTEM_VENDOR_ID); - subdevice = read_pci_config_16(num, slot, func, PCI_SUBSYSTEM_ID); - - for (i = 0; i < ARRAY_SIZE(intel_stolen_ids); i++) { - if (intel_stolen_ids[i].device == device) { - const struct intel_stolen_funcs *stolen_funcs = - (const struct intel_stolen_funcs *)intel_stolen_ids[i].driver_data; - size = stolen_funcs->size(num, slot, func); - start = stolen_funcs->base(num, slot, func, size); - if (size && start) { - printk(KERN_INFO "Reserving Intel graphics stolen memory at 0x%x-0x%x\n", - start, start + (u32)size - 1); - /* Mark this space as reserved */ - e820_add_region(start, size, E820_RESERVED); - sanitize_e820_map(e820.map, - ARRAY_SIZE(e820.map), - &e820.nr_map); - } - return; - } + + for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) { + kernel_ulong_t driver_data = intel_early_ids[i].driver_data; + + if (intel_early_ids[i].device != device) + continue; + + early_ops = (typeof(early_ops))driver_data; + + intel_graphics_stolen(num, slot, func, early_ops); + + return; } } @@ -690,7 +674,7 @@ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST, PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check }, { PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID, - QFLAG_APPLY_ONCE, intel_graphics_stolen }, + QFLAG_APPLY_ONCE, intel_graphics_quirks }, /* * HPET on the current version of the Baytrail platform has accuracy * problems: it will halt in deep idle state - so we disable it. diff --git a/arch/x86/kernel/ebda.c b/arch/x86/kernel/ebda.c index afe65dffe..4312f8ae7 100644 --- a/arch/x86/kernel/ebda.c +++ b/arch/x86/kernel/ebda.c @@ -6,66 +6,92 @@ #include /* + * This function reserves all conventional PC system BIOS related + * firmware memory areas (some of which are data, some of which + * are code), that must not be used by the kernel as available + * RAM. + * * The BIOS places the EBDA/XBDA at the top of conventional * memory, and usually decreases the reported amount of - * conventional memory (int 0x12) too. This also contains a - * workaround for Dell systems that neglect to reserve EBDA. - * The same workaround also avoids a problem with the AMD768MPX - * chipset: reserve a page before VGA to prevent PCI prefetch - * into it (errata #56). Usually the page is reserved anyways, - * unless you have no PS/2 mouse plugged in. + * conventional memory (int 0x12) too. + * + * This means that as a first approximation on most systems we can + * guess the reserved BIOS area by looking at the low BIOS RAM size + * value and assume that everything above that value (up to 1MB) is + * reserved. + * + * But life in firmware country is not that simple: + * + * - This code also contains a quirk for Dell systems that neglect + * to reserve the EBDA area in the 'RAM size' value ... + * + * - The same quirk also avoids a problem with the AMD768MPX + * chipset: reserve a page before VGA to prevent PCI prefetch + * into it (errata #56). (Usually the page is reserved anyways, + * unless you have no PS/2 mouse plugged in.) + * + * - Plus paravirt systems don't have a reliable value in the + * 'BIOS RAM size' pointer we can rely on, so we must quirk + * them too. + * + * Due to those various problems this function is deliberately + * very conservative and tries to err on the side of reserving + * too much, to not risk reserving too little. + * + * Losing a small amount of memory in the bottom megabyte is + * rarely a problem, as long as we have enough memory to install + * the SMP bootup trampoline which *must* be in this area. * - * This functions is deliberately very conservative. Losing - * memory in the bottom megabyte is rarely a problem, as long - * as we have enough memory to install the trampoline. Using - * memory that is in use by the BIOS or by some DMA device - * the BIOS didn't shut down *is* a big problem. + * Using memory that is in use by the BIOS or by some DMA device + * the BIOS didn't shut down *is* a big problem to the kernel, + * obviously. */ -#define BIOS_LOWMEM_KILOBYTES 0x413 -#define LOWMEM_CAP 0x9f000U /* Absolute maximum */ -#define INSANE_CUTOFF 0x20000U /* Less than this = insane */ +#define BIOS_RAM_SIZE_KB_PTR 0x413 -void __init reserve_ebda_region(void) +#define BIOS_START_MIN 0x20000U /* 128K, less than this is insane */ +#define BIOS_START_MAX 0x9f000U /* 640K, absolute maximum */ + +void __init reserve_bios_regions(void) { - unsigned int lowmem, ebda_addr; + unsigned int bios_start, ebda_start; /* - * To determine the position of the EBDA and the - * end of conventional memory, we need to look at - * the BIOS data area. In a paravirtual environment - * that area is absent. We'll just have to assume - * that the paravirt case can handle memory setup - * correctly, without our help. + * NOTE: In a paravirtual environment the BIOS reserved + * area is absent. We'll just have to assume that the + * paravirt case can handle memory setup correctly, + * without our help. */ - if (!x86_platform.legacy.ebda_search) + if (!x86_platform.legacy.reserve_bios_regions) return; - /* end of low (conventional) memory */ - lowmem = *(unsigned short *)__va(BIOS_LOWMEM_KILOBYTES); - lowmem <<= 10; - - /* start of EBDA area */ - ebda_addr = get_bios_ebda(); - /* - * Note: some old Dells seem to need 4k EBDA without - * reporting so, so just consider the memory above 0x9f000 - * to be off limits (bugzilla 2990). + * BIOS RAM size is encoded in kilobytes, convert it + * to bytes to get a first guess at where the BIOS + * firmware area starts: */ + bios_start = *(unsigned short *)__va(BIOS_RAM_SIZE_KB_PTR); + bios_start <<= 10; - /* If the EBDA address is below 128K, assume it is bogus */ - if (ebda_addr < INSANE_CUTOFF) - ebda_addr = LOWMEM_CAP; + /* + * If bios_start is less than 128K, assume it is bogus + * and bump it up to 640K. Similarly, if bios_start is above 640K, + * don't trust it. + */ + if (bios_start < BIOS_START_MIN || bios_start > BIOS_START_MAX) + bios_start = BIOS_START_MAX; - /* If lowmem is less than 128K, assume it is bogus */ - if (lowmem < INSANE_CUTOFF) - lowmem = LOWMEM_CAP; + /* Get the start address of the EBDA page: */ + ebda_start = get_bios_ebda(); - /* Use the lower of the lowmem and EBDA markers as the cutoff */ - lowmem = min(lowmem, ebda_addr); - lowmem = min(lowmem, LOWMEM_CAP); /* Absolute cap */ + /* + * If the EBDA start address is sane and is below the BIOS region, + * then also reserve everything from the EBDA start address up to + * the BIOS region. + */ + if (ebda_start >= BIOS_START_MIN && ebda_start < bios_start) + bios_start = ebda_start; - /* reserve all memory between lowmem and the 1MB mark */ - memblock_reserve(lowmem, 0x100000 - lowmem); + /* Reserve all memory between bios_start and the 1MB mark: */ + memblock_reserve(bios_start, 0x100000 - bios_start); } diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 5587526c4..04f89caef 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -173,7 +173,6 @@ void init_espfix_ap(int cpu) struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); pmd_p = (pmd_t *)page_address(page); - SetPageTOI_Untracked(virt_to_page(pmd_p)); pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PUD_CLONES; n++) @@ -186,7 +185,6 @@ void init_espfix_ap(int cpu) struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); pte_p = (pte_t *)page_address(page); - SetPageTOI_Untracked(virt_to_page(pte_p)); pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PMD_CLONES; n++) @@ -195,7 +193,6 @@ void init_espfix_ap(int cpu) pte_p = pte_offset_kernel(&pmd, addr); stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); - SetPageTOI_Untracked(virt_to_page(stack_page)); pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 97027545a..3fc03a09a 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -8,10 +8,14 @@ #include #include #include +#include #include #include +#define CREATE_TRACE_POINTS +#include + /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: @@ -192,6 +196,7 @@ void fpu__save(struct fpu *fpu) WARN_ON_FPU(fpu != ¤t->thread.fpu); preempt_disable(); + trace_x86_fpu_before_save(fpu); if (fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(fpu)) { if (use_eager_fpu()) @@ -200,6 +205,7 @@ void fpu__save(struct fpu *fpu) fpregs_deactivate(fpu); } } + trace_x86_fpu_after_save(fpu); preempt_enable(); } EXPORT_SYMBOL_GPL(fpu__save); @@ -222,7 +228,14 @@ void fpstate_init(union fpregs_state *state) return; } - memset(state, 0, xstate_size); + memset(state, 0, fpu_kernel_xstate_size); + + /* + * XRSTORS requires that this bit is set in xcomp_bv, or + * it will #GP. Make sure it is replaced after the memset(). + */ + if (static_cpu_has(X86_FEATURE_XSAVES)) + state->xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT; if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); @@ -247,7 +260,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * leak into the child task: */ if (use_eager_fpu()) - memset(&dst_fpu->state.xsave, 0, xstate_size); + memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* * Save current FPU registers directly into the child @@ -266,7 +279,8 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) */ preempt_disable(); if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, xstate_size); + memcpy(&src_fpu->state, &dst_fpu->state, + fpu_kernel_xstate_size); if (use_eager_fpu()) copy_kernel_to_fpregs(&src_fpu->state); @@ -275,6 +289,9 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) } preempt_enable(); + trace_x86_fpu_copy_src(src_fpu); + trace_x86_fpu_copy_dst(dst_fpu); + return 0; } @@ -288,7 +305,9 @@ void fpu__activate_curr(struct fpu *fpu) if (!fpu->fpstate_active) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for the current task: */ fpu->fpstate_active = 1; } @@ -314,7 +333,9 @@ void fpu__activate_fpstate_read(struct fpu *fpu) } else { if (!fpu->fpstate_active) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for current and for stopped child tasks: */ fpu->fpstate_active = 1; } @@ -347,7 +368,9 @@ void fpu__activate_fpstate_write(struct fpu *fpu) fpu->last_cpu = -1; } else { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); + trace_x86_fpu_activate_state(fpu); /* Safe to do for stopped child tasks: */ fpu->fpstate_active = 1; } @@ -432,9 +455,11 @@ void fpu__restore(struct fpu *fpu) /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ kernel_fpu_disable(); + trace_x86_fpu_before_restore(fpu); fpregs_activate(fpu); copy_kernel_to_fpregs(&fpu->state); fpu->counter++; + trace_x86_fpu_after_restore(fpu); kernel_fpu_enable(); } EXPORT_SYMBOL_GPL(fpu__restore); @@ -463,6 +488,8 @@ void fpu__drop(struct fpu *fpu) fpu->fpstate_active = 0; + trace_x86_fpu_dropped(fpu); + preempt_enable(); } diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index aacfd7a82..93982aebb 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -145,8 +145,8 @@ static void __init fpu__init_system_generic(void) * This is inherent to the XSAVE architecture which puts all state * components into a single, continuous memory block: */ -unsigned int xstate_size; -EXPORT_SYMBOL_GPL(xstate_size); +unsigned int fpu_kernel_xstate_size; +EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size); /* Get alignment of the TYPE. */ #define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test) @@ -178,7 +178,7 @@ static void __init fpu__init_task_struct_size(void) * Add back the dynamically-calculated register state * size. */ - task_size += xstate_size; + task_size += fpu_kernel_xstate_size; /* * We dynamically size 'struct fpu', so we require that @@ -195,7 +195,7 @@ static void __init fpu__init_task_struct_size(void) } /* - * Set up the xstate_size based on the legacy FPU context size. + * Set up the user and kernel xstate sizes based on the legacy FPU context size. * * We set this up first, and later it will be overwritten by * fpu__init_system_xstate() if the CPU knows about xstates. @@ -208,7 +208,7 @@ static void __init fpu__init_system_xstate_size_legacy(void) on_boot_cpu = 0; /* - * Note that xstate_size might be overwriten later during + * Note that xstate sizes might be overwritten later during * fpu__init_system_xstate(). */ @@ -219,27 +219,17 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ setup_clear_cpu_cap(X86_FEATURE_XSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - xstate_size = sizeof(struct swregs_state); + fpu_kernel_xstate_size = sizeof(struct swregs_state); } else { if (boot_cpu_has(X86_FEATURE_FXSR)) - xstate_size = sizeof(struct fxregs_state); + fpu_kernel_xstate_size = + sizeof(struct fxregs_state); else - xstate_size = sizeof(struct fregs_state); + fpu_kernel_xstate_size = + sizeof(struct fregs_state); } - /* - * Quirk: we don't yet handle the XSAVES* instructions - * correctly, as we don't correctly convert between - * standard and compacted format when interfacing - * with user-space - so disable it for now. - * - * The difference is small: with recent CPUs the - * compacted format is only marginally smaller than - * the standard FPU state format. - * - * ( This is easy to backport while we are fixing - * XSAVES* support. ) - */ - setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + fpu_user_xstate_size = fpu_kernel_xstate_size; } /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 81422dfb1..c114b132d 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -4,6 +4,7 @@ #include #include #include +#include /* * The xstateregs_active() routine is the same as the regset_fpregs_active() routine, @@ -85,21 +86,26 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; - fpu__activate_fpstate_read(fpu); - xsave = &fpu->state.xsave; - /* - * Copy the 48bytes defined by the software first into the xstate - * memory layout in the thread struct, so that we can copy the entire - * xstateregs to the user using one user_regset_copyout(). - */ - memcpy(&xsave->i387.sw_reserved, - xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); - /* - * Copy the xstate memory layout. - */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + fpu__activate_fpstate_read(fpu); + + if (using_compacted_format()) { + ret = copyout_from_xsaves(pos, count, kbuf, ubuf, xsave); + } else { + fpstate_sanitize_xstate(fpu); + /* + * Copy the 48 bytes defined by the software into the xsave + * area in the thread struct, so that we can copy the whole + * area to user using one user_regset_copyout(). + */ + memcpy(&xsave->i387.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + + /* + * Copy the xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + } return ret; } @@ -114,11 +120,27 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (!boot_cpu_has(X86_FEATURE_XSAVE)) return -ENODEV; - fpu__activate_fpstate_write(fpu); + /* + * A whole standard-format XSAVE buffer is needed: + */ + if ((pos != 0) || (count < fpu_user_xstate_size)) + return -EFAULT; xsave = &fpu->state.xsave; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + fpu__activate_fpstate_write(fpu); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + ret = copyin_to_xsaves(kbuf, ubuf, xsave); + else + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + + /* + * In case of failure, mark all states as init: + */ + if (ret) + fpstate_init(&fpu->state); + /* * mxcsr reserved bits must be masked to zero for security reasons. */ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 31c6a6050..a184c210e 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -8,8 +8,10 @@ #include #include #include +#include #include +#include static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32; @@ -31,7 +33,7 @@ static inline int check_for_xstate(struct fxregs_state __user *buf, /* Check for the first magic field and other error scenarios. */ if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || fx_sw->xstate_size < min_xstate_size || - fx_sw->xstate_size > xstate_size || + fx_sw->xstate_size > fpu_user_xstate_size || fx_sw->xstate_size > fx_sw->extended_size) return -1; @@ -88,7 +90,8 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) if (!use_xsave()) return err; - err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 *)(buf + fpu_user_xstate_size)); /* * Read the xfeatures which we copied (directly from the cpu or @@ -125,7 +128,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) else err = copy_fregs_to_user((struct fregs_state __user *) buf); - if (unlikely(err) && __clear_user(buf, xstate_size)) + if (unlikely(err) && __clear_user(buf, fpu_user_xstate_size)) err = -EFAULT; return err; } @@ -156,8 +159,8 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); + ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); if (!access_ok(VERIFY_WRITE, buf, size)) return -EACCES; @@ -167,7 +170,7 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpregs_active()) { + if (fpregs_active() || using_compacted_format()) { /* Save the live register state to the user directly. */ if (copy_fpregs_to_sigframe(buf_fx)) return -1; @@ -175,8 +178,19 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) if (ia32_fxstate) copy_fxregs_to_kernel(&tsk->thread.fpu); } else { + /* + * It is a *bug* if kernel uses compacted-format for xsave + * area and we copy it out directly to a signal frame. It + * should have been handled above by saving the registers + * directly. + */ + if (boot_cpu_has(X86_FEATURE_XSAVES)) { + WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n"); + return -1; + } + fpstate_sanitize_xstate(&tsk->thread.fpu); - if (__copy_to_user(buf_fx, xsave, xstate_size)) + if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) return -1; } @@ -250,12 +264,12 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; - int state_size = xstate_size; + int state_size = fpu_kernel_xstate_size; u64 xfeatures = 0; int fx_only = 0; - ia32_fxstate &= (config_enabled(CONFIG_X86_32) || - config_enabled(CONFIG_IA32_EMULATION)); + ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || + IS_ENABLED(CONFIG_IA32_EMULATION)); if (!buf) { fpu__clear(fpu); @@ -282,6 +296,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ state_size = sizeof(struct fxregs_state); fx_only = 1; + trace_x86_fpu_xstate_check_failed(fpu); } else { state_size = fx_sw_user.xstate_size; xfeatures = fx_sw_user.xfeatures; @@ -308,9 +323,17 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) */ fpu__drop(fpu); - if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || - __copy_from_user(&env, buf, sizeof(env))) { + if (using_compacted_format()) { + err = copyin_to_xsaves(NULL, buf_fx, + &fpu->state.xsave); + } else { + err = __copy_from_user(&fpu->state.xsave, + buf_fx, state_size); + } + + if (err || __copy_from_user(&env, buf, sizeof(env))) { fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); err = -1; } else { sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); @@ -341,7 +364,8 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) static inline int xstate_sigframe_size(void) { - return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; + return use_xsave() ? fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE : + fpu_user_xstate_size; } /* @@ -385,15 +409,15 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, */ void fpu__init_prepare_fx_sw_frame(void) { - int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; + int size = fpu_user_xstate_size + FP_XSTATE_MAGIC2_SIZE; fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; fx_sw_reserved.xfeatures = xfeatures_mask; - fx_sw_reserved.xstate_size = xstate_size; + fx_sw_reserved.xstate_size = fpu_user_xstate_size; - if (config_enabled(CONFIG_IA32_EMULATION) || - config_enabled(CONFIG_X86_32)) { + if (IS_ENABLED(CONFIG_IA32_EMULATION) || + IS_ENABLED(CONFIG_X86_32)) { int fsave_header_size = sizeof(struct fregs_state); fx_sw_reserved_ia32 = fx_sw_reserved; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 4ea2a5948..01567aa87 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -43,6 +44,13 @@ static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask)*8]; +/* + * The XSAVE area of kernel can be in standard or compacted format; + * it is always in standard format for user mode. This is the user + * mode standard format size used for signal and ptrace frames. + */ +unsigned int fpu_user_xstate_size; + /* * Clear all of the X86_FEATURE_* bits that are unavailable * when the CPU has no XSAVE support. @@ -105,6 +113,27 @@ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) } EXPORT_SYMBOL_GPL(cpu_has_xfeatures); +static int xfeature_is_supervisor(int xfeature_nr) +{ + /* + * We currently do not support supervisor states, but if + * we did, we could find out like this. + * + * SDM says: If state component 'i' is a user state component, + * ECX[0] return 0; if state component i is a supervisor + * state component, ECX[0] returns 1. + */ + u32 eax, ebx, ecx, edx; + + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + return !!(ecx & 1); +} + +static int xfeature_is_user(int xfeature_nr) +{ + return !xfeature_is_supervisor(xfeature_nr); +} + /* * When executing XSAVEOPT (or other optimized XSAVE instructions), if * a processor implementation detects that an FPU state component is still @@ -171,7 +200,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ while (xfeatures) { if (xfeatures & 0x1) { - int offset = xstate_offsets[feature_bit]; + int offset = xstate_comp_offsets[feature_bit]; int size = xstate_sizes[feature_bit]; memcpy((void *)fx + offset, @@ -192,6 +221,15 @@ void fpu__init_cpu_xstate(void) { if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) return; + /* + * Make it clear that XSAVES supervisor states are not yet + * implemented should anyone expect it to work by changing + * bits in XFEATURE_MASK_* macros and XCR0. + */ + WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR), + "x86/fpu: XSAVES supervisor states are not yet implemented.\n"); + + xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; cr4_set_bits(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); @@ -217,13 +255,29 @@ static void __init setup_xstate_features(void) /* start at the beginnning of the "extended state" */ unsigned int last_good_offset = offsetof(struct xregs_state, extended_state_area); + /* + * The FP xstates and SSE xstates are legacy states. They are always + * in the fixed offsets in the xsave area in either compacted form + * or standard form. + */ + xstate_offsets[0] = 0; + xstate_sizes[0] = offsetof(struct fxregs_state, xmm_space); + xstate_offsets[1] = xstate_sizes[0]; + xstate_sizes[1] = FIELD_SIZEOF(struct fxregs_state, xmm_space); for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { if (!xfeature_enabled(i)) continue; cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - xstate_offsets[i] = ebx; + + /* + * If an xfeature is supervisor state, the offset + * in EBX is invalid. We leave it to -1. + */ + if (xfeature_is_user(i)) + xstate_offsets[i] = ebx; + xstate_sizes[i] = eax; /* * In our xstate size checks, we assume that the @@ -233,8 +287,6 @@ static void __init setup_xstate_features(void) WARN_ONCE(last_good_offset > xstate_offsets[i], "x86/fpu: misordered xstate at %d\n", last_good_offset); last_good_offset = xstate_offsets[i]; - - printk(KERN_INFO "x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", i, ebx, i, eax); } } @@ -262,6 +314,33 @@ static void __init print_xstate_features(void) print_xstate_feature(XFEATURE_MASK_PKRU); } +/* + * This check is important because it is easy to get XSTATE_* + * confused with XSTATE_BIT_*. + */ +#define CHECK_XFEATURE(nr) do { \ + WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ + WARN_ON(nr >= XFEATURE_MAX); \ +} while (0) + +/* + * We could cache this like xstate_size[], but we only use + * it here, so it would be a waste of space. + */ +static int xfeature_is_aligned(int xfeature_nr) +{ + u32 eax, ebx, ecx, edx; + + CHECK_XFEATURE(xfeature_nr); + cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); + /* + * The value returned by ECX[1] indicates the alignment + * of state component 'i' when the compacted format + * of the extended region of an XSAVE area is used: + */ + return !!(ecx & 2); +} + /* * This function sets up offsets and sizes of all extended states in * xsave area. This supports both standard format and compacted format @@ -299,10 +378,29 @@ static void __init setup_xstate_comp(void) else xstate_comp_sizes[i] = 0; - if (i > FIRST_EXTENDED_XFEATURE) + if (i > FIRST_EXTENDED_XFEATURE) { xstate_comp_offsets[i] = xstate_comp_offsets[i-1] + xstate_comp_sizes[i-1]; + if (xfeature_is_aligned(i)) + xstate_comp_offsets[i] = + ALIGN(xstate_comp_offsets[i], 64); + } + } +} + +/* + * Print out xstate component offsets and sizes + */ +static void __init print_xstate_offset_size(void) +{ + int i; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i)) + continue; + pr_info("x86/fpu: xstate_offset[%d]: %4d, xstate_sizes[%d]: %4d\n", + i, xstate_comp_offsets[i], i, xstate_sizes[i]); } } @@ -322,13 +420,11 @@ static void __init setup_init_fpu_buf(void) setup_xstate_features(); print_xstate_features(); - if (boot_cpu_has(X86_FEATURE_XSAVES)) { + if (boot_cpu_has(X86_FEATURE_XSAVES)) init_fpstate.xsave.header.xcomp_bv = (u64)1 << 63 | xfeatures_mask; - init_fpstate.xsave.header.xfeatures = xfeatures_mask; - } /* - * Init all the features state with header_bv being 0x0 + * Init all the features state with header.xfeatures being 0x0 */ copy_kernel_to_xregs_booting(&init_fpstate.xsave); @@ -339,58 +435,19 @@ static void __init setup_init_fpu_buf(void) copy_xregs_to_kernel_booting(&init_fpstate.xsave); } -static int xfeature_is_supervisor(int xfeature_nr) -{ - /* - * We currently do not support supervisor states, but if - * we did, we could find out like this. - * - * SDM says: If state component i is a user state component, - * ECX[0] return 0; if state component i is a supervisor - * state component, ECX[0] returns 1. - u32 eax, ebx, ecx, edx; - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx; - return !!(ecx & 1); - */ - return 0; -} -/* -static int xfeature_is_user(int xfeature_nr) -{ - return !xfeature_is_supervisor(xfeature_nr); -} -*/ - -/* - * This check is important because it is easy to get XSTATE_* - * confused with XSTATE_BIT_*. - */ -#define CHECK_XFEATURE(nr) do { \ - WARN_ON(nr < FIRST_EXTENDED_XFEATURE); \ - WARN_ON(nr >= XFEATURE_MAX); \ -} while (0) - -/* - * We could cache this like xstate_size[], but we only use - * it here, so it would be a waste of space. - */ -static int xfeature_is_aligned(int xfeature_nr) +static int xfeature_uncompacted_offset(int xfeature_nr) { u32 eax, ebx, ecx, edx; - CHECK_XFEATURE(xfeature_nr); - cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); /* - * The value returned by ECX[1] indicates the alignment - * of state component i when the compacted format - * of the extended region of an XSAVE area is used + * Only XSAVES supports supervisor states and it uses compacted + * format. Checking a supervisor state's uncompacted offset is + * an error. */ - return !!(ecx & 2); -} - -static int xfeature_uncompacted_offset(int xfeature_nr) -{ - u32 eax, ebx, ecx, edx; + if (XFEATURE_MASK_SUPERVISOR & (1 << xfeature_nr)) { + WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); + return -1; + } CHECK_XFEATURE(xfeature_nr); cpuid_count(XSTATE_CPUID, xfeature_nr, &eax, &ebx, &ecx, &edx); @@ -415,7 +472,7 @@ static int xfeature_size(int xfeature_nr) * that it is obvious which aspect of 'XSAVES' is being handled * by the calling code. */ -static int using_compacted_format(void) +int using_compacted_format(void) { return boot_cpu_has(X86_FEATURE_XSAVES); } @@ -530,11 +587,12 @@ static void do_extra_xstate_size_checks(void) */ paranoid_xstate_size += xfeature_size(i); } - XSTATE_WARN_ON(paranoid_xstate_size != xstate_size); + XSTATE_WARN_ON(paranoid_xstate_size != fpu_kernel_xstate_size); } + /* - * Calculate total size of enabled xstates in XCR0/xfeatures_mask. + * Get total size of enabled xstates in XCR0/xfeatures_mask. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer @@ -544,34 +602,33 @@ static void do_extra_xstate_size_checks(void) * Note that we do not currently set any bits on IA32_XSS so * 'XCR0 | IA32_XSS == XCR0' for now. */ -static unsigned int __init calculate_xstate_size(void) +static unsigned int __init get_xsaves_size(void) { unsigned int eax, ebx, ecx, edx; - unsigned int calculated_xstate_size; + /* + * - CPUID function 0DH, sub-function 1: + * EBX enumerates the size (in bytes) required by + * the XSAVES instruction for an XSAVE area + * containing all the state components + * corresponding to bits currently set in + * XCR0 | IA32_XSS. + */ + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + return ebx; +} - if (!boot_cpu_has(X86_FEATURE_XSAVES)) { - /* - * - CPUID function 0DH, sub-function 0: - * EBX enumerates the size (in bytes) required by - * the XSAVE instruction for an XSAVE area - * containing all the *user* state components - * corresponding to bits currently set in XCR0. - */ - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - calculated_xstate_size = ebx; - } else { - /* - * - CPUID function 0DH, sub-function 1: - * EBX enumerates the size (in bytes) required by - * the XSAVES instruction for an XSAVE area - * containing all the state components - * corresponding to bits currently set in - * XCR0 | IA32_XSS. - */ - cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); - calculated_xstate_size = ebx; - } - return calculated_xstate_size; +static unsigned int __init get_xsave_size(void) +{ + unsigned int eax, ebx, ecx, edx; + /* + * - CPUID function 0DH, sub-function 0: + * EBX enumerates the size (in bytes) required by + * the XSAVE instruction for an XSAVE area + * containing all the *user* state components + * corresponding to bits currently set in XCR0. + */ + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + return ebx; } /* @@ -591,7 +648,15 @@ static bool is_supported_xstate_size(unsigned int test_xstate_size) static int init_xstate_size(void) { /* Recompute the context size for enabled features: */ - unsigned int possible_xstate_size = calculate_xstate_size(); + unsigned int possible_xstate_size; + unsigned int xsave_size; + + xsave_size = get_xsave_size(); + + if (boot_cpu_has(X86_FEATURE_XSAVES)) + possible_xstate_size = get_xsaves_size(); + else + possible_xstate_size = xsave_size; /* Ensure we have the space to store all enabled: */ if (!is_supported_xstate_size(possible_xstate_size)) @@ -601,8 +666,13 @@ static int init_xstate_size(void) * The size is OK, we are definitely going to use xsave, * make it known to the world that we need more space. */ - xstate_size = possible_xstate_size; + fpu_kernel_xstate_size = possible_xstate_size; do_extra_xstate_size_checks(); + + /* + * User space is always in standard format. + */ + fpu_user_xstate_size = xsave_size; return 0; } @@ -644,8 +714,13 @@ void __init fpu__init_system_xstate(void) xfeatures_mask = eax + ((u64)edx << 32); if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + /* + * This indicates that something really unexpected happened + * with the enumeration. Disable XSAVE and try to continue + * booting without it. This is too early to BUG(). + */ pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); - BUG(); + goto out_disable; } xfeatures_mask &= fpu__get_supported_xfeatures_mask(); @@ -653,21 +728,29 @@ void __init fpu__init_system_xstate(void) /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); err = init_xstate_size(); - if (err) { - /* something went wrong, boot without any XSAVE support */ - fpu__init_disable_system_xstate(); - return; - } + if (err) + goto out_disable; + + /* + * Update info used for ptrace frames; use standard-format size and no + * supervisor xstates: + */ + update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR); - update_regset_xstate_info(xstate_size, xfeatures_mask); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp(); + print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", xfeatures_mask, - xstate_size, + fpu_kernel_xstate_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); + return; + +out_disable: + /* something went wrong, try to boot without any XSAVE support */ + fpu__init_disable_system_xstate(); } /* @@ -693,6 +776,11 @@ void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask) { int feature_nr = fls64(xstate_feature_mask) - 1; + if (!xfeature_enabled(feature_nr)) { + WARN_ON_FPU(1); + return NULL; + } + return (void *)xsave + xstate_comp_offsets[feature_nr]; } /* @@ -778,155 +866,212 @@ const void *get_xsave_field_ptr(int xsave_state) return get_xsave_addr(&fpu->state.xsave, xsave_state); } +#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) +#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) /* - * Set xfeatures (aka XSTATE_BV) bit for a feature that we want - * to take out of its "init state". This will ensure that an - * XRSTOR actually restores the state. + * This will go out and modify PKRU register to set the access + * rights for @pkey to @init_val. */ -static void fpu__xfeature_set_non_init(struct xregs_state *xsave, - int xstate_feature_mask) +int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, + unsigned long init_val) { - xsave->header.xfeatures |= xstate_feature_mask; + u32 old_pkru; + int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); + u32 new_pkru_bits = 0; + + /* + * This check implies XSAVE support. OSPKE only gets + * set if we enable XSAVE and we enable PKU in XCR0. + */ + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return -EINVAL; + /* + * For most XSAVE components, this would be an arduous task: + * brining fpstate up to date with fpregs, updating fpstate, + * then re-populating fpregs. But, for components that are + * never lazily managed, we can just access the fpregs + * directly. PKRU is never managed lazily, so we can just + * manipulate it directly. Make sure it stays that way. + */ + WARN_ON_ONCE(!use_eager_fpu()); + + /* Set the bits we need in PKRU: */ + if (init_val & PKEY_DISABLE_ACCESS) + new_pkru_bits |= PKRU_AD_BIT; + if (init_val & PKEY_DISABLE_WRITE) + new_pkru_bits |= PKRU_WD_BIT; + + /* Shift the bits in to the correct place in PKRU for pkey: */ + new_pkru_bits <<= pkey_shift; + + /* Get old PKRU and mask off any old bits in place: */ + old_pkru = read_pkru(); + old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); + + /* Write old part along with new part: */ + write_pkru(old_pkru | new_pkru_bits); + + return 0; } /* - * This function is safe to call whether the FPU is in use or not. - * - * Note that this only works on the current task. - * - * Inputs: - * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, - * XFEATURE_MASK_SSE, etc...) - * @xsave_state_ptr: a pointer to a copy of the state that you would - * like written in to the current task's FPU xsave state. This pointer - * must not be located in the current tasks's xsave area. - * Output: - * address of the state in the xsave area or NULL if the state - * is not present or is in its 'init state'. + * This is similar to user_regset_copyout(), but will not add offset to + * the source data pointer or increment pos, count, kbuf, and ubuf. */ -static void fpu__xfeature_set_state(int xstate_feature_mask, - void *xstate_feature_src, size_t len) +static inline int xstate_copyout(unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf, + const void *data, const int start_pos, + const int end_pos) { - struct xregs_state *xsave = ¤t->thread.fpu.state.xsave; - struct fpu *fpu = ¤t->thread.fpu; - void *dst; + if ((count == 0) || (pos < start_pos)) + return 0; - if (!boot_cpu_has(X86_FEATURE_XSAVE)) { - WARN_ONCE(1, "%s() attempted with no xsave support", __func__); - return; + if (end_pos < 0 || pos < end_pos) { + unsigned int copy = (end_pos < 0 ? count : min(count, end_pos - pos)); + + if (kbuf) { + memcpy(kbuf + pos, data, copy); + } else { + if (__copy_to_user(ubuf + pos, data, copy)) + return -EFAULT; + } } + return 0; +} + +/* + * Convert from kernel XSAVES compacted format to standard format and copy + * to a ptrace buffer. It supports partial copy but pos always starts from + * zero. This is called from xstateregs_get() and there we check the CPU + * has XSAVES. + */ +int copyout_from_xsaves(unsigned int pos, unsigned int count, void *kbuf, + void __user *ubuf, struct xregs_state *xsave) +{ + unsigned int offset, size; + int ret, i; + struct xstate_header header; /* - * Tell the FPU code that we need the FPU state to be in - * 'fpu' (not in the registers), and that we need it to - * be stable while we write to it. + * Currently copy_regset_to_user() starts from pos 0: */ - fpu__current_fpstate_write_begin(); + if (unlikely(pos != 0)) + return -EFAULT; /* - * This method *WILL* *NOT* work for compact-format - * buffers. If the 'xstate_feature_mask' is unset in - * xcomp_bv then we may need to move other feature state - * "up" in the buffer. + * The destination is a ptrace buffer; we put in only user xstates: */ - if (xsave->header.xcomp_bv & xstate_feature_mask) { - WARN_ON_ONCE(1); - goto out; - } - - /* find the location in the xsave buffer of the desired state */ - dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask); + memset(&header, 0, sizeof(header)); + header.xfeatures = xsave->header.xfeatures; + header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; /* - * Make sure that the pointer being passed in did not - * come from the xsave buffer itself. + * Copy xregs_state->header: */ - WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself"); + offset = offsetof(struct xregs_state, header); + size = sizeof(header); - /* put the caller-provided data in the location */ - memcpy(dst, xstate_feature_src, len); + ret = xstate_copyout(offset, size, kbuf, ubuf, &header, 0, count); + + if (ret) + return ret; + + for (i = 0; i < XFEATURE_MAX; i++) { + /* + * Copy only in-use xstates: + */ + if ((header.xfeatures >> i) & 1) { + void *src = __raw_xsave_addr(xsave, 1 << i); + + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + ret = xstate_copyout(offset, size, kbuf, ubuf, src, 0, count); + + if (ret) + return ret; + + if (offset + size >= count) + break; + } + + } /* - * Mark the xfeature so that the CPU knows there is state - * in the buffer now. - */ - fpu__xfeature_set_non_init(xsave, xstate_feature_mask); -out: - /* - * We are done writing to the 'fpu'. Reenable preeption - * and (possibly) move the fpstate back in to the fpregs. + * Fill xsave->i387.sw_reserved value for ptrace frame: */ - fpu__current_fpstate_write_end(); -} + offset = offsetof(struct fxregs_state, sw_reserved); + size = sizeof(xstate_fx_sw_bytes); -#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) -#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) + ret = xstate_copyout(offset, size, kbuf, ubuf, xstate_fx_sw_bytes, 0, count); + + if (ret) + return ret; + + return 0; +} /* - * This will go out and modify the XSAVE buffer so that PKRU is - * set to a particular state for access to 'pkey'. - * - * PKRU state does affect kernel access to user memory. We do - * not modfiy PKRU *itself* here, only the XSAVE state that will - * be restored in to PKRU when we return back to userspace. + * Convert from a ptrace standard-format buffer to kernel XSAVES format + * and copy to the target thread. This is called from xstateregs_set() and + * there we check the CPU has XSAVES and a whole standard-sized buffer + * exists. */ -int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, - unsigned long init_val) +int copyin_to_xsaves(const void *kbuf, const void __user *ubuf, + struct xregs_state *xsave) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; - struct pkru_state *old_pkru_state; - struct pkru_state new_pkru_state; - int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); - u32 new_pkru_bits = 0; + unsigned int offset, size; + int i; + u64 xfeatures; + u64 allowed_features; + + offset = offsetof(struct xregs_state, header); + size = sizeof(xfeatures); + + if (kbuf) { + memcpy(&xfeatures, kbuf + offset, size); + } else { + if (__copy_from_user(&xfeatures, ubuf + offset, size)) + return -EFAULT; + } /* - * This check implies XSAVE support. OSPKE only gets - * set if we enable XSAVE and we enable PKU in XCR0. + * Reject if the user sets any disabled or supervisor features: */ - if (!boot_cpu_has(X86_FEATURE_OSPKE)) + allowed_features = xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR; + + if (xfeatures & ~allowed_features) return -EINVAL; - /* Set the bits we need in PKRU */ - if (init_val & PKEY_DISABLE_ACCESS) - new_pkru_bits |= PKRU_AD_BIT; - if (init_val & PKEY_DISABLE_WRITE) - new_pkru_bits |= PKRU_WD_BIT; + for (i = 0; i < XFEATURE_MAX; i++) { + u64 mask = ((u64)1 << i); - /* Shift the bits in to the correct place in PKRU for pkey. */ - new_pkru_bits <<= pkey_shift; + if (xfeatures & mask) { + void *dst = __raw_xsave_addr(xsave, 1 << i); - /* Locate old copy of the state in the xsave buffer */ - old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); + offset = xstate_offsets[i]; + size = xstate_sizes[i]; + + if (kbuf) { + memcpy(dst, kbuf + offset, size); + } else { + if (__copy_from_user(dst, ubuf + offset, size)) + return -EFAULT; + } + } + } /* - * When state is not in the buffer, it is in the init - * state, set it manually. Otherwise, copy out the old - * state. + * The state that came in from userspace was user-state only. + * Mask all the user states out of 'xfeatures': */ - if (!old_pkru_state) - new_pkru_state.pkru = 0; - else - new_pkru_state.pkru = old_pkru_state->pkru; - - /* mask off any old bits in place */ - new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); - /* Set the newly-requested bits */ - new_pkru_state.pkru |= new_pkru_bits; + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; /* - * We could theoretically live without zeroing pkru.pad. - * The current XSAVE feature state definition says that - * only bytes 0->3 are used. But we do not want to - * chance leaking kernel stack out to userspace in case a - * memcpy() of the whole xsave buffer was done. - * - * They're in the same cacheline anyway. + * Add back in the features that came in from userspace: */ - new_pkru_state.pad = 0; - - fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, - sizeof(new_pkru_state)); + xsave->header.xfeatures |= xfeatures; return 0; } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d784bb547..f16c55bfc 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -25,8 +25,6 @@ static void __init i386_default_early_setup(void) /* Initialize 32bit specific setup functions */ x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; - - reserve_ebda_region(); } asmlinkage __visible void __init i386_start_kernel(void) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index b72fb0b71..54a2372f5 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -183,7 +183,6 @@ void __init x86_64_start_reservations(char *real_mode_data) copy_bootdata(__va(real_mode_data)); x86_early_init_platform_quirks(); - reserve_ebda_region(); switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_INTEL_MID: diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 5df831ef1..9f8efc9f0 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -38,7 +38,7 @@ #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) -L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET) +L4_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) @@ -299,6 +299,7 @@ ENTRY(secondary_startup_64) pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq +ENDPROC(secondary_startup_64) #include "verify_cpu.S" diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index f112af7aa..c6dfd801d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -710,31 +710,29 @@ static void hpet_work(struct work_struct *w) complete(&hpet_work->complete); } -static int hpet_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) +static int hpet_cpuhp_online(unsigned int cpu) { - unsigned long cpu = (unsigned long)hcpu; struct hpet_work_struct work; + + INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); + init_completion(&work.complete); + /* FIXME: add schedule_work_on() */ + schedule_delayed_work_on(cpu, &work.work, 0); + wait_for_completion(&work.complete); + destroy_delayed_work_on_stack(&work.work); + return 0; +} + +static int hpet_cpuhp_dead(unsigned int cpu) +{ struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work); - init_completion(&work.complete); - /* FIXME: add schedule_work_on() */ - schedule_delayed_work_on(cpu, &work.work, 0); - wait_for_completion(&work.complete); - destroy_delayed_work_on_stack(&work.work); - break; - case CPU_DEAD: - if (hdev) { - free_irq(hdev->irq, hdev); - hdev->flags &= ~HPET_DEV_USED; - per_cpu(cpu_hpet_dev, cpu) = NULL; - } - break; - } - return NOTIFY_OK; + if (!hdev) + return 0; + free_irq(hdev->irq, hdev); + hdev->flags &= ~HPET_DEV_USED; + per_cpu(cpu_hpet_dev, cpu) = NULL; + return 0; } #else @@ -750,11 +748,8 @@ static void hpet_reserve_msi_timers(struct hpet_data *hd) } #endif -static int hpet_cpuhp_notify(struct notifier_block *n, - unsigned long action, void *hcpu) -{ - return NOTIFY_OK; -} +#define hpet_cpuhp_online NULL +#define hpet_cpuhp_dead NULL #endif @@ -931,7 +926,7 @@ out_nohpet: */ static __init int hpet_late_init(void) { - int cpu; + int ret; if (boot_hpet_disable) return -ENODEV; @@ -961,16 +956,20 @@ static __init int hpet_late_init(void) if (boot_cpu_has(X86_FEATURE_ARAT)) return 0; - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) { - hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); - } - /* This notifier should be called after workqueue is ready */ - __hotcpu_notifier(hpet_cpuhp_notify, -20); - cpu_notifier_register_done(); - + ret = cpuhp_setup_state(CPUHP_AP_X86_HPET_ONLINE, "AP_X86_HPET_ONLINE", + hpet_cpuhp_online, NULL); + if (ret) + return ret; + ret = cpuhp_setup_state(CPUHP_X86_HPET_DEAD, "X86_HPET_DEAD", NULL, + hpet_cpuhp_dead); + if (ret) + goto err_cpuhp; return 0; + +err_cpuhp: + cpuhp_remove_state(CPUHP_AP_X86_HPET_ONLINE); + return ret; } fs_initcall(hpet_late_init); @@ -1020,7 +1019,6 @@ void hpet_disable(void) */ #include #include -#include #define DEFAULT_RTC_INT_FREQ 64 #define DEFAULT_RTC_SHIFT 6 @@ -1244,7 +1242,7 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) memset(&curr_time, 0, sizeof(struct rtc_time)); if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) - get_rtc_time(&curr_time); + mc146818_get_time(&curr_time); if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 2bcfb5f2b..8771766d4 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -36,13 +36,14 @@ #include #include #include -#include +#include #include #include #include #include #include +#include /* Per cpu debug control register value */ DEFINE_PER_CPU(unsigned long, cpu_dr7); diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 64341aa48..1f9b878ef 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -1,4 +1,5 @@ -#include +#include +#include #include #include @@ -42,3 +43,5 @@ EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(___preempt_schedule); EXPORT_SYMBOL(___preempt_schedule_notrace); #endif + +EXPORT_SYMBOL(__sw_hweight32); diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index efb82f07b..6ebe00cb4 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -3,7 +3,7 @@ * */ #include -#include +#include #include #include diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index a979b5bd2..50c89e8a9 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -6,7 +6,7 @@ * outb_p/inb_p API uses. */ #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 61521dc19..9f669fdd2 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_puts(p, " Rescheduling interrupts\n"); seq_printf(p, "%*s: ", prec, "CAL"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - - irq_stats(j)->irq_tlb_count); + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); seq_puts(p, " Function call interrupts\n"); seq_printf(p, "%*s: ", prec, "TLB"); for_each_online_cpu(j) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index c627bf8d9..1f38d9a4d 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -8,7 +8,6 @@ * io_apic.c.) */ -#include #include #include #include diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 206d0b90a..4a7903714 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index dc1404bf8..bdb83e431 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -8,7 +8,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index eea2a6f72..1726c4c12 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -21,7 +21,7 @@ */ #include -#include +#include #include #include #include @@ -301,8 +301,6 @@ static void kvm_register_steal_time(void) if (!has_steal_clock) return; - memset(st, 0, sizeof(*st)); - wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); pr_info("kvm-stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1d39bfbd2..3692249a7 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -289,6 +289,7 @@ void __init kvmclock_init(void) put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 97340f2c4..068c4a929 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 04b132a76..bfe4d6c96 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 33ee3e0ef..1939a0269 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -3,7 +3,7 @@ * compiled in a FTRACE-compatible way. */ #include -#include +#include #include #include diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 078c933f9..1acfd76e3 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -19,7 +19,8 @@ */ #include -#include +#include +#include #include #include #include diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 833b1d329..5d400ba13 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -340,7 +340,7 @@ static inline struct iommu_table *find_iommu_table(struct device *dev) static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems,enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct iommu_table *tbl = find_iommu_table(dev); struct scatterlist *s; @@ -364,7 +364,7 @@ static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct iommu_table *tbl = find_iommu_table(dev); struct scatterlist *s; @@ -396,7 +396,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, return nelems; error: - calgary_unmap_sg(dev, sg, nelems, dir, NULL); + calgary_unmap_sg(dev, sg, nelems, dir, 0); for_each_sg(sg, s, nelems, i) { sg->dma_address = DMA_ERROR_CODE; sg->dma_length = 0; @@ -407,7 +407,7 @@ error: static dma_addr_t calgary_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { void *vaddr = page_address(page) + offset; unsigned long uaddr; @@ -422,7 +422,7 @@ static dma_addr_t calgary_map_page(struct device *dev, struct page *page, static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; @@ -432,7 +432,7 @@ static void calgary_unmap_page(struct device *dev, dma_addr_t dma_addr, } static void* calgary_alloc_coherent(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs) + dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { void *ret = NULL; dma_addr_t mapping; @@ -466,7 +466,7 @@ error: static void calgary_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned int npages; struct iommu_table *tbl = find_iommu_table(dev); diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 6ba014c61..d30c37750 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -77,7 +77,7 @@ void __init pci_iommu_alloc(void) } void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag, - struct dma_attrs *attrs) + unsigned long attrs) { unsigned long dma_mask; struct page *page; @@ -120,7 +120,7 @@ again: } void dma_generic_free_coherent(struct device *dev, size_t size, void *vaddr, - dma_addr_t dma_addr, struct dma_attrs *attrs) + dma_addr_t dma_addr, unsigned long attrs) { unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; struct page *page = virt_to_page(vaddr); diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index da15918d1..00e71ce39 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -28,7 +28,7 @@ check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) static dma_addr_t nommu_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { dma_addr_t bus = page_to_phys(page) + offset; WARN_ON(size == 0); @@ -55,7 +55,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page, */ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, enum dma_data_direction dir, - struct dma_attrs *attrs) + unsigned long attrs) { struct scatterlist *s; int i; diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 7c577a178..b47edb8f5 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -16,7 +16,7 @@ int swiotlb __read_mostly; void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, - struct dma_attrs *attrs) + unsigned long attrs) { void *vaddr; @@ -37,7 +37,7 @@ void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, void x86_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, - struct dma_attrs *attrs) + unsigned long attrs) { if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr))) swiotlb_free_coherent(dev, size, vaddr, dma_addr); diff --git a/arch/x86/kernel/platform-quirks.c b/arch/x86/kernel/platform-quirks.c index b2f8a33b3..24a50301f 100644 --- a/arch/x86/kernel/platform-quirks.c +++ b/arch/x86/kernel/platform-quirks.c @@ -7,12 +7,12 @@ void __init x86_early_init_platform_quirks(void) { x86_platform.legacy.rtc = 1; - x86_platform.legacy.ebda_search = 0; + x86_platform.legacy.reserve_bios_regions = 0; x86_platform.legacy.devices.pnpbios = 1; switch (boot_params.hdr.hardware_subarch) { case X86_SUBARCH_PC: - x86_platform.legacy.ebda_search = 1; + x86_platform.legacy.reserve_bios_regions = 1; break; case X86_SUBARCH_XEN: case X86_SUBARCH_LGUEST: diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c index 92f70147a..0c5315d32 100644 --- a/arch/x86/kernel/pmem.c +++ b/arch/x86/kernel/pmem.c @@ -3,7 +3,7 @@ * Copyright (c) 2015, Intel Corporation. */ #include -#include +#include #include static int found(u64 start, u64 end, void *data) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 96becbbb5..62c0b0ea2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -7,7 +7,8 @@ #include #include #include -#include +#include +#include #include #include #include @@ -404,7 +405,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) if (c->x86_vendor != X86_VENDOR_INTEL) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT)) + if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) return 0; return 1; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 9f9509175..d86be29c3 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6e789ca1f..a21068e49 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include @@ -110,12 +110,13 @@ void __show_regs(struct pt_regs *regs, int all) get_debugreg(d7, 7); /* Only print out debug registers if they are in their non-default state. */ - if ((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && - (d6 == DR6_RESERVED) && (d7 == 0x400)) - return; - - printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); - printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) && + (d6 == DR6_RESERVED) && (d7 == 0x400))) { + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", + d0, d1, d2); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", + d3, d6, d7); + } if (boot_cpu_has(X86_FEATURE_OSPKE)) printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru()); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 600edd225..a1606eadd 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -173,8 +173,8 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs) return sp; prev_esp = (u32 *)(context); - if (prev_esp) - return (unsigned long)prev_esp; + if (*prev_esp) + return (unsigned long)*prev_esp; return (unsigned long)regs; } @@ -923,15 +923,18 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) case offsetof(struct user32, regs.orig_eax): /* - * A 32-bit debugger setting orig_eax means to restore - * the state of the task restarting a 32-bit syscall. - * Make sure we interpret the -ERESTART* codes correctly - * in case the task is not actually still sitting at the - * exit from a 32-bit syscall with TS_COMPAT still set. + * Warning: bizarre corner case fixup here. A 32-bit + * debugger setting orig_eax to -1 wants to disable + * syscall restart. Make sure that the syscall + * restart code sign-extends orig_ax. Also make sure + * we interpret the -ERESTART* codes correctly if + * loaded into regs->ax in case the task is not + * actually still sitting at the exit from a 32-bit + * syscall with TS_COMPAT still set. */ regs->orig_ax = value; if (syscall_get_nr(child, regs) >= 0) - task_thread_info(child)->status |= TS_COMPAT; + task_thread_info(child)->status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 06c58ce46..3599404e3 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -64,14 +64,9 @@ u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) u8 flags; do { - version = src->version; - /* Make the latest version visible */ - smp_rmb(); - + version = pvclock_read_begin(src); flags = src->flags; - /* Make sure that the version double-check is last. */ - smp_rmb(); - } while ((src->version & 1) || version != src->version); + } while (pvclock_read_retry(src, version)); return flags & valid_flags; } @@ -84,10 +79,10 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) u8 flags; do { - version = __pvclock_read_cycles(src, &ret, &flags); - /* Make sure that the version double-check is last. */ - smp_rmb(); - } while ((src->version & 1) || version != src->version); + version = pvclock_read_begin(src); + ret = __pvclock_read_cycles(src); + flags = src->flags; + } while (pvclock_read_retry(src, version)); if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) { src->flags &= ~PVCLOCK_GUEST_STOPPED; diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a9b31eb81..63bf27d97 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -1,6 +1,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include +#include #include #include #include @@ -54,6 +54,19 @@ bool port_cf9_safe = false; * Dell Inc. so their systems "just work". :-) */ +/* + * Some machines require the "reboot=a" commandline options + */ +static int __init set_acpi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_ACPI) { + reboot_type = BOOT_ACPI; + pr_info("%s series board detected. Selecting %s-method for reboots.\n", + d->ident, "ACPI"); + } + return 0; +} + /* * Some machines require the "reboot=b" or "reboot=k" commandline options, * this quirk makes that automatic. @@ -395,6 +408,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Dell XPS710"), }, }, + { /* Handle problems with rebooting on Dell Optiplex 7450 AIO */ + .callback = set_acpi_reboot, + .ident = "Dell OptiPlex 7450 AIO", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 7450 AIO"), + }, + }, /* Hewlett-Packard */ { /* Handle problems with rebooting on HP laptops */ diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index eceaa082e..79c6311cd 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #ifdef CONFIG_X86_32 @@ -47,7 +46,7 @@ int mach_set_rtc_mmss(const struct timespec *now) rtc_time_to_tm(nowtime, &tm); if (!rtc_valid_tm(&tm)) { - retval = set_rtc_time(&tm); + retval = mc146818_set_time(&tm); if (retval) printk(KERN_ERR "%s: RTC write failed with error %d\n", __func__, retval); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c4e7b3991..98c9cd6f3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include @@ -113,6 +113,7 @@ #include #include #include +#include /* * max_low_pfn_mapped: highest direct mapped pfn under 4GB @@ -399,10 +400,6 @@ static void __init reserve_initrd(void) memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); } -static void __init early_initrd_acpi_init(void) -{ - early_acpi_table_init((void *)initrd_start, initrd_end - initrd_start); -} #else static void __init early_reserve_initrd(void) { @@ -410,9 +407,6 @@ static void __init early_reserve_initrd(void) static void __init reserve_initrd(void) { } -static void __init early_initrd_acpi_init(void) -{ -} #endif /* CONFIG_BLK_DEV_INITRD */ static void __init parse_setup_data(void) @@ -1059,6 +1053,12 @@ void __init setup_arch(char **cmdline_p) max_possible_pfn = max_pfn; + /* + * Define random base addresses for memory sections after max_pfn is + * defined and before each memory section base is used. + */ + kernel_randomize_memory(); + #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ find_low_pfn_range(); @@ -1101,6 +1101,8 @@ void __init setup_arch(char **cmdline_p) efi_find_mirror(); } + reserve_bios_regions(); + /* * The EFI specification says that boot service code won't be called * after ExitBootServices(). This is, in fact, a lie. @@ -1129,7 +1131,13 @@ void __init setup_arch(char **cmdline_p) early_trap_pf_init(); - setup_real_mode(); + /* + * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) + * with the current CR4 value. This may not be necessary, but + * auditing all the early-boot CR4 manipulation would be needed to + * rule it out. + */ + mmu_cr4_features = __read_cr4_safe(); memblock_set_current_limit(get_max_mapped()); @@ -1146,7 +1154,7 @@ void __init setup_arch(char **cmdline_p) reserve_initrd(); - early_initrd_acpi_init(); + acpi_table_upgrade(); vsmp_init(); @@ -1178,13 +1186,6 @@ void __init setup_arch(char **cmdline_p) kasan_init(); - if (boot_cpu_data.cpuid_level >= 0) { - /* A CPU has %cr4 if and only if it has CPUID */ - mmu_cr4_features = __read_cr4(); - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - } - #ifdef CONFIG_X86_32 /* sync back kernel address range */ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e4fcb87ba..7a40e0683 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -236,6 +236,8 @@ void __init setup_per_cpu_areas(void) early_per_cpu_map(x86_cpu_to_apicid, cpu); per_cpu(x86_bios_cpu_apicid, cpu) = early_per_cpu_map(x86_bios_cpu_apicid, cpu); + per_cpu(x86_cpu_to_acpiid, cpu) = + early_per_cpu_map(x86_cpu_to_acpiid, cpu); #endif #ifdef CONFIG_X86_32 per_cpu(x86_cpu_to_logical_apicid, cpu) = @@ -271,6 +273,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_LOCAL_APIC early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; + early_per_cpu_ptr(x86_cpu_to_acpiid) = NULL; #endif #ifdef CONFIG_X86_32 early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 22cc2f9f8..04cb3212d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -146,7 +146,7 @@ static int restore_sigcontext(struct pt_regs *regs, buf = (void __user *)buf_val; } get_user_catch(err); - err |= fpu__restore_sig(buf, config_enabled(CONFIG_X86_32)); + err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); force_iret(); @@ -245,14 +245,14 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, struct fpu *fpu = ¤t->thread.fpu; /* redzone */ - if (config_enabled(CONFIG_X86_64)) + if (IS_ENABLED(CONFIG_X86_64)) sp -= 128; /* This is the X/Open sanctioned signal stack switching. */ if (ka->sa.sa_flags & SA_ONSTACK) { if (sas_ss_flags(sp) == 0) sp = current->sas_ss_sp + current->sas_ss_size; - } else if (config_enabled(CONFIG_X86_32) && + } else if (IS_ENABLED(CONFIG_X86_32) && !onsigstack && (regs->ss & 0xffff) != __USER_DS && !(ka->sa.sa_flags & SA_RESTORER) && @@ -262,7 +262,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } if (fpu->fpstate_active) { - sp = fpu__alloc_mathframe(sp, config_enabled(CONFIG_X86_32), + sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), &buf_fx, &math_size); *fpstate = (void __user *)sp; } @@ -662,18 +662,18 @@ badframe: static inline int is_ia32_compat_frame(void) { - return config_enabled(CONFIG_IA32_EMULATION) && + return IS_ENABLED(CONFIG_IA32_EMULATION) && test_thread_flag(TIF_IA32); } static inline int is_ia32_frame(void) { - return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); + return IS_ENABLED(CONFIG_X86_32) || is_ia32_compat_frame(); } static inline int is_x32_frame(void) { - return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); + return IS_ENABLED(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); } static int @@ -760,8 +760,30 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) { -#ifdef CONFIG_X86_64 - if (in_ia32_syscall()) + /* + * This function is fundamentally broken as currently + * implemented. + * + * The idea is that we want to trigger a call to the + * restart_block() syscall and that we want in_ia32_syscall(), + * in_x32_syscall(), etc. to match whatever they were in the + * syscall being restarted. We assume that the syscall + * instruction at (regs->ip - 2) matches whatever syscall + * instruction we used to enter in the first place. + * + * The problem is that we can get here when ptrace pokes + * syscall-like values into regs even if we're not in a syscall + * at all. + * + * For now, we maintain historical behavior and guess based on + * stored state. We could do better by saving the actual + * syscall arch in restart_block or (with caveats on x32) by + * checking if regs->ip points to 'int $0x80'. The current + * behavior is incorrect if a tracer has a different bitness + * than the tracee. + */ +#ifdef CONFIG_IA32_EMULATION + if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index dc3c0b1c8..b44564bf8 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -1,11 +1,104 @@ #include #include +/* + * The compat_siginfo_t structure and handing code is very easy + * to break in several ways. It must always be updated when new + * updates are made to the main siginfo_t, and + * copy_siginfo_to_user32() must be updated when the + * (arch-independent) copy_siginfo_to_user() is updated. + * + * It is also easy to put a new member in the compat_siginfo_t + * which has implicit alignment which can move internal structure + * alignment around breaking the ABI. This can happen if you, + * for instance, put a plain 64-bit value in there. + */ +static inline void signal_compat_build_tests(void) +{ + int _sifields_offset = offsetof(compat_siginfo_t, _sifields); + + /* + * If adding a new si_code, there is probably new data in + * the siginfo. Make sure folks bumping the si_code + * limits also have to look at this code. Make sure any + * new fields are handled in copy_siginfo_to_user32()! + */ + BUILD_BUG_ON(NSIGILL != 8); + BUILD_BUG_ON(NSIGFPE != 8); + BUILD_BUG_ON(NSIGSEGV != 4); + BUILD_BUG_ON(NSIGBUS != 5); + BUILD_BUG_ON(NSIGTRAP != 4); + BUILD_BUG_ON(NSIGCHLD != 6); + BUILD_BUG_ON(NSIGSYS != 1); + + /* This is part of the ABI and can never change in size: */ + BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); + /* + * The offsets of all the (unioned) si_fields are fixed + * in the ABI, of course. Make sure none of them ever + * move and are always at the beginning: + */ + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int)); +#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name)) + + /* + * Ensure that the size of each si_field never changes. + * If it does, it is a sign that the + * copy_siginfo_to_user32() code below needs to updated + * along with the size in the CHECK_SI_SIZE(). + * + * We repeat this check for both the generic and compat + * siginfos. + * + * Note: it is OK for these to grow as long as the whole + * structure stays within the padding size (checked + * above). + */ +#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name)) +#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name)) + + CHECK_CSI_OFFSET(_kill); + CHECK_CSI_SIZE (_kill, 2*sizeof(int)); + CHECK_SI_SIZE (_kill, 2*sizeof(int)); + + CHECK_CSI_OFFSET(_timer); + CHECK_CSI_SIZE (_timer, 5*sizeof(int)); + CHECK_SI_SIZE (_timer, 6*sizeof(int)); + + CHECK_CSI_OFFSET(_rt); + CHECK_CSI_SIZE (_rt, 3*sizeof(int)); + CHECK_SI_SIZE (_rt, 4*sizeof(int)); + + CHECK_CSI_OFFSET(_sigchld); + CHECK_CSI_SIZE (_sigchld, 5*sizeof(int)); + CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); + + CHECK_CSI_OFFSET(_sigchld_x32); + CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int)); + /* no _sigchld_x32 in the generic siginfo_t */ + + CHECK_CSI_OFFSET(_sigfault); + CHECK_CSI_SIZE (_sigfault, 4*sizeof(int)); + CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); + + CHECK_CSI_OFFSET(_sigpoll); + CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); + CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); + + CHECK_CSI_OFFSET(_sigsys); + CHECK_CSI_SIZE (_sigsys, 3*sizeof(int)); + CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); + + /* any new si_fields should be added here */ +} + int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) { int err = 0; bool ia32 = test_thread_flag(TIF_IA32); + signal_compat_build_tests(); + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; @@ -32,6 +125,21 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) &to->_sifields._pad[0]); switch (from->si_code >> 16) { case __SI_FAULT >> 16: + if (from->si_signo == SIGBUS && + (from->si_code == BUS_MCEERR_AR || + from->si_code == BUS_MCEERR_AO)) + put_user_ex(from->si_addr_lsb, &to->si_addr_lsb); + + if (from->si_signo == SIGSEGV) { + if (from->si_code == SEGV_BNDERR) { + compat_uptr_t lower = (unsigned long)&to->si_lower; + compat_uptr_t upper = (unsigned long)&to->si_upper; + put_user_ex(lower, &to->si_lower); + put_user_ex(upper, &to->si_upper); + } + if (from->si_code == SEGV_PKUERR) + put_user_ex(from->si_pkey, &to->si_pkey); + } break; case __SI_SYS >> 16: put_user_ex(from->si_syscall, &to->si_syscall); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fafe8b923..82b17373b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -43,7 +43,7 @@ #include #include -#include +#include #include #include #include @@ -100,10 +100,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); /* Logical package management. We might want to allocate that dynamically */ static int *physical_to_logical_pkg __read_mostly; static unsigned long *physical_package_map __read_mostly;; -static unsigned long *logical_package_map __read_mostly; static unsigned int max_physical_pkg_id __read_mostly; unsigned int __max_logical_packages __read_mostly; EXPORT_SYMBOL(__max_logical_packages); +static unsigned int logical_packages __read_mostly; +static bool logical_packages_frozen __read_mostly; + +/* Maximum number of SMT threads on any online core */ +int __max_smt_threads __read_mostly; static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { @@ -274,14 +278,14 @@ int topology_update_package_map(unsigned int apicid, unsigned int cpu) if (test_and_set_bit(pkg, physical_package_map)) goto found; - new = find_first_zero_bit(logical_package_map, __max_logical_packages); - if (new >= __max_logical_packages) { + if (logical_packages_frozen) { physical_to_logical_pkg[pkg] = -1; - pr_warn("APIC(%x) Package %u exceeds logical package map\n", + pr_warn("APIC(%x) Package %u exceeds logical package max\n", apicid, pkg); return -ENOSPC; } - set_bit(new, logical_package_map); + + new = logical_packages++; pr_info("APIC(%x) Converting physical %u to logical package %u\n", apicid, pkg, new); physical_to_logical_pkg[pkg] = new; @@ -338,6 +342,7 @@ static void __init smp_init_package_map(void) } __max_logical_packages = DIV_ROUND_UP(total_cpus, ncpus); + logical_packages = 0; /* * Possibly larger than what we need as the number of apic ids per @@ -349,10 +354,6 @@ static void __init smp_init_package_map(void) memset(physical_to_logical_pkg, 0xff, size); size = BITS_TO_LONGS(max_physical_pkg_id) * sizeof(unsigned long); physical_package_map = kzalloc(size, GFP_KERNEL); - size = BITS_TO_LONGS(__max_logical_packages) * sizeof(unsigned long); - logical_package_map = kzalloc(size, GFP_KERNEL); - - pr_info("Max logical packages: %u\n", __max_logical_packages); for_each_present_cpu(cpu) { unsigned int apicid = apic->cpu_present_to_apicid(cpu); @@ -366,6 +367,15 @@ static void __init smp_init_package_map(void) set_cpu_possible(cpu, false); set_cpu_present(cpu, false); } + + if (logical_packages > __max_logical_packages) { + pr_warn("Detected more packages (%u), then computed by BIOS data (%u).\n", + logical_packages, __max_logical_packages); + logical_packages_frozen = true; + __max_logical_packages = logical_packages; + } + + pr_info("Max logical packages: %u\n", __max_logical_packages); } void __init smp_store_boot_cpu_info(void) @@ -493,7 +503,7 @@ void set_cpu_sibling_map(int cpu) bool has_mp = has_smt || boot_cpu_data.x86_max_cores > 1; struct cpuinfo_x86 *c = &cpu_data(cpu); struct cpuinfo_x86 *o; - int i; + int i, threads; cpumask_set_cpu(cpu, cpu_sibling_setup_mask); @@ -550,6 +560,10 @@ void set_cpu_sibling_map(int cpu) if (match_die(c, o) && !topology_same_node(c, o)) primarily_use_numa_for_topology(); } + + threads = cpumask_weight(topology_sibling_cpumask(cpu)); + if (threads > __max_smt_threads) + __max_smt_threads = threads; } /* maps the cpu to the sched domain representing multi-core */ @@ -676,7 +690,7 @@ wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); @@ -703,7 +717,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); @@ -742,7 +756,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) * Determine this based on the APIC version. * If we don't have an integrated APIC, don't send the STARTUP IPIs. */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) + if (APIC_INTEGRATED(boot_cpu_apic_version)) num_starts = 2; else num_starts = 0; @@ -980,7 +994,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) /* * Be paranoid about clearing APIC errors. */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + if (APIC_INTEGRATED(boot_cpu_apic_version)) { apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } @@ -1235,7 +1249,7 @@ static int __init smp_sanity_check(unsigned max_cpus) /* * If we couldn't find a local APIC, then get out of here now! */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && + if (APIC_INTEGRATED(boot_cpu_apic_version) && !boot_cpu_has(X86_FEATURE_APIC)) { if (!disable_apic) { pr_err("BIOS bug, local APIC #%d not detected!...\n", @@ -1285,7 +1299,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) cpumask_copy(cpu_callin_mask, cpumask_of(0)); mb(); - current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); @@ -1393,9 +1406,21 @@ __init void prefill_possible_map(void) { int i, possible; - /* no processor from mptable or madt */ - if (!num_processors) - num_processors = 1; + /* No boot processor was found in mptable or ACPI MADT */ + if (!num_processors) { + int apicid = boot_cpu_physical_apicid; + int cpu = hard_smp_processor_id(); + + pr_warn("Boot CPU (id %d) not listed by BIOS\n", cpu); + + /* Make sure boot cpu is enumerated */ + if (apic->cpu_present_to_apicid(0) == BAD_APICID && + apic->apic_id_valid(apicid)) + generic_processor_info(apicid, boot_cpu_apic_version); + + if (!num_processors) + num_processors = 1; + } i = setup_max_cpus ?: 1; if (setup_possible_cpus == -1) { @@ -1441,6 +1466,21 @@ __init void prefill_possible_map(void) #ifdef CONFIG_HOTPLUG_CPU +/* Recompute SMT state for all CPUs on offline */ +static void recompute_smt_state(void) +{ + int max_threads, cpu; + + max_threads = 0; + for_each_online_cpu (cpu) { + int threads = cpumask_weight(topology_sibling_cpumask(cpu)); + + if (threads > max_threads) + max_threads = threads; + } + __max_smt_threads = max_threads; +} + static void remove_siblinginfo(int cpu) { int sibling; @@ -1465,6 +1505,7 @@ static void remove_siblinginfo(int cpu) c->phys_proc_id = 0; c->cpu_core_id = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); + recompute_smt_state(); } static void remove_cpu_from_maps(int cpu) @@ -1622,7 +1663,7 @@ static inline void mwait_play_dead(void) } } -static inline void hlt_play_dead(void) +void hlt_play_dead(void) { if (__this_cpu_read(cpu_info.x86) >= 4) wbinvd(); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 9ee98eefc..4738f5e0f 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -5,7 +5,7 @@ */ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 9b0185fbe..654f6c66f 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -323,25 +323,16 @@ static int tboot_wait_for_aps(int num_aps) return !(atomic_read((atomic_t *)&tboot->num_in_wfs) == num_aps); } -static int tboot_cpu_callback(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int tboot_dying_cpu(unsigned int cpu) { - switch (action) { - case CPU_DYING: - atomic_inc(&ap_wfs_count); - if (num_online_cpus() == 1) - if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) - return NOTIFY_BAD; - break; + atomic_inc(&ap_wfs_count); + if (num_online_cpus() == 1) { + if (tboot_wait_for_aps(atomic_read(&ap_wfs_count))) + return -EBUSY; } - return NOTIFY_OK; + return 0; } -static struct notifier_block tboot_cpu_notifier = -{ - .notifier_call = tboot_cpu_callback, -}; - #ifdef CONFIG_DEBUG_FS #define TBOOT_LOG_UUID { 0x26, 0x25, 0x19, 0xc0, 0x30, 0x6b, 0xb4, 0x4d, \ @@ -417,8 +408,8 @@ static __init int tboot_late_init(void) tboot_create_trampoline(); atomic_set(&ap_wfs_count, 0); - register_hotcpu_notifier(&tboot_cpu_notifier); - + cpuhp_setup_state(CPUHP_AP_X86_TBOOT_DYING, "AP_X86_TBOOT_DYING", NULL, + tboot_dying_cpu); #ifdef CONFIG_DEBUG_FS debugfs_create_file("tboot_log", S_IRUSR, arch_debugfs_dir, NULL, &tboot_log_fops); diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index cb4a01b41..222e84e24 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -9,7 +9,6 @@ * as published by the Free Software Foundation; version 2 * of the License. */ -#include #include #include #include @@ -74,7 +73,3 @@ int rodata_test(void) return 0; } - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Testcase for marking rodata as read-only"); -MODULE_AUTHOR("Arjan van de Ven "); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 00f03d82e..b70ca12dd 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index fa820b7de..78b9cb5a2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -23,6 +22,7 @@ #include #include #include +#include unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -201,10 +201,6 @@ static void cyc2ns_init(int cpu) c2n->head = c2n->data; c2n->tail = c2n->data; - - // Don't let TuxOnIce make data RO - a secondary CPU will cause a triple fault - // if it loads microcode, which then does a printk, which may end up invoking cycles_2_ns - SetPageTOI_Untracked(virt_to_page(c2n)); } static inline unsigned long long cycles_2_ns(unsigned long long cyc) @@ -244,7 +240,7 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) +static void set_cyc2ns_scale(unsigned long khz, int cpu) { unsigned long long tsc_now, ns_now; struct cyc2ns_data *data; @@ -253,7 +249,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) local_irq_save(flags); sched_clock_idle_sleep_event(); - if (!cpu_khz) + if (!khz) goto done; data = cyc2ns_write_begin(cpu); @@ -266,7 +262,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) * time function is continuous; see the comment near struct * cyc2ns_data. */ - clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, cpu_khz, + clocks_calc_mult_shift(&data->cyc2ns_mul, &data->cyc2ns_shift, khz, NSEC_PER_MSEC, 0); /* @@ -340,12 +336,6 @@ int check_tsc_unstable(void) } EXPORT_SYMBOL_GPL(check_tsc_unstable); -int check_tsc_disabled(void) -{ - return tsc_disabled; -} -EXPORT_SYMBOL_GPL(check_tsc_disabled); - #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { @@ -670,19 +660,77 @@ success: } /** - * native_calibrate_tsc - calibrate the tsc on boot + * native_calibrate_tsc + * Determine TSC frequency via CPUID, else return 0. */ unsigned long native_calibrate_tsc(void) +{ + unsigned int eax_denominator, ebx_numerator, ecx_hz, edx; + unsigned int crystal_khz; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (boot_cpu_data.cpuid_level < 0x15) + return 0; + + eax_denominator = ebx_numerator = ecx_hz = edx = 0; + + /* CPUID 15H TSC/Crystal ratio, plus optionally Crystal Hz */ + cpuid(0x15, &eax_denominator, &ebx_numerator, &ecx_hz, &edx); + + if (ebx_numerator == 0 || eax_denominator == 0) + return 0; + + crystal_khz = ecx_hz / 1000; + + if (crystal_khz == 0) { + switch (boot_cpu_data.x86_model) { + case 0x4E: /* SKL */ + case 0x5E: /* SKL */ + crystal_khz = 24000; /* 24.0 MHz */ + break; + case 0x5C: /* BXT */ + crystal_khz = 19200; /* 19.2 MHz */ + break; + } + } + + return crystal_khz * ebx_numerator / eax_denominator; +} + +static unsigned long cpu_khz_from_cpuid(void) +{ + unsigned int eax_base_mhz, ebx_max_mhz, ecx_bus_mhz, edx; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (boot_cpu_data.cpuid_level < 0x16) + return 0; + + eax_base_mhz = ebx_max_mhz = ecx_bus_mhz = edx = 0; + + cpuid(0x16, &eax_base_mhz, &ebx_max_mhz, &ecx_bus_mhz, &edx); + + return eax_base_mhz * 1000; +} + +/** + * native_calibrate_cpu - calibrate the cpu on boot + */ +unsigned long native_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; - /* Calibrate TSC using MSR for Intel Atom SoCs */ - local_irq_save(flags); - fast_calibrate = try_msr_calibrate_tsc(); - local_irq_restore(flags); + fast_calibrate = cpu_khz_from_cpuid(); + if (fast_calibrate) + return fast_calibrate; + + fast_calibrate = cpu_khz_from_msr(); if (fast_calibrate) return fast_calibrate; @@ -842,8 +890,12 @@ int recalibrate_cpu_khz(void) if (!boot_cpu_has(X86_FEATURE_TSC)) return -ENODEV; + cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); - cpu_khz = tsc_khz; + if (tsc_khz == 0) + tsc_khz = cpu_khz; + else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) + cpu_khz = tsc_khz; cpu_data(0).loops_per_jiffy = cpufreq_scale(cpu_data(0).loops_per_jiffy, cpu_khz_old, cpu_khz); @@ -1198,6 +1250,9 @@ static void tsc_refine_calibration_work(struct work_struct *work) (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); + /* Inform the TSC deadline clockevent devices about the recalibration */ + lapic_update_tsc_freq(); + out: if (boot_cpu_has(X86_FEATURE_ART)) art_related_clocksource = &clocksource_tsc; @@ -1249,8 +1304,18 @@ void __init tsc_init(void) return; } + cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); - cpu_khz = tsc_khz; + + /* + * Trust non-zero tsc_khz as authorative, + * and use it to sanity check cpu_khz, + * which will be off if system timer is off. + */ + if (tsc_khz == 0) + tsc_khz = cpu_khz; + else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) + cpu_khz = tsc_khz; if (!tsc_khz) { mark_tsc_unstable("could not calculate TSC khz"); @@ -1270,7 +1335,7 @@ void __init tsc_init(void) */ for_each_possible_cpu(cpu) { cyc2ns_init(cpu); - set_cyc2ns_scale(cpu_khz, cpu); + set_cyc2ns_scale(tsc_khz, cpu); } if (tsc_disabled > 0) diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 9911a0620..0fe720d64 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -1,14 +1,5 @@ /* - * tsc_msr.c - MSR based TSC calibration on Intel Atom SoC platforms. - * - * TSC in Intel Atom SoC runs at a constant rate which can be figured - * by this formula: - * * - * See Intel 64 and IA-32 System Programming Guid section 16.12 and 30.11.5 - * for details. - * Especially some Intel Atom SoCs don't have PIT(i8254) or HPET, so MSR - * based calibration is the only option. - * + * tsc_msr.c - TSC frequency enumeration via MSR * * Copyright (C) 2013 Intel Corporation * Author: Bin Gao @@ -22,18 +13,10 @@ #include #include -/* CPU reference clock frequency: in KHz */ -#define FREQ_80 80000 -#define FREQ_83 83200 -#define FREQ_100 99840 -#define FREQ_133 133200 -#define FREQ_166 166400 - -#define MAX_NUM_FREQS 8 +#define MAX_NUM_FREQS 9 /* - * According to Intel 64 and IA-32 System Programming Guide, - * if MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be + * If MSR_PERF_STAT[31] is set, the maximum resolved bus ratio can be * read in MSR_PLATFORM_ID[12:8], otherwise in MSR_PERF_STAT[44:40]. * Unfortunately some Intel Atom SoCs aren't quite compliant to this, * so we need manually differentiate SoC families. This is what the @@ -48,17 +31,18 @@ struct freq_desc { static struct freq_desc freq_desc_tables[] = { /* PNW */ - { 6, 0x27, 0, { 0, 0, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, + { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }, /* CLV+ */ - { 6, 0x35, 0, { 0, FREQ_133, 0, 0, 0, FREQ_100, 0, FREQ_83 } }, - /* TNG */ - { 6, 0x4a, 1, { 0, FREQ_100, FREQ_133, 0, 0, 0, 0, 0 } }, - /* VLV2 */ - { 6, 0x37, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, 0, 0, 0, 0 } }, - /* ANN */ - { 6, 0x5a, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_100, 0, 0, 0, 0 } }, - /* AIRMONT */ - { 6, 0x4c, 1, { FREQ_83, FREQ_100, FREQ_133, FREQ_166, FREQ_80, 0, 0, 0 } }, + { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } }, + /* TNG - Intel Atom processor Z3400 series */ + { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } }, + /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */ + { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } }, + /* ANN - Intel Atom processor Z3500 series */ + { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }, + /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */ + { 6, 0x4c, 1, { 83300, 100000, 133300, 116700, + 80000, 93300, 90000, 88900, 87500 } }, }; static int match_cpu(u8 family, u8 model) @@ -79,16 +63,20 @@ static int match_cpu(u8 family, u8 model) (freq_desc_tables[cpu_index].freqs[freq_id]) /* - * Do MSR calibration only for known/supported CPUs. + * MSR-based CPU/TSC frequency discovery for certain CPUs. * - * Returns the calibration value or 0 if MSR calibration failed. + * Set global "lapic_timer_frequency" to bus_clock_cycles/jiffy + * Return processor base frequency in KHz, or 0 on failure. */ -unsigned long try_msr_calibrate_tsc(void) +unsigned long cpu_khz_from_msr(void) { u32 lo, hi, ratio, freq_id, freq; unsigned long res; int cpu_index; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return 0; + cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); if (cpu_index < 0) return 0; @@ -100,31 +88,17 @@ unsigned long try_msr_calibrate_tsc(void) rdmsr(MSR_IA32_PERF_STATUS, lo, hi); ratio = (hi >> 8) & 0x1f; } - pr_info("Maximum core-clock to bus-clock ratio: 0x%x\n", ratio); - - if (!ratio) - goto fail; /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); freq_id = lo & 0x7; freq = id_to_freq(cpu_index, freq_id); - pr_info("Resolved frequency ID: %u, frequency: %u KHz\n", - freq_id, freq); - if (!freq) - goto fail; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ res = freq * ratio; - pr_info("TSC runs at %lu KHz\n", res); #ifdef CONFIG_X86_LOCAL_APIC lapic_timer_frequency = (freq * 1000) / HZ; - pr_info("lapic_timer_frequency = %d\n", lapic_timer_frequency); #endif return res; - -fail: - pr_warn("Fast TSC calibration using MSR failed\n"); - return 0; } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 3dce1ca0a..01f30e56f 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -440,10 +440,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs) static inline int is_revectored(int nr, struct revectored_struct *bitmap) { - __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0" - :"=r" (nr) - :"m" (*bitmap), "r" (nr)); - return nr; + return test_bit(nr, bitmap->__map); } #define val_byte(val, n) (((__u8 *)&val)[n]) diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index cd05942bc..95e49f6e4 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -1,7 +1,8 @@ /* Exports for assembly files. All C exports should go in the respective C files. */ -#include +#include +#include #include #include @@ -44,6 +45,9 @@ EXPORT_SYMBOL(clear_page); EXPORT_SYMBOL(csum_partial); +EXPORT_SYMBOL(__sw_hweight32); +EXPORT_SYMBOL(__sw_hweight64); + /* * Export string functions. We normally rely on gcc builtin for most of these, * but gcc sometimes decides not to inline them. diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index dad5fe963..76c5e5243 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -5,7 +5,7 @@ */ #include #include -#include +#include #include #include @@ -92,6 +92,7 @@ static void default_nmi_init(void) { }; static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform = { + .calibrate_cpu = native_calibrate_cpu, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, -- cgit v1.2.3-54-g00ecf