diff options
Diffstat (limited to 'drivers/xen')
67 files changed, 26110 insertions, 0 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig new file mode 100644 index 000000000..7cd226da1 --- /dev/null +++ b/drivers/xen/Kconfig @@ -0,0 +1,283 @@ +menu "Xen driver support" + depends on XEN + +config XEN_BALLOON + bool "Xen memory balloon driver" + default y + help + The balloon driver allows the Xen domain to request more memory from + the system to expand the domain's memory allocation, or alternatively + return unneeded memory to the system. + +config XEN_SELFBALLOONING + bool "Dynamically self-balloon kernel memory to target" + depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM + default n + help + Self-ballooning dynamically balloons available kernel memory driven + by the current usage of anonymous memory ("committed AS") and + controlled by various sysfs-settable parameters. Configuring + FRONTSWAP is highly recommended; if it is not configured, self- + ballooning is disabled by default. If FRONTSWAP is configured, + frontswap-selfshrinking is enabled by default but can be disabled + with the 'tmem.selfshrink=0' kernel boot parameter; and self-ballooning + is enabled by default but can be disabled with the 'tmem.selfballooning=0' + kernel boot parameter. Note that systems without a sufficiently + large swap device should not enable self-ballooning. + +config XEN_BALLOON_MEMORY_HOTPLUG + bool "Memory hotplug support for Xen balloon driver" + default n + depends on XEN_BALLOON && MEMORY_HOTPLUG + help + Memory hotplug support for Xen balloon driver allows expanding memory + available for the system above limit declared at system startup. + It is very useful on critical systems which require long + run without rebooting. + + Memory could be hotplugged in following steps: + + 1) dom0: xl mem-max <domU> <maxmem> + where <maxmem> is >= requested memory size, + + 2) dom0: xl mem-set <domU> <memory> + where <memory> is requested memory size; alternatively memory + could be added by writing proper value to + /sys/devices/system/xen_memory/xen_memory0/target or + /sys/devices/system/xen_memory/xen_memory0/target_kb on dumU, + + 3) domU: for i in /sys/devices/system/memory/memory*/state; do \ + [ "`cat "$i"`" = offline ] && echo online > "$i"; done + + Memory could be onlined automatically on domU by adding following line to udev rules: + + SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'" + + In that case step 3 should be omitted. + +config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT + int "Hotplugged memory limit (in GiB) for a PV guest" + default 512 if X86_64 + default 4 if X86_32 + range 0 64 if X86_32 + depends on XEN_HAVE_PVMMU + depends on XEN_BALLOON_MEMORY_HOTPLUG + help + Maxmium amount of memory (in GiB) that a PV guest can be + expanded to when using memory hotplug. + + A PV guest can have more memory than this limit if is + started with a larger maximum. + + This value is used to allocate enough space in internal + tables needed for physical memory administration. + +config XEN_SCRUB_PAGES + bool "Scrub pages before returning them to system" + depends on XEN_BALLOON + default y + help + Scrub pages before returning them to the system for reuse by + other domains. This makes sure that any confidential data + is not accidentally visible to other domains. Is it more + secure, but slightly less efficient. + If in doubt, say yes. + +config XEN_DEV_EVTCHN + tristate "Xen /dev/xen/evtchn device" + default y + help + The evtchn driver allows a userspace process to trigger event + channels and to receive notification of an event channel + firing. + If in doubt, say yes. + +config XEN_BACKEND + bool "Backend driver support" + depends on XEN_DOM0 + default y + help + Support for backend device drivers that provide I/O services + to other virtual machines. + +config XENFS + tristate "Xen filesystem" + select XEN_PRIVCMD + default y + help + The xen filesystem provides a way for domains to share + information with each other and with the hypervisor. + For example, by reading and writing the "xenbus" file, guests + may pass arbitrary information to the initial domain. + If in doubt, say yes. + +config XEN_COMPAT_XENFS + bool "Create compatibility mount point /proc/xen" + depends on XENFS + default y + help + The old xenstore userspace tools expect to find "xenbus" + under /proc/xen, but "xenbus" is now found at the root of the + xenfs filesystem. Selecting this causes the kernel to create + the compatibility mount point /proc/xen if it is running on + a xen platform. + If in doubt, say yes. + +config XEN_SYS_HYPERVISOR + bool "Create xen entries under /sys/hypervisor" + depends on SYSFS + select SYS_HYPERVISOR + default y + help + Create entries under /sys/hypervisor describing the Xen + hypervisor environment. When running native or in another + virtual environment, /sys/hypervisor will still be present, + but will have no xen contents. + +config XEN_XENBUS_FRONTEND + tristate + +config XEN_GNTDEV + tristate "userspace grant access device driver" + depends on XEN + default m + select MMU_NOTIFIER + help + Allows userspace processes to use grants. + +config XEN_GRANT_DEV_ALLOC + tristate "User-space grant reference allocator driver" + depends on XEN + default m + help + Allows userspace processes to create pages with access granted + to other domains. This can be used to implement frontend drivers + or as part of an inter-domain shared memory channel. + +config SWIOTLB_XEN + def_bool y + select SWIOTLB + +config XEN_TMEM + tristate + depends on !ARM && !ARM64 + default m if (CLEANCACHE || FRONTSWAP) + help + Shim to interface in-kernel Transcendent Memory hooks + (e.g. cleancache and frontswap) to Xen tmem hypercalls. + +config XEN_PCIDEV_BACKEND + tristate "Xen PCI-device backend driver" + depends on PCI && X86 && XEN + depends on XEN_BACKEND + default m + help + The PCI device backend driver allows the kernel to export arbitrary + PCI devices to other guests. If you select this to be a module, you + will need to make sure no other driver has bound to the device(s) + you want to make visible to other guests. + + The parameter "passthrough" allows you specify how you want the PCI + devices to appear in the guest. You can choose the default (0) where + PCI topology starts at 00.00.0, or (1) for passthrough if you want + the PCI devices topology appear the same as in the host. + + The "hide" parameter (only applicable if backend driver is compiled + into the kernel) allows you to bind the PCI devices to this module + from the default device drivers. The argument is the list of PCI BDFs: + xen-pciback.hide=(03:00.0)(04:00.0) + + If in doubt, say m. + +config XEN_SCSI_BACKEND + tristate "XEN SCSI backend driver" + depends on XEN && XEN_BACKEND && TARGET_CORE + help + The SCSI backend driver allows the kernel to export its SCSI Devices + to other guests via a high-performance shared-memory interface. + Only needed for systems running as XEN driver domains (e.g. Dom0) and + if guests need generic access to SCSI devices. + +config XEN_PRIVCMD + tristate + depends on XEN + default m + +config XEN_STUB + bool "Xen stub drivers" + depends on XEN && X86_64 && BROKEN + default n + help + Allow kernel to install stub drivers, to reserve space for Xen drivers, + i.e. memory hotplug and cpu hotplug, and to block native drivers loaded, + so that real Xen drivers can be modular. + + To enable Xen features like cpu and memory hotplug, select Y here. + +config XEN_ACPI_HOTPLUG_MEMORY + tristate "Xen ACPI memory hotplug" + depends on XEN_DOM0 && XEN_STUB && ACPI + default n + help + This is Xen ACPI memory hotplug. + + Currently Xen only support ACPI memory hot-add. If you want + to hot-add memory at runtime (the hot-added memory cannot be + removed until machine stop), select Y/M here, otherwise select N. + +config XEN_ACPI_HOTPLUG_CPU + tristate "Xen ACPI cpu hotplug" + depends on XEN_DOM0 && XEN_STUB && ACPI + select ACPI_CONTAINER + default n + help + Xen ACPI cpu enumerating and hotplugging + + For hotplugging, currently Xen only support ACPI cpu hotadd. + If you want to hotadd cpu at runtime (the hotadded cpu cannot + be removed until machine stop), select Y/M here. + +config XEN_ACPI_PROCESSOR + tristate "Xen ACPI processor" + depends on XEN && X86 && ACPI_PROCESSOR && CPU_FREQ + default m + help + This ACPI processor uploads Power Management information to the Xen + hypervisor. + + To do that the driver parses the Power Management data and uploads + said information to the Xen hypervisor. Then the Xen hypervisor can + select the proper Cx and Pxx states. It also registers itself as the + SMM so that other drivers (such as ACPI cpufreq scaling driver) will + not load. + + To compile this driver as a module, choose M here: the module will be + called xen_acpi_processor If you do not know what to choose, select + M here. If the CPUFREQ drivers are built in, select Y here. + +config XEN_MCE_LOG + bool "Xen platform mcelog" + depends on XEN_DOM0 && X86_64 && X86_MCE + default n + help + Allow kernel fetching MCE error from Xen platform and + converting it into Linux mcelog format for mcelog tools + +config XEN_HAVE_PVMMU + bool + +config XEN_EFI + def_bool y + depends on X86_64 && EFI + +config XEN_AUTO_XLATE + def_bool y + depends on ARM || ARM64 || XEN_PVHVM + help + Support for auto-translated physmap guests. + +config XEN_ACPI + def_bool y + depends on X86 && ACPI + +endmenu diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile new file mode 100644 index 000000000..e293bc507 --- /dev/null +++ b/drivers/xen/Makefile @@ -0,0 +1,44 @@ +ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),) +obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o +endif +obj-$(CONFIG_X86) += fallback.o +obj-y += grant-table.o features.o balloon.o manage.o preempt.o +obj-y += events/ +obj-y += xenbus/ + +nostackp := $(call cc-option, -fno-stack-protector) +CFLAGS_features.o := $(nostackp) + +CFLAGS_efi.o += -fshort-wchar + +dom0-$(CONFIG_PCI) += pci.o +dom0-$(CONFIG_USB_SUPPORT) += dbgp.o +dom0-$(CONFIG_XEN_ACPI) += acpi.o $(xen-pad-y) +xen-pad-$(CONFIG_X86) += xen-acpi-pad.o +dom0-$(CONFIG_X86) += pcpu.o +obj-$(CONFIG_XEN_DOM0) += $(dom0-y) +obj-$(CONFIG_BLOCK) += biomerge.o +obj-$(CONFIG_XEN_BALLOON) += xen-balloon.o +obj-$(CONFIG_XEN_SELFBALLOONING) += xen-selfballoon.o +obj-$(CONFIG_XEN_DEV_EVTCHN) += xen-evtchn.o +obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o +obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o +obj-$(CONFIG_XENFS) += xenfs/ +obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o +obj-$(CONFIG_XEN_PVHVM) += platform-pci.o +obj-$(CONFIG_XEN_TMEM) += tmem.o +obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o +obj-$(CONFIG_XEN_MCE_LOG) += mcelog.o +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/ +obj-$(CONFIG_XEN_PRIVCMD) += xen-privcmd.o +obj-$(CONFIG_XEN_STUB) += xen-stub.o +obj-$(CONFIG_XEN_ACPI_HOTPLUG_MEMORY) += xen-acpi-memhotplug.o +obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU) += xen-acpi-cpuhotplug.o +obj-$(CONFIG_XEN_ACPI_PROCESSOR) += xen-acpi-processor.o +obj-$(CONFIG_XEN_EFI) += efi.o +obj-$(CONFIG_XEN_SCSI_BACKEND) += xen-scsiback.o +obj-$(CONFIG_XEN_AUTO_XLATE) += xlate_mmu.o +xen-evtchn-y := evtchn.o +xen-gntdev-y := gntdev.o +xen-gntalloc-y := gntalloc.o +xen-privcmd-y := privcmd.o diff --git a/drivers/xen/acpi.c b/drivers/xen/acpi.c new file mode 100644 index 000000000..90307c0b6 --- /dev/null +++ b/drivers/xen/acpi.c @@ -0,0 +1,77 @@ +/****************************************************************************** + * acpi.c + * acpi file for domain 0 kernel + * + * Copyright (c) 2011 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * Copyright (c) 2011 Yu Ke ke.yu@intel.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +static int xen_acpi_notify_hypervisor_state(u8 sleep_state, + u32 val_a, u32 val_b, + bool extended) +{ + unsigned int bits = extended ? 8 : 16; + + struct xen_platform_op op = { + .cmd = XENPF_enter_acpi_sleep, + .interface_version = XENPF_INTERFACE_VERSION, + .u.enter_acpi_sleep = { + .val_a = (u16)val_a, + .val_b = (u16)val_b, + .sleep_state = sleep_state, + .flags = extended ? XENPF_ACPI_SLEEP_EXTENDED : 0, + }, + }; + + if (WARN((val_a & (~0 << bits)) || (val_b & (~0 << bits)), + "Using more than %u bits of sleep control values %#x/%#x!" + "Email xen-devel@lists.xen.org - Thank you.\n", \ + bits, val_a, val_b)) + return -1; + + HYPERVISOR_dom0_op(&op); + return 1; +} + +int xen_acpi_notify_hypervisor_sleep(u8 sleep_state, + u32 pm1a_cnt, u32 pm1b_cnt) +{ + return xen_acpi_notify_hypervisor_state(sleep_state, pm1a_cnt, + pm1b_cnt, false); +} + +int xen_acpi_notify_hypervisor_extended_sleep(u8 sleep_state, + u32 val_a, u32 val_b) +{ + return xen_acpi_notify_hypervisor_state(sleep_state, val_a, + val_b, true); +} diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c new file mode 100644 index 000000000..fd933695f --- /dev/null +++ b/drivers/xen/balloon.c @@ -0,0 +1,651 @@ +/****************************************************************************** + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * Copyright (c) 2010 Daniel Kiper + * + * Memory hotplug support was written by Daniel Kiper. Work on + * it was sponsored by Google under Google Summer of Code 2010 + * program. Jeremy Fitzhardinge from Citrix was the mentor for + * this project. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/bootmem.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/mutex.h> +#include <linux/list.h> +#include <linux/gfp.h> +#include <linux/notifier.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> +#include <linux/percpu-defs.h> + +#include <asm/page.h> +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> + +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/interface/xen.h> +#include <xen/interface/memory.h> +#include <xen/balloon.h> +#include <xen/features.h> +#include <xen/page.h> + +/* + * balloon_process() state: + * + * BP_DONE: done or nothing to do, + * BP_EAGAIN: error, go to sleep, + * BP_ECANCELED: error, balloon operation canceled. + */ + +enum bp_state { + BP_DONE, + BP_EAGAIN, + BP_ECANCELED +}; + + +static DEFINE_MUTEX(balloon_mutex); + +struct balloon_stats balloon_stats; +EXPORT_SYMBOL_GPL(balloon_stats); + +/* We increase/decrease in batches which fit in a page */ +static xen_pfn_t frame_list[PAGE_SIZE / sizeof(unsigned long)]; + + +/* List of ballooned pages, threaded through the mem_map array. */ +static LIST_HEAD(ballooned_pages); + +/* Main work function, always executed in process context. */ +static void balloon_process(struct work_struct *work); +static DECLARE_DELAYED_WORK(balloon_worker, balloon_process); + +/* When ballooning out (allocating memory to return to Xen) we don't really + want the kernel to try too hard since that can trigger the oom killer. */ +#define GFP_BALLOON \ + (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC) + +static void scrub_page(struct page *page) +{ +#ifdef CONFIG_XEN_SCRUB_PAGES + clear_highpage(page); +#endif +} + +/* balloon_append: add the given page to the balloon. */ +static void __balloon_append(struct page *page) +{ + /* Lowmem is re-populated first, so highmem pages go at list tail. */ + if (PageHighMem(page)) { + list_add_tail(&page->lru, &ballooned_pages); + balloon_stats.balloon_high++; + } else { + list_add(&page->lru, &ballooned_pages); + balloon_stats.balloon_low++; + } +} + +static void balloon_append(struct page *page) +{ + __balloon_append(page); + adjust_managed_page_count(page, -1); +} + +/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ +static struct page *balloon_retrieve(bool prefer_highmem) +{ + struct page *page; + + if (list_empty(&ballooned_pages)) + return NULL; + + if (prefer_highmem) + page = list_entry(ballooned_pages.prev, struct page, lru); + else + page = list_entry(ballooned_pages.next, struct page, lru); + list_del(&page->lru); + + if (PageHighMem(page)) + balloon_stats.balloon_high--; + else + balloon_stats.balloon_low--; + + adjust_managed_page_count(page, 1); + + return page; +} + +static struct page *balloon_next_page(struct page *page) +{ + struct list_head *next = page->lru.next; + if (next == &ballooned_pages) + return NULL; + return list_entry(next, struct page, lru); +} + +static enum bp_state update_schedule(enum bp_state state) +{ + if (state == BP_ECANCELED) + return BP_ECANCELED; + + if (state == BP_DONE) { + balloon_stats.schedule_delay = 1; + balloon_stats.retry_count = 1; + return BP_DONE; + } + + ++balloon_stats.retry_count; + + if (balloon_stats.max_retry_count != RETRY_UNLIMITED && + balloon_stats.retry_count > balloon_stats.max_retry_count) { + balloon_stats.schedule_delay = 1; + balloon_stats.retry_count = 1; + return BP_ECANCELED; + } + + balloon_stats.schedule_delay <<= 1; + + if (balloon_stats.schedule_delay > balloon_stats.max_schedule_delay) + balloon_stats.schedule_delay = balloon_stats.max_schedule_delay; + + return BP_EAGAIN; +} + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static long current_credit(void) +{ + return balloon_stats.target_pages - balloon_stats.current_pages - + balloon_stats.hotplug_pages; +} + +static bool balloon_is_inflated(void) +{ + if (balloon_stats.balloon_low || balloon_stats.balloon_high || + balloon_stats.balloon_hotplug) + return true; + else + return false; +} + +/* + * reserve_additional_memory() adds memory region of size >= credit above + * max_pfn. New region is section aligned and size is modified to be multiple + * of section size. Those features allow optimal use of address space and + * establish proper alignment when this function is called first time after + * boot (last section not fully populated at boot time contains unused memory + * pages with PG_reserved bit not set; online_pages_range() does not allow page + * onlining in whole range if first onlined page does not have PG_reserved + * bit set). Real size of added memory is established at page onlining stage. + */ + +static enum bp_state reserve_additional_memory(long credit) +{ + int nid, rc; + u64 hotplug_start_paddr; + unsigned long balloon_hotplug = credit; + + hotplug_start_paddr = PFN_PHYS(SECTION_ALIGN_UP(max_pfn)); + balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION); + nid = memory_add_physaddr_to_nid(hotplug_start_paddr); + +#ifdef CONFIG_XEN_HAVE_PVMMU + /* + * add_memory() will build page tables for the new memory so + * the p2m must contain invalid entries so the correct + * non-present PTEs will be written. + * + * If a failure occurs, the original (identity) p2m entries + * are not restored since this region is now known not to + * conflict with any devices. + */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + unsigned long pfn, i; + + pfn = PFN_DOWN(hotplug_start_paddr); + for (i = 0; i < balloon_hotplug; i++) { + if (!set_phys_to_machine(pfn + i, INVALID_P2M_ENTRY)) { + pr_warn("set_phys_to_machine() failed, no memory added\n"); + return BP_ECANCELED; + } + } + } +#endif + + rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT); + + if (rc) { + pr_warn("Cannot add additional memory (%i)\n", rc); + return BP_ECANCELED; + } + + balloon_hotplug -= credit; + + balloon_stats.hotplug_pages += credit; + balloon_stats.balloon_hotplug = balloon_hotplug; + + return BP_DONE; +} + +static void xen_online_page(struct page *page) +{ + __online_page_set_limits(page); + + mutex_lock(&balloon_mutex); + + __balloon_append(page); + + if (balloon_stats.hotplug_pages) + --balloon_stats.hotplug_pages; + else + --balloon_stats.balloon_hotplug; + + mutex_unlock(&balloon_mutex); +} + +static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) +{ + if (val == MEM_ONLINE) + schedule_delayed_work(&balloon_worker, 0); + + return NOTIFY_OK; +} + +static struct notifier_block xen_memory_nb = { + .notifier_call = xen_memory_notifier, + .priority = 0 +}; +#else +static long current_credit(void) +{ + unsigned long target = balloon_stats.target_pages; + + target = min(target, + balloon_stats.current_pages + + balloon_stats.balloon_low + + balloon_stats.balloon_high); + + return target - balloon_stats.current_pages; +} + +static bool balloon_is_inflated(void) +{ + if (balloon_stats.balloon_low || balloon_stats.balloon_high) + return true; + else + return false; +} + +static enum bp_state reserve_additional_memory(long credit) +{ + balloon_stats.target_pages = balloon_stats.current_pages; + return BP_DONE; +} +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ + +static enum bp_state increase_reservation(unsigned long nr_pages) +{ + int rc; + unsigned long pfn, i; + struct page *page; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) { + nr_pages = min(nr_pages, balloon_stats.balloon_hotplug); + balloon_stats.hotplug_pages += nr_pages; + balloon_stats.balloon_hotplug -= nr_pages; + return BP_DONE; + } +#endif + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + + page = list_first_entry_or_null(&ballooned_pages, struct page, lru); + for (i = 0; i < nr_pages; i++) { + if (!page) { + nr_pages = i; + break; + } + frame_list[i] = page_to_pfn(page); + page = balloon_next_page(page); + } + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; + rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation); + if (rc <= 0) + return BP_EAGAIN; + + for (i = 0; i < rc; i++) { + page = balloon_retrieve(false); + BUG_ON(page == NULL); + + pfn = page_to_pfn(page); + +#ifdef CONFIG_XEN_HAVE_PVMMU + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + set_phys_to_machine(pfn, frame_list[i]); + + /* Link back into the page tables if not highmem. */ + if (!PageHighMem(page)) { + int ret; + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + mfn_pte(frame_list[i], PAGE_KERNEL), + 0); + BUG_ON(ret); + } + } +#endif + + /* Relinquish the page back to the allocator. */ + __free_reserved_page(page); + } + + balloon_stats.current_pages += rc; + + return BP_DONE; +} + +static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) +{ + enum bp_state state = BP_DONE; + unsigned long pfn, i; + struct page *page; + int ret; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + if (balloon_stats.hotplug_pages) { + nr_pages = min(nr_pages, balloon_stats.hotplug_pages); + balloon_stats.hotplug_pages -= nr_pages; + balloon_stats.balloon_hotplug += nr_pages; + return BP_DONE; + } +#endif + + if (nr_pages > ARRAY_SIZE(frame_list)) + nr_pages = ARRAY_SIZE(frame_list); + + for (i = 0; i < nr_pages; i++) { + page = alloc_page(gfp); + if (page == NULL) { + nr_pages = i; + state = BP_EAGAIN; + break; + } + scrub_page(page); + + frame_list[i] = page_to_pfn(page); + } + + /* + * Ensure that ballooned highmem pages don't have kmaps. + * + * Do this before changing the p2m as kmap_flush_unused() + * reads PTEs to obtain pages (and hence needs the original + * p2m entry). + */ + kmap_flush_unused(); + + /* Update direct mapping, invalidate P2M, and add to balloon. */ + for (i = 0; i < nr_pages; i++) { + pfn = frame_list[i]; + frame_list[i] = pfn_to_mfn(pfn); + page = pfn_to_page(pfn); + +#ifdef CONFIG_XEN_HAVE_PVMMU + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + if (!PageHighMem(page)) { + ret = HYPERVISOR_update_va_mapping( + (unsigned long)__va(pfn << PAGE_SHIFT), + __pte_ma(0), 0); + BUG_ON(ret); + } + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + } +#endif + + balloon_append(page); + } + + flush_tlb_all(); + + set_xen_guest_handle(reservation.extent_start, frame_list); + reservation.nr_extents = nr_pages; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + BUG_ON(ret != nr_pages); + + balloon_stats.current_pages -= nr_pages; + + return state; +} + +/* + * We avoid multiple worker processes conflicting via the balloon mutex. + * We may of course race updates of the target counts (which are protected + * by the balloon lock), or with changes to the Xen hard limit, but we will + * recover from these in time. + */ +static void balloon_process(struct work_struct *work) +{ + enum bp_state state = BP_DONE; + long credit; + + mutex_lock(&balloon_mutex); + + do { + credit = current_credit(); + + if (credit > 0) { + if (balloon_is_inflated()) + state = increase_reservation(credit); + else + state = reserve_additional_memory(credit); + } + + if (credit < 0) + state = decrease_reservation(-credit, GFP_BALLOON); + + state = update_schedule(state); + +#ifndef CONFIG_PREEMPT + if (need_resched()) + schedule(); +#endif + } while (credit && state == BP_DONE); + + /* Schedule more work if there is some still to be done. */ + if (state == BP_EAGAIN) + schedule_delayed_work(&balloon_worker, balloon_stats.schedule_delay * HZ); + + mutex_unlock(&balloon_mutex); +} + +/* Resets the Xen limit, sets new target, and kicks off processing. */ +void balloon_set_new_target(unsigned long target) +{ + /* No need for lock. Not read-modify-write updates. */ + balloon_stats.target_pages = target; + schedule_delayed_work(&balloon_worker, 0); +} +EXPORT_SYMBOL_GPL(balloon_set_new_target); + +/** + * alloc_xenballooned_pages - get pages that have been ballooned out + * @nr_pages: Number of pages to get + * @pages: pages returned + * @highmem: allow highmem pages + * @return 0 on success, error otherwise + */ +int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem) +{ + int pgno = 0; + struct page *page; + mutex_lock(&balloon_mutex); + while (pgno < nr_pages) { + page = balloon_retrieve(highmem); + if (page && (highmem || !PageHighMem(page))) { + pages[pgno++] = page; + } else { + enum bp_state st; + if (page) + balloon_append(page); + st = decrease_reservation(nr_pages - pgno, + highmem ? GFP_HIGHUSER : GFP_USER); + if (st != BP_DONE) + goto out_undo; + } + } + mutex_unlock(&balloon_mutex); + return 0; + out_undo: + while (pgno) + balloon_append(pages[--pgno]); + /* Free the memory back to the kernel soon */ + schedule_delayed_work(&balloon_worker, 0); + mutex_unlock(&balloon_mutex); + return -ENOMEM; +} +EXPORT_SYMBOL(alloc_xenballooned_pages); + +/** + * free_xenballooned_pages - return pages retrieved with get_ballooned_pages + * @nr_pages: Number of pages + * @pages: pages to return + */ +void free_xenballooned_pages(int nr_pages, struct page **pages) +{ + int i; + + mutex_lock(&balloon_mutex); + + for (i = 0; i < nr_pages; i++) { + if (pages[i]) + balloon_append(pages[i]); + } + + /* The balloon may be too large now. Shrink it if needed. */ + if (current_credit()) + schedule_delayed_work(&balloon_worker, 0); + + mutex_unlock(&balloon_mutex); +} +EXPORT_SYMBOL(free_xenballooned_pages); + +static void __init balloon_add_region(unsigned long start_pfn, + unsigned long pages) +{ + unsigned long pfn, extra_pfn_end; + struct page *page; + + /* + * If the amount of usable memory has been limited (e.g., with + * the 'mem' command line parameter), don't add pages beyond + * this limit. + */ + extra_pfn_end = min(max_pfn, start_pfn + pages); + + for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) { + page = pfn_to_page(pfn); + /* totalram_pages and totalhigh_pages do not + include the boot-time balloon extension, so + don't subtract from it. */ + __balloon_append(page); + } +} + +static int __init balloon_init(void) +{ + int i; + + if (!xen_domain()) + return -ENODEV; + + pr_info("Initialising balloon driver\n"); + + balloon_stats.current_pages = xen_pv_domain() + ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) + : get_num_physpages(); + balloon_stats.target_pages = balloon_stats.current_pages; + balloon_stats.balloon_low = 0; + balloon_stats.balloon_high = 0; + + balloon_stats.schedule_delay = 1; + balloon_stats.max_schedule_delay = 32; + balloon_stats.retry_count = 1; + balloon_stats.max_retry_count = RETRY_UNLIMITED; + +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + balloon_stats.hotplug_pages = 0; + balloon_stats.balloon_hotplug = 0; + + set_online_page_callback(&xen_online_page); + register_memory_notifier(&xen_memory_nb); +#endif + + /* + * Initialize the balloon with pages from the extra memory + * regions (see arch/x86/xen/setup.c). + */ + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) + if (xen_extra_mem[i].size) + balloon_add_region(PFN_UP(xen_extra_mem[i].start), + PFN_DOWN(xen_extra_mem[i].size)); + + return 0; +} + +subsys_initcall(balloon_init); + +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c new file mode 100644 index 000000000..0edb91c0d --- /dev/null +++ b/drivers/xen/biomerge.c @@ -0,0 +1,15 @@ +#include <linux/bio.h> +#include <linux/io.h> +#include <linux/export.h> +#include <xen/page.h> + +bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, + const struct bio_vec *vec2) +{ + unsigned long mfn1 = pfn_to_mfn(page_to_pfn(vec1->bv_page)); + unsigned long mfn2 = pfn_to_mfn(page_to_pfn(vec2->bv_page)); + + return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && + ((mfn1 == mfn2) || ((mfn1+1) == mfn2)); +} +EXPORT_SYMBOL(xen_biovec_phys_mergeable); diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c new file mode 100644 index 000000000..cc6513a17 --- /dev/null +++ b/drivers/xen/cpu_hotplug.c @@ -0,0 +1,114 @@ +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/notifier.h> + +#include <xen/xen.h> +#include <xen/xenbus.h> + +#include <asm/xen/hypervisor.h> +#include <asm/cpu.h> + +static void enable_hotplug_cpu(int cpu) +{ + if (!cpu_present(cpu)) + arch_register_cpu(cpu); + + set_cpu_present(cpu, true); +} + +static void disable_hotplug_cpu(int cpu) +{ + if (cpu_present(cpu)) + arch_unregister_cpu(cpu); + + set_cpu_present(cpu, false); +} + +static int vcpu_online(unsigned int cpu) +{ + int err; + char dir[16], state[16]; + + sprintf(dir, "cpu/%u", cpu); + err = xenbus_scanf(XBT_NIL, dir, "availability", "%15s", state); + if (err != 1) { + if (!xen_initial_domain()) + pr_err("Unable to read cpu state\n"); + return err; + } + + if (strcmp(state, "online") == 0) + return 1; + else if (strcmp(state, "offline") == 0) + return 0; + + pr_err("unknown state(%s) on CPU%d\n", state, cpu); + return -EINVAL; +} +static void vcpu_hotplug(unsigned int cpu) +{ + if (!cpu_possible(cpu)) + return; + + switch (vcpu_online(cpu)) { + case 1: + enable_hotplug_cpu(cpu); + break; + case 0: + (void)cpu_down(cpu); + disable_hotplug_cpu(cpu); + break; + default: + break; + } +} + +static void handle_vcpu_hotplug_event(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + unsigned int cpu; + char *cpustr; + const char *node = vec[XS_WATCH_PATH]; + + cpustr = strstr(node, "cpu/"); + if (cpustr != NULL) { + sscanf(cpustr, "cpu/%u", &cpu); + vcpu_hotplug(cpu); + } +} + +static int setup_cpu_watcher(struct notifier_block *notifier, + unsigned long event, void *data) +{ + int cpu; + static struct xenbus_watch cpu_watch = { + .node = "cpu", + .callback = handle_vcpu_hotplug_event}; + + (void)register_xenbus_watch(&cpu_watch); + + for_each_possible_cpu(cpu) { + if (vcpu_online(cpu) == 0) { + (void)cpu_down(cpu); + set_cpu_present(cpu, false); + } + } + + return NOTIFY_DONE; +} + +static int __init setup_vcpu_hotplug_event(void) +{ + static struct notifier_block xsn_cpu = { + .notifier_call = setup_cpu_watcher }; + + if (!xen_pv_domain()) + return -ENODEV; + + register_xenstore_notifier(&xsn_cpu); + + return 0; +} + +arch_initcall(setup_vcpu_hotplug_event); + diff --git a/drivers/xen/dbgp.c b/drivers/xen/dbgp.c new file mode 100644 index 000000000..8145a59fd --- /dev/null +++ b/drivers/xen/dbgp.c @@ -0,0 +1,50 @@ +#include <linux/pci.h> +#include <linux/usb.h> +#include <linux/usb/ehci_def.h> +#include <linux/usb/hcd.h> +#include <asm/xen/hypercall.h> +#include <xen/interface/physdev.h> +#include <xen/xen.h> + +static int xen_dbgp_op(struct usb_hcd *hcd, int op) +{ +#ifdef CONFIG_PCI + const struct device *ctrlr = hcd_to_bus(hcd)->controller; +#endif + struct physdev_dbgp_op dbgp; + + if (!xen_initial_domain()) + return 0; + + dbgp.op = op; + +#ifdef CONFIG_PCI + if (dev_is_pci(ctrlr)) { + const struct pci_dev *pdev = to_pci_dev(ctrlr); + + dbgp.u.pci.seg = pci_domain_nr(pdev->bus); + dbgp.u.pci.bus = pdev->bus->number; + dbgp.u.pci.devfn = pdev->devfn; + dbgp.bus = PHYSDEVOP_DBGP_BUS_PCI; + } else +#endif + dbgp.bus = PHYSDEVOP_DBGP_BUS_UNKNOWN; + + return HYPERVISOR_physdev_op(PHYSDEVOP_dbgp_op, &dbgp); +} + +int xen_dbgp_reset_prep(struct usb_hcd *hcd) +{ + return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_PREPARE); +} + +int xen_dbgp_external_startup(struct usb_hcd *hcd) +{ + return xen_dbgp_op(hcd, PHYSDEVOP_DBGP_RESET_DONE); +} + +#ifndef CONFIG_EARLY_PRINTK_DBGP +#include <linux/export.h> +EXPORT_SYMBOL_GPL(xen_dbgp_reset_prep); +EXPORT_SYMBOL_GPL(xen_dbgp_external_startup); +#endif diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c new file mode 100644 index 000000000..f745db270 --- /dev/null +++ b/drivers/xen/efi.c @@ -0,0 +1,371 @@ +/* + * EFI support for Xen. + * + * Copyright (C) 1999 VA Linux Systems + * Copyright (C) 1999 Walt Drummond <drummond@valinux.com> + * Copyright (C) 1999-2002 Hewlett-Packard Co. + * David Mosberger-Tang <davidm@hpl.hp.com> + * Stephane Eranian <eranian@hpl.hp.com> + * Copyright (C) 2005-2008 Intel Co. + * Fenghua Yu <fenghua.yu@intel.com> + * Bibo Mao <bibo.mao@intel.com> + * Chandramouli Narayanan <mouli@linux.intel.com> + * Huang Ying <ying.huang@intel.com> + * Copyright (C) 2011 Novell Co. + * Jan Beulich <JBeulich@suse.com> + * Copyright (C) 2011-2012 Oracle Co. + * Liang Tang <liang.tang@oracle.com> + * Copyright (c) 2014 Oracle Co., Daniel Kiper + */ + +#include <linux/bug.h> +#include <linux/efi.h> +#include <linux/init.h> +#include <linux/string.h> + +#include <xen/interface/xen.h> +#include <xen/interface/platform.h> +#include <xen/xen.h> + +#include <asm/page.h> + +#include <asm/xen/hypercall.h> + +#define INIT_EFI_OP(name) \ + {.cmd = XENPF_efi_runtime_call, \ + .u.efi_runtime_call.function = XEN_EFI_##name, \ + .u.efi_runtime_call.misc = 0} + +#define efi_data(op) (op.u.efi_runtime_call) + +static efi_status_t xen_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) +{ + struct xen_platform_op op = INIT_EFI_OP(get_time); + + if (HYPERVISOR_dom0_op(&op) < 0) + return EFI_UNSUPPORTED; + + if (tm) { + BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.get_time.time)); + memcpy(tm, &efi_data(op).u.get_time.time, sizeof(*tm)); + } + + if (tc) { + tc->resolution = efi_data(op).u.get_time.resolution; + tc->accuracy = efi_data(op).u.get_time.accuracy; + tc->sets_to_zero = !!(efi_data(op).misc & + XEN_EFI_GET_TIME_SET_CLEARS_NS); + } + + return efi_data(op).status; +} + +static efi_status_t xen_efi_set_time(efi_time_t *tm) +{ + struct xen_platform_op op = INIT_EFI_OP(set_time); + + BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.set_time)); + memcpy(&efi_data(op).u.set_time, tm, sizeof(*tm)); + + if (HYPERVISOR_dom0_op(&op) < 0) + return EFI_UNSUPPORTED; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_get_wakeup_time(efi_bool_t *enabled, + efi_bool_t *pending, + efi_time_t *tm) +{ + struct xen_platform_op op = INIT_EFI_OP(get_wakeup_time); + + if (HYPERVISOR_dom0_op(&op) < 0) + return EFI_UNSUPPORTED; + + if (tm) { + BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.get_wakeup_time)); + memcpy(tm, &efi_data(op).u.get_wakeup_time, sizeof(*tm)); + } + + if (enabled) + *enabled = !!(efi_data(op).misc & XEN_EFI_GET_WAKEUP_TIME_ENABLED); + + if (pending) + *pending = !!(efi_data(op).misc & XEN_EFI_GET_WAKEUP_TIME_PENDING); + + return efi_data(op).status; +} + +static efi_status_t xen_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) +{ + struct xen_platform_op op = INIT_EFI_OP(set_wakeup_time); + + BUILD_BUG_ON(sizeof(*tm) != sizeof(efi_data(op).u.set_wakeup_time)); + if (enabled) + efi_data(op).misc = XEN_EFI_SET_WAKEUP_TIME_ENABLE; + if (tm) + memcpy(&efi_data(op).u.set_wakeup_time, tm, sizeof(*tm)); + else + efi_data(op).misc |= XEN_EFI_SET_WAKEUP_TIME_ENABLE_ONLY; + + if (HYPERVISOR_dom0_op(&op) < 0) + return EFI_UNSUPPORTED; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_get_variable(efi_char16_t *name, + efi_guid_t *vendor, + u32 *attr, + unsigned long *data_size, + void *data) +{ + struct xen_platform_op op = INIT_EFI_OP(get_variable); + + set_xen_guest_handle(efi_data(op).u.get_variable.name, name); + BUILD_BUG_ON(sizeof(*vendor) != + sizeof(efi_data(op).u.get_variable.vendor_guid)); + memcpy(&efi_data(op).u.get_variable.vendor_guid, vendor, sizeof(*vendor)); + efi_data(op).u.get_variable.size = *data_size; + set_xen_guest_handle(efi_data(op).u.get_variable.data, data); + + if (HYPERVISOR_dom0_op(&op) < 0) + return EFI_UNSUPPORTED; + + *data_size = efi_data(op).u.get_variable.size; + if (attr) + *attr = efi_data(op).misc; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_get_next_variable(unsigned long *name_size, + efi_char16_t *name, + efi_guid_t *vendor) +{ + struct xen_platform_op op = INIT_EFI_OP(get_next_variable_name); + + efi_data(op).u.get_next_variable_name.size = *name_size; + set_xen_guest_handle(efi_data(op).u.get_next_variable_name.name, name); + BUILD_BUG_ON(sizeof(*vendor) != + sizeof(efi_data(op).u.get_next_variable_name.vendor_guid)); + memcpy(&efi_data(op).u.get_next_variable_name.vendor_guid, vendor, + sizeof(*vendor)); + + if (HYPERVISOR_dom0_op(&op) < 0) + return EFI_UNSUPPORTED; + + *name_size = efi_data(op).u.get_next_variable_name.size; + memcpy(vendor, &efi_data(op).u.get_next_variable_name.vendor_guid, + sizeof(*vendor)); + + return efi_data(op).status; +} + +static efi_status_t xen_efi_set_variable(efi_char16_t *name, + efi_guid_t *vendor, + u32 attr, + unsigned long data_size, + void *data) +{ + struct xen_platform_op op = INIT_EFI_OP(set_variable); + + set_xen_guest_handle(efi_data(op).u.set_variable.name, name); + efi_data(op).misc = attr; + BUILD_BUG_ON(sizeof(*vendor) != + sizeof(efi_data(op).u.set_variable.vendor_guid)); + memcpy(&efi_data(op).u.set_variable.vendor_guid, vendor, sizeof(*vendor)); + efi_data(op).u.set_variable.size = data_size; + set_xen_guest_handle(efi_data(op).u.set_variable.data, data); + + if (HYPERVISOR_dom0_op(&op) < 0) + return EFI_UNSUPPORTED; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_query_variable_info(u32 attr, + u64 *storage_space, + u64 *remaining_space, + u64 *max_variable_size) +{ + struct xen_platform_op op = INIT_EFI_OP(query_variable_info); + + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + efi_data(op).u.query_variable_info.attr = attr; + + if (HYPERVISOR_dom0_op(&op) < 0) + return EFI_UNSUPPORTED; + + *storage_space = efi_data(op).u.query_variable_info.max_store_size; + *remaining_space = efi_data(op).u.query_variable_info.remain_store_size; + *max_variable_size = efi_data(op).u.query_variable_info.max_size; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_get_next_high_mono_count(u32 *count) +{ + struct xen_platform_op op = INIT_EFI_OP(get_next_high_monotonic_count); + + if (HYPERVISOR_dom0_op(&op) < 0) + return EFI_UNSUPPORTED; + + *count = efi_data(op).misc; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, + unsigned long count, + unsigned long sg_list) +{ + struct xen_platform_op op = INIT_EFI_OP(update_capsule); + + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + set_xen_guest_handle(efi_data(op).u.update_capsule.capsule_header_array, + capsules); + efi_data(op).u.update_capsule.capsule_count = count; + efi_data(op).u.update_capsule.sg_list = sg_list; + + if (HYPERVISOR_dom0_op(&op) < 0) + return EFI_UNSUPPORTED; + + return efi_data(op).status; +} + +static efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, + unsigned long count, + u64 *max_size, + int *reset_type) +{ + struct xen_platform_op op = INIT_EFI_OP(query_capsule_capabilities); + + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + set_xen_guest_handle(efi_data(op).u.query_capsule_capabilities.capsule_header_array, + capsules); + efi_data(op).u.query_capsule_capabilities.capsule_count = count; + + if (HYPERVISOR_dom0_op(&op) < 0) + return EFI_UNSUPPORTED; + + *max_size = efi_data(op).u.query_capsule_capabilities.max_capsule_size; + *reset_type = efi_data(op).u.query_capsule_capabilities.reset_type; + + return efi_data(op).status; +} + +static efi_char16_t vendor[100] __initdata; + +static efi_system_table_t efi_systab_xen __initdata = { + .hdr = { + .signature = EFI_SYSTEM_TABLE_SIGNATURE, + .revision = 0, /* Initialized later. */ + .headersize = 0, /* Ignored by Linux Kernel. */ + .crc32 = 0, /* Ignored by Linux Kernel. */ + .reserved = 0 + }, + .fw_vendor = EFI_INVALID_TABLE_ADDR, /* Initialized later. */ + .fw_revision = 0, /* Initialized later. */ + .con_in_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .con_in = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .con_out_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .con_out = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .stderr_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .stderr = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .runtime = (efi_runtime_services_t *)EFI_INVALID_TABLE_ADDR, + /* Not used under Xen. */ + .boottime = (efi_boot_services_t *)EFI_INVALID_TABLE_ADDR, + /* Not used under Xen. */ + .nr_tables = 0, /* Initialized later. */ + .tables = EFI_INVALID_TABLE_ADDR /* Initialized later. */ +}; + +static const struct efi efi_xen __initconst = { + .systab = NULL, /* Initialized later. */ + .runtime_version = 0, /* Initialized later. */ + .mps = EFI_INVALID_TABLE_ADDR, + .acpi = EFI_INVALID_TABLE_ADDR, + .acpi20 = EFI_INVALID_TABLE_ADDR, + .smbios = EFI_INVALID_TABLE_ADDR, + .smbios3 = EFI_INVALID_TABLE_ADDR, + .sal_systab = EFI_INVALID_TABLE_ADDR, + .boot_info = EFI_INVALID_TABLE_ADDR, + .hcdp = EFI_INVALID_TABLE_ADDR, + .uga = EFI_INVALID_TABLE_ADDR, + .uv_systab = EFI_INVALID_TABLE_ADDR, + .fw_vendor = EFI_INVALID_TABLE_ADDR, + .runtime = EFI_INVALID_TABLE_ADDR, + .config_table = EFI_INVALID_TABLE_ADDR, + .get_time = xen_efi_get_time, + .set_time = xen_efi_set_time, + .get_wakeup_time = xen_efi_get_wakeup_time, + .set_wakeup_time = xen_efi_set_wakeup_time, + .get_variable = xen_efi_get_variable, + .get_next_variable = xen_efi_get_next_variable, + .set_variable = xen_efi_set_variable, + .query_variable_info = xen_efi_query_variable_info, + .update_capsule = xen_efi_update_capsule, + .query_capsule_caps = xen_efi_query_capsule_caps, + .get_next_high_mono_count = xen_efi_get_next_high_mono_count, + .reset_system = NULL, /* Functionality provided by Xen. */ + .set_virtual_address_map = NULL, /* Not used under Xen. */ + .memmap = NULL, /* Not used under Xen. */ + .flags = 0 /* Initialized later. */ +}; + +efi_system_table_t __init *xen_efi_probe(void) +{ + struct xen_platform_op op = { + .cmd = XENPF_firmware_info, + .u.firmware_info = { + .type = XEN_FW_EFI_INFO, + .index = XEN_FW_EFI_CONFIG_TABLE + } + }; + union xenpf_efi_info *info = &op.u.firmware_info.u.efi_info; + + if (!xen_initial_domain() || HYPERVISOR_dom0_op(&op) < 0) + return NULL; + + /* Here we know that Xen runs on EFI platform. */ + + efi = efi_xen; + + efi_systab_xen.tables = info->cfg.addr; + efi_systab_xen.nr_tables = info->cfg.nent; + + op.cmd = XENPF_firmware_info; + op.u.firmware_info.type = XEN_FW_EFI_INFO; + op.u.firmware_info.index = XEN_FW_EFI_VENDOR; + info->vendor.bufsz = sizeof(vendor); + set_xen_guest_handle(info->vendor.name, vendor); + + if (HYPERVISOR_dom0_op(&op) == 0) { + efi_systab_xen.fw_vendor = __pa_symbol(vendor); + efi_systab_xen.fw_revision = info->vendor.revision; + } else + efi_systab_xen.fw_vendor = __pa_symbol(L"UNKNOWN"); + + op.cmd = XENPF_firmware_info; + op.u.firmware_info.type = XEN_FW_EFI_INFO; + op.u.firmware_info.index = XEN_FW_EFI_VERSION; + + if (HYPERVISOR_dom0_op(&op) == 0) + efi_systab_xen.hdr.revision = info->version; + + op.cmd = XENPF_firmware_info; + op.u.firmware_info.type = XEN_FW_EFI_INFO; + op.u.firmware_info.index = XEN_FW_EFI_RT_VERSION; + + if (HYPERVISOR_dom0_op(&op) == 0) + efi.runtime_version = info->version; + + return &efi_systab_xen; +} diff --git a/drivers/xen/events/Makefile b/drivers/xen/events/Makefile new file mode 100644 index 000000000..62be55cd9 --- /dev/null +++ b/drivers/xen/events/Makefile @@ -0,0 +1,5 @@ +obj-y += events.o + +events-y += events_base.o +events-y += events_2l.o +events-y += events_fifo.o diff --git a/drivers/xen/events/events_2l.c b/drivers/xen/events/events_2l.c new file mode 100644 index 000000000..7dd46312c --- /dev/null +++ b/drivers/xen/events/events_2l.c @@ -0,0 +1,375 @@ +/* + * Xen event channels (2-level ABI) + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +/* + * Note sizeof(xen_ulong_t) can be more than sizeof(unsigned long). Be + * careful to only use bitops which allow for this (e.g + * test_bit/find_first_bit and friends but not __ffs) and to pass + * BITS_PER_EVTCHN_WORD as the bitmask length. + */ +#define BITS_PER_EVTCHN_WORD (sizeof(xen_ulong_t)*8) +/* + * Make a bitmask (i.e. unsigned long *) of a xen_ulong_t + * array. Primarily to avoid long lines (hence the terse name). + */ +#define BM(x) (unsigned long *)(x) +/* Find the first set bit in a evtchn mask */ +#define EVTCHN_FIRST_BIT(w) find_first_bit(BM(&(w)), BITS_PER_EVTCHN_WORD) + +static DEFINE_PER_CPU(xen_ulong_t [EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD], + cpu_evtchn_mask); + +static unsigned evtchn_2l_max_channels(void) +{ + return EVTCHN_2L_NR_CHANNELS; +} + +static void evtchn_2l_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + clear_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, info->cpu))); + set_bit(info->evtchn, BM(per_cpu(cpu_evtchn_mask, cpu))); +} + +static void evtchn_2l_clear_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, BM(&s->evtchn_pending[0])); +} + +static void evtchn_2l_set_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_is_pending(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_bit(port, BM(&s->evtchn_pending[0])); +} + +static bool evtchn_2l_test_and_set_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + return sync_test_and_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_mask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, BM(&s->evtchn_mask[0])); +} + +static void evtchn_2l_unmask(unsigned port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = get_cpu(); + int do_hypercall = 0, evtchn_pending = 0; + + BUG_ON(!irqs_disabled()); + + if (unlikely((cpu != cpu_from_evtchn(port)))) + do_hypercall = 1; + else { + /* + * Need to clear the mask before checking pending to + * avoid a race with an event becoming pending. + * + * EVTCHNOP_unmask will only trigger an upcall if the + * mask bit was set, so if a hypercall is needed + * remask the event. + */ + sync_clear_bit(port, BM(&s->evtchn_mask[0])); + evtchn_pending = sync_test_bit(port, BM(&s->evtchn_pending[0])); + + if (unlikely(evtchn_pending && xen_hvm_domain())) { + sync_set_bit(port, BM(&s->evtchn_mask[0])); + do_hypercall = 1; + } + } + + /* Slow path (hypercall) if this is a non-local port or if this is + * an hvm domain and an event is pending (hvm domains don't have + * their own implementation of irq_enable). */ + if (do_hypercall) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } else { + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose + * the interrupt edge' if the channel is masked. + */ + if (evtchn_pending && + !sync_test_and_set_bit(port / BITS_PER_EVTCHN_WORD, + BM(&vcpu_info->evtchn_pending_sel))) + vcpu_info->evtchn_upcall_pending = 1; + } + + put_cpu(); +} + +static DEFINE_PER_CPU(unsigned int, current_word_idx); +static DEFINE_PER_CPU(unsigned int, current_bit_idx); + +/* + * Mask out the i least significant bits of w + */ +#define MASK_LSBS(w, i) (w & ((~((xen_ulong_t)0UL)) << i)) + +static inline xen_ulong_t active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return sh->evtchn_pending[idx] & + per_cpu(cpu_evtchn_mask, cpu)[idx] & + ~sh->evtchn_mask[idx]; +} + +/* + * Search the CPU's pending events bitmasks. For each one found, map + * the event number to an irq, and feed it into do_IRQ() for handling. + * + * Xen uses a two-level bitmap to speed searching. The first level is + * a bitset of words which contain pending event bits. The second + * level is a bitset of pending events themselves. + */ +static void evtchn_2l_handle_events(unsigned cpu) +{ + int irq; + xen_ulong_t pending_words; + xen_ulong_t pending_bits; + int start_word_idx, start_bit_idx; + int word_idx, bit_idx; + int i; + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + + /* Timer interrupt has highest priority. */ + irq = irq_from_virq(cpu, VIRQ_TIMER); + if (irq != -1) { + unsigned int evtchn = evtchn_from_irq(irq); + word_idx = evtchn / BITS_PER_LONG; + bit_idx = evtchn % BITS_PER_LONG; + if (active_evtchns(cpu, s, word_idx) & (1ULL << bit_idx)) + generic_handle_irq(irq); + } + + /* + * Master flag must be cleared /before/ clearing + * selector flag. xchg_xen_ulong must contain an + * appropriate barrier. + */ + pending_words = xchg_xen_ulong(&vcpu_info->evtchn_pending_sel, 0); + + start_word_idx = __this_cpu_read(current_word_idx); + start_bit_idx = __this_cpu_read(current_bit_idx); + + word_idx = start_word_idx; + + for (i = 0; pending_words != 0; i++) { + xen_ulong_t words; + + words = MASK_LSBS(pending_words, word_idx); + + /* + * If we masked out all events, wrap to beginning. + */ + if (words == 0) { + word_idx = 0; + bit_idx = 0; + continue; + } + word_idx = EVTCHN_FIRST_BIT(words); + + pending_bits = active_evtchns(cpu, s, word_idx); + bit_idx = 0; /* usually scan entire word from start */ + /* + * We scan the starting word in two parts. + * + * 1st time: start in the middle, scanning the + * upper bits. + * + * 2nd time: scan the whole word (not just the + * parts skipped in the first pass) -- if an + * event in the previously scanned bits is + * pending again it would just be scanned on + * the next loop anyway. + */ + if (word_idx == start_word_idx) { + if (i == 0) + bit_idx = start_bit_idx; + } + + do { + xen_ulong_t bits; + int port; + + bits = MASK_LSBS(pending_bits, bit_idx); + + /* If we masked out all events, move on. */ + if (bits == 0) + break; + + bit_idx = EVTCHN_FIRST_BIT(bits); + + /* Process port. */ + port = (word_idx * BITS_PER_EVTCHN_WORD) + bit_idx; + irq = get_evtchn_to_irq(port); + + if (irq != -1) + generic_handle_irq(irq); + + bit_idx = (bit_idx + 1) % BITS_PER_EVTCHN_WORD; + + /* Next caller starts at last processed + 1 */ + __this_cpu_write(current_word_idx, + bit_idx ? word_idx : + (word_idx+1) % BITS_PER_EVTCHN_WORD); + __this_cpu_write(current_bit_idx, bit_idx); + } while (bit_idx != 0); + + /* Scan start_l1i twice; all others once. */ + if ((word_idx != start_word_idx) || (i != 0)) + pending_words &= ~(1UL << word_idx); + + word_idx = (word_idx + 1) % BITS_PER_EVTCHN_WORD; + } +} + +irqreturn_t xen_debug_interrupt(int irq, void *dev_id) +{ + struct shared_info *sh = HYPERVISOR_shared_info; + int cpu = smp_processor_id(); + xen_ulong_t *cpu_evtchn = per_cpu(cpu_evtchn_mask, cpu); + int i; + unsigned long flags; + static DEFINE_SPINLOCK(debug_lock); + struct vcpu_info *v; + + spin_lock_irqsave(&debug_lock, flags); + + printk("\nvcpu %d\n ", cpu); + + for_each_online_cpu(i) { + int pending; + v = per_cpu(xen_vcpu, i); + pending = (get_irq_regs() && i == cpu) + ? xen_irqs_disabled(get_irq_regs()) + : v->evtchn_upcall_mask; + printk("%d: masked=%d pending=%d event_sel %0*"PRI_xen_ulong"\n ", i, + pending, v->evtchn_upcall_pending, + (int)(sizeof(v->evtchn_pending_sel)*2), + v->evtchn_pending_sel); + } + v = per_cpu(xen_vcpu, cpu); + + printk("\npending:\n "); + for (i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)sizeof(sh->evtchn_pending[0])*2, + sh->evtchn_pending[i], + i % 8 == 0 ? "\n " : " "); + printk("\nglobal mask:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nglobally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + sh->evtchn_pending[i] & ~sh->evtchn_mask[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocal cpu%d mask:\n ", cpu); + for (i = (EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD)-1; i >= 0; i--) + printk("%0*"PRI_xen_ulong"%s", (int)(sizeof(cpu_evtchn[0])*2), + cpu_evtchn[i], + i % 8 == 0 ? "\n " : " "); + + printk("\nlocally unmasked:\n "); + for (i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--) { + xen_ulong_t pending = sh->evtchn_pending[i] + & ~sh->evtchn_mask[i] + & cpu_evtchn[i]; + printk("%0*"PRI_xen_ulong"%s", + (int)(sizeof(sh->evtchn_mask[0])*2), + pending, i % 8 == 0 ? "\n " : " "); + } + + printk("\npending list:\n"); + for (i = 0; i < EVTCHN_2L_NR_CHANNELS; i++) { + if (sync_test_bit(i, BM(sh->evtchn_pending))) { + int word_idx = i / BITS_PER_EVTCHN_WORD; + printk(" %d: event %d -> irq %d%s%s%s\n", + cpu_from_evtchn(i), i, + get_evtchn_to_irq(i), + sync_test_bit(word_idx, BM(&v->evtchn_pending_sel)) + ? "" : " l2-clear", + !sync_test_bit(i, BM(sh->evtchn_mask)) + ? "" : " globally-masked", + sync_test_bit(i, BM(cpu_evtchn)) + ? "" : " locally-masked"); + } + } + + spin_unlock_irqrestore(&debug_lock, flags); + + return IRQ_HANDLED; +} + +static void evtchn_2l_resume(void) +{ + int i; + + for_each_online_cpu(i) + memset(per_cpu(cpu_evtchn_mask, i), 0, sizeof(xen_ulong_t) * + EVTCHN_2L_NR_CHANNELS/BITS_PER_EVTCHN_WORD); +} + +static const struct evtchn_ops evtchn_ops_2l = { + .max_channels = evtchn_2l_max_channels, + .nr_channels = evtchn_2l_max_channels, + .bind_to_cpu = evtchn_2l_bind_to_cpu, + .clear_pending = evtchn_2l_clear_pending, + .set_pending = evtchn_2l_set_pending, + .is_pending = evtchn_2l_is_pending, + .test_and_set_mask = evtchn_2l_test_and_set_mask, + .mask = evtchn_2l_mask, + .unmask = evtchn_2l_unmask, + .handle_events = evtchn_2l_handle_events, + .resume = evtchn_2l_resume, +}; + +void __init xen_evtchn_2l_init(void) +{ + pr_info("Using 2-level ABI\n"); + evtchn_ops = &evtchn_ops_2l; +} diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c new file mode 100644 index 000000000..383879504 --- /dev/null +++ b/drivers/xen/events/events_base.c @@ -0,0 +1,1705 @@ +/* + * Xen event channels + * + * Xen models interrupts with abstract event channels. Because each + * domain gets 1024 event channels, but NR_IRQ is not that large, we + * must dynamically map irqs<->event channels. The event channels + * interface with the rest of the kernel by defining a xen interrupt + * chip. When an event is received, it is mapped to an irq and sent + * through the normal interrupt processing path. + * + * There are four kinds of events which can be mapped to an event + * channel: + * + * 1. Inter-domain notifications. This includes all the virtual + * device events, since they're driven by front-ends in another domain + * (typically dom0). + * 2. VIRQs, typically used for timers. These are per-cpu events. + * 3. IPIs. + * 4. PIRQs - Hardware interrupts. + * + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/bootmem.h> +#include <linux/slab.h> +#include <linux/irqnr.h> +#include <linux/pci.h> + +#ifdef CONFIG_X86 +#include <asm/desc.h> +#include <asm/ptrace.h> +#include <asm/irq.h> +#include <asm/idle.h> +#include <asm/io_apic.h> +#include <asm/xen/page.h> +#include <asm/xen/pci.h> +#endif +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/hvm.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> +#include <xen/interface/hvm/hvm_op.h> +#include <xen/interface/hvm/params.h> +#include <xen/interface/physdev.h> +#include <xen/interface/sched.h> +#include <xen/interface/vcpu.h> +#include <asm/hw_irq.h> + +#include "events_internal.h" + +const struct evtchn_ops *evtchn_ops; + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. + */ +static DEFINE_MUTEX(irq_mapping_update_lock); + +static LIST_HEAD(xen_irq_list_head); + +/* IRQ <-> VIRQ mapping. */ +static DEFINE_PER_CPU(int [NR_VIRQS], virq_to_irq) = {[0 ... NR_VIRQS-1] = -1}; + +/* IRQ <-> IPI mapping */ +static DEFINE_PER_CPU(int [XEN_NR_IPIS], ipi_to_irq) = {[0 ... XEN_NR_IPIS-1] = -1}; + +int **evtchn_to_irq; +#ifdef CONFIG_X86 +static unsigned long *pirq_eoi_map; +#endif +static bool (*pirq_needs_eoi)(unsigned irq); + +#define EVTCHN_ROW(e) (e / (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_COL(e) (e % (PAGE_SIZE/sizeof(**evtchn_to_irq))) +#define EVTCHN_PER_ROW (PAGE_SIZE / sizeof(**evtchn_to_irq)) + +/* Xen will never allocate port zero for any purpose. */ +#define VALID_EVTCHN(chn) ((chn) != 0) + +static struct irq_chip xen_dynamic_chip; +static struct irq_chip xen_percpu_chip; +static struct irq_chip xen_pirq_chip; +static void enable_dynirq(struct irq_data *data); +static void disable_dynirq(struct irq_data *data); + +static void clear_evtchn_to_irq_row(unsigned row) +{ + unsigned col; + + for (col = 0; col < EVTCHN_PER_ROW; col++) + evtchn_to_irq[row][col] = -1; +} + +static void clear_evtchn_to_irq_all(void) +{ + unsigned row; + + for (row = 0; row < EVTCHN_ROW(xen_evtchn_max_channels()); row++) { + if (evtchn_to_irq[row] == NULL) + continue; + clear_evtchn_to_irq_row(row); + } +} + +static int set_evtchn_to_irq(unsigned evtchn, unsigned irq) +{ + unsigned row; + unsigned col; + + if (evtchn >= xen_evtchn_max_channels()) + return -EINVAL; + + row = EVTCHN_ROW(evtchn); + col = EVTCHN_COL(evtchn); + + if (evtchn_to_irq[row] == NULL) { + /* Unallocated irq entries return -1 anyway */ + if (irq == -1) + return 0; + + evtchn_to_irq[row] = (int *)get_zeroed_page(GFP_KERNEL); + if (evtchn_to_irq[row] == NULL) + return -ENOMEM; + + clear_evtchn_to_irq_row(row); + } + + evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)] = irq; + return 0; +} + +int get_evtchn_to_irq(unsigned evtchn) +{ + if (evtchn >= xen_evtchn_max_channels()) + return -1; + if (evtchn_to_irq[EVTCHN_ROW(evtchn)] == NULL) + return -1; + return evtchn_to_irq[EVTCHN_ROW(evtchn)][EVTCHN_COL(evtchn)]; +} + +/* Get info for IRQ */ +struct irq_info *info_for_irq(unsigned irq) +{ + return irq_get_handler_data(irq); +} + +/* Constructors for packed IRQ information. */ +static int xen_irq_info_common_setup(struct irq_info *info, + unsigned irq, + enum xen_irq_type type, + unsigned evtchn, + unsigned short cpu) +{ + int ret; + + BUG_ON(info->type != IRQT_UNBOUND && info->type != type); + + info->type = type; + info->irq = irq; + info->evtchn = evtchn; + info->cpu = cpu; + + ret = set_evtchn_to_irq(evtchn, irq); + if (ret < 0) + return ret; + + irq_clear_status_flags(irq, IRQ_NOREQUEST|IRQ_NOAUTOEN); + + return xen_evtchn_port_setup(info); +} + +static int xen_irq_info_evtchn_setup(unsigned irq, + unsigned evtchn) +{ + struct irq_info *info = info_for_irq(irq); + + return xen_irq_info_common_setup(info, irq, IRQT_EVTCHN, evtchn, 0); +} + +static int xen_irq_info_ipi_setup(unsigned cpu, + unsigned irq, + unsigned evtchn, + enum ipi_vector ipi) +{ + struct irq_info *info = info_for_irq(irq); + + info->u.ipi = ipi; + + per_cpu(ipi_to_irq, cpu)[ipi] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_IPI, evtchn, 0); +} + +static int xen_irq_info_virq_setup(unsigned cpu, + unsigned irq, + unsigned evtchn, + unsigned virq) +{ + struct irq_info *info = info_for_irq(irq); + + info->u.virq = virq; + + per_cpu(virq_to_irq, cpu)[virq] = irq; + + return xen_irq_info_common_setup(info, irq, IRQT_VIRQ, evtchn, 0); +} + +static int xen_irq_info_pirq_setup(unsigned irq, + unsigned evtchn, + unsigned pirq, + unsigned gsi, + uint16_t domid, + unsigned char flags) +{ + struct irq_info *info = info_for_irq(irq); + + info->u.pirq.pirq = pirq; + info->u.pirq.gsi = gsi; + info->u.pirq.domid = domid; + info->u.pirq.flags = flags; + + return xen_irq_info_common_setup(info, irq, IRQT_PIRQ, evtchn, 0); +} + +static void xen_irq_info_cleanup(struct irq_info *info) +{ + set_evtchn_to_irq(info->evtchn, -1); + info->evtchn = 0; +} + +/* + * Accessors for packed IRQ information. + */ +unsigned int evtchn_from_irq(unsigned irq) +{ + if (unlikely(WARN(irq >= nr_irqs, "Invalid irq %d!\n", irq))) + return 0; + + return info_for_irq(irq)->evtchn; +} + +unsigned irq_from_evtchn(unsigned int evtchn) +{ + return get_evtchn_to_irq(evtchn); +} +EXPORT_SYMBOL_GPL(irq_from_evtchn); + +int irq_from_virq(unsigned int cpu, unsigned int virq) +{ + return per_cpu(virq_to_irq, cpu)[virq]; +} + +static enum ipi_vector ipi_from_irq(unsigned irq) +{ + struct irq_info *info = info_for_irq(irq); + + BUG_ON(info == NULL); + BUG_ON(info->type != IRQT_IPI); + + return info->u.ipi; +} + +static unsigned virq_from_irq(unsigned irq) +{ + struct irq_info *info = info_for_irq(irq); + + BUG_ON(info == NULL); + BUG_ON(info->type != IRQT_VIRQ); + + return info->u.virq; +} + +static unsigned pirq_from_irq(unsigned irq) +{ + struct irq_info *info = info_for_irq(irq); + + BUG_ON(info == NULL); + BUG_ON(info->type != IRQT_PIRQ); + + return info->u.pirq.pirq; +} + +static enum xen_irq_type type_from_irq(unsigned irq) +{ + return info_for_irq(irq)->type; +} + +unsigned cpu_from_irq(unsigned irq) +{ + return info_for_irq(irq)->cpu; +} + +unsigned int cpu_from_evtchn(unsigned int evtchn) +{ + int irq = get_evtchn_to_irq(evtchn); + unsigned ret = 0; + + if (irq != -1) + ret = cpu_from_irq(irq); + + return ret; +} + +#ifdef CONFIG_X86 +static bool pirq_check_eoi_map(unsigned irq) +{ + return test_bit(pirq_from_irq(irq), pirq_eoi_map); +} +#endif + +static bool pirq_needs_eoi_flag(unsigned irq) +{ + struct irq_info *info = info_for_irq(irq); + BUG_ON(info->type != IRQT_PIRQ); + + return info->u.pirq.flags & PIRQ_NEEDS_EOI; +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ + int irq = get_evtchn_to_irq(chn); + struct irq_info *info = info_for_irq(irq); + + BUG_ON(irq == -1); +#ifdef CONFIG_SMP + cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(cpu)); +#endif + xen_evtchn_port_bind_to_cpu(info, cpu); + + info->cpu = cpu; +} + +static void xen_evtchn_mask_all(void) +{ + unsigned int evtchn; + + for (evtchn = 0; evtchn < xen_evtchn_nr_channels(); evtchn++) + mask_evtchn(evtchn); +} + +/** + * notify_remote_via_irq - send event to remote end of event channel via irq + * @irq: irq of event channel to send event to + * + * Unlike notify_remote_via_evtchn(), this is safe to use across + * save/restore. Notifications on a broken connection are silently + * dropped. + */ +void notify_remote_via_irq(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + notify_remote_via_evtchn(evtchn); +} +EXPORT_SYMBOL_GPL(notify_remote_via_irq); + +static void xen_irq_init(unsigned irq) +{ + struct irq_info *info; +#ifdef CONFIG_SMP + /* By default all event channels notify CPU#0. */ + cpumask_copy(irq_get_irq_data(irq)->affinity, cpumask_of(0)); +#endif + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (info == NULL) + panic("Unable to allocate metadata for IRQ%d\n", irq); + + info->type = IRQT_UNBOUND; + info->refcnt = -1; + + irq_set_handler_data(irq, info); + + list_add_tail(&info->list, &xen_irq_list_head); +} + +static int __must_check xen_allocate_irqs_dynamic(int nvec) +{ + int i, irq = irq_alloc_descs(-1, 0, nvec, -1); + + if (irq >= 0) { + for (i = 0; i < nvec; i++) + xen_irq_init(irq + i); + } + + return irq; +} + +static inline int __must_check xen_allocate_irq_dynamic(void) +{ + + return xen_allocate_irqs_dynamic(1); +} + +static int __must_check xen_allocate_irq_gsi(unsigned gsi) +{ + int irq; + + /* + * A PV guest has no concept of a GSI (since it has no ACPI + * nor access to/knowledge of the physical APICs). Therefore + * all IRQs are dynamically allocated from the entire IRQ + * space. + */ + if (xen_pv_domain() && !xen_initial_domain()) + return xen_allocate_irq_dynamic(); + + /* Legacy IRQ descriptors are already allocated by the arch. */ + if (gsi < NR_IRQS_LEGACY) + irq = gsi; + else + irq = irq_alloc_desc_at(gsi, -1); + + xen_irq_init(irq); + + return irq; +} + +static void xen_free_irq(unsigned irq) +{ + struct irq_info *info = irq_get_handler_data(irq); + + if (WARN_ON(!info)) + return; + + list_del(&info->list); + + irq_set_handler_data(irq, NULL); + + WARN_ON(info->refcnt > 0); + + kfree(info); + + /* Legacy IRQ descriptors are managed by the arch. */ + if (irq < NR_IRQS_LEGACY) + return; + + irq_free_desc(irq); +} + +static void xen_evtchn_close(unsigned int port) +{ + struct evtchn_close close; + + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); +} + +static void pirq_query_unmask(int irq) +{ + struct physdev_irq_status_query irq_status; + struct irq_info *info = info_for_irq(irq); + + BUG_ON(info->type != IRQT_PIRQ); + + irq_status.irq = pirq_from_irq(irq); + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + irq_status.flags = 0; + + info->u.pirq.flags &= ~PIRQ_NEEDS_EOI; + if (irq_status.flags & XENIRQSTAT_needs_eoi) + info->u.pirq.flags |= PIRQ_NEEDS_EOI; +} + +static void eoi_pirq(struct irq_data *data) +{ + int evtchn = evtchn_from_irq(data->irq); + struct physdev_eoi eoi = { .irq = pirq_from_irq(data->irq) }; + int rc = 0; + + irq_move_irq(data); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); + + if (pirq_needs_eoi(data->irq)) { + rc = HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); + WARN_ON(rc); + } +} + +static void mask_ack_pirq(struct irq_data *data) +{ + disable_dynirq(data); + eoi_pirq(data); +} + +static unsigned int __startup_pirq(unsigned int irq) +{ + struct evtchn_bind_pirq bind_pirq; + struct irq_info *info = info_for_irq(irq); + int evtchn = evtchn_from_irq(irq); + int rc; + + BUG_ON(info->type != IRQT_PIRQ); + + if (VALID_EVTCHN(evtchn)) + goto out; + + bind_pirq.pirq = pirq_from_irq(irq); + /* NB. We are happy to share unless we are probing. */ + bind_pirq.flags = info->u.pirq.flags & PIRQ_SHAREABLE ? + BIND_PIRQ__WILL_SHARE : 0; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq); + if (rc != 0) { + pr_warn("Failed to obtain physical IRQ %d\n", irq); + return 0; + } + evtchn = bind_pirq.port; + + pirq_query_unmask(irq); + + rc = set_evtchn_to_irq(evtchn, irq); + if (rc) + goto err; + + info->evtchn = evtchn; + bind_evtchn_to_cpu(evtchn, 0); + + rc = xen_evtchn_port_setup(info); + if (rc) + goto err; + +out: + unmask_evtchn(evtchn); + eoi_pirq(irq_get_irq_data(irq)); + + return 0; + +err: + pr_err("irq%d: Failed to set port to irq mapping (%d)\n", irq, rc); + xen_evtchn_close(evtchn); + return 0; +} + +static unsigned int startup_pirq(struct irq_data *data) +{ + return __startup_pirq(data->irq); +} + +static void shutdown_pirq(struct irq_data *data) +{ + unsigned int irq = data->irq; + struct irq_info *info = info_for_irq(irq); + unsigned evtchn = evtchn_from_irq(irq); + + BUG_ON(info->type != IRQT_PIRQ); + + if (!VALID_EVTCHN(evtchn)) + return; + + mask_evtchn(evtchn); + xen_evtchn_close(evtchn); + xen_irq_info_cleanup(info); +} + +static void enable_pirq(struct irq_data *data) +{ + startup_pirq(data); +} + +static void disable_pirq(struct irq_data *data) +{ + disable_dynirq(data); +} + +int xen_irq_from_gsi(unsigned gsi) +{ + struct irq_info *info; + + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info->type != IRQT_PIRQ) + continue; + + if (info->u.pirq.gsi == gsi) + return info->irq; + } + + return -1; +} +EXPORT_SYMBOL_GPL(xen_irq_from_gsi); + +static void __unbind_from_irq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + struct irq_info *info = irq_get_handler_data(irq); + + if (info->refcnt > 0) { + info->refcnt--; + if (info->refcnt != 0) + return; + } + + if (VALID_EVTCHN(evtchn)) { + unsigned int cpu = cpu_from_irq(irq); + + xen_evtchn_close(evtchn); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu)[virq_from_irq(irq)] = -1; + break; + case IRQT_IPI: + per_cpu(ipi_to_irq, cpu)[ipi_from_irq(irq)] = -1; + break; + default: + break; + } + + xen_irq_info_cleanup(info); + } + + BUG_ON(info_for_irq(irq)->type == IRQT_UNBOUND); + + xen_free_irq(irq); +} + +/* + * Do not make any assumptions regarding the relationship between the + * IRQ number returned here and the Xen pirq argument. + * + * Note: We don't assign an event channel until the irq actually started + * up. Return an existing irq if we've already got one for the gsi. + * + * Shareable implies level triggered, not shareable implies edge + * triggered here. + */ +int xen_bind_pirq_gsi_to_irq(unsigned gsi, + unsigned pirq, int shareable, char *name) +{ + int irq = -1; + struct physdev_irq irq_op; + int ret; + + mutex_lock(&irq_mapping_update_lock); + + irq = xen_irq_from_gsi(gsi); + if (irq != -1) { + pr_info("%s: returning irq %d for gsi %u\n", + __func__, irq, gsi); + goto out; + } + + irq = xen_allocate_irq_gsi(gsi); + if (irq < 0) + goto out; + + irq_op.irq = irq; + irq_op.vector = 0; + + /* Only the privileged domain can do this. For non-priv, the pcifront + * driver provides a PCI bus that does the call to do exactly + * this in the priv domain. */ + if (xen_initial_domain() && + HYPERVISOR_physdev_op(PHYSDEVOP_alloc_irq_vector, &irq_op)) { + xen_free_irq(irq); + irq = -ENOSPC; + goto out; + } + + ret = xen_irq_info_pirq_setup(irq, 0, pirq, gsi, DOMID_SELF, + shareable ? PIRQ_SHAREABLE : 0); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } + + pirq_query_unmask(irq); + /* We try to use the handler with the appropriate semantic for the + * type of interrupt: if the interrupt is an edge triggered + * interrupt we use handle_edge_irq. + * + * On the other hand if the interrupt is level triggered we use + * handle_fasteoi_irq like the native code does for this kind of + * interrupts. + * + * Depending on the Xen version, pirq_needs_eoi might return true + * not only for level triggered interrupts but for edge triggered + * interrupts too. In any case Xen always honors the eoi mechanism, + * not injecting any more pirqs of the same kind if the first one + * hasn't received an eoi yet. Therefore using the fasteoi handler + * is the right choice either way. + */ + if (shareable) + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + handle_fasteoi_irq, name); + else + irq_set_chip_and_handler_name(irq, &xen_pirq_chip, + handle_edge_irq, name); + +out: + mutex_unlock(&irq_mapping_update_lock); + + return irq; +} + +#ifdef CONFIG_PCI_MSI +int xen_allocate_pirq_msi(struct pci_dev *dev, struct msi_desc *msidesc) +{ + int rc; + struct physdev_get_free_pirq op_get_free_pirq; + + op_get_free_pirq.type = MAP_PIRQ_TYPE_MSI; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_get_free_pirq, &op_get_free_pirq); + + WARN_ONCE(rc == -ENOSYS, + "hypervisor does not support the PHYSDEVOP_get_free_pirq interface\n"); + + return rc ? -1 : op_get_free_pirq.pirq; +} + +int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, + int pirq, int nvec, const char *name, domid_t domid) +{ + int i, irq, ret; + + mutex_lock(&irq_mapping_update_lock); + + irq = xen_allocate_irqs_dynamic(nvec); + if (irq < 0) + goto out; + + for (i = 0; i < nvec; i++) { + irq_set_chip_and_handler_name(irq + i, &xen_pirq_chip, handle_edge_irq, name); + + ret = xen_irq_info_pirq_setup(irq + i, 0, pirq + i, 0, domid, + i == 0 ? 0 : PIRQ_MSI_GROUP); + if (ret < 0) + goto error_irq; + } + + ret = irq_set_msi_desc(irq, msidesc); + if (ret < 0) + goto error_irq; +out: + mutex_unlock(&irq_mapping_update_lock); + return irq; +error_irq: + for (; i >= 0; i--) + __unbind_from_irq(irq + i); + mutex_unlock(&irq_mapping_update_lock); + return ret; +} +#endif + +int xen_destroy_irq(int irq) +{ + struct physdev_unmap_pirq unmap_irq; + struct irq_info *info = info_for_irq(irq); + int rc = -ENOENT; + + mutex_lock(&irq_mapping_update_lock); + + /* + * If trying to remove a vector in a MSI group different + * than the first one skip the PIRQ unmap unless this vector + * is the first one in the group. + */ + if (xen_initial_domain() && !(info->u.pirq.flags & PIRQ_MSI_GROUP)) { + unmap_irq.pirq = info->u.pirq.pirq; + unmap_irq.domid = info->u.pirq.domid; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_unmap_pirq, &unmap_irq); + /* If another domain quits without making the pci_disable_msix + * call, the Xen hypervisor takes care of freeing the PIRQs + * (free_domain_pirqs). + */ + if ((rc == -ESRCH && info->u.pirq.domid != DOMID_SELF)) + pr_info("domain %d does not have %d anymore\n", + info->u.pirq.domid, info->u.pirq.pirq); + else if (rc) { + pr_warn("unmap irq failed %d\n", rc); + goto out; + } + } + + xen_free_irq(irq); + +out: + mutex_unlock(&irq_mapping_update_lock); + return rc; +} + +int xen_irq_from_pirq(unsigned pirq) +{ + int irq; + + struct irq_info *info; + + mutex_lock(&irq_mapping_update_lock); + + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info->type != IRQT_PIRQ) + continue; + irq = info->irq; + if (info->u.pirq.pirq == pirq) + goto out; + } + irq = -1; +out: + mutex_unlock(&irq_mapping_update_lock); + + return irq; +} + + +int xen_pirq_from_irq(unsigned irq) +{ + return pirq_from_irq(irq); +} +EXPORT_SYMBOL_GPL(xen_pirq_from_irq); + +int bind_evtchn_to_irq(unsigned int evtchn) +{ + int irq; + int ret; + + if (evtchn >= xen_evtchn_max_channels()) + return -ENOMEM; + + mutex_lock(&irq_mapping_update_lock); + + irq = get_evtchn_to_irq(evtchn); + + if (irq == -1) { + irq = xen_allocate_irq_dynamic(); + if (irq < 0) + goto out; + + irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_edge_irq, "event"); + + ret = xen_irq_info_evtchn_setup(irq, evtchn); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } + /* New interdomain events are bound to VCPU 0. */ + bind_evtchn_to_cpu(evtchn, 0); + } else { + struct irq_info *info = info_for_irq(irq); + WARN_ON(info == NULL || info->type != IRQT_EVTCHN); + } + +out: + mutex_unlock(&irq_mapping_update_lock); + + return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irq); + +static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + int evtchn, irq; + int ret; + + mutex_lock(&irq_mapping_update_lock); + + irq = per_cpu(ipi_to_irq, cpu)[ipi]; + + if (irq == -1) { + irq = xen_allocate_irq_dynamic(); + if (irq < 0) + goto out; + + irq_set_chip_and_handler_name(irq, &xen_percpu_chip, + handle_percpu_irq, "ipi"); + + bind_ipi.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + ret = xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } + bind_evtchn_to_cpu(evtchn, cpu); + } else { + struct irq_info *info = info_for_irq(irq); + WARN_ON(info == NULL || info->type != IRQT_IPI); + } + + out: + mutex_unlock(&irq_mapping_update_lock); + return irq; +} + +int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port) +{ + struct evtchn_bind_interdomain bind_interdomain; + int err; + + bind_interdomain.remote_dom = remote_domain; + bind_interdomain.remote_port = remote_port; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + + return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irq); + +static int find_virq(unsigned int virq, unsigned int cpu) +{ + struct evtchn_status status; + int port, rc = -ENOENT; + + memset(&status, 0, sizeof(status)); + for (port = 0; port < xen_evtchn_max_channels(); port++) { + status.dom = DOMID_SELF; + status.port = port; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); + if (rc < 0) + continue; + if (status.status != EVTCHNSTAT_virq) + continue; + if (status.u.virq == virq && status.vcpu == cpu) { + rc = port; + break; + } + } + return rc; +} + +/** + * xen_evtchn_nr_channels - number of usable event channel ports + * + * This may be less than the maximum supported by the current + * hypervisor ABI. Use xen_evtchn_max_channels() for the maximum + * supported. + */ +unsigned xen_evtchn_nr_channels(void) +{ + return evtchn_ops->nr_channels(); +} +EXPORT_SYMBOL_GPL(xen_evtchn_nr_channels); + +int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu) +{ + struct evtchn_bind_virq bind_virq; + int evtchn, irq, ret; + + mutex_lock(&irq_mapping_update_lock); + + irq = per_cpu(virq_to_irq, cpu)[virq]; + + if (irq == -1) { + irq = xen_allocate_irq_dynamic(); + if (irq < 0) + goto out; + + if (percpu) + irq_set_chip_and_handler_name(irq, &xen_percpu_chip, + handle_percpu_irq, "virq"); + else + irq_set_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_edge_irq, "virq"); + + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq); + if (ret == 0) + evtchn = bind_virq.port; + else { + if (ret == -EEXIST) + ret = find_virq(virq, cpu); + BUG_ON(ret < 0); + evtchn = ret; + } + + ret = xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + if (ret < 0) { + __unbind_from_irq(irq); + irq = ret; + goto out; + } + + bind_evtchn_to_cpu(evtchn, cpu); + } else { + struct irq_info *info = info_for_irq(irq); + WARN_ON(info == NULL || info->type != IRQT_VIRQ); + } + +out: + mutex_unlock(&irq_mapping_update_lock); + + return irq; +} + +static void unbind_from_irq(unsigned int irq) +{ + mutex_lock(&irq_mapping_update_lock); + __unbind_from_irq(irq); + mutex_unlock(&irq_mapping_update_lock); +} + +int bind_evtchn_to_irqhandler(unsigned int evtchn, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, void *dev_id) +{ + int irq, retval; + + irq = bind_evtchn_to_irq(evtchn); + if (irq < 0) + return irq; + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + +int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, + unsigned int remote_port, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); + if (irq < 0) + return irq; + + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); + +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, const char *devname, void *dev_id) +{ + int irq, retval; + + irq = bind_virq_to_irq(virq, cpu, irqflags & IRQF_PERCPU); + if (irq < 0) + return irq; + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); + +int bind_ipi_to_irqhandler(enum ipi_vector ipi, + unsigned int cpu, + irq_handler_t handler, + unsigned long irqflags, + const char *devname, + void *dev_id) +{ + int irq, retval; + + irq = bind_ipi_to_irq(ipi, cpu); + if (irq < 0) + return irq; + + irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME; + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} + +void unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ + struct irq_info *info = irq_get_handler_data(irq); + + if (WARN_ON(!info)) + return; + free_irq(irq, dev_id); + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(unbind_from_irqhandler); + +/** + * xen_set_irq_priority() - set an event channel priority. + * @irq:irq bound to an event channel. + * @priority: priority between XEN_IRQ_PRIORITY_MAX and XEN_IRQ_PRIORITY_MIN. + */ +int xen_set_irq_priority(unsigned irq, unsigned priority) +{ + struct evtchn_set_priority set_priority; + + set_priority.port = evtchn_from_irq(irq); + set_priority.priority = priority; + + return HYPERVISOR_event_channel_op(EVTCHNOP_set_priority, + &set_priority); +} +EXPORT_SYMBOL_GPL(xen_set_irq_priority); + +int evtchn_make_refcounted(unsigned int evtchn) +{ + int irq = get_evtchn_to_irq(evtchn); + struct irq_info *info; + + if (irq == -1) + return -ENOENT; + + info = irq_get_handler_data(irq); + + if (!info) + return -ENOENT; + + WARN_ON(info->refcnt != -1); + + info->refcnt = 1; + + return 0; +} +EXPORT_SYMBOL_GPL(evtchn_make_refcounted); + +int evtchn_get(unsigned int evtchn) +{ + int irq; + struct irq_info *info; + int err = -ENOENT; + + if (evtchn >= xen_evtchn_max_channels()) + return -EINVAL; + + mutex_lock(&irq_mapping_update_lock); + + irq = get_evtchn_to_irq(evtchn); + if (irq == -1) + goto done; + + info = irq_get_handler_data(irq); + + if (!info) + goto done; + + err = -EINVAL; + if (info->refcnt <= 0) + goto done; + + info->refcnt++; + err = 0; + done: + mutex_unlock(&irq_mapping_update_lock); + + return err; +} +EXPORT_SYMBOL_GPL(evtchn_get); + +void evtchn_put(unsigned int evtchn) +{ + int irq = get_evtchn_to_irq(evtchn); + if (WARN_ON(irq == -1)) + return; + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(evtchn_put); + +void xen_send_IPI_one(unsigned int cpu, enum ipi_vector vector) +{ + int irq; + +#ifdef CONFIG_X86 + if (unlikely(vector == XEN_NMI_VECTOR)) { + int rc = HYPERVISOR_vcpu_op(VCPUOP_send_nmi, cpu, NULL); + if (rc < 0) + printk(KERN_WARNING "Sending nmi to CPU%d failed (rc:%d)\n", cpu, rc); + return; + } +#endif + irq = per_cpu(ipi_to_irq, cpu)[vector]; + BUG_ON(irq < 0); + notify_remote_via_irq(irq); +} + +static DEFINE_PER_CPU(unsigned, xed_nesting_count); + +static void __xen_evtchn_do_upcall(void) +{ + struct vcpu_info *vcpu_info = __this_cpu_read(xen_vcpu); + int cpu = get_cpu(); + unsigned count; + + do { + vcpu_info->evtchn_upcall_pending = 0; + + if (__this_cpu_inc_return(xed_nesting_count) - 1) + goto out; + + xen_evtchn_handle_events(cpu); + + BUG_ON(!irqs_disabled()); + + count = __this_cpu_read(xed_nesting_count); + __this_cpu_write(xed_nesting_count, 0); + } while (count != 1 || vcpu_info->evtchn_upcall_pending); + +out: + + put_cpu(); +} + +void xen_evtchn_do_upcall(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + irq_enter(); +#ifdef CONFIG_X86 + exit_idle(); + inc_irq_stat(irq_hv_callback_count); +#endif + + __xen_evtchn_do_upcall(); + + irq_exit(); + set_irq_regs(old_regs); +} + +void xen_hvm_evtchn_do_upcall(void) +{ + __xen_evtchn_do_upcall(); +} +EXPORT_SYMBOL_GPL(xen_hvm_evtchn_do_upcall); + +/* Rebind a new event channel to an existing irq. */ +void rebind_evtchn_irq(int evtchn, int irq) +{ + struct irq_info *info = info_for_irq(irq); + + if (WARN_ON(!info)) + return; + + /* Make sure the irq is masked, since the new event channel + will also be masked. */ + disable_irq(irq); + + mutex_lock(&irq_mapping_update_lock); + + /* After resume the irq<->evtchn mappings are all cleared out */ + BUG_ON(get_evtchn_to_irq(evtchn) != -1); + /* Expect irq to have been bound before, + so there should be a proper type */ + BUG_ON(info->type == IRQT_UNBOUND); + + (void)xen_irq_info_evtchn_setup(irq, evtchn); + + mutex_unlock(&irq_mapping_update_lock); + + bind_evtchn_to_cpu(evtchn, info->cpu); + /* This will be deferred until interrupt is processed */ + irq_set_affinity(irq, cpumask_of(info->cpu)); + + /* Unmask the event channel. */ + enable_irq(irq); +} + +/* Rebind an evtchn so that it gets delivered to a specific cpu */ +static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) +{ + struct evtchn_bind_vcpu bind_vcpu; + int evtchn = evtchn_from_irq(irq); + int masked; + + if (!VALID_EVTCHN(evtchn)) + return -1; + + /* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ + if (xen_hvm_domain() && !xen_have_vector_callback) + return -1; + + /* Send future instances of this interrupt to other vcpu. */ + bind_vcpu.port = evtchn; + bind_vcpu.vcpu = tcpu; + + /* + * Mask the event while changing the VCPU binding to prevent + * it being delivered on an unexpected VCPU. + */ + masked = test_and_set_mask(evtchn); + + /* + * If this fails, it usually just indicates that we're dealing with a + * virq or IPI channel, which don't actually need to be rebound. Ignore + * it, but don't do the xenlinux-level rebind in that case. + */ + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) + bind_evtchn_to_cpu(evtchn, tcpu); + + if (!masked) + unmask_evtchn(evtchn); + + return 0; +} + +static int set_affinity_irq(struct irq_data *data, const struct cpumask *dest, + bool force) +{ + unsigned tcpu = cpumask_first_and(dest, cpu_online_mask); + + return rebind_irq_to_cpu(data->irq, tcpu); +} + +static void enable_dynirq(struct irq_data *data) +{ + int evtchn = evtchn_from_irq(data->irq); + + if (VALID_EVTCHN(evtchn)) + unmask_evtchn(evtchn); +} + +static void disable_dynirq(struct irq_data *data) +{ + int evtchn = evtchn_from_irq(data->irq); + + if (VALID_EVTCHN(evtchn)) + mask_evtchn(evtchn); +} + +static void ack_dynirq(struct irq_data *data) +{ + int evtchn = evtchn_from_irq(data->irq); + + irq_move_irq(data); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); +} + +static void mask_ack_dynirq(struct irq_data *data) +{ + disable_dynirq(data); + ack_dynirq(data); +} + +static int retrigger_dynirq(struct irq_data *data) +{ + unsigned int evtchn = evtchn_from_irq(data->irq); + int masked; + + if (!VALID_EVTCHN(evtchn)) + return 0; + + masked = test_and_set_mask(evtchn); + set_evtchn(evtchn); + if (!masked) + unmask_evtchn(evtchn); + + return 1; +} + +static void restore_pirqs(void) +{ + int pirq, rc, irq, gsi; + struct physdev_map_pirq map_irq; + struct irq_info *info; + + list_for_each_entry(info, &xen_irq_list_head, list) { + if (info->type != IRQT_PIRQ) + continue; + + pirq = info->u.pirq.pirq; + gsi = info->u.pirq.gsi; + irq = info->irq; + + /* save/restore of PT devices doesn't work, so at this point the + * only devices present are GSI based emulated devices */ + if (!gsi) + continue; + + map_irq.domid = DOMID_SELF; + map_irq.type = MAP_PIRQ_TYPE_GSI; + map_irq.index = gsi; + map_irq.pirq = pirq; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); + if (rc) { + pr_warn("xen map irq failed gsi=%d irq=%d pirq=%d rc=%d\n", + gsi, irq, pirq, rc); + xen_free_irq(irq); + continue; + } + + printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq); + + __startup_pirq(irq); + } +} + +static void restore_cpu_virqs(unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int virq, irq, evtchn; + + for (virq = 0; virq < NR_VIRQS; virq++) { + if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) + continue; + + BUG_ON(virq_from_irq(irq) != virq); + + /* Get a new binding from Xen. */ + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + /* Record the new mapping. */ + (void)xen_irq_info_virq_setup(cpu, irq, evtchn, virq); + bind_evtchn_to_cpu(evtchn, cpu); + } +} + +static void restore_cpu_ipis(unsigned int cpu) +{ + struct evtchn_bind_ipi bind_ipi; + int ipi, irq, evtchn; + + for (ipi = 0; ipi < XEN_NR_IPIS; ipi++) { + if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) + continue; + + BUG_ON(ipi_from_irq(irq) != ipi); + + /* Get a new binding from Xen. */ + bind_ipi.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, + &bind_ipi) != 0) + BUG(); + evtchn = bind_ipi.port; + + /* Record the new mapping. */ + (void)xen_irq_info_ipi_setup(cpu, irq, evtchn, ipi); + bind_evtchn_to_cpu(evtchn, cpu); + } +} + +/* Clear an irq's pending state, in preparation for polling on it */ +void xen_clear_irq_pending(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); +} +EXPORT_SYMBOL(xen_clear_irq_pending); +void xen_set_irq_pending(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + set_evtchn(evtchn); +} + +bool xen_test_irq_pending(int irq) +{ + int evtchn = evtchn_from_irq(irq); + bool ret = false; + + if (VALID_EVTCHN(evtchn)) + ret = test_evtchn(evtchn); + + return ret; +} + +/* Poll waiting for an irq to become pending with timeout. In the usual case, + * the irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq_timeout(int irq, u64 timeout) +{ + evtchn_port_t evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) { + struct sched_poll poll; + + poll.nr_ports = 1; + poll.timeout = timeout; + set_xen_guest_handle(poll.ports, &evtchn); + + if (HYPERVISOR_sched_op(SCHEDOP_poll, &poll) != 0) + BUG(); + } +} +EXPORT_SYMBOL(xen_poll_irq_timeout); +/* Poll waiting for an irq to become pending. In the usual case, the + * irq will be disabled so it won't deliver an interrupt. */ +void xen_poll_irq(int irq) +{ + xen_poll_irq_timeout(irq, 0 /* no timeout */); +} + +/* Check whether the IRQ line is shared with other guests. */ +int xen_test_irq_shared(int irq) +{ + struct irq_info *info = info_for_irq(irq); + struct physdev_irq_status_query irq_status; + + if (WARN_ON(!info)) + return -ENOENT; + + irq_status.irq = info->u.pirq.pirq; + + if (HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status)) + return 0; + return !(irq_status.flags & XENIRQSTAT_shared); +} +EXPORT_SYMBOL_GPL(xen_test_irq_shared); + +void xen_irq_resume(void) +{ + unsigned int cpu; + struct irq_info *info; + + /* New event-channel space is not 'live' yet. */ + xen_evtchn_mask_all(); + xen_evtchn_resume(); + + /* No IRQ <-> event-channel mappings. */ + list_for_each_entry(info, &xen_irq_list_head, list) + info->evtchn = 0; /* zap event-channel binding */ + + clear_evtchn_to_irq_all(); + + for_each_possible_cpu(cpu) { + restore_cpu_virqs(cpu); + restore_cpu_ipis(cpu); + } + + restore_pirqs(); +} + +static struct irq_chip xen_dynamic_chip __read_mostly = { + .name = "xen-dyn", + + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = ack_dynirq, + .irq_mask_ack = mask_ack_dynirq, + + .irq_set_affinity = set_affinity_irq, + .irq_retrigger = retrigger_dynirq, +}; + +static struct irq_chip xen_pirq_chip __read_mostly = { + .name = "xen-pirq", + + .irq_startup = startup_pirq, + .irq_shutdown = shutdown_pirq, + .irq_enable = enable_pirq, + .irq_disable = disable_pirq, + + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = eoi_pirq, + .irq_eoi = eoi_pirq, + .irq_mask_ack = mask_ack_pirq, + + .irq_set_affinity = set_affinity_irq, + + .irq_retrigger = retrigger_dynirq, +}; + +static struct irq_chip xen_percpu_chip __read_mostly = { + .name = "xen-percpu", + + .irq_disable = disable_dynirq, + .irq_mask = disable_dynirq, + .irq_unmask = enable_dynirq, + + .irq_ack = ack_dynirq, +}; + +int xen_set_callback_via(uint64_t via) +{ + struct xen_hvm_param a; + a.domid = DOMID_SELF; + a.index = HVM_PARAM_CALLBACK_IRQ; + a.value = via; + return HYPERVISOR_hvm_op(HVMOP_set_param, &a); +} +EXPORT_SYMBOL_GPL(xen_set_callback_via); + +#ifdef CONFIG_XEN_PVHVM +/* Vector callbacks are better than PCI interrupts to receive event + * channel notifications because we can receive vector callbacks on any + * vcpu and we don't need PCI support or APIC interactions. */ +void xen_callback_vector(void) +{ + int rc; + uint64_t callback_via; + if (xen_have_vector_callback) { + callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); + rc = xen_set_callback_via(callback_via); + if (rc) { + pr_err("Request for Xen HVM callback vector failed\n"); + xen_have_vector_callback = 0; + return; + } + pr_info("Xen HVM callback vector for event delivery is enabled\n"); + /* in the restore case the vector has already been allocated */ + if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + xen_hvm_callback_vector); + } +} +#else +void xen_callback_vector(void) {} +#endif + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "xen." + +static bool fifo_events = true; +module_param(fifo_events, bool, 0); + +void __init xen_init_IRQ(void) +{ + int ret = -EINVAL; + + if (fifo_events) + ret = xen_evtchn_fifo_init(); + if (ret < 0) + xen_evtchn_2l_init(); + + evtchn_to_irq = kcalloc(EVTCHN_ROW(xen_evtchn_max_channels()), + sizeof(*evtchn_to_irq), GFP_KERNEL); + BUG_ON(!evtchn_to_irq); + + /* No event channels are 'live' right now. */ + xen_evtchn_mask_all(); + + pirq_needs_eoi = pirq_needs_eoi_flag; + +#ifdef CONFIG_X86 + if (xen_pv_domain()) { + irq_ctx_init(smp_processor_id()); + if (xen_initial_domain()) + pci_xen_initial_domain(); + } + if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_callback_vector(); + + if (xen_hvm_domain()) { + native_init_IRQ(); + /* pci_xen_hvm_init must be called after native_init_IRQ so that + * __acpi_register_gsi can point at the right function */ + pci_xen_hvm_init(); + } else { + int rc; + struct physdev_pirq_eoi_gmfn eoi_gmfn; + + pirq_eoi_map = (void *)__get_free_page(GFP_KERNEL|__GFP_ZERO); + eoi_gmfn.gmfn = virt_to_mfn(pirq_eoi_map); + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pirq_eoi_gmfn_v2, &eoi_gmfn); + /* TODO: No PVH support for PIRQ EOI */ + if (rc != 0) { + free_page((unsigned long) pirq_eoi_map); + pirq_eoi_map = NULL; + } else + pirq_needs_eoi = pirq_check_eoi_map; + } +#endif +} diff --git a/drivers/xen/events/events_fifo.c b/drivers/xen/events/events_fifo.c new file mode 100644 index 000000000..417415d73 --- /dev/null +++ b/drivers/xen/events/events_fifo.c @@ -0,0 +1,450 @@ +/* + * Xen event channels (FIFO-based ABI) + * + * Copyright (C) 2013 Citrix Systems R&D ltd. + * + * This source code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * Or, when distributed separately from the Linux kernel or + * incorporated into other software packages, subject to the following + * license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/linkage.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/percpu.h> +#include <linux/cpu.h> + +#include <asm/sync_bitops.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> + +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/events.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> + +#include "events_internal.h" + +#define EVENT_WORDS_PER_PAGE (PAGE_SIZE / sizeof(event_word_t)) +#define MAX_EVENT_ARRAY_PAGES (EVTCHN_FIFO_NR_CHANNELS / EVENT_WORDS_PER_PAGE) + +struct evtchn_fifo_queue { + uint32_t head[EVTCHN_FIFO_MAX_QUEUES]; +}; + +static DEFINE_PER_CPU(struct evtchn_fifo_control_block *, cpu_control_block); +static DEFINE_PER_CPU(struct evtchn_fifo_queue, cpu_queue); +static event_word_t *event_array[MAX_EVENT_ARRAY_PAGES] __read_mostly; +static unsigned event_array_pages __read_mostly; + +/* + * sync_set_bit() and friends must be unsigned long aligned. + */ +#if BITS_PER_LONG > 32 + +#define BM(w) (unsigned long *)((unsigned long)w & ~0x7UL) +#define EVTCHN_FIFO_BIT(b, w) \ + (((unsigned long)w & 0x4UL) ? (EVTCHN_FIFO_ ##b + 32) : EVTCHN_FIFO_ ##b) + +#else + +#define BM(w) ((unsigned long *)(w)) +#define EVTCHN_FIFO_BIT(b, w) EVTCHN_FIFO_ ##b + +#endif + +static inline event_word_t *event_word_from_port(unsigned port) +{ + unsigned i = port / EVENT_WORDS_PER_PAGE; + + return event_array[i] + port % EVENT_WORDS_PER_PAGE; +} + +static unsigned evtchn_fifo_max_channels(void) +{ + return EVTCHN_FIFO_NR_CHANNELS; +} + +static unsigned evtchn_fifo_nr_channels(void) +{ + return event_array_pages * EVENT_WORDS_PER_PAGE; +} + +static int init_control_block(int cpu, + struct evtchn_fifo_control_block *control_block) +{ + struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); + struct evtchn_init_control init_control; + unsigned int i; + + /* Reset the control block and the local HEADs. */ + clear_page(control_block); + for (i = 0; i < EVTCHN_FIFO_MAX_QUEUES; i++) + q->head[i] = 0; + + init_control.control_gfn = virt_to_mfn(control_block); + init_control.offset = 0; + init_control.vcpu = cpu; + + return HYPERVISOR_event_channel_op(EVTCHNOP_init_control, &init_control); +} + +static void free_unused_array_pages(void) +{ + unsigned i; + + for (i = event_array_pages; i < MAX_EVENT_ARRAY_PAGES; i++) { + if (!event_array[i]) + break; + free_page((unsigned long)event_array[i]); + event_array[i] = NULL; + } +} + +static void init_array_page(event_word_t *array_page) +{ + unsigned i; + + for (i = 0; i < EVENT_WORDS_PER_PAGE; i++) + array_page[i] = 1 << EVTCHN_FIFO_MASKED; +} + +static int evtchn_fifo_setup(struct irq_info *info) +{ + unsigned port = info->evtchn; + unsigned new_array_pages; + int ret; + + new_array_pages = port / EVENT_WORDS_PER_PAGE + 1; + + if (new_array_pages > MAX_EVENT_ARRAY_PAGES) + return -EINVAL; + + while (event_array_pages < new_array_pages) { + void *array_page; + struct evtchn_expand_array expand_array; + + /* Might already have a page if we've resumed. */ + array_page = event_array[event_array_pages]; + if (!array_page) { + array_page = (void *)__get_free_page(GFP_KERNEL); + if (array_page == NULL) { + ret = -ENOMEM; + goto error; + } + event_array[event_array_pages] = array_page; + } + + /* Mask all events in this page before adding it. */ + init_array_page(array_page); + + expand_array.array_gfn = virt_to_mfn(array_page); + + ret = HYPERVISOR_event_channel_op(EVTCHNOP_expand_array, &expand_array); + if (ret < 0) + goto error; + + event_array_pages++; + } + return 0; + + error: + if (event_array_pages == 0) + panic("xen: unable to expand event array with initial page (%d)\n", ret); + else + pr_err("unable to expand event array (%d)\n", ret); + free_unused_array_pages(); + return ret; +} + +static void evtchn_fifo_bind_to_cpu(struct irq_info *info, unsigned cpu) +{ + /* no-op */ +} + +static void evtchn_fifo_clear_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_clear_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static void evtchn_fifo_set_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_is_pending(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(PENDING, word), BM(word)); +} + +static bool evtchn_fifo_test_and_set_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_and_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static void evtchn_fifo_mask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + sync_set_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} + +static bool evtchn_fifo_is_masked(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + return sync_test_bit(EVTCHN_FIFO_BIT(MASKED, word), BM(word)); +} +/* + * Clear MASKED, spinning if BUSY is set. + */ +static void clear_masked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w & ~(1 << EVTCHN_FIFO_BUSY); + new = old & ~(1 << EVTCHN_FIFO_MASKED); + w = sync_cmpxchg(word, old, new); + } while (w != old); +} + +static void evtchn_fifo_unmask(unsigned port) +{ + event_word_t *word = event_word_from_port(port); + + BUG_ON(!irqs_disabled()); + + clear_masked(word); + if (evtchn_fifo_is_pending(port)) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } +} + +static uint32_t clear_linked(volatile event_word_t *word) +{ + event_word_t new, old, w; + + w = *word; + + do { + old = w; + new = (w & ~((1 << EVTCHN_FIFO_LINKED) + | EVTCHN_FIFO_LINK_MASK)); + } while ((w = sync_cmpxchg(word, old, new)) != old); + + return w & EVTCHN_FIFO_LINK_MASK; +} + +static void handle_irq_for_port(unsigned port) +{ + int irq; + + irq = get_evtchn_to_irq(port); + if (irq != -1) + generic_handle_irq(irq); +} + +static void consume_one_event(unsigned cpu, + struct evtchn_fifo_control_block *control_block, + unsigned priority, unsigned long *ready) +{ + struct evtchn_fifo_queue *q = &per_cpu(cpu_queue, cpu); + uint32_t head; + unsigned port; + event_word_t *word; + + head = q->head[priority]; + + /* + * Reached the tail last time? Read the new HEAD from the + * control block. + */ + if (head == 0) { + rmb(); /* Ensure word is up-to-date before reading head. */ + head = control_block->head[priority]; + } + + port = head; + word = event_word_from_port(port); + head = clear_linked(word); + + /* + * If the link is non-zero, there are more events in the + * queue, otherwise the queue is empty. + * + * If the queue is empty, clear this priority from our local + * copy of the ready word. + */ + if (head == 0) + clear_bit(priority, ready); + + if (evtchn_fifo_is_pending(port) && !evtchn_fifo_is_masked(port)) + handle_irq_for_port(port); + + q->head[priority] = head; +} + +static void evtchn_fifo_handle_events(unsigned cpu) +{ + struct evtchn_fifo_control_block *control_block; + unsigned long ready; + unsigned q; + + control_block = per_cpu(cpu_control_block, cpu); + + ready = xchg(&control_block->ready, 0); + + while (ready) { + q = find_first_bit(&ready, EVTCHN_FIFO_MAX_QUEUES); + consume_one_event(cpu, control_block, q, &ready); + ready |= xchg(&control_block->ready, 0); + } +} + +static void evtchn_fifo_resume(void) +{ + unsigned cpu; + + for_each_possible_cpu(cpu) { + void *control_block = per_cpu(cpu_control_block, cpu); + int ret; + + if (!control_block) + continue; + + /* + * If this CPU is offline, take the opportunity to + * free the control block while it is not being + * used. + */ + if (!cpu_online(cpu)) { + free_page((unsigned long)control_block); + per_cpu(cpu_control_block, cpu) = NULL; + continue; + } + + ret = init_control_block(cpu, control_block); + if (ret < 0) + BUG(); + } + + /* + * The event array starts out as empty again and is extended + * as normal when events are bound. The existing pages will + * be reused. + */ + event_array_pages = 0; +} + +static const struct evtchn_ops evtchn_ops_fifo = { + .max_channels = evtchn_fifo_max_channels, + .nr_channels = evtchn_fifo_nr_channels, + .setup = evtchn_fifo_setup, + .bind_to_cpu = evtchn_fifo_bind_to_cpu, + .clear_pending = evtchn_fifo_clear_pending, + .set_pending = evtchn_fifo_set_pending, + .is_pending = evtchn_fifo_is_pending, + .test_and_set_mask = evtchn_fifo_test_and_set_mask, + .mask = evtchn_fifo_mask, + .unmask = evtchn_fifo_unmask, + .handle_events = evtchn_fifo_handle_events, + .resume = evtchn_fifo_resume, +}; + +static int evtchn_fifo_alloc_control_block(unsigned cpu) +{ + void *control_block = NULL; + int ret = -ENOMEM; + + control_block = (void *)__get_free_page(GFP_KERNEL); + if (control_block == NULL) + goto error; + + ret = init_control_block(cpu, control_block); + if (ret < 0) + goto error; + + per_cpu(cpu_control_block, cpu) = control_block; + + return 0; + + error: + free_page((unsigned long)control_block); + return ret; +} + +static int evtchn_fifo_cpu_notification(struct notifier_block *self, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = 0; + + switch (action) { + case CPU_UP_PREPARE: + if (!per_cpu(cpu_control_block, cpu)) + ret = evtchn_fifo_alloc_control_block(cpu); + break; + default: + break; + } + return ret < 0 ? NOTIFY_BAD : NOTIFY_OK; +} + +static struct notifier_block evtchn_fifo_cpu_notifier = { + .notifier_call = evtchn_fifo_cpu_notification, +}; + +int __init xen_evtchn_fifo_init(void) +{ + int cpu = get_cpu(); + int ret; + + ret = evtchn_fifo_alloc_control_block(cpu); + if (ret < 0) + goto out; + + pr_info("Using FIFO-based ABI\n"); + + evtchn_ops = &evtchn_ops_fifo; + + register_cpu_notifier(&evtchn_fifo_cpu_notifier); +out: + put_cpu(); + return ret; +} diff --git a/drivers/xen/events/events_internal.h b/drivers/xen/events/events_internal.h new file mode 100644 index 000000000..50c2050a1 --- /dev/null +++ b/drivers/xen/events/events_internal.h @@ -0,0 +1,151 @@ +/* + * Xen Event Channels (internal header) + * + * Copyright (C) 2013 Citrix Systems R&D Ltd. + * + * This source code is licensed under the GNU General Public License, + * Version 2 or later. See the file COPYING for more details. + */ +#ifndef __EVENTS_INTERNAL_H__ +#define __EVENTS_INTERNAL_H__ + +/* Interrupt types. */ +enum xen_irq_type { + IRQT_UNBOUND = 0, + IRQT_PIRQ, + IRQT_VIRQ, + IRQT_IPI, + IRQT_EVTCHN +}; + +/* + * Packed IRQ information: + * type - enum xen_irq_type + * event channel - irq->event channel mapping + * cpu - cpu this event channel is bound to + * index - type-specific information: + * PIRQ - vector, with MSB being "needs EIO", or physical IRQ of the HVM + * guest, or GSI (real passthrough IRQ) of the device. + * VIRQ - virq number + * IPI - IPI vector + * EVTCHN - + */ +struct irq_info { + struct list_head list; + int refcnt; + enum xen_irq_type type; /* type */ + unsigned irq; + unsigned int evtchn; /* event channel */ + unsigned short cpu; /* cpu bound */ + + union { + unsigned short virq; + enum ipi_vector ipi; + struct { + unsigned short pirq; + unsigned short gsi; + unsigned char vector; + unsigned char flags; + uint16_t domid; + } pirq; + } u; +}; + +#define PIRQ_NEEDS_EOI (1 << 0) +#define PIRQ_SHAREABLE (1 << 1) +#define PIRQ_MSI_GROUP (1 << 2) + +struct evtchn_ops { + unsigned (*max_channels)(void); + unsigned (*nr_channels)(void); + + int (*setup)(struct irq_info *info); + void (*bind_to_cpu)(struct irq_info *info, unsigned cpu); + + void (*clear_pending)(unsigned port); + void (*set_pending)(unsigned port); + bool (*is_pending)(unsigned port); + bool (*test_and_set_mask)(unsigned port); + void (*mask)(unsigned port); + void (*unmask)(unsigned port); + + void (*handle_events)(unsigned cpu); + void (*resume)(void); +}; + +extern const struct evtchn_ops *evtchn_ops; + +extern int **evtchn_to_irq; +int get_evtchn_to_irq(unsigned int evtchn); + +struct irq_info *info_for_irq(unsigned irq); +unsigned cpu_from_irq(unsigned irq); +unsigned cpu_from_evtchn(unsigned int evtchn); + +static inline unsigned xen_evtchn_max_channels(void) +{ + return evtchn_ops->max_channels(); +} + +/* + * Do any ABI specific setup for a bound event channel before it can + * be unmasked and used. + */ +static inline int xen_evtchn_port_setup(struct irq_info *info) +{ + if (evtchn_ops->setup) + return evtchn_ops->setup(info); + return 0; +} + +static inline void xen_evtchn_port_bind_to_cpu(struct irq_info *info, + unsigned cpu) +{ + evtchn_ops->bind_to_cpu(info, cpu); +} + +static inline void clear_evtchn(unsigned port) +{ + evtchn_ops->clear_pending(port); +} + +static inline void set_evtchn(unsigned port) +{ + evtchn_ops->set_pending(port); +} + +static inline bool test_evtchn(unsigned port) +{ + return evtchn_ops->is_pending(port); +} + +static inline bool test_and_set_mask(unsigned port) +{ + return evtchn_ops->test_and_set_mask(port); +} + +static inline void mask_evtchn(unsigned port) +{ + return evtchn_ops->mask(port); +} + +static inline void unmask_evtchn(unsigned port) +{ + return evtchn_ops->unmask(port); +} + +static inline void xen_evtchn_handle_events(unsigned cpu) +{ + return evtchn_ops->handle_events(cpu); +} + +static inline void xen_evtchn_resume(void) +{ + if (evtchn_ops->resume) + evtchn_ops->resume(); +} + +void xen_evtchn_2l_init(void); +int xen_evtchn_fifo_init(void); + +#endif /* #ifndef __EVENTS_INTERNAL_H__ */ diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c new file mode 100644 index 000000000..00f40f051 --- /dev/null +++ b/drivers/xen/evtchn.c @@ -0,0 +1,586 @@ +/****************************************************************************** + * evtchn.c + * + * Driver for receiving and demuxing event-channel signals. + * + * Copyright (c) 2004-2005, K A Fraser + * Multi-process extensions Copyright (c) 2004, Steven Smith + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/major.h> +#include <linux/proc_fs.h> +#include <linux/stat.h> +#include <linux/poll.h> +#include <linux/irq.h> +#include <linux/init.h> +#include <linux/mutex.h> +#include <linux/cpu.h> + +#include <xen/xen.h> +#include <xen/events.h> +#include <xen/evtchn.h> +#include <asm/xen/hypervisor.h> + +struct per_user_data { + struct mutex bind_mutex; /* serialize bind/unbind operations */ + struct rb_root evtchns; + + /* Notification ring, accessed via /dev/xen/evtchn. */ +#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) + evtchn_port_t *ring; + unsigned int ring_cons, ring_prod, ring_overflow; + struct mutex ring_cons_mutex; /* protect against concurrent readers */ + spinlock_t ring_prod_lock; /* product against concurrent interrupts */ + + /* Processes wait on this queue when ring is empty. */ + wait_queue_head_t evtchn_wait; + struct fasync_struct *evtchn_async_queue; + const char *name; +}; + +struct user_evtchn { + struct rb_node node; + struct per_user_data *user; + unsigned port; + bool enabled; +}; + +static int add_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) +{ + struct rb_node **new = &(u->evtchns.rb_node), *parent = NULL; + + while (*new) { + struct user_evtchn *this; + + this = container_of(*new, struct user_evtchn, node); + + parent = *new; + if (this->port < evtchn->port) + new = &((*new)->rb_left); + else if (this->port > evtchn->port) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + /* Add new node and rebalance tree. */ + rb_link_node(&evtchn->node, parent, new); + rb_insert_color(&evtchn->node, &u->evtchns); + + return 0; +} + +static void del_evtchn(struct per_user_data *u, struct user_evtchn *evtchn) +{ + rb_erase(&evtchn->node, &u->evtchns); + kfree(evtchn); +} + +static struct user_evtchn *find_evtchn(struct per_user_data *u, unsigned port) +{ + struct rb_node *node = u->evtchns.rb_node; + + while (node) { + struct user_evtchn *evtchn; + + evtchn = container_of(node, struct user_evtchn, node); + + if (evtchn->port < port) + node = node->rb_left; + else if (evtchn->port > port) + node = node->rb_right; + else + return evtchn; + } + return NULL; +} + +static irqreturn_t evtchn_interrupt(int irq, void *data) +{ + struct user_evtchn *evtchn = data; + struct per_user_data *u = evtchn->user; + + WARN(!evtchn->enabled, + "Interrupt for port %d, but apparently not enabled; per-user %p\n", + evtchn->port, u); + + disable_irq_nosync(irq); + evtchn->enabled = false; + + spin_lock(&u->ring_prod_lock); + + if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = evtchn->port; + wmb(); /* Ensure ring contents visible */ + if (u->ring_cons == u->ring_prod++) { + wake_up_interruptible(&u->evtchn_wait); + kill_fasync(&u->evtchn_async_queue, + SIGIO, POLL_IN); + } + } else + u->ring_overflow = 1; + + spin_unlock(&u->ring_prod_lock); + + return IRQ_HANDLED; +} + +static ssize_t evtchn_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int rc; + unsigned int c, p, bytes1 = 0, bytes2 = 0; + struct per_user_data *u = file->private_data; + + /* Whole number of ports. */ + count &= ~(sizeof(evtchn_port_t)-1); + + if (count == 0) + return 0; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + for (;;) { + mutex_lock(&u->ring_cons_mutex); + + rc = -EFBIG; + if (u->ring_overflow) + goto unlock_out; + + c = u->ring_cons; + p = u->ring_prod; + if (c != p) + break; + + mutex_unlock(&u->ring_cons_mutex); + + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + rc = wait_event_interruptible(u->evtchn_wait, + u->ring_cons != u->ring_prod); + if (rc) + return rc; + } + + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ + if (((c ^ p) & EVTCHN_RING_SIZE) != 0) { + bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * + sizeof(evtchn_port_t); + bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t); + } else { + bytes1 = (p - c) * sizeof(evtchn_port_t); + bytes2 = 0; + } + + /* Truncate chunks according to caller's maximum byte count. */ + if (bytes1 > count) { + bytes1 = count; + bytes2 = 0; + } else if ((bytes1 + bytes2) > count) { + bytes2 = count - bytes1; + } + + rc = -EFAULT; + rmb(); /* Ensure that we see the port before we copy it. */ + if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) || + ((bytes2 != 0) && + copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) + goto unlock_out; + + u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t); + rc = bytes1 + bytes2; + + unlock_out: + mutex_unlock(&u->ring_cons_mutex); + return rc; +} + +static ssize_t evtchn_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int rc, i; + evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL); + struct per_user_data *u = file->private_data; + + if (kbuf == NULL) + return -ENOMEM; + + /* Whole number of ports. */ + count &= ~(sizeof(evtchn_port_t)-1); + + rc = 0; + if (count == 0) + goto out; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + rc = -EFAULT; + if (copy_from_user(kbuf, buf, count) != 0) + goto out; + + mutex_lock(&u->bind_mutex); + + for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) { + unsigned port = kbuf[i]; + struct user_evtchn *evtchn; + + evtchn = find_evtchn(u, port); + if (evtchn && !evtchn->enabled) { + evtchn->enabled = true; + enable_irq(irq_from_evtchn(port)); + } + } + + mutex_unlock(&u->bind_mutex); + + rc = count; + + out: + free_page((unsigned long)kbuf); + return rc; +} + +static int evtchn_bind_to_user(struct per_user_data *u, int port) +{ + struct user_evtchn *evtchn; + struct evtchn_close close; + int rc = 0; + + /* + * Ports are never reused, so every caller should pass in a + * unique port. + * + * (Locking not necessary because we haven't registered the + * interrupt handler yet, and our caller has already + * serialized bind operations.) + */ + + evtchn = kzalloc(sizeof(*evtchn), GFP_KERNEL); + if (!evtchn) + return -ENOMEM; + + evtchn->user = u; + evtchn->port = port; + evtchn->enabled = true; /* start enabled */ + + rc = add_evtchn(u, evtchn); + if (rc < 0) + goto err; + + rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, 0, + u->name, evtchn); + if (rc < 0) + goto err; + + rc = evtchn_make_refcounted(port); + return rc; + +err: + /* bind failed, should close the port now */ + close.port = port; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + del_evtchn(u, evtchn); + return rc; +} + +static void evtchn_unbind_from_user(struct per_user_data *u, + struct user_evtchn *evtchn) +{ + int irq = irq_from_evtchn(evtchn->port); + + BUG_ON(irq < 0); + + unbind_from_irqhandler(irq, evtchn); + + del_evtchn(u, evtchn); +} + +static long evtchn_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + int rc; + struct per_user_data *u = file->private_data; + void __user *uarg = (void __user *) arg; + + /* Prevent bind from racing with unbind */ + mutex_lock(&u->bind_mutex); + + switch (cmd) { + case IOCTL_EVTCHN_BIND_VIRQ: { + struct ioctl_evtchn_bind_virq bind; + struct evtchn_bind_virq bind_virq; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + bind_virq.virq = bind.virq; + bind_virq.vcpu = 0; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, bind_virq.port); + if (rc == 0) + rc = bind_virq.port; + break; + } + + case IOCTL_EVTCHN_BIND_INTERDOMAIN: { + struct ioctl_evtchn_bind_interdomain bind; + struct evtchn_bind_interdomain bind_interdomain; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + bind_interdomain.remote_dom = bind.remote_domain; + bind_interdomain.remote_port = bind.remote_port; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, bind_interdomain.local_port); + if (rc == 0) + rc = bind_interdomain.local_port; + break; + } + + case IOCTL_EVTCHN_BIND_UNBOUND_PORT: { + struct ioctl_evtchn_bind_unbound_port bind; + struct evtchn_alloc_unbound alloc_unbound; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = bind.remote_domain; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, alloc_unbound.port); + if (rc == 0) + rc = alloc_unbound.port; + break; + } + + case IOCTL_EVTCHN_UNBIND: { + struct ioctl_evtchn_unbind unbind; + struct user_evtchn *evtchn; + + rc = -EFAULT; + if (copy_from_user(&unbind, uarg, sizeof(unbind))) + break; + + rc = -EINVAL; + if (unbind.port >= xen_evtchn_nr_channels()) + break; + + rc = -ENOTCONN; + evtchn = find_evtchn(u, unbind.port); + if (!evtchn) + break; + + disable_irq(irq_from_evtchn(unbind.port)); + evtchn_unbind_from_user(u, evtchn); + rc = 0; + break; + } + + case IOCTL_EVTCHN_NOTIFY: { + struct ioctl_evtchn_notify notify; + struct user_evtchn *evtchn; + + rc = -EFAULT; + if (copy_from_user(¬ify, uarg, sizeof(notify))) + break; + + rc = -ENOTCONN; + evtchn = find_evtchn(u, notify.port); + if (evtchn) { + notify_remote_via_evtchn(notify.port); + rc = 0; + } + break; + } + + case IOCTL_EVTCHN_RESET: { + /* Initialise the ring to empty. Clear errors. */ + mutex_lock(&u->ring_cons_mutex); + spin_lock_irq(&u->ring_prod_lock); + u->ring_cons = u->ring_prod = u->ring_overflow = 0; + spin_unlock_irq(&u->ring_prod_lock); + mutex_unlock(&u->ring_cons_mutex); + rc = 0; + break; + } + + default: + rc = -ENOSYS; + break; + } + mutex_unlock(&u->bind_mutex); + + return rc; +} + +static unsigned int evtchn_poll(struct file *file, poll_table *wait) +{ + unsigned int mask = POLLOUT | POLLWRNORM; + struct per_user_data *u = file->private_data; + + poll_wait(file, &u->evtchn_wait, wait); + if (u->ring_cons != u->ring_prod) + mask |= POLLIN | POLLRDNORM; + if (u->ring_overflow) + mask = POLLERR; + return mask; +} + +static int evtchn_fasync(int fd, struct file *filp, int on) +{ + struct per_user_data *u = filp->private_data; + return fasync_helper(fd, filp, on, &u->evtchn_async_queue); +} + +static int evtchn_open(struct inode *inode, struct file *filp) +{ + struct per_user_data *u; + + u = kzalloc(sizeof(*u), GFP_KERNEL); + if (u == NULL) + return -ENOMEM; + + u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm); + if (u->name == NULL) { + kfree(u); + return -ENOMEM; + } + + init_waitqueue_head(&u->evtchn_wait); + + u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL); + if (u->ring == NULL) { + kfree(u->name); + kfree(u); + return -ENOMEM; + } + + mutex_init(&u->bind_mutex); + mutex_init(&u->ring_cons_mutex); + spin_lock_init(&u->ring_prod_lock); + + filp->private_data = u; + + return nonseekable_open(inode, filp); +} + +static int evtchn_release(struct inode *inode, struct file *filp) +{ + struct per_user_data *u = filp->private_data; + struct rb_node *node; + + while ((node = u->evtchns.rb_node)) { + struct user_evtchn *evtchn; + + evtchn = rb_entry(node, struct user_evtchn, node); + disable_irq(irq_from_evtchn(evtchn->port)); + evtchn_unbind_from_user(u, evtchn); + } + + free_page((unsigned long)u->ring); + kfree(u->name); + kfree(u); + + return 0; +} + +static const struct file_operations evtchn_fops = { + .owner = THIS_MODULE, + .read = evtchn_read, + .write = evtchn_write, + .unlocked_ioctl = evtchn_ioctl, + .poll = evtchn_poll, + .fasync = evtchn_fasync, + .open = evtchn_open, + .release = evtchn_release, + .llseek = no_llseek, +}; + +static struct miscdevice evtchn_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/evtchn", + .fops = &evtchn_fops, +}; +static int __init evtchn_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + /* Create '/dev/xen/evtchn'. */ + err = misc_register(&evtchn_miscdev); + if (err != 0) { + pr_err("Could not register /dev/xen/evtchn\n"); + return err; + } + + pr_info("Event-channel device installed\n"); + + return 0; +} + +static void __exit evtchn_cleanup(void) +{ + misc_deregister(&evtchn_miscdev); +} + +module_init(evtchn_init); +module_exit(evtchn_cleanup); + +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/fallback.c b/drivers/xen/fallback.c new file mode 100644 index 000000000..b04fb64c5 --- /dev/null +++ b/drivers/xen/fallback.c @@ -0,0 +1,81 @@ +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/bug.h> +#include <linux/export.h> +#include <asm/hypervisor.h> +#include <asm/xen/hypercall.h> + +int xen_event_channel_op_compat(int cmd, void *arg) +{ + struct evtchn_op op; + int rc; + + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, event_channel_op_compat, &op); + + switch (cmd) { + case EVTCHNOP_close: + case EVTCHNOP_send: + case EVTCHNOP_bind_vcpu: + case EVTCHNOP_unmask: + /* no output */ + break; + +#define COPY_BACK(eop) \ + case EVTCHNOP_##eop: \ + memcpy(arg, &op.u.eop, sizeof(op.u.eop)); \ + break + + COPY_BACK(bind_interdomain); + COPY_BACK(bind_virq); + COPY_BACK(bind_pirq); + COPY_BACK(status); + COPY_BACK(alloc_unbound); + COPY_BACK(bind_ipi); +#undef COPY_BACK + + default: + WARN_ON(rc != -ENOSYS); + break; + } + + return rc; +} +EXPORT_SYMBOL_GPL(xen_event_channel_op_compat); + +int xen_physdev_op_compat(int cmd, void *arg) +{ + struct physdev_op op; + int rc; + + op.cmd = cmd; + memcpy(&op.u, arg, sizeof(op.u)); + rc = _hypercall1(int, physdev_op_compat, &op); + + switch (cmd) { + case PHYSDEVOP_IRQ_UNMASK_NOTIFY: + case PHYSDEVOP_set_iopl: + case PHYSDEVOP_set_iobitmap: + case PHYSDEVOP_apic_write: + /* no output */ + break; + +#define COPY_BACK(pop, fld) \ + case PHYSDEVOP_##pop: \ + memcpy(arg, &op.u.fld, sizeof(op.u.fld)); \ + break + + COPY_BACK(irq_status_query, irq_status_query); + COPY_BACK(apic_read, apic_op); + COPY_BACK(ASSIGN_VECTOR, irq_op); +#undef COPY_BACK + + default: + WARN_ON(rc != -ENOSYS); + break; + } + + return rc; +} +EXPORT_SYMBOL_GPL(xen_physdev_op_compat); diff --git a/drivers/xen/features.c b/drivers/xen/features.c new file mode 100644 index 000000000..99eda169c --- /dev/null +++ b/drivers/xen/features.c @@ -0,0 +1,33 @@ +/****************************************************************************** + * features.c + * + * Xen feature flags. + * + * Copyright (c) 2006, Ian Campbell, XenSource Inc. + */ +#include <linux/types.h> +#include <linux/cache.h> +#include <linux/module.h> + +#include <asm/xen/hypercall.h> + +#include <xen/interface/xen.h> +#include <xen/interface/version.h> +#include <xen/features.h> + +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; +EXPORT_SYMBOL_GPL(xen_features); + +void xen_setup_features(void) +{ + struct xen_feature_info fi; + int i, j; + + for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { + fi.submap_idx = i; + if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) + break; + for (j = 0; j < 32; j++) + xen_features[i * 32 + j] = !!(fi.submap & 1<<j); + } +} diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c new file mode 100644 index 000000000..e53fe1917 --- /dev/null +++ b/drivers/xen/gntalloc.c @@ -0,0 +1,608 @@ +/****************************************************************************** + * gntalloc.c + * + * Device for creating grant references (in user-space) that may be shared + * with other domains. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * This driver exists to allow userspace programs in Linux to allocate kernel + * memory that will later be shared with another domain. Without this device, + * Linux userspace programs cannot create grant references. + * + * How this stuff works: + * X -> granting a page to Y + * Y -> mapping the grant from X + * + * 1. X uses the gntalloc device to allocate a page of kernel memory, P. + * 2. X creates an entry in the grant table that says domid(Y) can access P. + * This is done without a hypercall unless the grant table needs expansion. + * 3. X gives the grant reference identifier, GREF, to Y. + * 4. Y maps the page, either directly into kernel memory for use in a backend + * driver, or via a the gntdev device to map into the address space of an + * application running in Y. This is the first point at which Xen does any + * tracking of the page. + * 5. A program in X mmap()s a segment of the gntalloc device that corresponds + * to the shared page, and can now communicate with Y over the shared page. + * + * + * NOTE TO USERSPACE LIBRARIES: + * The grant allocation and mmap()ing are, naturally, two separate operations. + * You set up the sharing by calling the create ioctl() and then the mmap(). + * Teardown requires munmap() and either close() or ioctl(). + * + * WARNING: Since Xen does not allow a guest to forcibly end the use of a grant + * reference, this device can be used to consume kernel memory by leaving grant + * references mapped by another domain when an application exits. Therefore, + * there is a global limit on the number of pages that can be allocated. When + * all references to the page are unmapped, it will be freed during the next + * grant operation. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/atomic.h> +#include <linux/module.h> +#include <linux/miscdevice.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/mm.h> +#include <linux/uaccess.h> +#include <linux/types.h> +#include <linux/list.h> +#include <linux/highmem.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/grant_table.h> +#include <xen/gntalloc.h> +#include <xen/events.h> + +static int limit = 1024; +module_param(limit, int, 0644); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be allocated by " + "the gntalloc device"); + +static LIST_HEAD(gref_list); +static DEFINE_MUTEX(gref_mutex); +static int gref_size; + +struct notify_info { + uint16_t pgoff:12; /* Bits 0-11: Offset of the byte to clear */ + uint16_t flags:2; /* Bits 12-13: Unmap notification flags */ + int event; /* Port (event channel) to notify */ +}; + +/* Metadata on a grant reference. */ +struct gntalloc_gref { + struct list_head next_gref; /* list entry gref_list */ + struct list_head next_file; /* list entry file->list, if open */ + struct page *page; /* The shared page */ + uint64_t file_index; /* File offset for mmap() */ + unsigned int users; /* Use count - when zero, waiting on Xen */ + grant_ref_t gref_id; /* The grant reference number */ + struct notify_info notify; /* Unmap notification */ +}; + +struct gntalloc_file_private_data { + struct list_head list; + uint64_t index; +}; + +struct gntalloc_vma_private_data { + struct gntalloc_gref *gref; + int users; + int count; +}; + +static void __del_gref(struct gntalloc_gref *gref); + +static void do_cleanup(void) +{ + struct gntalloc_gref *gref, *n; + list_for_each_entry_safe(gref, n, &gref_list, next_gref) { + if (!gref->users) + __del_gref(gref); + } +} + +static int add_grefs(struct ioctl_gntalloc_alloc_gref *op, + uint32_t *gref_ids, struct gntalloc_file_private_data *priv) +{ + int i, rc, readonly; + LIST_HEAD(queue_gref); + LIST_HEAD(queue_file); + struct gntalloc_gref *gref, *next; + + readonly = !(op->flags & GNTALLOC_FLAG_WRITABLE); + rc = -ENOMEM; + for (i = 0; i < op->count; i++) { + gref = kzalloc(sizeof(*gref), GFP_KERNEL); + if (!gref) + goto undo; + list_add_tail(&gref->next_gref, &queue_gref); + list_add_tail(&gref->next_file, &queue_file); + gref->users = 1; + gref->file_index = op->index + i * PAGE_SIZE; + gref->page = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!gref->page) + goto undo; + + /* Grant foreign access to the page. */ + rc = gnttab_grant_foreign_access(op->domid, + pfn_to_mfn(page_to_pfn(gref->page)), readonly); + if (rc < 0) + goto undo; + gref_ids[i] = gref->gref_id = rc; + } + + /* Add to gref lists. */ + mutex_lock(&gref_mutex); + list_splice_tail(&queue_gref, &gref_list); + list_splice_tail(&queue_file, &priv->list); + mutex_unlock(&gref_mutex); + + return 0; + +undo: + mutex_lock(&gref_mutex); + gref_size -= (op->count - i); + + list_for_each_entry_safe(gref, next, &queue_file, next_file) { + list_del(&gref->next_file); + __del_gref(gref); + } + + /* It's possible for the target domain to map the just-allocated grant + * references by blindly guessing their IDs; if this is done, then + * __del_gref will leave them in the queue_gref list. They need to be + * added to the global list so that we can free them when they are no + * longer referenced. + */ + if (unlikely(!list_empty(&queue_gref))) + list_splice_tail(&queue_gref, &gref_list); + mutex_unlock(&gref_mutex); + return rc; +} + +static void __del_gref(struct gntalloc_gref *gref) +{ + if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { + uint8_t *tmp = kmap(gref->page); + tmp[gref->notify.pgoff] = 0; + kunmap(gref->page); + } + if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { + notify_remote_via_evtchn(gref->notify.event); + evtchn_put(gref->notify.event); + } + + gref->notify.flags = 0; + + if (gref->gref_id) { + if (gnttab_query_foreign_access(gref->gref_id)) + return; + + if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) + return; + + gnttab_free_grant_reference(gref->gref_id); + } + + gref_size--; + list_del(&gref->next_gref); + + if (gref->page) + __free_page(gref->page); + + kfree(gref); +} + +/* finds contiguous grant references in a file, returns the first */ +static struct gntalloc_gref *find_grefs(struct gntalloc_file_private_data *priv, + uint64_t index, uint32_t count) +{ + struct gntalloc_gref *rv = NULL, *gref; + list_for_each_entry(gref, &priv->list, next_file) { + if (gref->file_index == index && !rv) + rv = gref; + if (rv) { + if (gref->file_index != index) + return NULL; + index += PAGE_SIZE; + count--; + if (count == 0) + return rv; + } + } + return NULL; +} + +/* + * ------------------------------------- + * File operations. + * ------------------------------------- + */ +static int gntalloc_open(struct inode *inode, struct file *filp) +{ + struct gntalloc_file_private_data *priv; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + goto out_nomem; + INIT_LIST_HEAD(&priv->list); + + filp->private_data = priv; + + pr_debug("%s: priv %p\n", __func__, priv); + + return 0; + +out_nomem: + return -ENOMEM; +} + +static int gntalloc_release(struct inode *inode, struct file *filp) +{ + struct gntalloc_file_private_data *priv = filp->private_data; + struct gntalloc_gref *gref; + + pr_debug("%s: priv %p\n", __func__, priv); + + mutex_lock(&gref_mutex); + while (!list_empty(&priv->list)) { + gref = list_entry(priv->list.next, + struct gntalloc_gref, next_file); + list_del(&gref->next_file); + gref->users--; + if (gref->users == 0) + __del_gref(gref); + } + kfree(priv); + mutex_unlock(&gref_mutex); + + return 0; +} + +static long gntalloc_ioctl_alloc(struct gntalloc_file_private_data *priv, + struct ioctl_gntalloc_alloc_gref __user *arg) +{ + int rc = 0; + struct ioctl_gntalloc_alloc_gref op; + uint32_t *gref_ids; + + pr_debug("%s: priv %p\n", __func__, priv); + + if (copy_from_user(&op, arg, sizeof(op))) { + rc = -EFAULT; + goto out; + } + + gref_ids = kcalloc(op.count, sizeof(gref_ids[0]), GFP_TEMPORARY); + if (!gref_ids) { + rc = -ENOMEM; + goto out; + } + + mutex_lock(&gref_mutex); + /* Clean up pages that were at zero (local) users but were still mapped + * by remote domains. Since those pages count towards the limit that we + * are about to enforce, removing them here is a good idea. + */ + do_cleanup(); + if (gref_size + op.count > limit) { + mutex_unlock(&gref_mutex); + rc = -ENOSPC; + goto out_free; + } + gref_size += op.count; + op.index = priv->index; + priv->index += op.count * PAGE_SIZE; + mutex_unlock(&gref_mutex); + + rc = add_grefs(&op, gref_ids, priv); + if (rc < 0) + goto out_free; + + /* Once we finish add_grefs, it is unsafe to touch the new reference, + * since it is possible for a concurrent ioctl to remove it (by guessing + * its index). If the userspace application doesn't provide valid memory + * to write the IDs to, then it will need to close the file in order to + * release - which it will do by segfaulting when it tries to access the + * IDs to close them. + */ + if (copy_to_user(arg, &op, sizeof(op))) { + rc = -EFAULT; + goto out_free; + } + if (copy_to_user(arg->gref_ids, gref_ids, + sizeof(gref_ids[0]) * op.count)) { + rc = -EFAULT; + goto out_free; + } + +out_free: + kfree(gref_ids); +out: + return rc; +} + +static long gntalloc_ioctl_dealloc(struct gntalloc_file_private_data *priv, + void __user *arg) +{ + int i, rc = 0; + struct ioctl_gntalloc_dealloc_gref op; + struct gntalloc_gref *gref, *n; + + pr_debug("%s: priv %p\n", __func__, priv); + + if (copy_from_user(&op, arg, sizeof(op))) { + rc = -EFAULT; + goto dealloc_grant_out; + } + + mutex_lock(&gref_mutex); + gref = find_grefs(priv, op.index, op.count); + if (gref) { + /* Remove from the file list only, and decrease reference count. + * The later call to do_cleanup() will remove from gref_list and + * free the memory if the pages aren't mapped anywhere. + */ + for (i = 0; i < op.count; i++) { + n = list_entry(gref->next_file.next, + struct gntalloc_gref, next_file); + list_del(&gref->next_file); + gref->users--; + gref = n; + } + } else { + rc = -EINVAL; + } + + do_cleanup(); + + mutex_unlock(&gref_mutex); +dealloc_grant_out: + return rc; +} + +static long gntalloc_ioctl_unmap_notify(struct gntalloc_file_private_data *priv, + void __user *arg) +{ + struct ioctl_gntalloc_unmap_notify op; + struct gntalloc_gref *gref; + uint64_t index; + int pgoff; + int rc; + + if (copy_from_user(&op, arg, sizeof(op))) + return -EFAULT; + + index = op.index & ~(PAGE_SIZE - 1); + pgoff = op.index & (PAGE_SIZE - 1); + + mutex_lock(&gref_mutex); + + gref = find_grefs(priv, index, 1); + if (!gref) { + rc = -ENOENT; + goto unlock_out; + } + + if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) { + rc = -EINVAL; + goto unlock_out; + } + + /* We need to grab a reference to the event channel we are going to use + * to send the notify before releasing the reference we may already have + * (if someone has called this ioctl twice). This is required so that + * it is possible to change the clear_byte part of the notification + * without disturbing the event channel part, which may now be the last + * reference to that event channel. + */ + if (op.action & UNMAP_NOTIFY_SEND_EVENT) { + if (evtchn_get(op.event_channel_port)) { + rc = -EINVAL; + goto unlock_out; + } + } + + if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) + evtchn_put(gref->notify.event); + + gref->notify.flags = op.action; + gref->notify.pgoff = pgoff; + gref->notify.event = op.event_channel_port; + rc = 0; + + unlock_out: + mutex_unlock(&gref_mutex); + return rc; +} + +static long gntalloc_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct gntalloc_file_private_data *priv = filp->private_data; + + switch (cmd) { + case IOCTL_GNTALLOC_ALLOC_GREF: + return gntalloc_ioctl_alloc(priv, (void __user *)arg); + + case IOCTL_GNTALLOC_DEALLOC_GREF: + return gntalloc_ioctl_dealloc(priv, (void __user *)arg); + + case IOCTL_GNTALLOC_SET_UNMAP_NOTIFY: + return gntalloc_ioctl_unmap_notify(priv, (void __user *)arg); + + default: + return -ENOIOCTLCMD; + } + + return 0; +} + +static void gntalloc_vma_open(struct vm_area_struct *vma) +{ + struct gntalloc_vma_private_data *priv = vma->vm_private_data; + + if (!priv) + return; + + mutex_lock(&gref_mutex); + priv->users++; + mutex_unlock(&gref_mutex); +} + +static void gntalloc_vma_close(struct vm_area_struct *vma) +{ + struct gntalloc_vma_private_data *priv = vma->vm_private_data; + struct gntalloc_gref *gref, *next; + int i; + + if (!priv) + return; + + mutex_lock(&gref_mutex); + priv->users--; + if (priv->users == 0) { + gref = priv->gref; + for (i = 0; i < priv->count; i++) { + gref->users--; + next = list_entry(gref->next_gref.next, + struct gntalloc_gref, next_gref); + if (gref->users == 0) + __del_gref(gref); + gref = next; + } + kfree(priv); + } + mutex_unlock(&gref_mutex); +} + +static struct vm_operations_struct gntalloc_vmops = { + .open = gntalloc_vma_open, + .close = gntalloc_vma_close, +}; + +static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct gntalloc_file_private_data *priv = filp->private_data; + struct gntalloc_vma_private_data *vm_priv; + struct gntalloc_gref *gref; + int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int rv, i; + + if (!(vma->vm_flags & VM_SHARED)) { + pr_err("%s: Mapping must be shared\n", __func__); + return -EINVAL; + } + + vm_priv = kmalloc(sizeof(*vm_priv), GFP_KERNEL); + if (!vm_priv) + return -ENOMEM; + + mutex_lock(&gref_mutex); + + pr_debug("%s: priv %p,%p, page %lu+%d\n", __func__, + priv, vm_priv, vma->vm_pgoff, count); + + gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count); + if (gref == NULL) { + rv = -ENOENT; + pr_debug("%s: Could not find grant reference", + __func__); + kfree(vm_priv); + goto out_unlock; + } + + vm_priv->gref = gref; + vm_priv->users = 1; + vm_priv->count = count; + + vma->vm_private_data = vm_priv; + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + + vma->vm_ops = &gntalloc_vmops; + + for (i = 0; i < count; i++) { + gref->users++; + rv = vm_insert_page(vma, vma->vm_start + i * PAGE_SIZE, + gref->page); + if (rv) + goto out_unlock; + + gref = list_entry(gref->next_file.next, + struct gntalloc_gref, next_file); + } + rv = 0; + +out_unlock: + mutex_unlock(&gref_mutex); + return rv; +} + +static const struct file_operations gntalloc_fops = { + .owner = THIS_MODULE, + .open = gntalloc_open, + .release = gntalloc_release, + .unlocked_ioctl = gntalloc_ioctl, + .mmap = gntalloc_mmap +}; + +/* + * ------------------------------------- + * Module creation/destruction. + * ------------------------------------- + */ +static struct miscdevice gntalloc_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/gntalloc", + .fops = &gntalloc_fops, +}; + +static int __init gntalloc_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + err = misc_register(&gntalloc_miscdev); + if (err != 0) { + pr_err("Could not register misc gntalloc device\n"); + return err; + } + + pr_debug("Created grant allocation device at %d,%d\n", + MISC_MAJOR, gntalloc_miscdev.minor); + + return 0; +} + +static void __exit gntalloc_exit(void) +{ + misc_deregister(&gntalloc_miscdev); +} + +module_init(gntalloc_init); +module_exit(gntalloc_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Carter Weatherly <carter.weatherly@jhuapl.edu>, " + "Daniel De Graaf <dgdegra@tycho.nsa.gov>"); +MODULE_DESCRIPTION("User-space grant reference allocator driver"); diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c new file mode 100644 index 000000000..892748507 --- /dev/null +++ b/drivers/xen/gntdev.c @@ -0,0 +1,924 @@ +/****************************************************************************** + * gntdev.c + * + * Device for accessing (in user-space) pages that have been granted by other + * domains. + * + * Copyright (c) 2006-2007, D G Murray. + * (c) 2009 Gerd Hoffmann <kraxel@redhat.com> + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#undef DEBUG + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/miscdevice.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/mmu_notifier.h> +#include <linux/types.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/highmem.h> + +#include <xen/xen.h> +#include <xen/grant_table.h> +#include <xen/balloon.h> +#include <xen/gntdev.h> +#include <xen/events.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Derek G. Murray <Derek.Murray@cl.cam.ac.uk>, " + "Gerd Hoffmann <kraxel@redhat.com>"); +MODULE_DESCRIPTION("User-space granted page access driver"); + +static int limit = 1024*1024; +module_param(limit, int, 0644); +MODULE_PARM_DESC(limit, "Maximum number of grants that may be mapped by " + "the gntdev device"); + +static atomic_t pages_mapped = ATOMIC_INIT(0); + +static int use_ptemod; +#define populate_freeable_maps use_ptemod + +struct gntdev_priv { + /* maps with visible offsets in the file descriptor */ + struct list_head maps; + /* maps that are not visible; will be freed on munmap. + * Only populated if populate_freeable_maps == 1 */ + struct list_head freeable_maps; + /* lock protects maps and freeable_maps */ + struct mutex lock; + struct mm_struct *mm; + struct mmu_notifier mn; +}; + +struct unmap_notify { + int flags; + /* Address relative to the start of the grant_map */ + int addr; + int event; +}; + +struct grant_map { + struct list_head next; + struct vm_area_struct *vma; + int index; + int count; + int flags; + atomic_t users; + struct unmap_notify notify; + struct ioctl_gntdev_grant_ref *grants; + struct gnttab_map_grant_ref *map_ops; + struct gnttab_unmap_grant_ref *unmap_ops; + struct gnttab_map_grant_ref *kmap_ops; + struct gnttab_unmap_grant_ref *kunmap_ops; + struct page **pages; + unsigned long pages_vm_start; +}; + +static int unmap_grant_pages(struct grant_map *map, int offset, int pages); + +/* ------------------------------------------------------------------ */ + +static void gntdev_print_maps(struct gntdev_priv *priv, + char *text, int text_index) +{ +#ifdef DEBUG + struct grant_map *map; + + pr_debug("%s: maps list (priv %p)\n", __func__, priv); + list_for_each_entry(map, &priv->maps, next) + pr_debug(" index %2d, count %2d %s\n", + map->index, map->count, + map->index == text_index && text ? text : ""); +#endif +} + +static void gntdev_free_map(struct grant_map *map) +{ + if (map == NULL) + return; + + if (map->pages) + gnttab_free_pages(map->count, map->pages); + kfree(map->pages); + kfree(map->grants); + kfree(map->map_ops); + kfree(map->unmap_ops); + kfree(map->kmap_ops); + kfree(map->kunmap_ops); + kfree(map); +} + +static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) +{ + struct grant_map *add; + int i; + + add = kzalloc(sizeof(struct grant_map), GFP_KERNEL); + if (NULL == add) + return NULL; + + add->grants = kcalloc(count, sizeof(add->grants[0]), GFP_KERNEL); + add->map_ops = kcalloc(count, sizeof(add->map_ops[0]), GFP_KERNEL); + add->unmap_ops = kcalloc(count, sizeof(add->unmap_ops[0]), GFP_KERNEL); + add->kmap_ops = kcalloc(count, sizeof(add->kmap_ops[0]), GFP_KERNEL); + add->kunmap_ops = kcalloc(count, sizeof(add->kunmap_ops[0]), GFP_KERNEL); + add->pages = kcalloc(count, sizeof(add->pages[0]), GFP_KERNEL); + if (NULL == add->grants || + NULL == add->map_ops || + NULL == add->unmap_ops || + NULL == add->kmap_ops || + NULL == add->kunmap_ops || + NULL == add->pages) + goto err; + + if (gnttab_alloc_pages(count, add->pages)) + goto err; + + for (i = 0; i < count; i++) { + add->map_ops[i].handle = -1; + add->unmap_ops[i].handle = -1; + add->kmap_ops[i].handle = -1; + add->kunmap_ops[i].handle = -1; + } + + add->index = 0; + add->count = count; + atomic_set(&add->users, 1); + + return add; + +err: + gntdev_free_map(add); + return NULL; +} + +static void gntdev_add_map(struct gntdev_priv *priv, struct grant_map *add) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (add->index + add->count < map->index) { + list_add_tail(&add->next, &map->next); + goto done; + } + add->index = map->index + map->count; + } + list_add_tail(&add->next, &priv->maps); + +done: + gntdev_print_maps(priv, "[new]", add->index); +} + +static struct grant_map *gntdev_find_map_index(struct gntdev_priv *priv, + int index, int count) +{ + struct grant_map *map; + + list_for_each_entry(map, &priv->maps, next) { + if (map->index != index) + continue; + if (count && map->count != count) + continue; + return map; + } + return NULL; +} + +static void gntdev_put_map(struct gntdev_priv *priv, struct grant_map *map) +{ + if (!map) + return; + + if (!atomic_dec_and_test(&map->users)) + return; + + atomic_sub(map->count, &pages_mapped); + + if (map->notify.flags & UNMAP_NOTIFY_SEND_EVENT) { + notify_remote_via_evtchn(map->notify.event); + evtchn_put(map->notify.event); + } + + if (populate_freeable_maps && priv) { + mutex_lock(&priv->lock); + list_del(&map->next); + mutex_unlock(&priv->lock); + } + + if (map->pages && !use_ptemod) + unmap_grant_pages(map, 0, map->count); + gntdev_free_map(map); +} + +/* ------------------------------------------------------------------ */ + +static int find_grant_ptes(pte_t *pte, pgtable_t token, + unsigned long addr, void *data) +{ + struct grant_map *map = data; + unsigned int pgnr = (addr - map->vma->vm_start) >> PAGE_SHIFT; + int flags = map->flags | GNTMAP_application_map | GNTMAP_contains_pte; + u64 pte_maddr; + + BUG_ON(pgnr >= map->count); + pte_maddr = arbitrary_virt_to_machine(pte).maddr; + + /* + * Set the PTE as special to force get_user_pages_fast() fall + * back to the slow path. If this is not supported as part of + * the grant map, it will be done afterwards. + */ + if (xen_feature(XENFEAT_gnttab_map_avail_bits)) + flags |= (1 << _GNTMAP_guest_avail0); + + gnttab_set_map_op(&map->map_ops[pgnr], pte_maddr, flags, + map->grants[pgnr].ref, + map->grants[pgnr].domid); + gnttab_set_unmap_op(&map->unmap_ops[pgnr], pte_maddr, flags, + -1 /* handle */); + return 0; +} + +#ifdef CONFIG_X86 +static int set_grant_ptes_as_special(pte_t *pte, pgtable_t token, + unsigned long addr, void *data) +{ + set_pte_at(current->mm, addr, pte, pte_mkspecial(*pte)); + return 0; +} +#endif + +static int map_grant_pages(struct grant_map *map) +{ + int i, err = 0; + + if (!use_ptemod) { + /* Note: it could already be mapped */ + if (map->map_ops[0].handle != -1) + return 0; + for (i = 0; i < map->count; i++) { + unsigned long addr = (unsigned long) + pfn_to_kaddr(page_to_pfn(map->pages[i])); + gnttab_set_map_op(&map->map_ops[i], addr, map->flags, + map->grants[i].ref, + map->grants[i].domid); + gnttab_set_unmap_op(&map->unmap_ops[i], addr, + map->flags, -1 /* handle */); + } + } else { + /* + * Setup the map_ops corresponding to the pte entries pointing + * to the kernel linear addresses of the struct pages. + * These ptes are completely different from the user ptes dealt + * with find_grant_ptes. + */ + for (i = 0; i < map->count; i++) { + unsigned long address = (unsigned long) + pfn_to_kaddr(page_to_pfn(map->pages[i])); + BUG_ON(PageHighMem(map->pages[i])); + + gnttab_set_map_op(&map->kmap_ops[i], address, + map->flags | GNTMAP_host_map, + map->grants[i].ref, + map->grants[i].domid); + gnttab_set_unmap_op(&map->kunmap_ops[i], address, + map->flags | GNTMAP_host_map, -1); + } + } + + pr_debug("map %d+%d\n", map->index, map->count); + err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, + map->pages, map->count); + if (err) + return err; + + for (i = 0; i < map->count; i++) { + if (map->map_ops[i].status) { + err = -EINVAL; + continue; + } + + map->unmap_ops[i].handle = map->map_ops[i].handle; + if (use_ptemod) + map->kunmap_ops[i].handle = map->kmap_ops[i].handle; + } + return err; +} + +static int __unmap_grant_pages(struct grant_map *map, int offset, int pages) +{ + int i, err = 0; + struct gntab_unmap_queue_data unmap_data; + + if (map->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) { + int pgno = (map->notify.addr >> PAGE_SHIFT); + if (pgno >= offset && pgno < offset + pages) { + /* No need for kmap, pages are in lowmem */ + uint8_t *tmp = pfn_to_kaddr(page_to_pfn(map->pages[pgno])); + tmp[map->notify.addr & (PAGE_SIZE-1)] = 0; + map->notify.flags &= ~UNMAP_NOTIFY_CLEAR_BYTE; + } + } + + unmap_data.unmap_ops = map->unmap_ops + offset; + unmap_data.kunmap_ops = use_ptemod ? map->kunmap_ops + offset : NULL; + unmap_data.pages = map->pages + offset; + unmap_data.count = pages; + + err = gnttab_unmap_refs_sync(&unmap_data); + if (err) + return err; + + for (i = 0; i < pages; i++) { + if (map->unmap_ops[offset+i].status) + err = -EINVAL; + pr_debug("unmap handle=%d st=%d\n", + map->unmap_ops[offset+i].handle, + map->unmap_ops[offset+i].status); + map->unmap_ops[offset+i].handle = -1; + } + return err; +} + +static int unmap_grant_pages(struct grant_map *map, int offset, int pages) +{ + int range, err = 0; + + pr_debug("unmap %d+%d [%d+%d]\n", map->index, map->count, offset, pages); + + /* It is possible the requested range will have a "hole" where we + * already unmapped some of the grants. Only unmap valid ranges. + */ + while (pages && !err) { + while (pages && map->unmap_ops[offset].handle == -1) { + offset++; + pages--; + } + range = 0; + while (range < pages) { + if (map->unmap_ops[offset+range].handle == -1) { + range--; + break; + } + range++; + } + err = __unmap_grant_pages(map, offset, range); + offset += range; + pages -= range; + } + + return err; +} + +/* ------------------------------------------------------------------ */ + +static void gntdev_vma_open(struct vm_area_struct *vma) +{ + struct grant_map *map = vma->vm_private_data; + + pr_debug("gntdev_vma_open %p\n", vma); + atomic_inc(&map->users); +} + +static void gntdev_vma_close(struct vm_area_struct *vma) +{ + struct grant_map *map = vma->vm_private_data; + struct file *file = vma->vm_file; + struct gntdev_priv *priv = file->private_data; + + pr_debug("gntdev_vma_close %p\n", vma); + if (use_ptemod) { + /* It is possible that an mmu notifier could be running + * concurrently, so take priv->lock to ensure that the vma won't + * vanishing during the unmap_grant_pages call, since we will + * spin here until that completes. Such a concurrent call will + * not do any unmapping, since that has been done prior to + * closing the vma, but it may still iterate the unmap_ops list. + */ + mutex_lock(&priv->lock); + map->vma = NULL; + mutex_unlock(&priv->lock); + } + vma->vm_private_data = NULL; + gntdev_put_map(priv, map); +} + +static struct page *gntdev_vma_find_special_page(struct vm_area_struct *vma, + unsigned long addr) +{ + struct grant_map *map = vma->vm_private_data; + + return map->pages[(addr - map->pages_vm_start) >> PAGE_SHIFT]; +} + +static struct vm_operations_struct gntdev_vmops = { + .open = gntdev_vma_open, + .close = gntdev_vma_close, + .find_special_page = gntdev_vma_find_special_page, +}; + +/* ------------------------------------------------------------------ */ + +static void unmap_if_in_range(struct grant_map *map, + unsigned long start, unsigned long end) +{ + unsigned long mstart, mend; + int err; + + if (!map->vma) + return; + if (map->vma->vm_start >= end) + return; + if (map->vma->vm_end <= start) + return; + mstart = max(start, map->vma->vm_start); + mend = min(end, map->vma->vm_end); + pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end, + start, end, mstart, mend); + err = unmap_grant_pages(map, + (mstart - map->vma->vm_start) >> PAGE_SHIFT, + (mend - mstart) >> PAGE_SHIFT); + WARN_ON(err); +} + +static void mn_invl_range_start(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); + struct grant_map *map; + + mutex_lock(&priv->lock); + list_for_each_entry(map, &priv->maps, next) { + unmap_if_in_range(map, start, end); + } + list_for_each_entry(map, &priv->freeable_maps, next) { + unmap_if_in_range(map, start, end); + } + mutex_unlock(&priv->lock); +} + +static void mn_invl_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + mn_invl_range_start(mn, mm, address, address + PAGE_SIZE); +} + +static void mn_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); + struct grant_map *map; + int err; + + mutex_lock(&priv->lock); + list_for_each_entry(map, &priv->maps, next) { + if (!map->vma) + continue; + pr_debug("map %d+%d (%lx %lx)\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end); + err = unmap_grant_pages(map, /* offset */ 0, map->count); + WARN_ON(err); + } + list_for_each_entry(map, &priv->freeable_maps, next) { + if (!map->vma) + continue; + pr_debug("map %d+%d (%lx %lx)\n", + map->index, map->count, + map->vma->vm_start, map->vma->vm_end); + err = unmap_grant_pages(map, /* offset */ 0, map->count); + WARN_ON(err); + } + mutex_unlock(&priv->lock); +} + +static struct mmu_notifier_ops gntdev_mmu_ops = { + .release = mn_release, + .invalidate_page = mn_invl_page, + .invalidate_range_start = mn_invl_range_start, +}; + +/* ------------------------------------------------------------------ */ + +static int gntdev_open(struct inode *inode, struct file *flip) +{ + struct gntdev_priv *priv; + int ret = 0; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + INIT_LIST_HEAD(&priv->maps); + INIT_LIST_HEAD(&priv->freeable_maps); + mutex_init(&priv->lock); + + if (use_ptemod) { + priv->mm = get_task_mm(current); + if (!priv->mm) { + kfree(priv); + return -ENOMEM; + } + priv->mn.ops = &gntdev_mmu_ops; + ret = mmu_notifier_register(&priv->mn, priv->mm); + mmput(priv->mm); + } + + if (ret) { + kfree(priv); + return ret; + } + + flip->private_data = priv; + pr_debug("priv %p\n", priv); + + return 0; +} + +static int gntdev_release(struct inode *inode, struct file *flip) +{ + struct gntdev_priv *priv = flip->private_data; + struct grant_map *map; + + pr_debug("priv %p\n", priv); + + while (!list_empty(&priv->maps)) { + map = list_entry(priv->maps.next, struct grant_map, next); + list_del(&map->next); + gntdev_put_map(NULL /* already removed */, map); + } + WARN_ON(!list_empty(&priv->freeable_maps)); + + if (use_ptemod) + mmu_notifier_unregister(&priv->mn, priv->mm); + kfree(priv); + return 0; +} + +static long gntdev_ioctl_map_grant_ref(struct gntdev_priv *priv, + struct ioctl_gntdev_map_grant_ref __user *u) +{ + struct ioctl_gntdev_map_grant_ref op; + struct grant_map *map; + int err; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, add %d\n", priv, op.count); + if (unlikely(op.count <= 0)) + return -EINVAL; + + err = -ENOMEM; + map = gntdev_alloc_map(priv, op.count); + if (!map) + return err; + + if (unlikely(atomic_add_return(op.count, &pages_mapped) > limit)) { + pr_debug("can't map: over limit\n"); + gntdev_put_map(NULL, map); + return err; + } + + if (copy_from_user(map->grants, &u->refs, + sizeof(map->grants[0]) * op.count) != 0) { + gntdev_put_map(NULL, map); + return -EFAULT; + } + + mutex_lock(&priv->lock); + gntdev_add_map(priv, map); + op.index = map->index << PAGE_SHIFT; + mutex_unlock(&priv->lock); + + if (copy_to_user(u, &op, sizeof(op)) != 0) + return -EFAULT; + + return 0; +} + +static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, + struct ioctl_gntdev_unmap_grant_ref __user *u) +{ + struct ioctl_gntdev_unmap_grant_ref op; + struct grant_map *map; + int err = -ENOENT; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, del %d+%d\n", priv, (int)op.index, (int)op.count); + + mutex_lock(&priv->lock); + map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); + if (map) { + list_del(&map->next); + if (populate_freeable_maps) + list_add_tail(&map->next, &priv->freeable_maps); + err = 0; + } + mutex_unlock(&priv->lock); + if (map) + gntdev_put_map(priv, map); + return err; +} + +static long gntdev_ioctl_get_offset_for_vaddr(struct gntdev_priv *priv, + struct ioctl_gntdev_get_offset_for_vaddr __user *u) +{ + struct ioctl_gntdev_get_offset_for_vaddr op; + struct vm_area_struct *vma; + struct grant_map *map; + int rv = -EINVAL; + + if (copy_from_user(&op, u, sizeof(op)) != 0) + return -EFAULT; + pr_debug("priv %p, offset for vaddr %lx\n", priv, (unsigned long)op.vaddr); + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, op.vaddr); + if (!vma || vma->vm_ops != &gntdev_vmops) + goto out_unlock; + + map = vma->vm_private_data; + if (!map) + goto out_unlock; + + op.offset = map->index << PAGE_SHIFT; + op.count = map->count; + rv = 0; + + out_unlock: + up_read(¤t->mm->mmap_sem); + + if (rv == 0 && copy_to_user(u, &op, sizeof(op)) != 0) + return -EFAULT; + return rv; +} + +static long gntdev_ioctl_notify(struct gntdev_priv *priv, void __user *u) +{ + struct ioctl_gntdev_unmap_notify op; + struct grant_map *map; + int rc; + int out_flags; + unsigned int out_event; + + if (copy_from_user(&op, u, sizeof(op))) + return -EFAULT; + + if (op.action & ~(UNMAP_NOTIFY_CLEAR_BYTE|UNMAP_NOTIFY_SEND_EVENT)) + return -EINVAL; + + /* We need to grab a reference to the event channel we are going to use + * to send the notify before releasing the reference we may already have + * (if someone has called this ioctl twice). This is required so that + * it is possible to change the clear_byte part of the notification + * without disturbing the event channel part, which may now be the last + * reference to that event channel. + */ + if (op.action & UNMAP_NOTIFY_SEND_EVENT) { + if (evtchn_get(op.event_channel_port)) + return -EINVAL; + } + + out_flags = op.action; + out_event = op.event_channel_port; + + mutex_lock(&priv->lock); + + list_for_each_entry(map, &priv->maps, next) { + uint64_t begin = map->index << PAGE_SHIFT; + uint64_t end = (map->index + map->count) << PAGE_SHIFT; + if (op.index >= begin && op.index < end) + goto found; + } + rc = -ENOENT; + goto unlock_out; + + found: + if ((op.action & UNMAP_NOTIFY_CLEAR_BYTE) && + (map->flags & GNTMAP_readonly)) { + rc = -EINVAL; + goto unlock_out; + } + + out_flags = map->notify.flags; + out_event = map->notify.event; + + map->notify.flags = op.action; + map->notify.addr = op.index - (map->index << PAGE_SHIFT); + map->notify.event = op.event_channel_port; + + rc = 0; + + unlock_out: + mutex_unlock(&priv->lock); + + /* Drop the reference to the event channel we did not save in the map */ + if (out_flags & UNMAP_NOTIFY_SEND_EVENT) + evtchn_put(out_event); + + return rc; +} + +static long gntdev_ioctl(struct file *flip, + unsigned int cmd, unsigned long arg) +{ + struct gntdev_priv *priv = flip->private_data; + void __user *ptr = (void __user *)arg; + + switch (cmd) { + case IOCTL_GNTDEV_MAP_GRANT_REF: + return gntdev_ioctl_map_grant_ref(priv, ptr); + + case IOCTL_GNTDEV_UNMAP_GRANT_REF: + return gntdev_ioctl_unmap_grant_ref(priv, ptr); + + case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: + return gntdev_ioctl_get_offset_for_vaddr(priv, ptr); + + case IOCTL_GNTDEV_SET_UNMAP_NOTIFY: + return gntdev_ioctl_notify(priv, ptr); + + default: + pr_debug("priv %p, unknown cmd %x\n", priv, cmd); + return -ENOIOCTLCMD; + } + + return 0; +} + +static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma) +{ + struct gntdev_priv *priv = flip->private_data; + int index = vma->vm_pgoff; + int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + struct grant_map *map; + int i, err = -EINVAL; + + if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + pr_debug("map %d+%d at %lx (pgoff %lx)\n", + index, count, vma->vm_start, vma->vm_pgoff); + + mutex_lock(&priv->lock); + map = gntdev_find_map_index(priv, index, count); + if (!map) + goto unlock_out; + if (use_ptemod && map->vma) + goto unlock_out; + if (use_ptemod && priv->mm != vma->vm_mm) { + pr_warn("Huh? Other mm?\n"); + goto unlock_out; + } + + atomic_inc(&map->users); + + vma->vm_ops = &gntdev_vmops; + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + + if (use_ptemod) + vma->vm_flags |= VM_DONTCOPY; + + vma->vm_private_data = map; + + if (use_ptemod) + map->vma = vma; + + if (map->flags) { + if ((vma->vm_flags & VM_WRITE) && + (map->flags & GNTMAP_readonly)) + goto out_unlock_put; + } else { + map->flags = GNTMAP_host_map; + if (!(vma->vm_flags & VM_WRITE)) + map->flags |= GNTMAP_readonly; + } + + mutex_unlock(&priv->lock); + + if (use_ptemod) { + err = apply_to_page_range(vma->vm_mm, vma->vm_start, + vma->vm_end - vma->vm_start, + find_grant_ptes, map); + if (err) { + pr_warn("find_grant_ptes() failure.\n"); + goto out_put_map; + } + } + + err = map_grant_pages(map); + if (err) + goto out_put_map; + + if (!use_ptemod) { + for (i = 0; i < count; i++) { + err = vm_insert_page(vma, vma->vm_start + i*PAGE_SIZE, + map->pages[i]); + if (err) + goto out_put_map; + } + } else { +#ifdef CONFIG_X86 + /* + * If the PTEs were not made special by the grant map + * hypercall, do so here. + * + * This is racy since the mapping is already visible + * to userspace but userspace should be well-behaved + * enough to not touch it until the mmap() call + * returns. + */ + if (!xen_feature(XENFEAT_gnttab_map_avail_bits)) { + apply_to_page_range(vma->vm_mm, vma->vm_start, + vma->vm_end - vma->vm_start, + set_grant_ptes_as_special, NULL); + } +#endif + map->pages_vm_start = vma->vm_start; + } + + return 0; + +unlock_out: + mutex_unlock(&priv->lock); + return err; + +out_unlock_put: + mutex_unlock(&priv->lock); +out_put_map: + if (use_ptemod) + map->vma = NULL; + gntdev_put_map(priv, map); + return err; +} + +static const struct file_operations gntdev_fops = { + .owner = THIS_MODULE, + .open = gntdev_open, + .release = gntdev_release, + .mmap = gntdev_mmap, + .unlocked_ioctl = gntdev_ioctl +}; + +static struct miscdevice gntdev_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/gntdev", + .fops = &gntdev_fops, +}; + +/* ------------------------------------------------------------------ */ + +static int __init gntdev_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + use_ptemod = !xen_feature(XENFEAT_auto_translated_physmap); + + err = misc_register(&gntdev_miscdev); + if (err != 0) { + pr_err("Could not register gntdev device\n"); + return err; + } + return 0; +} + +static void __exit gntdev_exit(void) +{ + misc_deregister(&gntdev_miscdev); +} + +module_init(gntdev_init); +module_exit(gntdev_exit); + +/* ------------------------------------------------------------------ */ diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c new file mode 100644 index 000000000..b1c7170e5 --- /dev/null +++ b/drivers/xen/grant-table.c @@ -0,0 +1,1114 @@ +/****************************************************************************** + * grant_table.c + * + * Granting foreign access to our memory reservation. + * + * Copyright (c) 2005-2006, Christopher Clark + * Copyright (c) 2004-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/uaccess.h> +#include <linux/io.h> +#include <linux/delay.h> +#include <linux/hardirq.h> +#include <linux/workqueue.h> + +#include <xen/xen.h> +#include <xen/interface/xen.h> +#include <xen/page.h> +#include <xen/grant_table.h> +#include <xen/interface/memory.h> +#include <xen/hvc-console.h> +#include <xen/swiotlb-xen.h> +#include <xen/balloon.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/interface.h> + +#include <asm/pgtable.h> +#include <asm/sync_bitops.h> + +/* External tools reserve first few grant table entries. */ +#define NR_RESERVED_ENTRIES 8 +#define GNTTAB_LIST_END 0xffffffff + +static grant_ref_t **gnttab_list; +static unsigned int nr_grant_frames; +static int gnttab_free_count; +static grant_ref_t gnttab_free_head; +static DEFINE_SPINLOCK(gnttab_list_lock); +struct grant_frames xen_auto_xlat_grant_frames; + +static union { + struct grant_entry_v1 *v1; + void *addr; +} gnttab_shared; + +/*This is a structure of function pointers for grant table*/ +struct gnttab_ops { + /* + * Mapping a list of frames for storing grant entries. Frames parameter + * is used to store grant table address when grant table being setup, + * nr_gframes is the number of frames to map grant table. Returning + * GNTST_okay means success and negative value means failure. + */ + int (*map_frames)(xen_pfn_t *frames, unsigned int nr_gframes); + /* + * Release a list of frames which are mapped in map_frames for grant + * entry status. + */ + void (*unmap_frames)(void); + /* + * Introducing a valid entry into the grant table, granting the frame of + * this grant entry to domain for accessing or transfering. Ref + * parameter is reference of this introduced grant entry, domid is id of + * granted domain, frame is the page frame to be granted, and flags is + * status of the grant entry to be updated. + */ + void (*update_entry)(grant_ref_t ref, domid_t domid, + unsigned long frame, unsigned flags); + /* + * Stop granting a grant entry to domain for accessing. Ref parameter is + * reference of a grant entry whose grant access will be stopped, + * readonly is not in use in this function. If the grant entry is + * currently mapped for reading or writing, just return failure(==0) + * directly and don't tear down the grant access. Otherwise, stop grant + * access for this entry and return success(==1). + */ + int (*end_foreign_access_ref)(grant_ref_t ref, int readonly); + /* + * Stop granting a grant entry to domain for transfer. Ref parameter is + * reference of a grant entry whose grant transfer will be stopped. If + * tranfer has not started, just reclaim the grant entry and return + * failure(==0). Otherwise, wait for the transfer to complete and then + * return the frame. + */ + unsigned long (*end_foreign_transfer_ref)(grant_ref_t ref); + /* + * Query the status of a grant entry. Ref parameter is reference of + * queried grant entry, return value is the status of queried entry. + * Detailed status(writing/reading) can be gotten from the return value + * by bit operations. + */ + int (*query_foreign_access)(grant_ref_t ref); +}; + +struct unmap_refs_callback_data { + struct completion completion; + int result; +}; + +static struct gnttab_ops *gnttab_interface; + +static int grant_table_version; +static int grefs_per_grant_frame; + +static struct gnttab_free_callback *gnttab_free_callback_list; + +static int gnttab_expand(unsigned int req_entries); + +#define RPP (PAGE_SIZE / sizeof(grant_ref_t)) +#define SPP (PAGE_SIZE / sizeof(grant_status_t)) + +static inline grant_ref_t *__gnttab_entry(grant_ref_t entry) +{ + return &gnttab_list[(entry) / RPP][(entry) % RPP]; +} +/* This can be used as an l-value */ +#define gnttab_entry(entry) (*__gnttab_entry(entry)) + +static int get_free_entries(unsigned count) +{ + unsigned long flags; + int ref, rc = 0; + grant_ref_t head; + + spin_lock_irqsave(&gnttab_list_lock, flags); + + if ((gnttab_free_count < count) && + ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) { + spin_unlock_irqrestore(&gnttab_list_lock, flags); + return rc; + } + + ref = head = gnttab_free_head; + gnttab_free_count -= count; + while (count-- > 1) + head = gnttab_entry(head); + gnttab_free_head = gnttab_entry(head); + gnttab_entry(head) = GNTTAB_LIST_END; + + spin_unlock_irqrestore(&gnttab_list_lock, flags); + + return ref; +} + +static void do_free_callbacks(void) +{ + struct gnttab_free_callback *callback, *next; + + callback = gnttab_free_callback_list; + gnttab_free_callback_list = NULL; + + while (callback != NULL) { + next = callback->next; + if (gnttab_free_count >= callback->count) { + callback->next = NULL; + callback->fn(callback->arg); + } else { + callback->next = gnttab_free_callback_list; + gnttab_free_callback_list = callback; + } + callback = next; + } +} + +static inline void check_free_callbacks(void) +{ + if (unlikely(gnttab_free_callback_list)) + do_free_callbacks(); +} + +static void put_free_entry(grant_ref_t ref) +{ + unsigned long flags; + spin_lock_irqsave(&gnttab_list_lock, flags); + gnttab_entry(ref) = gnttab_free_head; + gnttab_free_head = ref; + gnttab_free_count++; + check_free_callbacks(); + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} + +/* + * Following applies to gnttab_update_entry_v1. + * Introducing a valid entry into the grant table: + * 1. Write ent->domid. + * 2. Write ent->frame: + * GTF_permit_access: Frame to which access is permitted. + * GTF_accept_transfer: Pseudo-phys frame slot being filled by new + * frame, or zero if none. + * 3. Write memory barrier (WMB). + * 4. Write ent->flags, inc. valid type. + */ +static void gnttab_update_entry_v1(grant_ref_t ref, domid_t domid, + unsigned long frame, unsigned flags) +{ + gnttab_shared.v1[ref].domid = domid; + gnttab_shared.v1[ref].frame = frame; + wmb(); + gnttab_shared.v1[ref].flags = flags; +} + +/* + * Public grant-issuing interface functions + */ +void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, + unsigned long frame, int readonly) +{ + gnttab_interface->update_entry(ref, domid, frame, + GTF_permit_access | (readonly ? GTF_readonly : 0)); +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref); + +int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, + int readonly) +{ + int ref; + + ref = get_free_entries(1); + if (unlikely(ref < 0)) + return -ENOSPC; + + gnttab_grant_foreign_access_ref(ref, domid, frame, readonly); + + return ref; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); + +static int gnttab_query_foreign_access_v1(grant_ref_t ref) +{ + return gnttab_shared.v1[ref].flags & (GTF_reading|GTF_writing); +} + +int gnttab_query_foreign_access(grant_ref_t ref) +{ + return gnttab_interface->query_foreign_access(ref); +} +EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); + +static int gnttab_end_foreign_access_ref_v1(grant_ref_t ref, int readonly) +{ + u16 flags, nflags; + u16 *pflags; + + pflags = &gnttab_shared.v1[ref].flags; + nflags = *pflags; + do { + flags = nflags; + if (flags & (GTF_reading|GTF_writing)) + return 0; + } while ((nflags = sync_cmpxchg(pflags, flags, 0)) != flags); + + return 1; +} + +static inline int _gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +{ + return gnttab_interface->end_foreign_access_ref(ref, readonly); +} + +int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) +{ + if (_gnttab_end_foreign_access_ref(ref, readonly)) + return 1; + pr_warn("WARNING: g.e. %#x still in use!\n", ref); + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); + +struct deferred_entry { + struct list_head list; + grant_ref_t ref; + bool ro; + uint16_t warn_delay; + struct page *page; +}; +static LIST_HEAD(deferred_list); +static void gnttab_handle_deferred(unsigned long); +static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred, 0, 0); + +static void gnttab_handle_deferred(unsigned long unused) +{ + unsigned int nr = 10; + struct deferred_entry *first = NULL; + unsigned long flags; + + spin_lock_irqsave(&gnttab_list_lock, flags); + while (nr--) { + struct deferred_entry *entry + = list_first_entry(&deferred_list, + struct deferred_entry, list); + + if (entry == first) + break; + list_del(&entry->list); + spin_unlock_irqrestore(&gnttab_list_lock, flags); + if (_gnttab_end_foreign_access_ref(entry->ref, entry->ro)) { + put_free_entry(entry->ref); + if (entry->page) { + pr_debug("freeing g.e. %#x (pfn %#lx)\n", + entry->ref, page_to_pfn(entry->page)); + __free_page(entry->page); + } else + pr_info("freeing g.e. %#x\n", entry->ref); + kfree(entry); + entry = NULL; + } else { + if (!--entry->warn_delay) + pr_info("g.e. %#x still pending\n", entry->ref); + if (!first) + first = entry; + } + spin_lock_irqsave(&gnttab_list_lock, flags); + if (entry) + list_add_tail(&entry->list, &deferred_list); + else if (list_empty(&deferred_list)) + break; + } + if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) { + deferred_timer.expires = jiffies + HZ; + add_timer(&deferred_timer); + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} + +static void gnttab_add_deferred(grant_ref_t ref, bool readonly, + struct page *page) +{ + struct deferred_entry *entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + const char *what = KERN_WARNING "leaking"; + + if (entry) { + unsigned long flags; + + entry->ref = ref; + entry->ro = readonly; + entry->page = page; + entry->warn_delay = 60; + spin_lock_irqsave(&gnttab_list_lock, flags); + list_add_tail(&entry->list, &deferred_list); + if (!timer_pending(&deferred_timer)) { + deferred_timer.expires = jiffies + HZ; + add_timer(&deferred_timer); + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); + what = KERN_DEBUG "deferring"; + } + printk("%s g.e. %#x (pfn %#lx)\n", + what, ref, page ? page_to_pfn(page) : -1); +} + +void gnttab_end_foreign_access(grant_ref_t ref, int readonly, + unsigned long page) +{ + if (gnttab_end_foreign_access_ref(ref, readonly)) { + put_free_entry(ref); + if (page != 0) + free_page(page); + } else + gnttab_add_deferred(ref, readonly, + page ? virt_to_page(page) : NULL); +} +EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); + +int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) +{ + int ref; + + ref = get_free_entries(1); + if (unlikely(ref < 0)) + return -ENOSPC; + gnttab_grant_foreign_transfer_ref(ref, domid, pfn); + + return ref; +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer); + +void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, + unsigned long pfn) +{ + gnttab_interface->update_entry(ref, domid, pfn, GTF_accept_transfer); +} +EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref); + +static unsigned long gnttab_end_foreign_transfer_ref_v1(grant_ref_t ref) +{ + unsigned long frame; + u16 flags; + u16 *pflags; + + pflags = &gnttab_shared.v1[ref].flags; + + /* + * If a transfer is not even yet started, try to reclaim the grant + * reference and return failure (== 0). + */ + while (!((flags = *pflags) & GTF_transfer_committed)) { + if (sync_cmpxchg(pflags, flags, 0) == flags) + return 0; + cpu_relax(); + } + + /* If a transfer is in progress then wait until it is completed. */ + while (!(flags & GTF_transfer_completed)) { + flags = *pflags; + cpu_relax(); + } + + rmb(); /* Read the frame number /after/ reading completion status. */ + frame = gnttab_shared.v1[ref].frame; + BUG_ON(frame == 0); + + return frame; +} + +unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) +{ + return gnttab_interface->end_foreign_transfer_ref(ref); +} +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref); + +unsigned long gnttab_end_foreign_transfer(grant_ref_t ref) +{ + unsigned long frame = gnttab_end_foreign_transfer_ref(ref); + put_free_entry(ref); + return frame; +} +EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer); + +void gnttab_free_grant_reference(grant_ref_t ref) +{ + put_free_entry(ref); +} +EXPORT_SYMBOL_GPL(gnttab_free_grant_reference); + +void gnttab_free_grant_references(grant_ref_t head) +{ + grant_ref_t ref; + unsigned long flags; + int count = 1; + if (head == GNTTAB_LIST_END) + return; + spin_lock_irqsave(&gnttab_list_lock, flags); + ref = head; + while (gnttab_entry(ref) != GNTTAB_LIST_END) { + ref = gnttab_entry(ref); + count++; + } + gnttab_entry(ref) = gnttab_free_head; + gnttab_free_head = head; + gnttab_free_count += count; + check_free_callbacks(); + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} +EXPORT_SYMBOL_GPL(gnttab_free_grant_references); + +int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) +{ + int h = get_free_entries(count); + + if (h < 0) + return -ENOSPC; + + *head = h; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references); + +int gnttab_empty_grant_references(const grant_ref_t *private_head) +{ + return (*private_head == GNTTAB_LIST_END); +} +EXPORT_SYMBOL_GPL(gnttab_empty_grant_references); + +int gnttab_claim_grant_reference(grant_ref_t *private_head) +{ + grant_ref_t g = *private_head; + if (unlikely(g == GNTTAB_LIST_END)) + return -ENOSPC; + *private_head = gnttab_entry(g); + return g; +} +EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference); + +void gnttab_release_grant_reference(grant_ref_t *private_head, + grant_ref_t release) +{ + gnttab_entry(release) = *private_head; + *private_head = release; +} +EXPORT_SYMBOL_GPL(gnttab_release_grant_reference); + +void gnttab_request_free_callback(struct gnttab_free_callback *callback, + void (*fn)(void *), void *arg, u16 count) +{ + unsigned long flags; + struct gnttab_free_callback *cb; + + spin_lock_irqsave(&gnttab_list_lock, flags); + + /* Check if the callback is already on the list */ + cb = gnttab_free_callback_list; + while (cb) { + if (cb == callback) + goto out; + cb = cb->next; + } + + callback->fn = fn; + callback->arg = arg; + callback->count = count; + callback->next = gnttab_free_callback_list; + gnttab_free_callback_list = callback; + check_free_callbacks(); +out: + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} +EXPORT_SYMBOL_GPL(gnttab_request_free_callback); + +void gnttab_cancel_free_callback(struct gnttab_free_callback *callback) +{ + struct gnttab_free_callback **pcb; + unsigned long flags; + + spin_lock_irqsave(&gnttab_list_lock, flags); + for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) { + if (*pcb == callback) { + *pcb = callback->next; + break; + } + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); +} +EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback); + +static int grow_gnttab_list(unsigned int more_frames) +{ + unsigned int new_nr_grant_frames, extra_entries, i; + unsigned int nr_glist_frames, new_nr_glist_frames; + + BUG_ON(grefs_per_grant_frame == 0); + + new_nr_grant_frames = nr_grant_frames + more_frames; + extra_entries = more_frames * grefs_per_grant_frame; + + nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; + new_nr_glist_frames = + (new_nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; + for (i = nr_glist_frames; i < new_nr_glist_frames; i++) { + gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC); + if (!gnttab_list[i]) + goto grow_nomem; + } + + + for (i = grefs_per_grant_frame * nr_grant_frames; + i < grefs_per_grant_frame * new_nr_grant_frames - 1; i++) + gnttab_entry(i) = i + 1; + + gnttab_entry(i) = gnttab_free_head; + gnttab_free_head = grefs_per_grant_frame * nr_grant_frames; + gnttab_free_count += extra_entries; + + nr_grant_frames = new_nr_grant_frames; + + check_free_callbacks(); + + return 0; + +grow_nomem: + while (i-- > nr_glist_frames) + free_page((unsigned long) gnttab_list[i]); + return -ENOMEM; +} + +static unsigned int __max_nr_grant_frames(void) +{ + struct gnttab_query_size query; + int rc; + + query.dom = DOMID_SELF; + + rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1); + if ((rc < 0) || (query.status != GNTST_okay)) + return 4; /* Legacy max supported number of frames */ + + return query.max_nr_frames; +} + +unsigned int gnttab_max_grant_frames(void) +{ + unsigned int xen_max = __max_nr_grant_frames(); + static unsigned int boot_max_nr_grant_frames; + + /* First time, initialize it properly. */ + if (!boot_max_nr_grant_frames) + boot_max_nr_grant_frames = __max_nr_grant_frames(); + + if (xen_max > boot_max_nr_grant_frames) + return boot_max_nr_grant_frames; + return xen_max; +} +EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); + +int gnttab_setup_auto_xlat_frames(phys_addr_t addr) +{ + xen_pfn_t *pfn; + unsigned int max_nr_gframes = __max_nr_grant_frames(); + unsigned int i; + void *vaddr; + + if (xen_auto_xlat_grant_frames.count) + return -EINVAL; + + vaddr = xen_remap(addr, PAGE_SIZE * max_nr_gframes); + if (vaddr == NULL) { + pr_warn("Failed to ioremap gnttab share frames (addr=%pa)!\n", + &addr); + return -ENOMEM; + } + pfn = kcalloc(max_nr_gframes, sizeof(pfn[0]), GFP_KERNEL); + if (!pfn) { + xen_unmap(vaddr); + return -ENOMEM; + } + for (i = 0; i < max_nr_gframes; i++) + pfn[i] = PFN_DOWN(addr) + i; + + xen_auto_xlat_grant_frames.vaddr = vaddr; + xen_auto_xlat_grant_frames.pfn = pfn; + xen_auto_xlat_grant_frames.count = max_nr_gframes; + + return 0; +} +EXPORT_SYMBOL_GPL(gnttab_setup_auto_xlat_frames); + +void gnttab_free_auto_xlat_frames(void) +{ + if (!xen_auto_xlat_grant_frames.count) + return; + kfree(xen_auto_xlat_grant_frames.pfn); + xen_unmap(xen_auto_xlat_grant_frames.vaddr); + + xen_auto_xlat_grant_frames.pfn = NULL; + xen_auto_xlat_grant_frames.count = 0; + xen_auto_xlat_grant_frames.vaddr = NULL; +} +EXPORT_SYMBOL_GPL(gnttab_free_auto_xlat_frames); + +/** + * gnttab_alloc_pages - alloc pages suitable for grant mapping into + * @nr_pages: number of pages to alloc + * @pages: returns the pages + */ +int gnttab_alloc_pages(int nr_pages, struct page **pages) +{ + int i; + int ret; + + ret = alloc_xenballooned_pages(nr_pages, pages, false); + if (ret < 0) + return ret; + + for (i = 0; i < nr_pages; i++) { +#if BITS_PER_LONG < 64 + struct xen_page_foreign *foreign; + + foreign = kzalloc(sizeof(*foreign), GFP_KERNEL); + if (!foreign) { + gnttab_free_pages(nr_pages, pages); + return -ENOMEM; + } + set_page_private(pages[i], (unsigned long)foreign); +#endif + SetPagePrivate(pages[i]); + } + + return 0; +} +EXPORT_SYMBOL(gnttab_alloc_pages); + +/** + * gnttab_free_pages - free pages allocated by gnttab_alloc_pages() + * @nr_pages; number of pages to free + * @pages: the pages + */ +void gnttab_free_pages(int nr_pages, struct page **pages) +{ + int i; + + for (i = 0; i < nr_pages; i++) { + if (PagePrivate(pages[i])) { +#if BITS_PER_LONG < 64 + kfree((void *)page_private(pages[i])); +#endif + ClearPagePrivate(pages[i]); + } + } + free_xenballooned_pages(nr_pages, pages); +} +EXPORT_SYMBOL(gnttab_free_pages); + +/* Handling of paged out grant targets (GNTST_eagain) */ +#define MAX_DELAY 256 +static inline void +gnttab_retry_eagain_gop(unsigned int cmd, void *gop, int16_t *status, + const char *func) +{ + unsigned delay = 1; + + do { + BUG_ON(HYPERVISOR_grant_table_op(cmd, gop, 1)); + if (*status == GNTST_eagain) + msleep(delay++); + } while ((*status == GNTST_eagain) && (delay < MAX_DELAY)); + + if (delay >= MAX_DELAY) { + pr_err("%s: %s eagain grant\n", func, current->comm); + *status = GNTST_bad_page; + } +} + +void gnttab_batch_map(struct gnttab_map_grant_ref *batch, unsigned count) +{ + struct gnttab_map_grant_ref *op; + + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, batch, count)) + BUG(); + for (op = batch; op < batch + count; op++) + if (op->status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, op, + &op->status, __func__); +} +EXPORT_SYMBOL_GPL(gnttab_batch_map); + +void gnttab_batch_copy(struct gnttab_copy *batch, unsigned count) +{ + struct gnttab_copy *op; + + if (HYPERVISOR_grant_table_op(GNTTABOP_copy, batch, count)) + BUG(); + for (op = batch; op < batch + count; op++) + if (op->status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_copy, op, + &op->status, __func__); +} +EXPORT_SYMBOL_GPL(gnttab_batch_copy); + +int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) +{ + int i, ret; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map_ops, count); + if (ret) + return ret; + + for (i = 0; i < count; i++) { + /* Retry eagain maps */ + if (map_ops[i].status == GNTST_eagain) + gnttab_retry_eagain_gop(GNTTABOP_map_grant_ref, map_ops + i, + &map_ops[i].status, __func__); + + if (map_ops[i].status == GNTST_okay) { + struct xen_page_foreign *foreign; + + SetPageForeign(pages[i]); + foreign = xen_page_foreign(pages[i]); + foreign->domid = map_ops[i].dom; + foreign->gref = map_ops[i].ref; + } + } + + return set_foreign_p2m_mapping(map_ops, kmap_ops, pages, count); +} +EXPORT_SYMBOL_GPL(gnttab_map_refs); + +int gnttab_unmap_refs(struct gnttab_unmap_grant_ref *unmap_ops, + struct gnttab_unmap_grant_ref *kunmap_ops, + struct page **pages, unsigned int count) +{ + unsigned int i; + int ret; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap_ops, count); + if (ret) + return ret; + + for (i = 0; i < count; i++) + ClearPageForeign(pages[i]); + + return clear_foreign_p2m_mapping(unmap_ops, kunmap_ops, pages, count); +} +EXPORT_SYMBOL_GPL(gnttab_unmap_refs); + +#define GNTTAB_UNMAP_REFS_DELAY 5 + +static void __gnttab_unmap_refs_async(struct gntab_unmap_queue_data* item); + +static void gnttab_unmap_work(struct work_struct *work) +{ + struct gntab_unmap_queue_data + *unmap_data = container_of(work, + struct gntab_unmap_queue_data, + gnttab_work.work); + if (unmap_data->age != UINT_MAX) + unmap_data->age++; + __gnttab_unmap_refs_async(unmap_data); +} + +static void __gnttab_unmap_refs_async(struct gntab_unmap_queue_data* item) +{ + int ret; + int pc; + + for (pc = 0; pc < item->count; pc++) { + if (page_count(item->pages[pc]) > 1) { + unsigned long delay = GNTTAB_UNMAP_REFS_DELAY * (item->age + 1); + schedule_delayed_work(&item->gnttab_work, + msecs_to_jiffies(delay)); + return; + } + } + + ret = gnttab_unmap_refs(item->unmap_ops, item->kunmap_ops, + item->pages, item->count); + item->done(ret, item); +} + +void gnttab_unmap_refs_async(struct gntab_unmap_queue_data* item) +{ + INIT_DELAYED_WORK(&item->gnttab_work, gnttab_unmap_work); + item->age = 0; + + __gnttab_unmap_refs_async(item); +} +EXPORT_SYMBOL_GPL(gnttab_unmap_refs_async); + +static void unmap_refs_callback(int result, + struct gntab_unmap_queue_data *data) +{ + struct unmap_refs_callback_data *d = data->data; + + d->result = result; + complete(&d->completion); +} + +int gnttab_unmap_refs_sync(struct gntab_unmap_queue_data *item) +{ + struct unmap_refs_callback_data data; + + init_completion(&data.completion); + item->data = &data; + item->done = &unmap_refs_callback; + gnttab_unmap_refs_async(item); + wait_for_completion(&data.completion); + + return data.result; +} +EXPORT_SYMBOL_GPL(gnttab_unmap_refs_sync); + +static int gnttab_map_frames_v1(xen_pfn_t *frames, unsigned int nr_gframes) +{ + int rc; + + rc = arch_gnttab_map_shared(frames, nr_gframes, + gnttab_max_grant_frames(), + &gnttab_shared.addr); + BUG_ON(rc); + + return 0; +} + +static void gnttab_unmap_frames_v1(void) +{ + arch_gnttab_unmap(gnttab_shared.addr, nr_grant_frames); +} + +static int gnttab_map(unsigned int start_idx, unsigned int end_idx) +{ + struct gnttab_setup_table setup; + xen_pfn_t *frames; + unsigned int nr_gframes = end_idx + 1; + int rc; + + if (xen_feature(XENFEAT_auto_translated_physmap)) { + struct xen_add_to_physmap xatp; + unsigned int i = end_idx; + rc = 0; + BUG_ON(xen_auto_xlat_grant_frames.count < nr_gframes); + /* + * Loop backwards, so that the first hypercall has the largest + * index, ensuring that the table will grow only once. + */ + do { + xatp.domid = DOMID_SELF; + xatp.idx = i; + xatp.space = XENMAPSPACE_grant_table; + xatp.gpfn = xen_auto_xlat_grant_frames.pfn[i]; + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp); + if (rc != 0) { + pr_warn("grant table add_to_physmap failed, err=%d\n", + rc); + break; + } + } while (i-- > start_idx); + + return rc; + } + + /* No need for kzalloc as it is initialized in following hypercall + * GNTTABOP_setup_table. + */ + frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); + if (!frames) + return -ENOMEM; + + setup.dom = DOMID_SELF; + setup.nr_frames = nr_gframes; + set_xen_guest_handle(setup.frame_list, frames); + + rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1); + if (rc == -ENOSYS) { + kfree(frames); + return -ENOSYS; + } + + BUG_ON(rc || setup.status); + + rc = gnttab_interface->map_frames(frames, nr_gframes); + + kfree(frames); + + return rc; +} + +static struct gnttab_ops gnttab_v1_ops = { + .map_frames = gnttab_map_frames_v1, + .unmap_frames = gnttab_unmap_frames_v1, + .update_entry = gnttab_update_entry_v1, + .end_foreign_access_ref = gnttab_end_foreign_access_ref_v1, + .end_foreign_transfer_ref = gnttab_end_foreign_transfer_ref_v1, + .query_foreign_access = gnttab_query_foreign_access_v1, +}; + +static void gnttab_request_version(void) +{ + /* Only version 1 is used, which will always be available. */ + grant_table_version = 1; + grefs_per_grant_frame = PAGE_SIZE / sizeof(struct grant_entry_v1); + gnttab_interface = &gnttab_v1_ops; + + pr_info("Grant tables using version %d layout\n", grant_table_version); +} + +static int gnttab_setup(void) +{ + unsigned int max_nr_gframes; + + max_nr_gframes = gnttab_max_grant_frames(); + if (max_nr_gframes < nr_grant_frames) + return -ENOSYS; + + if (xen_feature(XENFEAT_auto_translated_physmap) && gnttab_shared.addr == NULL) { + gnttab_shared.addr = xen_auto_xlat_grant_frames.vaddr; + if (gnttab_shared.addr == NULL) { + pr_warn("gnttab share frames (addr=0x%08lx) is not mapped!\n", + (unsigned long)xen_auto_xlat_grant_frames.vaddr); + return -ENOMEM; + } + } + return gnttab_map(0, nr_grant_frames - 1); +} + +int gnttab_resume(void) +{ + gnttab_request_version(); + return gnttab_setup(); +} + +int gnttab_suspend(void) +{ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + gnttab_interface->unmap_frames(); + return 0; +} + +static int gnttab_expand(unsigned int req_entries) +{ + int rc; + unsigned int cur, extra; + + BUG_ON(grefs_per_grant_frame == 0); + cur = nr_grant_frames; + extra = ((req_entries + (grefs_per_grant_frame-1)) / + grefs_per_grant_frame); + if (cur + extra > gnttab_max_grant_frames()) + return -ENOSPC; + + rc = gnttab_map(cur, cur + extra - 1); + if (rc == 0) + rc = grow_gnttab_list(extra); + + return rc; +} + +int gnttab_init(void) +{ + int i; + unsigned long max_nr_grant_frames; + unsigned int max_nr_glist_frames, nr_glist_frames; + unsigned int nr_init_grefs; + int ret; + + gnttab_request_version(); + max_nr_grant_frames = gnttab_max_grant_frames(); + nr_grant_frames = 1; + + /* Determine the maximum number of frames required for the + * grant reference free list on the current hypervisor. + */ + BUG_ON(grefs_per_grant_frame == 0); + max_nr_glist_frames = (max_nr_grant_frames * + grefs_per_grant_frame / RPP); + + gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), + GFP_KERNEL); + if (gnttab_list == NULL) + return -ENOMEM; + + nr_glist_frames = (nr_grant_frames * grefs_per_grant_frame + RPP - 1) / RPP; + for (i = 0; i < nr_glist_frames; i++) { + gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); + if (gnttab_list[i] == NULL) { + ret = -ENOMEM; + goto ini_nomem; + } + } + + ret = arch_gnttab_init(max_nr_grant_frames); + if (ret < 0) + goto ini_nomem; + + if (gnttab_setup() < 0) { + ret = -ENODEV; + goto ini_nomem; + } + + nr_init_grefs = nr_grant_frames * grefs_per_grant_frame; + + for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) + gnttab_entry(i) = i + 1; + + gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END; + gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES; + gnttab_free_head = NR_RESERVED_ENTRIES; + + printk("Grant table initialized\n"); + return 0; + + ini_nomem: + for (i--; i >= 0; i--) + free_page((unsigned long)gnttab_list[i]); + kfree(gnttab_list); + return ret; +} +EXPORT_SYMBOL_GPL(gnttab_init); + +static int __gnttab_init(void) +{ + /* Delay grant-table initialization in the PV on HVM case */ + if (xen_hvm_domain()) + return 0; + + if (!xen_pv_domain()) + return -ENODEV; + + return gnttab_init(); +} +/* Starts after core_initcall so that xen_pvh_gnttab_setup can be called + * beforehand to initialize xen_auto_xlat_grant_frames. */ +core_initcall_sync(__gnttab_init); diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c new file mode 100644 index 000000000..9e6a85104 --- /dev/null +++ b/drivers/xen/manage.c @@ -0,0 +1,355 @@ +/* + * Handle extern requests for shutdown, reboot and sysrq + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/reboot.h> +#include <linux/sysrq.h> +#include <linux/stop_machine.h> +#include <linux/freezer.h> +#include <linux/syscore_ops.h> +#include <linux/export.h> + +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/grant_table.h> +#include <xen/events.h> +#include <xen/hvc-console.h> +#include <xen/xen-ops.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> +#include <asm/xen/hypervisor.h> + +enum shutdown_state { + SHUTDOWN_INVALID = -1, + SHUTDOWN_POWEROFF = 0, + SHUTDOWN_SUSPEND = 2, + /* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only + report a crash, not be instructed to crash! + HALT is the same as POWEROFF, as far as we're concerned. The tools use + the distinction when we return the reason code to them. */ + SHUTDOWN_HALT = 4, +}; + +/* Ignore multiple shutdown requests. */ +static enum shutdown_state shutting_down = SHUTDOWN_INVALID; + +struct suspend_info { + int cancelled; +}; + +static RAW_NOTIFIER_HEAD(xen_resume_notifier); + +void xen_resume_notifier_register(struct notifier_block *nb) +{ + raw_notifier_chain_register(&xen_resume_notifier, nb); +} +EXPORT_SYMBOL_GPL(xen_resume_notifier_register); + +void xen_resume_notifier_unregister(struct notifier_block *nb) +{ + raw_notifier_chain_unregister(&xen_resume_notifier, nb); +} +EXPORT_SYMBOL_GPL(xen_resume_notifier_unregister); + +#ifdef CONFIG_HIBERNATE_CALLBACKS +static int xen_suspend(void *data) +{ + struct suspend_info *si = data; + int err; + + BUG_ON(!irqs_disabled()); + + err = syscore_suspend(); + if (err) { + pr_err("%s: system core suspend failed: %d\n", __func__, err); + return err; + } + + gnttab_suspend(); + xen_arch_pre_suspend(); + + /* + * This hypercall returns 1 if suspend was cancelled + * or the domain was merely checkpointed, and 0 if it + * is resuming in a new domain. + */ + si->cancelled = HYPERVISOR_suspend(xen_pv_domain() + ? virt_to_mfn(xen_start_info) + : 0); + + xen_arch_post_suspend(si->cancelled); + gnttab_resume(); + + if (!si->cancelled) { + xen_irq_resume(); + xen_timer_resume(); + } + + syscore_resume(); + + return 0; +} + +static void do_suspend(void) +{ + int err; + struct suspend_info si; + + shutting_down = SHUTDOWN_SUSPEND; + + err = freeze_processes(); + if (err) { + pr_err("%s: freeze processes failed %d\n", __func__, err); + goto out; + } + + err = freeze_kernel_threads(); + if (err) { + pr_err("%s: freeze kernel threads failed %d\n", __func__, err); + goto out_thaw; + } + + err = dpm_suspend_start(PMSG_FREEZE); + if (err) { + pr_err("%s: dpm_suspend_start %d\n", __func__, err); + goto out_thaw; + } + + printk(KERN_DEBUG "suspending xenstore...\n"); + xs_suspend(); + + err = dpm_suspend_end(PMSG_FREEZE); + if (err) { + pr_err("dpm_suspend_end failed: %d\n", err); + si.cancelled = 0; + goto out_resume; + } + + xen_arch_suspend(); + + si.cancelled = 1; + + err = stop_machine(xen_suspend, &si, cpumask_of(0)); + + /* Resume console as early as possible. */ + if (!si.cancelled) + xen_console_resume(); + + raw_notifier_call_chain(&xen_resume_notifier, 0, NULL); + + dpm_resume_start(si.cancelled ? PMSG_THAW : PMSG_RESTORE); + + if (err) { + pr_err("failed to start xen_suspend: %d\n", err); + si.cancelled = 1; + } + + xen_arch_resume(); + +out_resume: + if (!si.cancelled) + xs_resume(); + else + xs_suspend_cancel(); + + dpm_resume_end(si.cancelled ? PMSG_THAW : PMSG_RESTORE); + +out_thaw: + thaw_processes(); +out: + shutting_down = SHUTDOWN_INVALID; +} +#endif /* CONFIG_HIBERNATE_CALLBACKS */ + +struct shutdown_handler { + const char *command; + void (*cb)(void); +}; + +static int poweroff_nb(struct notifier_block *cb, unsigned long code, void *unused) +{ + switch (code) { + case SYS_DOWN: + case SYS_HALT: + case SYS_POWER_OFF: + shutting_down = SHUTDOWN_POWEROFF; + default: + break; + } + return NOTIFY_DONE; +} +static void do_poweroff(void) +{ + switch (system_state) { + case SYSTEM_BOOTING: + orderly_poweroff(true); + break; + case SYSTEM_RUNNING: + orderly_poweroff(false); + break; + default: + /* Don't do it when we are halting/rebooting. */ + pr_info("Ignoring Xen toolstack shutdown.\n"); + break; + } +} + +static void do_reboot(void) +{ + shutting_down = SHUTDOWN_POWEROFF; /* ? */ + ctrl_alt_del(); +} + +static void shutdown_handler(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + char *str; + struct xenbus_transaction xbt; + int err; + static struct shutdown_handler handlers[] = { + { "poweroff", do_poweroff }, + { "halt", do_poweroff }, + { "reboot", do_reboot }, +#ifdef CONFIG_HIBERNATE_CALLBACKS + { "suspend", do_suspend }, +#endif + {NULL, NULL}, + }; + static struct shutdown_handler *handler; + + if (shutting_down != SHUTDOWN_INVALID) + return; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + + str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); + /* Ignore read errors and empty reads. */ + if (XENBUS_IS_ERR_READ(str)) { + xenbus_transaction_end(xbt, 1); + return; + } + + for (handler = &handlers[0]; handler->command; handler++) { + if (strcmp(str, handler->command) == 0) + break; + } + + /* Only acknowledge commands which we are prepared to handle. */ + if (handler->cb) + xenbus_write(xbt, "control", "shutdown", ""); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) { + kfree(str); + goto again; + } + + if (handler->cb) { + handler->cb(); + } else { + pr_info("Ignoring shutdown request: %s\n", str); + shutting_down = SHUTDOWN_INVALID; + } + + kfree(str); +} + +#ifdef CONFIG_MAGIC_SYSRQ +static void sysrq_handler(struct xenbus_watch *watch, const char **vec, + unsigned int len) +{ + char sysrq_key = '\0'; + struct xenbus_transaction xbt; + int err; + + again: + err = xenbus_transaction_start(&xbt); + if (err) + return; + if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { + pr_err("Unable to read sysrq code in control/sysrq\n"); + xenbus_transaction_end(xbt, 1); + return; + } + + if (sysrq_key != '\0') + xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + + if (sysrq_key != '\0') + handle_sysrq(sysrq_key); +} + +static struct xenbus_watch sysrq_watch = { + .node = "control/sysrq", + .callback = sysrq_handler +}; +#endif + +static struct xenbus_watch shutdown_watch = { + .node = "control/shutdown", + .callback = shutdown_handler +}; + +static struct notifier_block xen_reboot_nb = { + .notifier_call = poweroff_nb, +}; + +static int setup_shutdown_watcher(void) +{ + int err; + + err = register_xenbus_watch(&shutdown_watch); + if (err) { + pr_err("Failed to set shutdown watcher\n"); + return err; + } + + +#ifdef CONFIG_MAGIC_SYSRQ + err = register_xenbus_watch(&sysrq_watch); + if (err) { + pr_err("Failed to set sysrq watcher\n"); + return err; + } +#endif + + return 0; +} + +static int shutdown_event(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + setup_shutdown_watcher(); + return NOTIFY_DONE; +} + +int xen_setup_shutdown_event(void) +{ + static struct notifier_block xenstore_notifier = { + .notifier_call = shutdown_event + }; + + if (!xen_domain()) + return -ENODEV; + register_xenstore_notifier(&xenstore_notifier); + register_reboot_notifier(&xen_reboot_nb); + + return 0; +} +EXPORT_SYMBOL_GPL(xen_setup_shutdown_event); + +subsys_initcall(xen_setup_shutdown_event); diff --git a/drivers/xen/mcelog.c b/drivers/xen/mcelog.c new file mode 100644 index 000000000..a493c7315 --- /dev/null +++ b/drivers/xen/mcelog.c @@ -0,0 +1,417 @@ +/****************************************************************************** + * mcelog.c + * Driver for receiving and transferring machine check error infomation + * + * Copyright (c) 2012 Intel Corporation + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * Author: Jiang, Yunhong <yunhong.jiang@intel.com> + * Author: Ke, Liping <liping.ke@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen_mcelog: " fmt + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/fs.h> +#include <linux/device.h> +#include <linux/miscdevice.h> +#include <linux/uaccess.h> +#include <linux/capability.h> +#include <linux/poll.h> +#include <linux/sched.h> + +#include <xen/interface/xen.h> +#include <xen/events.h> +#include <xen/interface/vcpu.h> +#include <xen/xen.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +static struct mc_info g_mi; +static struct mcinfo_logical_cpu *g_physinfo; +static uint32_t ncpus; + +static DEFINE_MUTEX(mcelog_lock); + +static struct xen_mce_log xen_mcelog = { + .signature = XEN_MCE_LOG_SIGNATURE, + .len = XEN_MCE_LOG_LEN, + .recordlen = sizeof(struct xen_mce), +}; + +static DEFINE_SPINLOCK(xen_mce_chrdev_state_lock); +static int xen_mce_chrdev_open_count; /* #times opened */ +static int xen_mce_chrdev_open_exclu; /* already open exclusive? */ + +static DECLARE_WAIT_QUEUE_HEAD(xen_mce_chrdev_wait); + +static int xen_mce_chrdev_open(struct inode *inode, struct file *file) +{ + spin_lock(&xen_mce_chrdev_state_lock); + + if (xen_mce_chrdev_open_exclu || + (xen_mce_chrdev_open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&xen_mce_chrdev_state_lock); + + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + xen_mce_chrdev_open_exclu = 1; + xen_mce_chrdev_open_count++; + + spin_unlock(&xen_mce_chrdev_state_lock); + + return nonseekable_open(inode, file); +} + +static int xen_mce_chrdev_release(struct inode *inode, struct file *file) +{ + spin_lock(&xen_mce_chrdev_state_lock); + + xen_mce_chrdev_open_count--; + xen_mce_chrdev_open_exclu = 0; + + spin_unlock(&xen_mce_chrdev_state_lock); + + return 0; +} + +static ssize_t xen_mce_chrdev_read(struct file *filp, char __user *ubuf, + size_t usize, loff_t *off) +{ + char __user *buf = ubuf; + unsigned num; + int i, err; + + mutex_lock(&mcelog_lock); + + num = xen_mcelog.next; + + /* Only supports full reads right now */ + err = -EINVAL; + if (*off != 0 || usize < XEN_MCE_LOG_LEN*sizeof(struct xen_mce)) + goto out; + + err = 0; + for (i = 0; i < num; i++) { + struct xen_mce *m = &xen_mcelog.entry[i]; + + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); + } + + memset(xen_mcelog.entry, 0, num * sizeof(struct xen_mce)); + xen_mcelog.next = 0; + + if (err) + err = -EFAULT; + +out: + mutex_unlock(&mcelog_lock); + + return err ? err : buf - ubuf; +} + +static unsigned int xen_mce_chrdev_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &xen_mce_chrdev_wait, wait); + + if (xen_mcelog.next) + return POLLIN | POLLRDNORM; + + return 0; +} + +static long xen_mce_chrdev_ioctl(struct file *f, unsigned int cmd, + unsigned long arg) +{ + int __user *p = (int __user *)arg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case MCE_GET_RECORD_LEN: + return put_user(sizeof(struct xen_mce), p); + case MCE_GET_LOG_LEN: + return put_user(XEN_MCE_LOG_LEN, p); + case MCE_GETCLEAR_FLAGS: { + unsigned flags; + + do { + flags = xen_mcelog.flags; + } while (cmpxchg(&xen_mcelog.flags, flags, 0) != flags); + + return put_user(flags, p); + } + default: + return -ENOTTY; + } +} + +static const struct file_operations xen_mce_chrdev_ops = { + .open = xen_mce_chrdev_open, + .release = xen_mce_chrdev_release, + .read = xen_mce_chrdev_read, + .poll = xen_mce_chrdev_poll, + .unlocked_ioctl = xen_mce_chrdev_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice xen_mce_chrdev_device = { + MISC_MCELOG_MINOR, + "mcelog", + &xen_mce_chrdev_ops, +}; + +/* + * Caller should hold the mcelog_lock + */ +static void xen_mce_log(struct xen_mce *mce) +{ + unsigned entry; + + entry = xen_mcelog.next; + + /* + * When the buffer fills up discard new entries. + * Assume that the earlier errors are the more + * interesting ones: + */ + if (entry >= XEN_MCE_LOG_LEN) { + set_bit(XEN_MCE_OVERFLOW, + (unsigned long *)&xen_mcelog.flags); + return; + } + + memcpy(xen_mcelog.entry + entry, mce, sizeof(struct xen_mce)); + + xen_mcelog.next++; +} + +static int convert_log(struct mc_info *mi) +{ + struct mcinfo_common *mic; + struct mcinfo_global *mc_global; + struct mcinfo_bank *mc_bank; + struct xen_mce m; + uint32_t i; + + mic = NULL; + x86_mcinfo_lookup(&mic, mi, MC_TYPE_GLOBAL); + if (unlikely(!mic)) { + pr_warn("Failed to find global error info\n"); + return -ENODEV; + } + + memset(&m, 0, sizeof(struct xen_mce)); + + mc_global = (struct mcinfo_global *)mic; + m.mcgstatus = mc_global->mc_gstatus; + m.apicid = mc_global->mc_apicid; + + for (i = 0; i < ncpus; i++) + if (g_physinfo[i].mc_apicid == m.apicid) + break; + if (unlikely(i == ncpus)) { + pr_warn("Failed to match cpu with apicid %d\n", m.apicid); + return -ENODEV; + } + + m.socketid = g_physinfo[i].mc_chipid; + m.cpu = m.extcpu = g_physinfo[i].mc_cpunr; + m.cpuvendor = (__u8)g_physinfo[i].mc_vendor; + m.mcgcap = g_physinfo[i].mc_msrvalues[__MC_MSR_MCGCAP].value; + + mic = NULL; + x86_mcinfo_lookup(&mic, mi, MC_TYPE_BANK); + if (unlikely(!mic)) { + pr_warn("Fail to find bank error info\n"); + return -ENODEV; + } + + do { + if ((!mic) || (mic->size == 0) || + (mic->type != MC_TYPE_GLOBAL && + mic->type != MC_TYPE_BANK && + mic->type != MC_TYPE_EXTENDED && + mic->type != MC_TYPE_RECOVERY)) + break; + + if (mic->type == MC_TYPE_BANK) { + mc_bank = (struct mcinfo_bank *)mic; + m.misc = mc_bank->mc_misc; + m.status = mc_bank->mc_status; + m.addr = mc_bank->mc_addr; + m.tsc = mc_bank->mc_tsc; + m.bank = mc_bank->mc_bank; + m.finished = 1; + /*log this record*/ + xen_mce_log(&m); + } + mic = x86_mcinfo_next(mic); + } while (1); + + return 0; +} + +static int mc_queue_handle(uint32_t flags) +{ + struct xen_mc mc_op; + int ret = 0; + + mc_op.cmd = XEN_MC_fetch; + mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; + set_xen_guest_handle(mc_op.u.mc_fetch.data, &g_mi); + do { + mc_op.u.mc_fetch.flags = flags; + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to fetch %surgent error log\n", + flags == XEN_MC_URGENT ? "" : "non"); + break; + } + + if (mc_op.u.mc_fetch.flags & XEN_MC_NODATA || + mc_op.u.mc_fetch.flags & XEN_MC_FETCHFAILED) + break; + else { + ret = convert_log(&g_mi); + if (ret) + pr_warn("Failed to convert this error log, continue acking it anyway\n"); + + mc_op.u.mc_fetch.flags = flags | XEN_MC_ACK; + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to ack previous error log\n"); + break; + } + } + } while (1); + + return ret; +} + +/* virq handler for machine check error info*/ +static void xen_mce_work_fn(struct work_struct *work) +{ + int err; + + mutex_lock(&mcelog_lock); + + /* urgent mc_info */ + err = mc_queue_handle(XEN_MC_URGENT); + if (err) + pr_err("Failed to handle urgent mc_info queue, continue handling nonurgent mc_info queue anyway\n"); + + /* nonurgent mc_info */ + err = mc_queue_handle(XEN_MC_NONURGENT); + if (err) + pr_err("Failed to handle nonurgent mc_info queue\n"); + + /* wake processes polling /dev/mcelog */ + wake_up_interruptible(&xen_mce_chrdev_wait); + + mutex_unlock(&mcelog_lock); +} +static DECLARE_WORK(xen_mce_work, xen_mce_work_fn); + +static irqreturn_t xen_mce_interrupt(int irq, void *dev_id) +{ + schedule_work(&xen_mce_work); + return IRQ_HANDLED; +} + +static int bind_virq_for_mce(void) +{ + int ret; + struct xen_mc mc_op; + + memset(&mc_op, 0, sizeof(struct xen_mc)); + + /* Fetch physical CPU Numbers */ + mc_op.cmd = XEN_MC_physcpuinfo; + mc_op.interface_version = XEN_MCA_INTERFACE_VERSION; + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to get CPU numbers\n"); + return ret; + } + + /* Fetch each CPU Physical Info for later reference*/ + ncpus = mc_op.u.mc_physcpuinfo.ncpus; + g_physinfo = kcalloc(ncpus, sizeof(struct mcinfo_logical_cpu), + GFP_KERNEL); + if (!g_physinfo) + return -ENOMEM; + set_xen_guest_handle(mc_op.u.mc_physcpuinfo.info, g_physinfo); + ret = HYPERVISOR_mca(&mc_op); + if (ret) { + pr_err("Failed to get CPU info\n"); + kfree(g_physinfo); + return ret; + } + + ret = bind_virq_to_irqhandler(VIRQ_MCA, 0, + xen_mce_interrupt, 0, "mce", NULL); + if (ret < 0) { + pr_err("Failed to bind virq\n"); + kfree(g_physinfo); + return ret; + } + + return 0; +} + +static int __init xen_late_init_mcelog(void) +{ + int ret; + + /* Only DOM0 is responsible for MCE logging */ + if (!xen_initial_domain()) + return -ENODEV; + + /* register character device /dev/mcelog for xen mcelog */ + ret = misc_register(&xen_mce_chrdev_device); + if (ret) + return ret; + + ret = bind_virq_for_mce(); + if (ret) + goto deregister; + + return 0; + +deregister: + misc_deregister(&xen_mce_chrdev_device); + return ret; +} +device_initcall(xen_late_init_mcelog); diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c new file mode 100644 index 000000000..7494dbeb4 --- /dev/null +++ b/drivers/xen/pci.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Author: Weidong Han <weidong.han@intel.com> + */ + +#include <linux/pci.h> +#include <linux/acpi.h> +#include <linux/pci-acpi.h> +#include <xen/xen.h> +#include <xen/interface/physdev.h> +#include <xen/interface/xen.h> + +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> +#include "../pci/pci.h" +#ifdef CONFIG_PCI_MMCONFIG +#include <asm/pci_x86.h> +#endif + +static bool __read_mostly pci_seg_supported = true; + +static int xen_add_device(struct device *dev) +{ + int r; + struct pci_dev *pci_dev = to_pci_dev(dev); +#ifdef CONFIG_PCI_IOV + struct pci_dev *physfn = pci_dev->physfn; +#endif + + if (pci_seg_supported) { + struct { + struct physdev_pci_device_add add; + uint32_t pxm; + } add_ext = { + .add.seg = pci_domain_nr(pci_dev->bus), + .add.bus = pci_dev->bus->number, + .add.devfn = pci_dev->devfn + }; + struct physdev_pci_device_add *add = &add_ext.add; + +#ifdef CONFIG_ACPI + acpi_handle handle; +#endif + +#ifdef CONFIG_PCI_IOV + if (pci_dev->is_virtfn) { + add->flags = XEN_PCI_DEV_VIRTFN; + add->physfn.bus = physfn->bus->number; + add->physfn.devfn = physfn->devfn; + } else +#endif + if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) + add->flags = XEN_PCI_DEV_EXTFN; + +#ifdef CONFIG_ACPI + handle = ACPI_HANDLE(&pci_dev->dev); +#ifdef CONFIG_PCI_IOV + if (!handle && pci_dev->is_virtfn) + handle = ACPI_HANDLE(physfn->bus->bridge); +#endif + if (!handle) { + /* + * This device was not listed in the ACPI name space at + * all. Try to get acpi handle of parent pci bus. + */ + struct pci_bus *pbus; + for (pbus = pci_dev->bus; pbus; pbus = pbus->parent) { + handle = acpi_pci_get_bridge_handle(pbus); + if (handle) + break; + } + } + if (handle) { + acpi_status status; + + do { + unsigned long long pxm; + + status = acpi_evaluate_integer(handle, "_PXM", + NULL, &pxm); + if (ACPI_SUCCESS(status)) { + add->optarr[0] = pxm; + add->flags |= XEN_PCI_DEV_PXM; + break; + } + status = acpi_get_parent(handle, &handle); + } while (ACPI_SUCCESS(status)); + } +#endif /* CONFIG_ACPI */ + + r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, add); + if (r != -ENOSYS) + return r; + pci_seg_supported = false; + } + + if (pci_domain_nr(pci_dev->bus)) + r = -ENOSYS; +#ifdef CONFIG_PCI_IOV + else if (pci_dev->is_virtfn) { + struct physdev_manage_pci_ext manage_pci_ext = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn, + .is_virtfn = 1, + .physfn.bus = physfn->bus->number, + .physfn.devfn = physfn->devfn, + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, + &manage_pci_ext); + } +#endif + else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) { + struct physdev_manage_pci_ext manage_pci_ext = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn, + .is_extfn = 1, + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, + &manage_pci_ext); + } else { + struct physdev_manage_pci manage_pci = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn, + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add, + &manage_pci); + } + + return r; +} + +static int xen_remove_device(struct device *dev) +{ + int r; + struct pci_dev *pci_dev = to_pci_dev(dev); + + if (pci_seg_supported) { + struct physdev_pci_device device = { + .seg = pci_domain_nr(pci_dev->bus), + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_remove, + &device); + } else if (pci_domain_nr(pci_dev->bus)) + r = -ENOSYS; + else { + struct physdev_manage_pci manage_pci = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, + &manage_pci); + } + + return r; +} + +static int xen_pci_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + int r = 0; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + r = xen_add_device(dev); + break; + case BUS_NOTIFY_DEL_DEVICE: + r = xen_remove_device(dev); + break; + default: + return NOTIFY_DONE; + } + if (r) + dev_err(dev, "Failed to %s - passthrough or MSI/MSI-X might fail!\n", + action == BUS_NOTIFY_ADD_DEVICE ? "add" : + (action == BUS_NOTIFY_DEL_DEVICE ? "delete" : "?")); + return NOTIFY_OK; +} + +static struct notifier_block device_nb = { + .notifier_call = xen_pci_notifier, +}; + +static int __init register_xen_pci_notifier(void) +{ + if (!xen_initial_domain()) + return 0; + + return bus_register_notifier(&pci_bus_type, &device_nb); +} + +arch_initcall(register_xen_pci_notifier); + +#ifdef CONFIG_PCI_MMCONFIG +static int __init xen_mcfg_late(void) +{ + struct pci_mmcfg_region *cfg; + int rc; + + if (!xen_initial_domain()) + return 0; + + if ((pci_probe & PCI_PROBE_MMCONF) == 0) + return 0; + + if (list_empty(&pci_mmcfg_list)) + return 0; + + /* Check whether they are in the right area. */ + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + struct physdev_pci_mmcfg_reserved r; + + r.address = cfg->address; + r.segment = cfg->segment; + r.start_bus = cfg->start_bus; + r.end_bus = cfg->end_bus; + r.flags = XEN_PCI_MMCFG_RESERVED; + + rc = HYPERVISOR_physdev_op(PHYSDEVOP_pci_mmcfg_reserved, &r); + switch (rc) { + case 0: + case -ENOSYS: + continue; + + default: + pr_warn("Failed to report MMCONFIG reservation" + " state for %s to hypervisor" + " (%d)\n", + cfg->name, rc); + } + } + return 0; +} +/* + * Needs to be done after acpi_init which are subsys_initcall. + */ +subsys_initcall_sync(xen_mcfg_late); +#endif diff --git a/drivers/xen/pcpu.c b/drivers/xen/pcpu.c new file mode 100644 index 000000000..49e88f2ce --- /dev/null +++ b/drivers/xen/pcpu.c @@ -0,0 +1,418 @@ +/****************************************************************************** + * pcpu.c + * Management physical cpu in dom0, get pcpu info and provide sys interface + * + * Copyright (c) 2012 Intel Corporation + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * Author: Jiang, Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen_cpu: " fmt + +#include <linux/interrupt.h> +#include <linux/spinlock.h> +#include <linux/cpu.h> +#include <linux/stat.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/acpi.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + + +/* + * @cpu_id: Xen physical cpu logic number + * @flags: Xen physical cpu status flag + * - XEN_PCPU_FLAGS_ONLINE: cpu is online + * - XEN_PCPU_FLAGS_INVALID: cpu is not present + */ +struct pcpu { + struct list_head list; + struct device dev; + uint32_t cpu_id; + uint32_t flags; +}; + +static struct bus_type xen_pcpu_subsys = { + .name = "xen_cpu", + .dev_name = "xen_cpu", +}; + +static DEFINE_MUTEX(xen_pcpu_lock); + +static LIST_HEAD(xen_pcpus); + +static int xen_pcpu_down(uint32_t cpu_id) +{ + struct xen_platform_op op = { + .cmd = XENPF_cpu_offline, + .interface_version = XENPF_INTERFACE_VERSION, + .u.cpu_ol.cpuid = cpu_id, + }; + + return HYPERVISOR_dom0_op(&op); +} + +static int xen_pcpu_up(uint32_t cpu_id) +{ + struct xen_platform_op op = { + .cmd = XENPF_cpu_online, + .interface_version = XENPF_INTERFACE_VERSION, + .u.cpu_ol.cpuid = cpu_id, + }; + + return HYPERVISOR_dom0_op(&op); +} + +static ssize_t show_online(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pcpu *cpu = container_of(dev, struct pcpu, dev); + + return sprintf(buf, "%u\n", !!(cpu->flags & XEN_PCPU_FLAGS_ONLINE)); +} + +static ssize_t __ref store_online(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct pcpu *pcpu = container_of(dev, struct pcpu, dev); + unsigned long long val; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoull(buf, 0, &val) < 0) + return -EINVAL; + + switch (val) { + case 0: + ret = xen_pcpu_down(pcpu->cpu_id); + break; + case 1: + ret = xen_pcpu_up(pcpu->cpu_id); + break; + default: + ret = -EINVAL; + } + + if (ret >= 0) + ret = count; + return ret; +} +static DEVICE_ATTR(online, S_IRUGO | S_IWUSR, show_online, store_online); + +static struct attribute *pcpu_dev_attrs[] = { + &dev_attr_online.attr, + NULL +}; + +static umode_t pcpu_dev_is_visible(struct kobject *kobj, + struct attribute *attr, int idx) +{ + struct device *dev = kobj_to_dev(kobj); + /* + * Xen never offline cpu0 due to several restrictions + * and assumptions. This basically doesn't add a sys control + * to user, one cannot attempt to offline BSP. + */ + return dev->id ? attr->mode : 0; +} + +static const struct attribute_group pcpu_dev_group = { + .attrs = pcpu_dev_attrs, + .is_visible = pcpu_dev_is_visible, +}; + +static const struct attribute_group *pcpu_dev_groups[] = { + &pcpu_dev_group, + NULL +}; + +static bool xen_pcpu_online(uint32_t flags) +{ + return !!(flags & XEN_PCPU_FLAGS_ONLINE); +} + +static void pcpu_online_status(struct xenpf_pcpuinfo *info, + struct pcpu *pcpu) +{ + if (xen_pcpu_online(info->flags) && + !xen_pcpu_online(pcpu->flags)) { + /* the pcpu is onlined */ + pcpu->flags |= XEN_PCPU_FLAGS_ONLINE; + kobject_uevent(&pcpu->dev.kobj, KOBJ_ONLINE); + } else if (!xen_pcpu_online(info->flags) && + xen_pcpu_online(pcpu->flags)) { + /* The pcpu is offlined */ + pcpu->flags &= ~XEN_PCPU_FLAGS_ONLINE; + kobject_uevent(&pcpu->dev.kobj, KOBJ_OFFLINE); + } +} + +static struct pcpu *get_pcpu(uint32_t cpu_id) +{ + struct pcpu *pcpu; + + list_for_each_entry(pcpu, &xen_pcpus, list) { + if (pcpu->cpu_id == cpu_id) + return pcpu; + } + + return NULL; +} + +static void pcpu_release(struct device *dev) +{ + struct pcpu *pcpu = container_of(dev, struct pcpu, dev); + + list_del(&pcpu->list); + kfree(pcpu); +} + +static void unregister_and_remove_pcpu(struct pcpu *pcpu) +{ + struct device *dev; + + if (!pcpu) + return; + + dev = &pcpu->dev; + /* pcpu remove would be implicitly done */ + device_unregister(dev); +} + +static int register_pcpu(struct pcpu *pcpu) +{ + struct device *dev; + int err = -EINVAL; + + if (!pcpu) + return err; + + dev = &pcpu->dev; + dev->bus = &xen_pcpu_subsys; + dev->id = pcpu->cpu_id; + dev->release = pcpu_release; + dev->groups = pcpu_dev_groups; + + err = device_register(dev); + if (err) { + pcpu_release(dev); + return err; + } + + return 0; +} + +static struct pcpu *create_and_register_pcpu(struct xenpf_pcpuinfo *info) +{ + struct pcpu *pcpu; + int err; + + if (info->flags & XEN_PCPU_FLAGS_INVALID) + return ERR_PTR(-ENODEV); + + pcpu = kzalloc(sizeof(struct pcpu), GFP_KERNEL); + if (!pcpu) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&pcpu->list); + pcpu->cpu_id = info->xen_cpuid; + pcpu->flags = info->flags; + + /* Need hold on xen_pcpu_lock before pcpu list manipulations */ + list_add_tail(&pcpu->list, &xen_pcpus); + + err = register_pcpu(pcpu); + if (err) { + pr_warn("Failed to register pcpu%u\n", info->xen_cpuid); + return ERR_PTR(-ENOENT); + } + + return pcpu; +} + +/* + * Caller should hold the xen_pcpu_lock + */ +static int sync_pcpu(uint32_t cpu, uint32_t *max_cpu) +{ + int ret; + struct pcpu *pcpu = NULL; + struct xenpf_pcpuinfo *info; + struct xen_platform_op op = { + .cmd = XENPF_get_cpuinfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.pcpu_info.xen_cpuid = cpu, + }; + + ret = HYPERVISOR_dom0_op(&op); + if (ret) + return ret; + + info = &op.u.pcpu_info; + if (max_cpu) + *max_cpu = info->max_present; + + pcpu = get_pcpu(cpu); + + /* + * Only those at cpu present map has its sys interface. + */ + if (info->flags & XEN_PCPU_FLAGS_INVALID) { + unregister_and_remove_pcpu(pcpu); + return 0; + } + + if (!pcpu) { + pcpu = create_and_register_pcpu(info); + if (IS_ERR_OR_NULL(pcpu)) + return -ENODEV; + } else + pcpu_online_status(info, pcpu); + + return 0; +} + +/* + * Sync dom0's pcpu information with xen hypervisor's + */ +static int xen_sync_pcpus(void) +{ + /* + * Boot cpu always have cpu_id 0 in xen + */ + uint32_t cpu = 0, max_cpu = 0; + int err = 0; + struct pcpu *pcpu, *tmp; + + mutex_lock(&xen_pcpu_lock); + + while (!err && (cpu <= max_cpu)) { + err = sync_pcpu(cpu, &max_cpu); + cpu++; + } + + if (err) + list_for_each_entry_safe(pcpu, tmp, &xen_pcpus, list) + unregister_and_remove_pcpu(pcpu); + + mutex_unlock(&xen_pcpu_lock); + + return err; +} + +static void xen_pcpu_work_fn(struct work_struct *work) +{ + xen_sync_pcpus(); +} +static DECLARE_WORK(xen_pcpu_work, xen_pcpu_work_fn); + +static irqreturn_t xen_pcpu_interrupt(int irq, void *dev_id) +{ + schedule_work(&xen_pcpu_work); + return IRQ_HANDLED; +} + +/* Sync with Xen hypervisor after cpu hotadded */ +void xen_pcpu_hotplug_sync(void) +{ + schedule_work(&xen_pcpu_work); +} +EXPORT_SYMBOL_GPL(xen_pcpu_hotplug_sync); + +/* + * For hypervisor presented cpu, return logic cpu id; + * For hypervisor non-presented cpu, return -ENODEV. + */ +int xen_pcpu_id(uint32_t acpi_id) +{ + int cpu_id = 0, max_id = 0; + struct xen_platform_op op; + + op.cmd = XENPF_get_cpuinfo; + while (cpu_id <= max_id) { + op.u.pcpu_info.xen_cpuid = cpu_id; + if (HYPERVISOR_dom0_op(&op)) { + cpu_id++; + continue; + } + + if (acpi_id == op.u.pcpu_info.acpi_id) + return cpu_id; + if (op.u.pcpu_info.max_present > max_id) + max_id = op.u.pcpu_info.max_present; + cpu_id++; + } + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(xen_pcpu_id); + +static int __init xen_pcpu_init(void) +{ + int irq, ret; + + if (!xen_initial_domain()) + return -ENODEV; + + irq = bind_virq_to_irqhandler(VIRQ_PCPU_STATE, 0, + xen_pcpu_interrupt, 0, + "xen-pcpu", NULL); + if (irq < 0) { + pr_warn("Failed to bind pcpu virq\n"); + return irq; + } + + ret = subsys_system_register(&xen_pcpu_subsys, NULL); + if (ret) { + pr_warn("Failed to register pcpu subsys\n"); + goto err1; + } + + ret = xen_sync_pcpus(); + if (ret) { + pr_warn("Failed to sync pcpu info\n"); + goto err2; + } + + return 0; + +err2: + bus_unregister(&xen_pcpu_subsys); +err1: + unbind_from_irqhandler(irq, NULL); + return ret; +} +arch_initcall(xen_pcpu_init); diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c new file mode 100644 index 000000000..3454973dc --- /dev/null +++ b/drivers/xen/platform-pci.c @@ -0,0 +1,200 @@ +/****************************************************************************** + * platform-pci.c + * + * Xen platform PCI device driver + * Copyright (c) 2005, Intel Corporation. + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + + +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/module.h> +#include <linux/pci.h> + +#include <xen/platform_pci.h> +#include <xen/grant_table.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/hvm.h> +#include <xen/xen-ops.h> + +#define DRV_NAME "xen-platform-pci" + +MODULE_AUTHOR("ssmith@xensource.com and stefano.stabellini@eu.citrix.com"); +MODULE_DESCRIPTION("Xen platform PCI device"); +MODULE_LICENSE("GPL"); + +static unsigned long platform_mmio; +static unsigned long platform_mmio_alloc; +static unsigned long platform_mmiolen; +static uint64_t callback_via; + +static unsigned long alloc_xen_mmio(unsigned long len) +{ + unsigned long addr; + + addr = platform_mmio + platform_mmio_alloc; + platform_mmio_alloc += len; + BUG_ON(platform_mmio_alloc > platform_mmiolen); + + return addr; +} + +static uint64_t get_callback_via(struct pci_dev *pdev) +{ + u8 pin; + int irq; + + irq = pdev->irq; + if (irq < 16) + return irq; /* ISA IRQ */ + + pin = pdev->pin; + + /* We don't know the GSI. Specify the PCI INTx line instead. */ + return ((uint64_t)0x01 << 56) | /* PCI INTx identifier */ + ((uint64_t)pci_domain_nr(pdev->bus) << 32) | + ((uint64_t)pdev->bus->number << 16) | + ((uint64_t)(pdev->devfn & 0xff) << 8) | + ((uint64_t)(pin - 1) & 3); +} + +static irqreturn_t do_hvm_evtchn_intr(int irq, void *dev_id) +{ + xen_hvm_evtchn_do_upcall(); + return IRQ_HANDLED; +} + +static int xen_allocate_irq(struct pci_dev *pdev) +{ + return request_irq(pdev->irq, do_hvm_evtchn_intr, + IRQF_NOBALANCING | IRQF_TRIGGER_RISING, + "xen-platform-pci", pdev); +} + +static int platform_pci_resume(struct pci_dev *pdev) +{ + int err; + if (xen_have_vector_callback) + return 0; + err = xen_set_callback_via(callback_via); + if (err) { + dev_err(&pdev->dev, "platform_pci_resume failure!\n"); + return err; + } + return 0; +} + +static int platform_pci_init(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + int i, ret; + long ioaddr; + long mmio_addr, mmio_len; + unsigned int max_nr_gframes; + unsigned long grant_frames; + + if (!xen_domain()) + return -ENODEV; + + i = pci_enable_device(pdev); + if (i) + return i; + + ioaddr = pci_resource_start(pdev, 0); + + mmio_addr = pci_resource_start(pdev, 1); + mmio_len = pci_resource_len(pdev, 1); + + if (mmio_addr == 0 || ioaddr == 0) { + dev_err(&pdev->dev, "no resources found\n"); + ret = -ENOENT; + goto pci_out; + } + + ret = pci_request_region(pdev, 1, DRV_NAME); + if (ret < 0) + goto pci_out; + + ret = pci_request_region(pdev, 0, DRV_NAME); + if (ret < 0) + goto mem_out; + + platform_mmio = mmio_addr; + platform_mmiolen = mmio_len; + + if (!xen_have_vector_callback) { + ret = xen_allocate_irq(pdev); + if (ret) { + dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret); + goto out; + } + callback_via = get_callback_via(pdev); + ret = xen_set_callback_via(callback_via); + if (ret) { + dev_warn(&pdev->dev, "Unable to set the evtchn callback " + "err=%d\n", ret); + goto out; + } + } + + max_nr_gframes = gnttab_max_grant_frames(); + grant_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); + ret = gnttab_setup_auto_xlat_frames(grant_frames); + if (ret) + goto out; + ret = gnttab_init(); + if (ret) + goto grant_out; + xenbus_probe(NULL); + return 0; +grant_out: + gnttab_free_auto_xlat_frames(); +out: + pci_release_region(pdev, 0); +mem_out: + pci_release_region(pdev, 1); +pci_out: + pci_disable_device(pdev); + return ret; +} + +static struct pci_device_id platform_pci_tbl[] = { + {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {0,} +}; + +MODULE_DEVICE_TABLE(pci, platform_pci_tbl); + +static struct pci_driver platform_driver = { + .name = DRV_NAME, + .probe = platform_pci_init, + .id_table = platform_pci_tbl, +#ifdef CONFIG_PM + .resume_early = platform_pci_resume, +#endif +}; + +static int __init platform_pci_module_init(void) +{ + return pci_register_driver(&platform_driver); +} + +module_init(platform_pci_module_init); diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c new file mode 100644 index 000000000..a1800c150 --- /dev/null +++ b/drivers/xen/preempt.c @@ -0,0 +1,44 @@ +/* + * Preemptible hypercalls + * + * Copyright (C) 2014 Citrix Systems R&D ltd. + * + * This source code is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + */ + +#include <linux/sched.h> +#include <xen/xen-ops.h> + +#ifndef CONFIG_PREEMPT + +/* + * Some hypercalls issued by the toolstack can take many 10s of + * seconds. Allow tasks running hypercalls via the privcmd driver to + * be voluntarily preempted even if full kernel preemption is + * disabled. + * + * Such preemptible hypercalls are bracketed by + * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() + * calls. + */ + +DEFINE_PER_CPU(bool, xen_in_preemptible_hcall); +EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall); + +asmlinkage __visible void xen_maybe_preempt_hcall(void) +{ + if (unlikely(__this_cpu_read(xen_in_preemptible_hcall) + && should_resched())) { + /* + * Clear flag as we may be rescheduled on a different + * cpu. + */ + __this_cpu_write(xen_in_preemptible_hcall, false); + _cond_resched(); + __this_cpu_write(xen_in_preemptible_hcall, true); + } +} +#endif /* CONFIG_PREEMPT */ diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c new file mode 100644 index 000000000..5a296161d --- /dev/null +++ b/drivers/xen/privcmd.c @@ -0,0 +1,679 @@ +/****************************************************************************** + * privcmd.c + * + * Interface to privileged domain-0 commands. + * + * Copyright (c) 2002-2004, K A Fraser, B Dragovic + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/uaccess.h> +#include <linux/swap.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> +#include <linux/miscdevice.h> + +#include <asm/pgalloc.h> +#include <asm/pgtable.h> +#include <asm/tlb.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/privcmd.h> +#include <xen/interface/xen.h> +#include <xen/features.h> +#include <xen/page.h> +#include <xen/xen-ops.h> +#include <xen/balloon.h> + +#include "privcmd.h" + +MODULE_LICENSE("GPL"); + +#define PRIV_VMA_LOCKED ((void *)1) + +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages); + +static long privcmd_ioctl_hypercall(void __user *udata) +{ + struct privcmd_hypercall hypercall; + long ret; + + if (copy_from_user(&hypercall, udata, sizeof(hypercall))) + return -EFAULT; + + xen_preemptible_hcall_begin(); + ret = privcmd_call(hypercall.op, + hypercall.arg[0], hypercall.arg[1], + hypercall.arg[2], hypercall.arg[3], + hypercall.arg[4]); + xen_preemptible_hcall_end(); + + return ret; +} + +static void free_page_list(struct list_head *pages) +{ + struct page *p, *n; + + list_for_each_entry_safe(p, n, pages, lru) + __free_page(p); + + INIT_LIST_HEAD(pages); +} + +/* + * Given an array of items in userspace, return a list of pages + * containing the data. If copying fails, either because of memory + * allocation failure or a problem reading user memory, return an + * error code; its up to the caller to dispose of any partial list. + */ +static int gather_array(struct list_head *pagelist, + unsigned nelem, size_t size, + const void __user *data) +{ + unsigned pageidx; + void *pagedata; + int ret; + + if (size > PAGE_SIZE) + return 0; + + pageidx = PAGE_SIZE; + pagedata = NULL; /* quiet, gcc */ + while (nelem--) { + if (pageidx > PAGE_SIZE-size) { + struct page *page = alloc_page(GFP_KERNEL); + + ret = -ENOMEM; + if (page == NULL) + goto fail; + + pagedata = page_address(page); + + list_add_tail(&page->lru, pagelist); + pageidx = 0; + } + + ret = -EFAULT; + if (copy_from_user(pagedata + pageidx, data, size)) + goto fail; + + data += size; + pageidx += size; + } + + ret = 0; + +fail: + return ret; +} + +/* + * Call function "fn" on each element of the array fragmented + * over a list of pages. + */ +static int traverse_pages(unsigned nelem, size_t size, + struct list_head *pos, + int (*fn)(void *data, void *state), + void *state) +{ + void *pagedata; + unsigned pageidx; + int ret = 0; + + BUG_ON(size > PAGE_SIZE); + + pageidx = PAGE_SIZE; + pagedata = NULL; /* hush, gcc */ + + while (nelem--) { + if (pageidx > PAGE_SIZE-size) { + struct page *page; + pos = pos->next; + page = list_entry(pos, struct page, lru); + pagedata = page_address(page); + pageidx = 0; + } + + ret = (*fn)(pagedata + pageidx, state); + if (ret) + break; + pageidx += size; + } + + return ret; +} + +/* + * Similar to traverse_pages, but use each page as a "block" of + * data to be processed as one unit. + */ +static int traverse_pages_block(unsigned nelem, size_t size, + struct list_head *pos, + int (*fn)(void *data, int nr, void *state), + void *state) +{ + void *pagedata; + unsigned pageidx; + int ret = 0; + + BUG_ON(size > PAGE_SIZE); + + pageidx = PAGE_SIZE; + + while (nelem) { + int nr = (PAGE_SIZE/size); + struct page *page; + if (nr > nelem) + nr = nelem; + pos = pos->next; + page = list_entry(pos, struct page, lru); + pagedata = page_address(page); + ret = (*fn)(pagedata, nr, state); + if (ret) + break; + nelem -= nr; + } + + return ret; +} + +struct mmap_mfn_state { + unsigned long va; + struct vm_area_struct *vma; + domid_t domain; +}; + +static int mmap_mfn_range(void *data, void *state) +{ + struct privcmd_mmap_entry *msg = data; + struct mmap_mfn_state *st = state; + struct vm_area_struct *vma = st->vma; + int rc; + + /* Do not allow range to wrap the address space. */ + if ((msg->npages > (LONG_MAX >> PAGE_SHIFT)) || + ((unsigned long)(msg->npages << PAGE_SHIFT) >= -st->va)) + return -EINVAL; + + /* Range chunks must be contiguous in va space. */ + if ((msg->va != st->va) || + ((msg->va+(msg->npages<<PAGE_SHIFT)) > vma->vm_end)) + return -EINVAL; + + rc = xen_remap_domain_mfn_range(vma, + msg->va & PAGE_MASK, + msg->mfn, msg->npages, + vma->vm_page_prot, + st->domain, NULL); + if (rc < 0) + return rc; + + st->va += msg->npages << PAGE_SHIFT; + + return 0; +} + +static long privcmd_ioctl_mmap(void __user *udata) +{ + struct privcmd_mmap mmapcmd; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int rc; + LIST_HEAD(pagelist); + struct mmap_mfn_state state; + + /* We only support privcmd_ioctl_mmap_batch for auto translated. */ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return -ENOSYS; + + if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) + return -EFAULT; + + rc = gather_array(&pagelist, + mmapcmd.num, sizeof(struct privcmd_mmap_entry), + mmapcmd.entry); + + if (rc || list_empty(&pagelist)) + goto out; + + down_write(&mm->mmap_sem); + + { + struct page *page = list_first_entry(&pagelist, + struct page, lru); + struct privcmd_mmap_entry *msg = page_address(page); + + vma = find_vma(mm, msg->va); + rc = -EINVAL; + + if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) + goto out_up; + vma->vm_private_data = PRIV_VMA_LOCKED; + } + + state.va = vma->vm_start; + state.vma = vma; + state.domain = mmapcmd.dom; + + rc = traverse_pages(mmapcmd.num, sizeof(struct privcmd_mmap_entry), + &pagelist, + mmap_mfn_range, &state); + + +out_up: + up_write(&mm->mmap_sem); + +out: + free_page_list(&pagelist); + + return rc; +} + +struct mmap_batch_state { + domid_t domain; + unsigned long va; + struct vm_area_struct *vma; + int index; + /* A tristate: + * 0 for no errors + * 1 if at least one error has happened (and no + * -ENOENT errors have happened) + * -ENOENT if at least 1 -ENOENT has happened. + */ + int global_error; + int version; + + /* User-space mfn array to store errors in the second pass for V1. */ + xen_pfn_t __user *user_mfn; + /* User-space int array to store errors in the second pass for V2. */ + int __user *user_err; +}; + +/* auto translated dom0 note: if domU being created is PV, then mfn is + * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP). + */ +static int mmap_batch_fn(void *data, int nr, void *state) +{ + xen_pfn_t *mfnp = data; + struct mmap_batch_state *st = state; + struct vm_area_struct *vma = st->vma; + struct page **pages = vma->vm_private_data; + struct page **cur_pages = NULL; + int ret; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + cur_pages = &pages[st->index]; + + BUG_ON(nr < 0); + ret = xen_remap_domain_mfn_array(st->vma, st->va & PAGE_MASK, mfnp, nr, + (int *)mfnp, st->vma->vm_page_prot, + st->domain, cur_pages); + + /* Adjust the global_error? */ + if (ret != nr) { + if (ret == -ENOENT) + st->global_error = -ENOENT; + else { + /* Record that at least one error has happened. */ + if (st->global_error == 0) + st->global_error = 1; + } + } + st->va += PAGE_SIZE * nr; + st->index += nr; + + return 0; +} + +static int mmap_return_error(int err, struct mmap_batch_state *st) +{ + int ret; + + if (st->version == 1) { + if (err) { + xen_pfn_t mfn; + + ret = get_user(mfn, st->user_mfn); + if (ret < 0) + return ret; + /* + * V1 encodes the error codes in the 32bit top + * nibble of the mfn (with its known + * limitations vis-a-vis 64 bit callers). + */ + mfn |= (err == -ENOENT) ? + PRIVCMD_MMAPBATCH_PAGED_ERROR : + PRIVCMD_MMAPBATCH_MFN_ERROR; + return __put_user(mfn, st->user_mfn++); + } else + st->user_mfn++; + } else { /* st->version == 2 */ + if (err) + return __put_user(err, st->user_err++); + else + st->user_err++; + } + + return 0; +} + +static int mmap_return_errors(void *data, int nr, void *state) +{ + struct mmap_batch_state *st = state; + int *errs = data; + int i; + int ret; + + for (i = 0; i < nr; i++) { + ret = mmap_return_error(errs[i], st); + if (ret < 0) + return ret; + } + return 0; +} + +/* Allocate pfns that are then mapped with gmfns from foreign domid. Update + * the vma with the page info to use later. + * Returns: 0 if success, otherwise -errno + */ +static int alloc_empty_pages(struct vm_area_struct *vma, int numpgs) +{ + int rc; + struct page **pages; + + pages = kcalloc(numpgs, sizeof(pages[0]), GFP_KERNEL); + if (pages == NULL) + return -ENOMEM; + + rc = alloc_xenballooned_pages(numpgs, pages, 0); + if (rc != 0) { + pr_warn("%s Could not alloc %d pfns rc:%d\n", __func__, + numpgs, rc); + kfree(pages); + return -ENOMEM; + } + BUG_ON(vma->vm_private_data != NULL); + vma->vm_private_data = pages; + + return 0; +} + +static struct vm_operations_struct privcmd_vm_ops; + +static long privcmd_ioctl_mmap_batch(void __user *udata, int version) +{ + int ret; + struct privcmd_mmapbatch_v2 m; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long nr_pages; + LIST_HEAD(pagelist); + struct mmap_batch_state state; + + switch (version) { + case 1: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch))) + return -EFAULT; + /* Returns per-frame error in m.arr. */ + m.err = NULL; + if (!access_ok(VERIFY_WRITE, m.arr, m.num * sizeof(*m.arr))) + return -EFAULT; + break; + case 2: + if (copy_from_user(&m, udata, sizeof(struct privcmd_mmapbatch_v2))) + return -EFAULT; + /* Returns per-frame error code in m.err. */ + if (!access_ok(VERIFY_WRITE, m.err, m.num * (sizeof(*m.err)))) + return -EFAULT; + break; + default: + return -EINVAL; + } + + nr_pages = m.num; + if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) + return -EINVAL; + + ret = gather_array(&pagelist, m.num, sizeof(xen_pfn_t), m.arr); + + if (ret) + goto out; + if (list_empty(&pagelist)) { + ret = -EINVAL; + goto out; + } + + if (version == 2) { + /* Zero error array now to only copy back actual errors. */ + if (clear_user(m.err, sizeof(int) * m.num)) { + ret = -EFAULT; + goto out; + } + } + + down_write(&mm->mmap_sem); + + vma = find_vma(mm, m.addr); + if (!vma || + vma->vm_ops != &privcmd_vm_ops) { + ret = -EINVAL; + goto out_unlock; + } + + /* + * Caller must either: + * + * Map the whole VMA range, which will also allocate all the + * pages required for the auto_translated_physmap case. + * + * Or + * + * Map unmapped holes left from a previous map attempt (e.g., + * because those foreign frames were previously paged out). + */ + if (vma->vm_private_data == NULL) { + if (m.addr != vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) != vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (xen_feature(XENFEAT_auto_translated_physmap)) { + ret = alloc_empty_pages(vma, m.num); + if (ret < 0) + goto out_unlock; + } else + vma->vm_private_data = PRIV_VMA_LOCKED; + } else { + if (m.addr < vma->vm_start || + m.addr + (nr_pages << PAGE_SHIFT) > vma->vm_end) { + ret = -EINVAL; + goto out_unlock; + } + if (privcmd_vma_range_is_mapped(vma, m.addr, nr_pages)) { + ret = -EINVAL; + goto out_unlock; + } + } + + state.domain = m.dom; + state.vma = vma; + state.va = m.addr; + state.index = 0; + state.global_error = 0; + state.version = version; + + /* mmap_batch_fn guarantees ret == 0 */ + BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_batch_fn, &state)); + + up_write(&mm->mmap_sem); + + if (state.global_error) { + /* Write back errors in second pass. */ + state.user_mfn = (xen_pfn_t *)m.arr; + state.user_err = m.err; + ret = traverse_pages_block(m.num, sizeof(xen_pfn_t), + &pagelist, mmap_return_errors, &state); + } else + ret = 0; + + /* If we have not had any EFAULT-like global errors then set the global + * error to -ENOENT if necessary. */ + if ((ret == 0) && (state.global_error == -ENOENT)) + ret = -ENOENT; + +out: + free_page_list(&pagelist); + return ret; + +out_unlock: + up_write(&mm->mmap_sem); + goto out; +} + +static long privcmd_ioctl(struct file *file, + unsigned int cmd, unsigned long data) +{ + int ret = -ENOSYS; + void __user *udata = (void __user *) data; + + switch (cmd) { + case IOCTL_PRIVCMD_HYPERCALL: + ret = privcmd_ioctl_hypercall(udata); + break; + + case IOCTL_PRIVCMD_MMAP: + ret = privcmd_ioctl_mmap(udata); + break; + + case IOCTL_PRIVCMD_MMAPBATCH: + ret = privcmd_ioctl_mmap_batch(udata, 1); + break; + + case IOCTL_PRIVCMD_MMAPBATCH_V2: + ret = privcmd_ioctl_mmap_batch(udata, 2); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static void privcmd_close(struct vm_area_struct *vma) +{ + struct page **pages = vma->vm_private_data; + int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + int rc; + + if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages) + return; + + rc = xen_unmap_domain_mfn_range(vma, numpgs, pages); + if (rc == 0) + free_xenballooned_pages(numpgs, pages); + else + pr_crit("unable to unmap MFN range: leaking %d pages. rc=%d\n", + numpgs, rc); + kfree(pages); +} + +static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", + vma, vma->vm_start, vma->vm_end, + vmf->pgoff, vmf->virtual_address); + + return VM_FAULT_SIGBUS; +} + +static struct vm_operations_struct privcmd_vm_ops = { + .close = privcmd_close, + .fault = privcmd_fault +}; + +static int privcmd_mmap(struct file *file, struct vm_area_struct *vma) +{ + /* DONTCOPY is essential for Xen because copy_page_range doesn't know + * how to recreate these mappings */ + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTCOPY | + VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_ops = &privcmd_vm_ops; + vma->vm_private_data = NULL; + + return 0; +} + +/* + * For MMAPBATCH*. This allows asserting the singleshot mapping + * on a per pfn/pte basis. Mapping calls that fail with ENOENT + * can be then retried until success. + */ +static int is_mapped_fn(pte_t *pte, struct page *pmd_page, + unsigned long addr, void *data) +{ + return pte_none(*pte) ? 0 : -EBUSY; +} + +static int privcmd_vma_range_is_mapped( + struct vm_area_struct *vma, + unsigned long addr, + unsigned long nr_pages) +{ + return apply_to_page_range(vma->vm_mm, addr, nr_pages << PAGE_SHIFT, + is_mapped_fn, NULL) != 0; +} + +const struct file_operations xen_privcmd_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = privcmd_ioctl, + .mmap = privcmd_mmap, +}; +EXPORT_SYMBOL_GPL(xen_privcmd_fops); + +static struct miscdevice privcmd_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/privcmd", + .fops = &xen_privcmd_fops, +}; + +static int __init privcmd_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + err = misc_register(&privcmd_dev); + if (err != 0) { + pr_err("Could not register Xen privcmd device\n"); + return err; + } + return 0; +} + +static void __exit privcmd_exit(void) +{ + misc_deregister(&privcmd_dev); +} + +module_init(privcmd_init); +module_exit(privcmd_exit); diff --git a/drivers/xen/privcmd.h b/drivers/xen/privcmd.h new file mode 100644 index 000000000..14facaeed --- /dev/null +++ b/drivers/xen/privcmd.h @@ -0,0 +1,3 @@ +#include <linux/fs.h> + +extern const struct file_operations xen_privcmd_fops; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c new file mode 100644 index 000000000..4c549323c --- /dev/null +++ b/drivers/xen/swiotlb-xen.c @@ -0,0 +1,688 @@ +/* + * Copyright 2010 + * by Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This code provides a IOMMU for Xen PV guests with PCI passthrough. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License v2.0 as published by + * the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * PV guests under Xen are running in an non-contiguous memory architecture. + * + * When PCI pass-through is utilized, this necessitates an IOMMU for + * translating bus (DMA) to virtual and vice-versa and also providing a + * mechanism to have contiguous pages for device drivers operations (say DMA + * operations). + * + * Specifically, under Xen the Linux idea of pages is an illusion. It + * assumes that pages start at zero and go up to the available memory. To + * help with that, the Linux Xen MMU provides a lookup mechanism to + * translate the page frame numbers (PFN) to machine frame numbers (MFN) + * and vice-versa. The MFN are the "real" frame numbers. Furthermore + * memory is not contiguous. Xen hypervisor stitches memory for guests + * from different pools, which means there is no guarantee that PFN==MFN + * and PFN+1==MFN+1. Lastly with Xen 4.0, pages (in debug mode) are + * allocated in descending order (high to low), meaning the guest might + * never get any MFN's under the 4GB mark. + * + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/bootmem.h> +#include <linux/dma-mapping.h> +#include <linux/export.h> +#include <xen/swiotlb-xen.h> +#include <xen/page.h> +#include <xen/xen-ops.h> +#include <xen/hvc-console.h> + +#include <asm/dma-mapping.h> +#include <asm/xen/page-coherent.h> + +#include <trace/events/swiotlb.h> +/* + * Used to do a quick range check in swiotlb_tbl_unmap_single and + * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this + * API. + */ + +#ifndef CONFIG_X86 +static unsigned long dma_alloc_coherent_mask(struct device *dev, + gfp_t gfp) +{ + unsigned long dma_mask = 0; + + dma_mask = dev->coherent_dma_mask; + if (!dma_mask) + dma_mask = (gfp & GFP_DMA) ? DMA_BIT_MASK(24) : DMA_BIT_MASK(32); + + return dma_mask; +} +#endif + +static char *xen_io_tlb_start, *xen_io_tlb_end; +static unsigned long xen_io_tlb_nslabs; +/* + * Quick lookup value of the bus address of the IOTLB. + */ + +static u64 start_dma_addr; + +/* + * Both of these functions should avoid PFN_PHYS because phys_addr_t + * can be 32bit when dma_addr_t is 64bit leading to a loss in + * information if the shift is done before casting to 64bit. + */ +static inline dma_addr_t xen_phys_to_bus(phys_addr_t paddr) +{ + unsigned long mfn = pfn_to_mfn(PFN_DOWN(paddr)); + dma_addr_t dma = (dma_addr_t)mfn << PAGE_SHIFT; + + dma |= paddr & ~PAGE_MASK; + + return dma; +} + +static inline phys_addr_t xen_bus_to_phys(dma_addr_t baddr) +{ + unsigned long pfn = mfn_to_pfn(PFN_DOWN(baddr)); + dma_addr_t dma = (dma_addr_t)pfn << PAGE_SHIFT; + phys_addr_t paddr = dma; + + paddr |= baddr & ~PAGE_MASK; + + return paddr; +} + +static inline dma_addr_t xen_virt_to_bus(void *address) +{ + return xen_phys_to_bus(virt_to_phys(address)); +} + +static int check_pages_physically_contiguous(unsigned long pfn, + unsigned int offset, + size_t length) +{ + unsigned long next_mfn; + int i; + int nr_pages; + + next_mfn = pfn_to_mfn(pfn); + nr_pages = (offset + length + PAGE_SIZE-1) >> PAGE_SHIFT; + + for (i = 1; i < nr_pages; i++) { + if (pfn_to_mfn(++pfn) != ++next_mfn) + return 0; + } + return 1; +} + +static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) +{ + unsigned long pfn = PFN_DOWN(p); + unsigned int offset = p & ~PAGE_MASK; + + if (offset + size <= PAGE_SIZE) + return 0; + if (check_pages_physically_contiguous(pfn, offset, size)) + return 0; + return 1; +} + +static int is_xen_swiotlb_buffer(dma_addr_t dma_addr) +{ + unsigned long mfn = PFN_DOWN(dma_addr); + unsigned long pfn = mfn_to_local_pfn(mfn); + phys_addr_t paddr; + + /* If the address is outside our domain, it CAN + * have the same virtual address as another address + * in our domain. Therefore _only_ check address within our domain. + */ + if (pfn_valid(pfn)) { + paddr = PFN_PHYS(pfn); + return paddr >= virt_to_phys(xen_io_tlb_start) && + paddr < virt_to_phys(xen_io_tlb_end); + } + return 0; +} + +static int max_dma_bits = 32; + +static int +xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) +{ + int i, rc; + int dma_bits; + dma_addr_t dma_handle; + phys_addr_t p = virt_to_phys(buf); + + dma_bits = get_order(IO_TLB_SEGSIZE << IO_TLB_SHIFT) + PAGE_SHIFT; + + i = 0; + do { + int slabs = min(nslabs - i, (unsigned long)IO_TLB_SEGSIZE); + + do { + rc = xen_create_contiguous_region( + p + (i << IO_TLB_SHIFT), + get_order(slabs << IO_TLB_SHIFT), + dma_bits, &dma_handle); + } while (rc && dma_bits++ < max_dma_bits); + if (rc) + return rc; + + i += slabs; + } while (i < nslabs); + return 0; +} +static unsigned long xen_set_nslabs(unsigned long nr_tbl) +{ + if (!nr_tbl) { + xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); + xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); + } else + xen_io_tlb_nslabs = nr_tbl; + + return xen_io_tlb_nslabs << IO_TLB_SHIFT; +} + +enum xen_swiotlb_err { + XEN_SWIOTLB_UNKNOWN = 0, + XEN_SWIOTLB_ENOMEM, + XEN_SWIOTLB_EFIXUP +}; + +static const char *xen_swiotlb_error(enum xen_swiotlb_err err) +{ + switch (err) { + case XEN_SWIOTLB_ENOMEM: + return "Cannot allocate Xen-SWIOTLB buffer\n"; + case XEN_SWIOTLB_EFIXUP: + return "Failed to get contiguous memory for DMA from Xen!\n"\ + "You either: don't have the permissions, do not have"\ + " enough free memory under 4GB, or the hypervisor memory"\ + " is too fragmented!"; + default: + break; + } + return ""; +} +int __ref xen_swiotlb_init(int verbose, bool early) +{ + unsigned long bytes, order; + int rc = -ENOMEM; + enum xen_swiotlb_err m_ret = XEN_SWIOTLB_UNKNOWN; + unsigned int repeat = 3; + + xen_io_tlb_nslabs = swiotlb_nr_tbl(); +retry: + bytes = xen_set_nslabs(xen_io_tlb_nslabs); + order = get_order(xen_io_tlb_nslabs << IO_TLB_SHIFT); + /* + * Get IO TLB memory from any location. + */ + if (early) + xen_io_tlb_start = alloc_bootmem_pages(PAGE_ALIGN(bytes)); + else { +#define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) +#define IO_TLB_MIN_SLABS ((1<<20) >> IO_TLB_SHIFT) + while ((SLABS_PER_PAGE << order) > IO_TLB_MIN_SLABS) { + xen_io_tlb_start = (void *)xen_get_swiotlb_free_pages(order); + if (xen_io_tlb_start) + break; + order--; + } + if (order != get_order(bytes)) { + pr_warn("Warning: only able to allocate %ld MB for software IO TLB\n", + (PAGE_SIZE << order) >> 20); + xen_io_tlb_nslabs = SLABS_PER_PAGE << order; + bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; + } + } + if (!xen_io_tlb_start) { + m_ret = XEN_SWIOTLB_ENOMEM; + goto error; + } + xen_io_tlb_end = xen_io_tlb_start + bytes; + /* + * And replace that memory with pages under 4GB. + */ + rc = xen_swiotlb_fixup(xen_io_tlb_start, + bytes, + xen_io_tlb_nslabs); + if (rc) { + if (early) + free_bootmem(__pa(xen_io_tlb_start), PAGE_ALIGN(bytes)); + else { + free_pages((unsigned long)xen_io_tlb_start, order); + xen_io_tlb_start = NULL; + } + m_ret = XEN_SWIOTLB_EFIXUP; + goto error; + } + start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); + if (early) { + if (swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, + verbose)) + panic("Cannot allocate SWIOTLB buffer"); + rc = 0; + } else + rc = swiotlb_late_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs); + return rc; +error: + if (repeat--) { + xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ + (xen_io_tlb_nslabs >> 1)); + pr_info("Lowering to %luMB\n", + (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); + goto retry; + } + pr_err("%s (rc:%d)\n", xen_swiotlb_error(m_ret), rc); + if (early) + panic("%s (rc:%d)", xen_swiotlb_error(m_ret), rc); + else + free_pages((unsigned long)xen_io_tlb_start, order); + return rc; +} +void * +xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs) +{ + void *ret; + int order = get_order(size); + u64 dma_mask = DMA_BIT_MASK(32); + phys_addr_t phys; + dma_addr_t dev_addr; + + /* + * Ignore region specifiers - the kernel's ideas of + * pseudo-phys memory layout has nothing to do with the + * machine physical layout. We can't allocate highmem + * because we can't return a pointer to it. + */ + flags &= ~(__GFP_DMA | __GFP_HIGHMEM); + + if (dma_alloc_from_coherent(hwdev, size, dma_handle, &ret)) + return ret; + + /* On ARM this function returns an ioremap'ped virtual address for + * which virt_to_phys doesn't return the corresponding physical + * address. In fact on ARM virt_to_phys only works for kernel direct + * mapped RAM memory. Also see comment below. + */ + ret = xen_alloc_coherent_pages(hwdev, size, dma_handle, flags, attrs); + + if (!ret) + return ret; + + if (hwdev && hwdev->coherent_dma_mask) + dma_mask = dma_alloc_coherent_mask(hwdev, flags); + + /* At this point dma_handle is the physical address, next we are + * going to set it to the machine address. + * Do not use virt_to_phys(ret) because on ARM it doesn't correspond + * to *dma_handle. */ + phys = *dma_handle; + dev_addr = xen_phys_to_bus(phys); + if (((dev_addr + size - 1 <= dma_mask)) && + !range_straddles_page_boundary(phys, size)) + *dma_handle = dev_addr; + else { + if (xen_create_contiguous_region(phys, order, + fls64(dma_mask), dma_handle) != 0) { + xen_free_coherent_pages(hwdev, size, ret, (dma_addr_t)phys, attrs); + return NULL; + } + } + memset(ret, 0, size); + return ret; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent); + +void +xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, + dma_addr_t dev_addr, struct dma_attrs *attrs) +{ + int order = get_order(size); + phys_addr_t phys; + u64 dma_mask = DMA_BIT_MASK(32); + + if (dma_release_from_coherent(hwdev, order, vaddr)) + return; + + if (hwdev && hwdev->coherent_dma_mask) + dma_mask = hwdev->coherent_dma_mask; + + /* do not use virt_to_phys because on ARM it doesn't return you the + * physical address */ + phys = xen_bus_to_phys(dev_addr); + + if (((dev_addr + size - 1 > dma_mask)) || + range_straddles_page_boundary(phys, size)) + xen_destroy_contiguous_region(phys, order); + + xen_free_coherent_pages(hwdev, size, vaddr, (dma_addr_t)phys, attrs); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); + + +/* + * Map a single buffer of the indicated size for DMA in streaming mode. The + * physical address to use is returned. + * + * Once the device is given the dma address, the device owns this memory until + * either xen_swiotlb_unmap_page or xen_swiotlb_dma_sync_single is performed. + */ +dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + phys_addr_t map, phys = page_to_phys(page) + offset; + dma_addr_t dev_addr = xen_phys_to_bus(phys); + + BUG_ON(dir == DMA_NONE); + /* + * If the address happens to be in the device's DMA window, + * we can safely return the device addr and not worry about bounce + * buffering it. + */ + if (dma_capable(dev, dev_addr, size) && + !range_straddles_page_boundary(phys, size) && + !xen_arch_need_swiotlb(dev, PFN_DOWN(phys), PFN_DOWN(dev_addr)) && + !swiotlb_force) { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(dev, page, dev_addr, offset, size, dir, attrs); + return dev_addr; + } + + /* + * Oh well, have to allocate and map a bounce buffer. + */ + trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); + + map = swiotlb_tbl_map_single(dev, start_dma_addr, phys, size, dir); + if (map == SWIOTLB_MAP_ERROR) + return DMA_ERROR_CODE; + + xen_dma_map_page(dev, pfn_to_page(map >> PAGE_SHIFT), + dev_addr, map & ~PAGE_MASK, size, dir, attrs); + dev_addr = xen_phys_to_bus(map); + + /* + * Ensure that the address returned is DMA'ble + */ + if (!dma_capable(dev, dev_addr, size)) { + swiotlb_tbl_unmap_single(dev, map, size, dir); + dev_addr = 0; + } + return dev_addr; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); + +/* + * Unmap a single streaming mode DMA translation. The dma_addr and size must + * match what was provided for in a previous xen_swiotlb_map_page call. All + * other usages are undefined. + * + * After this call, reads by the cpu to the buffer are guaranteed to see + * whatever the device wrote there. + */ +static void xen_unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + phys_addr_t paddr = xen_bus_to_phys(dev_addr); + + BUG_ON(dir == DMA_NONE); + + xen_dma_unmap_page(hwdev, dev_addr, size, dir, attrs); + + /* NOTE: We use dev_addr here, not paddr! */ + if (is_xen_swiotlb_buffer(dev_addr)) { + swiotlb_tbl_unmap_single(hwdev, paddr, size, dir); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + /* + * phys_to_virt doesn't work with hihgmem page but we could + * call dma_mark_clean() with hihgmem page here. However, we + * are fine since dma_mark_clean() is null on POWERPC. We can + * make dma_mark_clean() take a physical address if necessary. + */ + dma_mark_clean(phys_to_virt(paddr), size); +} + +void xen_swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + xen_unmap_single(hwdev, dev_addr, size, dir, attrs); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_page); + +/* + * Make physical memory consistent for a single streaming mode DMA translation + * after a transfer. + * + * If you perform a xen_swiotlb_map_page() but wish to interrogate the buffer + * using the cpu, yet do not wish to teardown the dma mapping, you must + * call this function before doing so. At the next point you give the dma + * address back to the card, you must first perform a + * xen_swiotlb_dma_sync_for_device, and then the device again owns the buffer + */ +static void +xen_swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + enum dma_sync_target target) +{ + phys_addr_t paddr = xen_bus_to_phys(dev_addr); + + BUG_ON(dir == DMA_NONE); + + if (target == SYNC_FOR_CPU) + xen_dma_sync_single_for_cpu(hwdev, dev_addr, size, dir); + + /* NOTE: We use dev_addr here, not paddr! */ + if (is_xen_swiotlb_buffer(dev_addr)) + swiotlb_tbl_sync_single(hwdev, paddr, size, dir, target); + + if (target == SYNC_FOR_DEVICE) + xen_dma_sync_single_for_device(hwdev, dev_addr, size, dir); + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(phys_to_virt(paddr), size); +} + +void +xen_swiotlb_sync_single_for_cpu(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_CPU); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_cpu); + +void +xen_swiotlb_sync_single_for_device(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir) +{ + xen_swiotlb_sync_single(hwdev, dev_addr, size, dir, SYNC_FOR_DEVICE); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_single_for_device); + +/* + * Map a set of buffers described by scatterlist in streaming mode for DMA. + * This is the scatter-gather version of the above xen_swiotlb_map_page + * interface. Here the scatter gather list elements are each tagged with the + * appropriate dma address and length. They are obtained via + * sg_dma_{address,length}(SG). + * + * NOTE: An implementation may be able to use a smaller number of + * DMA address/length pairs than there are SG table elements. + * (for example via virtual mapping capabilities) + * The routine returns the number of addr/length pairs actually + * used, at most nents. + * + * Device ownership issues as mentioned above for xen_swiotlb_map_page are the + * same here. + */ +int +xen_swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) { + phys_addr_t paddr = sg_phys(sg); + dma_addr_t dev_addr = xen_phys_to_bus(paddr); + + if (swiotlb_force || + xen_arch_need_swiotlb(hwdev, PFN_DOWN(paddr), PFN_DOWN(dev_addr)) || + !dma_capable(hwdev, dev_addr, sg->length) || + range_straddles_page_boundary(paddr, sg->length)) { + phys_addr_t map = swiotlb_tbl_map_single(hwdev, + start_dma_addr, + sg_phys(sg), + sg->length, + dir); + if (map == SWIOTLB_MAP_ERROR) { + dev_warn(hwdev, "swiotlb buffer is full\n"); + /* Don't panic here, we expect map_sg users + to do proper error handling. */ + xen_swiotlb_unmap_sg_attrs(hwdev, sgl, i, dir, + attrs); + sg_dma_len(sgl) = 0; + return 0; + } + xen_dma_map_page(hwdev, pfn_to_page(map >> PAGE_SHIFT), + dev_addr, + map & ~PAGE_MASK, + sg->length, + dir, + attrs); + sg->dma_address = xen_phys_to_bus(map); + } else { + /* we are not interested in the dma_addr returned by + * xen_dma_map_page, only in the potential cache flushes executed + * by the function. */ + xen_dma_map_page(hwdev, pfn_to_page(paddr >> PAGE_SHIFT), + dev_addr, + paddr & ~PAGE_MASK, + sg->length, + dir, + attrs); + sg->dma_address = dev_addr; + } + sg_dma_len(sg) = sg->length; + } + return nelems; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_map_sg_attrs); + +/* + * Unmap a set of streaming mode DMA translations. Again, cpu read rules + * concerning calls here are the same as for swiotlb_unmap_page() above. + */ +void +xen_swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + struct scatterlist *sg; + int i; + + BUG_ON(dir == DMA_NONE); + + for_each_sg(sgl, sg, nelems, i) + xen_unmap_single(hwdev, sg->dma_address, sg_dma_len(sg), dir, attrs); + +} +EXPORT_SYMBOL_GPL(xen_swiotlb_unmap_sg_attrs); + +/* + * Make physical memory consistent for a set of streaming mode DMA translations + * after a transfer. + * + * The same as swiotlb_sync_single_* but for a scatter-gather list, same rules + * and usage. + */ +static void +xen_swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, + int nelems, enum dma_data_direction dir, + enum dma_sync_target target) +{ + struct scatterlist *sg; + int i; + + for_each_sg(sgl, sg, nelems, i) + xen_swiotlb_sync_single(hwdev, sg->dma_address, + sg_dma_len(sg), dir, target); +} + +void +xen_swiotlb_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_CPU); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_cpu); + +void +xen_swiotlb_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg, + int nelems, enum dma_data_direction dir) +{ + xen_swiotlb_sync_sg(hwdev, sg, nelems, dir, SYNC_FOR_DEVICE); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_sync_sg_for_device); + +int +xen_swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) +{ + return !dma_addr; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mapping_error); + +/* + * Return whether the given device DMA address mask can be supported + * properly. For example, if your device can only drive the low 24-bits + * during bus mastering, then you would pass 0x00ffffff as the mask to + * this function. + */ +int +xen_swiotlb_dma_supported(struct device *hwdev, u64 mask) +{ + return xen_virt_to_bus(xen_io_tlb_end - 1) <= mask; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_dma_supported); + +int +xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask) +{ + if (!dev->dma_mask || !xen_swiotlb_dma_supported(dev, dma_mask)) + return -EIO; + + *dev->dma_mask = dma_mask; + + return 0; +} +EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask); diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c new file mode 100644 index 000000000..96453f8a8 --- /dev/null +++ b/drivers/xen/sys-hypervisor.c @@ -0,0 +1,460 @@ +/* + * copyright (c) 2006 IBM Corporation + * Authored by: Mike D. Day <ncmike@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kobject.h> +#include <linux/err.h> + +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> + +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/interface/xen.h> +#include <xen/interface/version.h> + +#define HYPERVISOR_ATTR_RO(_name) \ +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) + +#define HYPERVISOR_ATTR_RW(_name) \ +static struct hyp_sysfs_attr _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +struct hyp_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct hyp_sysfs_attr *, char *); + ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t); + void *hyp_attr_data; +}; + +static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + return sprintf(buffer, "xen\n"); +} + +HYPERVISOR_ATTR_RO(type); + +static int __init xen_sysfs_type_init(void) +{ + return sysfs_create_file(hypervisor_kobj, &type_attr.attr); +} + +static void xen_sysfs_type_destroy(void) +{ + sysfs_remove_file(hypervisor_kobj, &type_attr.attr); +} + +/* xen version attributes */ +static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int version = HYPERVISOR_xen_version(XENVER_version, NULL); + if (version) + return sprintf(buffer, "%d\n", version >> 16); + return -ENODEV; +} + +HYPERVISOR_ATTR_RO(major); + +static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int version = HYPERVISOR_xen_version(XENVER_version, NULL); + if (version) + return sprintf(buffer, "%d\n", version & 0xff); + return -ENODEV; +} + +HYPERVISOR_ATTR_RO(minor); + +static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *extra; + + extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL); + if (extra) { + ret = HYPERVISOR_xen_version(XENVER_extraversion, extra); + if (!ret) + ret = sprintf(buffer, "%s\n", extra); + kfree(extra); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(extra); + +static struct attribute *version_attrs[] = { + &major_attr.attr, + &minor_attr.attr, + &extra_attr.attr, + NULL +}; + +static const struct attribute_group version_group = { + .name = "version", + .attrs = version_attrs, +}; + +static int __init xen_sysfs_version_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &version_group); +} + +static void xen_sysfs_version_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &version_group); +} + +/* UUID */ + +static ssize_t uuid_show_fallback(struct hyp_sysfs_attr *attr, char *buffer) +{ + char *vm, *val; + int ret; + extern int xenstored_ready; + + if (!xenstored_ready) + return -EBUSY; + + vm = xenbus_read(XBT_NIL, "vm", "", NULL); + if (IS_ERR(vm)) + return PTR_ERR(vm); + val = xenbus_read(XBT_NIL, vm, "uuid", NULL); + kfree(vm); + if (IS_ERR(val)) + return PTR_ERR(val); + ret = sprintf(buffer, "%s\n", val); + kfree(val); + return ret; +} + +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + xen_domain_handle_t uuid; + int ret; + ret = HYPERVISOR_xen_version(XENVER_guest_handle, uuid); + if (ret) + return uuid_show_fallback(attr, buffer); + ret = sprintf(buffer, "%pU\n", uuid); + return ret; +} + +HYPERVISOR_ATTR_RO(uuid); + +static int __init xen_sysfs_uuid_init(void) +{ + return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr); +} + +static void xen_sysfs_uuid_destroy(void) +{ + sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr); +} + +/* xen compilation attributes */ + +static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compiler); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compiler); + +static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compile_by); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compiled_by); + +static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compile_date); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compile_date); + +static struct attribute *xen_compile_attrs[] = { + &compiler_attr.attr, + &compiled_by_attr.attr, + &compile_date_attr.attr, + NULL +}; + +static const struct attribute_group xen_compilation_group = { + .name = "compilation", + .attrs = xen_compile_attrs, +}; + +static int __init xen_compilation_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_compilation_group); +} + +static void xen_compilation_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &xen_compilation_group); +} + +/* xen properties info */ + +static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *caps; + + caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL); + if (caps) { + ret = HYPERVISOR_xen_version(XENVER_capabilities, caps); + if (!ret) + ret = sprintf(buffer, "%s\n", caps); + kfree(caps); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(capabilities); + +static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *cset; + + cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL); + if (cset) { + ret = HYPERVISOR_xen_version(XENVER_changeset, cset); + if (!ret) + ret = sprintf(buffer, "%s\n", cset); + kfree(cset); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(changeset); + +static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_platform_parameters *parms; + + parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL); + if (parms) { + ret = HYPERVISOR_xen_version(XENVER_platform_parameters, + parms); + if (!ret) + ret = sprintf(buffer, "%"PRI_xen_ulong"\n", + parms->virt_start); + kfree(parms); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(virtual_start); + +static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + + ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL); + if (ret > 0) + ret = sprintf(buffer, "%x\n", ret); + + return ret; +} + +HYPERVISOR_ATTR_RO(pagesize); + +static ssize_t xen_feature_show(int index, char *buffer) +{ + ssize_t ret; + struct xen_feature_info info; + + info.submap_idx = index; + ret = HYPERVISOR_xen_version(XENVER_get_features, &info); + if (!ret) + ret = sprintf(buffer, "%08x", info.submap); + + return ret; +} + +static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + ssize_t len; + int i; + + len = 0; + for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) { + int ret = xen_feature_show(i, buffer + len); + if (ret < 0) { + if (len == 0) + len = ret; + break; + } + len += ret; + } + if (len > 0) + buffer[len++] = '\n'; + + return len; +} + +HYPERVISOR_ATTR_RO(features); + +static struct attribute *xen_properties_attrs[] = { + &capabilities_attr.attr, + &changeset_attr.attr, + &virtual_start_attr.attr, + &pagesize_attr.attr, + &features_attr.attr, + NULL +}; + +static const struct attribute_group xen_properties_group = { + .name = "properties", + .attrs = xen_properties_attrs, +}; + +static int __init xen_properties_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_properties_group); +} + +static void xen_properties_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &xen_properties_group); +} + +static int __init hyper_sysfs_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + ret = xen_sysfs_type_init(); + if (ret) + goto out; + ret = xen_sysfs_version_init(); + if (ret) + goto version_out; + ret = xen_compilation_init(); + if (ret) + goto comp_out; + ret = xen_sysfs_uuid_init(); + if (ret) + goto uuid_out; + ret = xen_properties_init(); + if (ret) + goto prop_out; + + goto out; + +prop_out: + xen_sysfs_uuid_destroy(); +uuid_out: + xen_compilation_destroy(); +comp_out: + xen_sysfs_version_destroy(); +version_out: + xen_sysfs_type_destroy(); +out: + return ret; +} + +static void __exit hyper_sysfs_exit(void) +{ + xen_properties_destroy(); + xen_compilation_destroy(); + xen_sysfs_uuid_destroy(); + xen_sysfs_version_destroy(); + xen_sysfs_type_destroy(); + +} +module_init(hyper_sysfs_init); +module_exit(hyper_sysfs_exit); + +static ssize_t hyp_sysfs_show(struct kobject *kobj, + struct attribute *attr, + char *buffer) +{ + struct hyp_sysfs_attr *hyp_attr; + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); + if (hyp_attr->show) + return hyp_attr->show(hyp_attr, buffer); + return 0; +} + +static ssize_t hyp_sysfs_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t len) +{ + struct hyp_sysfs_attr *hyp_attr; + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); + if (hyp_attr->store) + return hyp_attr->store(hyp_attr, buffer, len); + return 0; +} + +static const struct sysfs_ops hyp_sysfs_ops = { + .show = hyp_sysfs_show, + .store = hyp_sysfs_store, +}; + +static struct kobj_type hyp_sysfs_kobj_type = { + .sysfs_ops = &hyp_sysfs_ops, +}; + +static int __init hypervisor_subsys_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + hypervisor_kobj->ktype = &hyp_sysfs_kobj_type; + return 0; +} +device_initcall(hypervisor_subsys_init); diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c new file mode 100644 index 000000000..c4211a316 --- /dev/null +++ b/drivers/xen/tmem.c @@ -0,0 +1,428 @@ +/* + * Xen implementation for transcendent memory (tmem) + * + * Copyright (C) 2009-2011 Oracle Corp. All rights reserved. + * Author: Dan Magenheimer + */ + +#define pr_fmt(fmt) "xen:" KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/init.h> +#include <linux/pagemap.h> +#include <linux/cleancache.h> +#include <linux/frontswap.h> + +#include <xen/xen.h> +#include <xen/interface/xen.h> +#include <asm/xen/hypercall.h> +#include <asm/xen/page.h> +#include <asm/xen/hypervisor.h> +#include <xen/tmem.h> + +#ifndef CONFIG_XEN_TMEM_MODULE +bool __read_mostly tmem_enabled = false; + +static int __init enable_tmem(char *s) +{ + tmem_enabled = true; + return 1; +} +__setup("tmem", enable_tmem); +#endif + +#ifdef CONFIG_CLEANCACHE +static bool cleancache __read_mostly = true; +module_param(cleancache, bool, S_IRUGO); +static bool selfballooning __read_mostly = true; +module_param(selfballooning, bool, S_IRUGO); +#endif /* CONFIG_CLEANCACHE */ + +#ifdef CONFIG_FRONTSWAP +static bool frontswap __read_mostly = true; +module_param(frontswap, bool, S_IRUGO); +#else /* CONFIG_FRONTSWAP */ +#define frontswap (0) +#endif /* CONFIG_FRONTSWAP */ + +#ifdef CONFIG_XEN_SELFBALLOONING +static bool selfshrinking __read_mostly = true; +module_param(selfshrinking, bool, S_IRUGO); +#endif /* CONFIG_XEN_SELFBALLOONING */ + +#define TMEM_CONTROL 0 +#define TMEM_NEW_POOL 1 +#define TMEM_DESTROY_POOL 2 +#define TMEM_NEW_PAGE 3 +#define TMEM_PUT_PAGE 4 +#define TMEM_GET_PAGE 5 +#define TMEM_FLUSH_PAGE 6 +#define TMEM_FLUSH_OBJECT 7 +#define TMEM_READ 8 +#define TMEM_WRITE 9 +#define TMEM_XCHG 10 + +/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */ +#define TMEM_POOL_PERSIST 1 +#define TMEM_POOL_SHARED 2 +#define TMEM_POOL_PAGESIZE_SHIFT 4 +#define TMEM_VERSION_SHIFT 24 + + +struct tmem_pool_uuid { + u64 uuid_lo; + u64 uuid_hi; +}; + +struct tmem_oid { + u64 oid[3]; +}; + +#define TMEM_POOL_PRIVATE_UUID { 0, 0 } + +/* flags for tmem_ops.new_pool */ +#define TMEM_POOL_PERSIST 1 +#define TMEM_POOL_SHARED 2 + +/* xen tmem foundation ops/hypercalls */ + +static inline int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, struct tmem_oid oid, + u32 index, unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len) +{ + struct tmem_op op; + int rc = 0; + + op.cmd = tmem_cmd; + op.pool_id = tmem_pool; + op.u.gen.oid[0] = oid.oid[0]; + op.u.gen.oid[1] = oid.oid[1]; + op.u.gen.oid[2] = oid.oid[2]; + op.u.gen.index = index; + op.u.gen.tmem_offset = tmem_offset; + op.u.gen.pfn_offset = pfn_offset; + op.u.gen.len = len; + set_xen_guest_handle(op.u.gen.gmfn, (void *)gmfn); + rc = HYPERVISOR_tmem_op(&op); + return rc; +} + +static int xen_tmem_new_pool(struct tmem_pool_uuid uuid, + u32 flags, unsigned long pagesize) +{ + struct tmem_op op; + int rc = 0, pageshift; + + for (pageshift = 0; pagesize != 1; pageshift++) + pagesize >>= 1; + flags |= (pageshift - 12) << TMEM_POOL_PAGESIZE_SHIFT; + flags |= TMEM_SPEC_VERSION << TMEM_VERSION_SHIFT; + op.cmd = TMEM_NEW_POOL; + op.u.new.uuid[0] = uuid.uuid_lo; + op.u.new.uuid[1] = uuid.uuid_hi; + op.u.new.flags = flags; + rc = HYPERVISOR_tmem_op(&op); + return rc; +} + +/* xen generic tmem ops */ + +static int xen_tmem_put_page(u32 pool_id, struct tmem_oid oid, + u32 index, unsigned long pfn) +{ + unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; + + return xen_tmem_op(TMEM_PUT_PAGE, pool_id, oid, index, + gmfn, 0, 0, 0); +} + +static int xen_tmem_get_page(u32 pool_id, struct tmem_oid oid, + u32 index, unsigned long pfn) +{ + unsigned long gmfn = xen_pv_domain() ? pfn_to_mfn(pfn) : pfn; + + return xen_tmem_op(TMEM_GET_PAGE, pool_id, oid, index, + gmfn, 0, 0, 0); +} + +static int xen_tmem_flush_page(u32 pool_id, struct tmem_oid oid, u32 index) +{ + return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, oid, index, + 0, 0, 0, 0); +} + +static int xen_tmem_flush_object(u32 pool_id, struct tmem_oid oid) +{ + return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, oid, 0, 0, 0, 0, 0); +} + + +#ifdef CONFIG_CLEANCACHE +static int xen_tmem_destroy_pool(u32 pool_id) +{ + struct tmem_oid oid = { { 0 } }; + + return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, oid, 0, 0, 0, 0, 0); +} + +/* cleancache ops */ + +static void tmem_cleancache_put_page(int pool, struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + unsigned long pfn = page_to_pfn(page); + + if (pool < 0) + return; + if (ind != index) + return; + mb(); /* ensure page is quiescent; tmem may address it with an alias */ + (void)xen_tmem_put_page((u32)pool, oid, ind, pfn); +} + +static int tmem_cleancache_get_page(int pool, struct cleancache_filekey key, + pgoff_t index, struct page *page) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + unsigned long pfn = page_to_pfn(page); + int ret; + + /* translate return values to linux semantics */ + if (pool < 0) + return -1; + if (ind != index) + return -1; + ret = xen_tmem_get_page((u32)pool, oid, ind, pfn); + if (ret == 1) + return 0; + else + return -1; +} + +static void tmem_cleancache_flush_page(int pool, struct cleancache_filekey key, + pgoff_t index) +{ + u32 ind = (u32) index; + struct tmem_oid oid = *(struct tmem_oid *)&key; + + if (pool < 0) + return; + if (ind != index) + return; + (void)xen_tmem_flush_page((u32)pool, oid, ind); +} + +static void tmem_cleancache_flush_inode(int pool, struct cleancache_filekey key) +{ + struct tmem_oid oid = *(struct tmem_oid *)&key; + + if (pool < 0) + return; + (void)xen_tmem_flush_object((u32)pool, oid); +} + +static void tmem_cleancache_flush_fs(int pool) +{ + if (pool < 0) + return; + (void)xen_tmem_destroy_pool((u32)pool); +} + +static int tmem_cleancache_init_fs(size_t pagesize) +{ + struct tmem_pool_uuid uuid_private = TMEM_POOL_PRIVATE_UUID; + + return xen_tmem_new_pool(uuid_private, 0, pagesize); +} + +static int tmem_cleancache_init_shared_fs(char *uuid, size_t pagesize) +{ + struct tmem_pool_uuid shared_uuid; + + shared_uuid.uuid_lo = *(u64 *)uuid; + shared_uuid.uuid_hi = *(u64 *)(&uuid[8]); + return xen_tmem_new_pool(shared_uuid, TMEM_POOL_SHARED, pagesize); +} + +static struct cleancache_ops tmem_cleancache_ops = { + .put_page = tmem_cleancache_put_page, + .get_page = tmem_cleancache_get_page, + .invalidate_page = tmem_cleancache_flush_page, + .invalidate_inode = tmem_cleancache_flush_inode, + .invalidate_fs = tmem_cleancache_flush_fs, + .init_shared_fs = tmem_cleancache_init_shared_fs, + .init_fs = tmem_cleancache_init_fs +}; +#endif + +#ifdef CONFIG_FRONTSWAP +/* frontswap tmem operations */ + +/* a single tmem poolid is used for all frontswap "types" (swapfiles) */ +static int tmem_frontswap_poolid; + +/* + * Swizzling increases objects per swaptype, increasing tmem concurrency + * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS + */ +#define SWIZ_BITS 4 +#define SWIZ_MASK ((1 << SWIZ_BITS) - 1) +#define _oswiz(_type, _ind) ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK)) +#define iswiz(_ind) (_ind >> SWIZ_BITS) + +static inline struct tmem_oid oswiz(unsigned type, u32 ind) +{ + struct tmem_oid oid = { .oid = { 0 } }; + oid.oid[0] = _oswiz(type, ind); + return oid; +} + +/* returns 0 if the page was successfully put into frontswap, -1 if not */ +static int tmem_frontswap_store(unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + unsigned long pfn = page_to_pfn(page); + int pool = tmem_frontswap_poolid; + int ret; + + if (pool < 0) + return -1; + if (ind64 != ind) + return -1; + mb(); /* ensure page is quiescent; tmem may address it with an alias */ + ret = xen_tmem_put_page(pool, oswiz(type, ind), iswiz(ind), pfn); + /* translate Xen tmem return values to linux semantics */ + if (ret == 1) + return 0; + else + return -1; +} + +/* + * returns 0 if the page was successfully gotten from frontswap, -1 if + * was not present (should never happen!) + */ +static int tmem_frontswap_load(unsigned type, pgoff_t offset, + struct page *page) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + unsigned long pfn = page_to_pfn(page); + int pool = tmem_frontswap_poolid; + int ret; + + if (pool < 0) + return -1; + if (ind64 != ind) + return -1; + ret = xen_tmem_get_page(pool, oswiz(type, ind), iswiz(ind), pfn); + /* translate Xen tmem return values to linux semantics */ + if (ret == 1) + return 0; + else + return -1; +} + +/* flush a single page from frontswap */ +static void tmem_frontswap_flush_page(unsigned type, pgoff_t offset) +{ + u64 ind64 = (u64)offset; + u32 ind = (u32)offset; + int pool = tmem_frontswap_poolid; + + if (pool < 0) + return; + if (ind64 != ind) + return; + (void) xen_tmem_flush_page(pool, oswiz(type, ind), iswiz(ind)); +} + +/* flush all pages from the passed swaptype */ +static void tmem_frontswap_flush_area(unsigned type) +{ + int pool = tmem_frontswap_poolid; + int ind; + + if (pool < 0) + return; + for (ind = SWIZ_MASK; ind >= 0; ind--) + (void)xen_tmem_flush_object(pool, oswiz(type, ind)); +} + +static void tmem_frontswap_init(unsigned ignored) +{ + struct tmem_pool_uuid private = TMEM_POOL_PRIVATE_UUID; + + /* a single tmem poolid is used for all frontswap "types" (swapfiles) */ + if (tmem_frontswap_poolid < 0) + tmem_frontswap_poolid = + xen_tmem_new_pool(private, TMEM_POOL_PERSIST, PAGE_SIZE); +} + +static struct frontswap_ops tmem_frontswap_ops = { + .store = tmem_frontswap_store, + .load = tmem_frontswap_load, + .invalidate_page = tmem_frontswap_flush_page, + .invalidate_area = tmem_frontswap_flush_area, + .init = tmem_frontswap_init +}; +#endif + +static int __init xen_tmem_init(void) +{ + if (!xen_domain()) + return 0; +#ifdef CONFIG_FRONTSWAP + if (tmem_enabled && frontswap) { + char *s = ""; + struct frontswap_ops *old_ops; + + tmem_frontswap_poolid = -1; + old_ops = frontswap_register_ops(&tmem_frontswap_ops); + if (IS_ERR(old_ops) || old_ops) { + if (IS_ERR(old_ops)) + return PTR_ERR(old_ops); + s = " (WARNING: frontswap_ops overridden)"; + } + pr_info("frontswap enabled, RAM provided by Xen Transcendent Memory%s\n", + s); + } +#endif +#ifdef CONFIG_CLEANCACHE + BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid)); + if (tmem_enabled && cleancache) { + int err; + + err = cleancache_register_ops(&tmem_cleancache_ops); + if (err) + pr_warn("xen-tmem: failed to enable cleancache: %d\n", + err); + else + pr_info("cleancache enabled, RAM provided by " + "Xen Transcendent Memory\n"); + } +#endif +#ifdef CONFIG_XEN_SELFBALLOONING + /* + * There is no point of driving pages to the swap system if they + * aren't going anywhere in tmem universe. + */ + if (!frontswap) { + selfshrinking = false; + selfballooning = false; + } + xen_selfballoon_init(selfballooning, selfshrinking); +#endif + return 0; +} + +module_init(xen_tmem_init) +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>"); +MODULE_DESCRIPTION("Shim to Xen transcendent memory"); diff --git a/drivers/xen/xen-acpi-cpuhotplug.c b/drivers/xen/xen-acpi-cpuhotplug.c new file mode 100644 index 000000000..3e62ee4b3 --- /dev/null +++ b/drivers/xen/xen-acpi-cpuhotplug.c @@ -0,0 +1,462 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/cpu.h> +#include <linux/acpi.h> +#include <linux/uaccess.h> +#include <acpi/processor.h> +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +#define PREFIX "ACPI:xen_cpu_hotplug:" + +#define INSTALL_NOTIFY_HANDLER 0 +#define UNINSTALL_NOTIFY_HANDLER 1 + +static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr); + +/* -------------------------------------------------------------------------- + Driver Interface +-------------------------------------------------------------------------- */ + +static int xen_acpi_processor_enable(struct acpi_device *device) +{ + acpi_status status = 0; + unsigned long long value; + union acpi_object object = { 0 }; + struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; + struct acpi_processor *pr; + + pr = acpi_driver_data(device); + if (!pr) { + pr_err(PREFIX "Cannot find driver data\n"); + return -EINVAL; + } + + if (!strcmp(acpi_device_hid(device), ACPI_PROCESSOR_OBJECT_HID)) { + /* Declared with "Processor" statement; match ProcessorID */ + status = acpi_evaluate_object(pr->handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) { + pr_err(PREFIX "Evaluating processor object\n"); + return -ENODEV; + } + + pr->acpi_id = object.processor.proc_id; + } else { + /* Declared with "Device" statement; match _UID */ + status = acpi_evaluate_integer(pr->handle, METHOD_NAME__UID, + NULL, &value); + if (ACPI_FAILURE(status)) { + pr_err(PREFIX "Evaluating processor _UID\n"); + return -ENODEV; + } + + pr->acpi_id = value; + } + + pr->id = xen_pcpu_id(pr->acpi_id); + + if ((int)pr->id < 0) + /* This cpu is not presented at hypervisor, try to hotadd it */ + if (ACPI_FAILURE(xen_acpi_cpu_hotadd(pr))) { + pr_err(PREFIX "Hotadd CPU (acpi_id = %d) failed.\n", + pr->acpi_id); + return -ENODEV; + } + + return 0; +} + +static int xen_acpi_processor_add(struct acpi_device *device) +{ + int ret; + struct acpi_processor *pr; + + if (!device) + return -EINVAL; + + pr = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); + if (!pr) + return -ENOMEM; + + pr->handle = device->handle; + strcpy(acpi_device_name(device), ACPI_PROCESSOR_DEVICE_NAME); + strcpy(acpi_device_class(device), ACPI_PROCESSOR_CLASS); + device->driver_data = pr; + + ret = xen_acpi_processor_enable(device); + if (ret) + pr_err(PREFIX "Error when enabling Xen processor\n"); + + return ret; +} + +static int xen_acpi_processor_remove(struct acpi_device *device) +{ + struct acpi_processor *pr; + + if (!device) + return -EINVAL; + + pr = acpi_driver_data(device); + if (!pr) + return -EINVAL; + + kfree(pr); + return 0; +} + +/*-------------------------------------------------------------- + Acpi processor hotplug support +--------------------------------------------------------------*/ + +static int is_processor_present(acpi_handle handle) +{ + acpi_status status; + unsigned long long sta = 0; + + + status = acpi_evaluate_integer(handle, "_STA", NULL, &sta); + + if (ACPI_SUCCESS(status) && (sta & ACPI_STA_DEVICE_PRESENT)) + return 1; + + /* + * _STA is mandatory for a processor that supports hot plug + */ + if (status == AE_NOT_FOUND) + pr_info(PREFIX "Processor does not support hot plug\n"); + else + pr_info(PREFIX "Processor Device is not present"); + return 0; +} + +static int xen_apic_id(acpi_handle handle) +{ + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + struct acpi_madt_local_apic *lapic; + int apic_id; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_MAT", NULL, &buffer))) + return -EINVAL; + + if (!buffer.length || !buffer.pointer) + return -EINVAL; + + obj = buffer.pointer; + if (obj->type != ACPI_TYPE_BUFFER || + obj->buffer.length < sizeof(*lapic)) { + kfree(buffer.pointer); + return -EINVAL; + } + + lapic = (struct acpi_madt_local_apic *)obj->buffer.pointer; + + if (lapic->header.type != ACPI_MADT_TYPE_LOCAL_APIC || + !(lapic->lapic_flags & ACPI_MADT_ENABLED)) { + kfree(buffer.pointer); + return -EINVAL; + } + + apic_id = (uint32_t)lapic->id; + kfree(buffer.pointer); + buffer.length = ACPI_ALLOCATE_BUFFER; + buffer.pointer = NULL; + + return apic_id; +} + +static int xen_hotadd_cpu(struct acpi_processor *pr) +{ + int cpu_id, apic_id, pxm; + struct xen_platform_op op; + + apic_id = xen_apic_id(pr->handle); + if (apic_id < 0) { + pr_err(PREFIX "Failed to get apic_id for acpi_id %d\n", + pr->acpi_id); + return -ENODEV; + } + + pxm = xen_acpi_get_pxm(pr->handle); + if (pxm < 0) { + pr_err(PREFIX "Failed to get _PXM for acpi_id %d\n", + pr->acpi_id); + return pxm; + } + + op.cmd = XENPF_cpu_hotadd; + op.u.cpu_add.apic_id = apic_id; + op.u.cpu_add.acpi_id = pr->acpi_id; + op.u.cpu_add.pxm = pxm; + + cpu_id = HYPERVISOR_dom0_op(&op); + if (cpu_id < 0) + pr_err(PREFIX "Failed to hotadd CPU for acpi_id %d\n", + pr->acpi_id); + + return cpu_id; +} + +static acpi_status xen_acpi_cpu_hotadd(struct acpi_processor *pr) +{ + if (!is_processor_present(pr->handle)) + return AE_ERROR; + + pr->id = xen_hotadd_cpu(pr); + if ((int)pr->id < 0) + return AE_ERROR; + + /* + * Sync with Xen hypervisor, providing new /sys/.../xen_cpuX + * interface after cpu hotadded. + */ + xen_pcpu_hotplug_sync(); + + return AE_OK; +} + +static int acpi_processor_device_remove(struct acpi_device *device) +{ + pr_debug(PREFIX "Xen does not support CPU hotremove\n"); + + return -ENOSYS; +} + +static void acpi_processor_hotplug_notify(acpi_handle handle, + u32 event, void *data) +{ + struct acpi_processor *pr; + struct acpi_device *device = NULL; + u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ + int result; + + acpi_scan_lock_acquire(); + + switch (event) { + case ACPI_NOTIFY_BUS_CHECK: + case ACPI_NOTIFY_DEVICE_CHECK: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Processor driver received %s event\n", + (event == ACPI_NOTIFY_BUS_CHECK) ? + "ACPI_NOTIFY_BUS_CHECK" : "ACPI_NOTIFY_DEVICE_CHECK")); + + if (!is_processor_present(handle)) + break; + + acpi_bus_get_device(handle, &device); + if (acpi_device_enumerated(device)) + break; + + result = acpi_bus_scan(handle); + if (result) { + pr_err(PREFIX "Unable to add the device\n"); + break; + } + device = NULL; + acpi_bus_get_device(handle, &device); + if (!acpi_device_enumerated(device)) { + pr_err(PREFIX "Missing device object\n"); + break; + } + ost_code = ACPI_OST_SC_SUCCESS; + break; + + case ACPI_NOTIFY_EJECT_REQUEST: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "received ACPI_NOTIFY_EJECT_REQUEST\n")); + + if (acpi_bus_get_device(handle, &device)) { + pr_err(PREFIX "Device don't exist, dropping EJECT\n"); + break; + } + pr = acpi_driver_data(device); + if (!pr) { + pr_err(PREFIX "Driver data is NULL, dropping EJECT\n"); + break; + } + + /* + * TBD: implement acpi_processor_device_remove if Xen support + * CPU hotremove in the future. + */ + acpi_processor_device_remove(device); + break; + + default: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Unsupported event [0x%x]\n", event)); + + /* non-hotplug event; possibly handled by other handler */ + goto out; + } + + (void) acpi_evaluate_ost(handle, event, ost_code, NULL); + +out: + acpi_scan_lock_release(); +} + +static acpi_status is_processor_device(acpi_handle handle) +{ + struct acpi_device_info *info; + char *hid; + acpi_status status; + + status = acpi_get_object_info(handle, &info); + if (ACPI_FAILURE(status)) + return status; + + if (info->type == ACPI_TYPE_PROCESSOR) { + kfree(info); + return AE_OK; /* found a processor object */ + } + + if (!(info->valid & ACPI_VALID_HID)) { + kfree(info); + return AE_ERROR; + } + + hid = info->hardware_id.string; + if ((hid == NULL) || strcmp(hid, ACPI_PROCESSOR_DEVICE_HID)) { + kfree(info); + return AE_ERROR; + } + + kfree(info); + return AE_OK; /* found a processor device object */ +} + +static acpi_status +processor_walk_namespace_cb(acpi_handle handle, + u32 lvl, void *context, void **rv) +{ + acpi_status status; + int *action = context; + + status = is_processor_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* not a processor; continue to walk */ + + switch (*action) { + case INSTALL_NOTIFY_HANDLER: + acpi_install_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_processor_hotplug_notify, + NULL); + break; + case UNINSTALL_NOTIFY_HANDLER: + acpi_remove_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_processor_hotplug_notify); + break; + default: + break; + } + + /* found a processor; skip walking underneath */ + return AE_CTRL_DEPTH; +} + +static +void acpi_processor_install_hotplug_notify(void) +{ + int action = INSTALL_NOTIFY_HANDLER; + acpi_walk_namespace(ACPI_TYPE_ANY, + ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + processor_walk_namespace_cb, NULL, &action, NULL); +} + +static +void acpi_processor_uninstall_hotplug_notify(void) +{ + int action = UNINSTALL_NOTIFY_HANDLER; + acpi_walk_namespace(ACPI_TYPE_ANY, + ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + processor_walk_namespace_cb, NULL, &action, NULL); +} + +static const struct acpi_device_id processor_device_ids[] = { + {ACPI_PROCESSOR_OBJECT_HID, 0}, + {ACPI_PROCESSOR_DEVICE_HID, 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, processor_device_ids); + +static struct acpi_driver xen_acpi_processor_driver = { + .name = "processor", + .class = ACPI_PROCESSOR_CLASS, + .ids = processor_device_ids, + .ops = { + .add = xen_acpi_processor_add, + .remove = xen_acpi_processor_remove, + }, +}; + +static int __init xen_acpi_processor_init(void) +{ + int result = 0; + + if (!xen_initial_domain()) + return -ENODEV; + + /* unregister the stub which only used to reserve driver space */ + xen_stub_processor_exit(); + + result = acpi_bus_register_driver(&xen_acpi_processor_driver); + if (result < 0) { + xen_stub_processor_init(); + return result; + } + + acpi_processor_install_hotplug_notify(); + return 0; +} + +static void __exit xen_acpi_processor_exit(void) +{ + if (!xen_initial_domain()) + return; + + acpi_processor_uninstall_hotplug_notify(); + + acpi_bus_unregister_driver(&xen_acpi_processor_driver); + + /* + * stub reserve space again to prevent any chance of native + * driver loading. + */ + xen_stub_processor_init(); + return; +} + +module_init(xen_acpi_processor_init); +module_exit(xen_acpi_processor_exit); +ACPI_MODULE_NAME("xen-acpi-cpuhotplug"); +MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); +MODULE_DESCRIPTION("Xen Hotplug CPU Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-memhotplug.c b/drivers/xen/xen-acpi-memhotplug.c new file mode 100644 index 000000000..4fc886cd5 --- /dev/null +++ b/drivers/xen/xen-acpi-memhotplug.c @@ -0,0 +1,485 @@ +/* + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/acpi.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +#define PREFIX "ACPI:xen_memory_hotplug:" + +struct acpi_memory_info { + struct list_head list; + u64 start_addr; /* Memory Range start physical addr */ + u64 length; /* Memory Range length */ + unsigned short caching; /* memory cache attribute */ + unsigned short write_protect; /* memory read/write attribute */ + /* copied from buffer getting from _CRS */ + unsigned int enabled:1; +}; + +struct acpi_memory_device { + struct acpi_device *device; + struct list_head res_list; +}; + +static bool acpi_hotmem_initialized __read_mostly; + +static int xen_hotadd_memory(int pxm, struct acpi_memory_info *info) +{ + int rc; + struct xen_platform_op op; + + op.cmd = XENPF_mem_hotadd; + op.u.mem_add.spfn = info->start_addr >> PAGE_SHIFT; + op.u.mem_add.epfn = (info->start_addr + info->length) >> PAGE_SHIFT; + op.u.mem_add.pxm = pxm; + + rc = HYPERVISOR_dom0_op(&op); + if (rc) + pr_err(PREFIX "Xen Hotplug Memory Add failed on " + "0x%lx -> 0x%lx, _PXM: %d, error: %d\n", + (unsigned long)info->start_addr, + (unsigned long)(info->start_addr + info->length), + pxm, rc); + + return rc; +} + +static int xen_acpi_memory_enable_device(struct acpi_memory_device *mem_device) +{ + int pxm, result; + int num_enabled = 0; + struct acpi_memory_info *info; + + if (!mem_device) + return -EINVAL; + + pxm = xen_acpi_get_pxm(mem_device->device->handle); + if (pxm < 0) + return pxm; + + list_for_each_entry(info, &mem_device->res_list, list) { + if (info->enabled) { /* just sanity check...*/ + num_enabled++; + continue; + } + + if (!info->length) + continue; + + result = xen_hotadd_memory(pxm, info); + if (result) + continue; + info->enabled = 1; + num_enabled++; + } + + if (!num_enabled) + return -ENODEV; + + return 0; +} + +static acpi_status +acpi_memory_get_resource(struct acpi_resource *resource, void *context) +{ + struct acpi_memory_device *mem_device = context; + struct acpi_resource_address64 address64; + struct acpi_memory_info *info, *new; + acpi_status status; + + status = acpi_resource_to_address64(resource, &address64); + if (ACPI_FAILURE(status) || + (address64.resource_type != ACPI_MEMORY_RANGE)) + return AE_OK; + + list_for_each_entry(info, &mem_device->res_list, list) { + if ((info->caching == address64.info.mem.caching) && + (info->write_protect == address64.info.mem.write_protect) && + (info->start_addr + info->length == address64.address.minimum)) { + info->length += address64.address.address_length; + return AE_OK; + } + } + + new = kzalloc(sizeof(struct acpi_memory_info), GFP_KERNEL); + if (!new) + return AE_ERROR; + + INIT_LIST_HEAD(&new->list); + new->caching = address64.info.mem.caching; + new->write_protect = address64.info.mem.write_protect; + new->start_addr = address64.address.minimum; + new->length = address64.address.address_length; + list_add_tail(&new->list, &mem_device->res_list); + + return AE_OK; +} + +static int +acpi_memory_get_device_resources(struct acpi_memory_device *mem_device) +{ + acpi_status status; + struct acpi_memory_info *info, *n; + + if (!list_empty(&mem_device->res_list)) + return 0; + + status = acpi_walk_resources(mem_device->device->handle, + METHOD_NAME__CRS, acpi_memory_get_resource, mem_device); + + if (ACPI_FAILURE(status)) { + list_for_each_entry_safe(info, n, &mem_device->res_list, list) + kfree(info); + INIT_LIST_HEAD(&mem_device->res_list); + return -EINVAL; + } + + return 0; +} + +static int acpi_memory_get_device(acpi_handle handle, + struct acpi_memory_device **mem_device) +{ + struct acpi_device *device = NULL; + int result = 0; + + acpi_scan_lock_acquire(); + + acpi_bus_get_device(handle, &device); + if (acpi_device_enumerated(device)) + goto end; + + /* + * Now add the notified device. This creates the acpi_device + * and invokes .add function + */ + result = acpi_bus_scan(handle); + if (result) { + pr_warn(PREFIX "ACPI namespace scan failed\n"); + result = -EINVAL; + goto out; + } + device = NULL; + acpi_bus_get_device(handle, &device); + if (!acpi_device_enumerated(device)) { + pr_warn(PREFIX "Missing device object\n"); + result = -EINVAL; + goto out; + } + +end: + *mem_device = acpi_driver_data(device); + if (!(*mem_device)) { + pr_err(PREFIX "driver data not found\n"); + result = -ENODEV; + goto out; + } + +out: + acpi_scan_lock_release(); + return result; +} + +static int acpi_memory_check_device(struct acpi_memory_device *mem_device) +{ + unsigned long long current_status; + + /* Get device present/absent information from the _STA */ + if (ACPI_FAILURE(acpi_evaluate_integer(mem_device->device->handle, + "_STA", NULL, ¤t_status))) + return -ENODEV; + /* + * Check for device status. Device should be + * present/enabled/functioning. + */ + if (!((current_status & ACPI_STA_DEVICE_PRESENT) + && (current_status & ACPI_STA_DEVICE_ENABLED) + && (current_status & ACPI_STA_DEVICE_FUNCTIONING))) + return -ENODEV; + + return 0; +} + +static int acpi_memory_disable_device(struct acpi_memory_device *mem_device) +{ + pr_debug(PREFIX "Xen does not support memory hotremove\n"); + + return -ENOSYS; +} + +static void acpi_memory_device_notify(acpi_handle handle, u32 event, void *data) +{ + struct acpi_memory_device *mem_device; + struct acpi_device *device; + u32 ost_code = ACPI_OST_SC_NON_SPECIFIC_FAILURE; /* default */ + + switch (event) { + case ACPI_NOTIFY_BUS_CHECK: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived BUS CHECK notification for device\n")); + /* Fall Through */ + case ACPI_NOTIFY_DEVICE_CHECK: + if (event == ACPI_NOTIFY_DEVICE_CHECK) + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived DEVICE CHECK notification for device\n")); + + if (acpi_memory_get_device(handle, &mem_device)) { + pr_err(PREFIX "Cannot find driver data\n"); + break; + } + + ost_code = ACPI_OST_SC_SUCCESS; + break; + + case ACPI_NOTIFY_EJECT_REQUEST: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "\nReceived EJECT REQUEST notification for device\n")); + + acpi_scan_lock_acquire(); + if (acpi_bus_get_device(handle, &device)) { + acpi_scan_lock_release(); + pr_err(PREFIX "Device doesn't exist\n"); + break; + } + mem_device = acpi_driver_data(device); + if (!mem_device) { + acpi_scan_lock_release(); + pr_err(PREFIX "Driver Data is NULL\n"); + break; + } + + /* + * TBD: implement acpi_memory_disable_device and invoke + * acpi_bus_remove if Xen support hotremove in the future + */ + acpi_memory_disable_device(mem_device); + acpi_scan_lock_release(); + break; + + default: + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Unsupported event [0x%x]\n", event)); + /* non-hotplug event; possibly handled by other handler */ + return; + } + + (void) acpi_evaluate_ost(handle, event, ost_code, NULL); + return; +} + +static int xen_acpi_memory_device_add(struct acpi_device *device) +{ + int result; + struct acpi_memory_device *mem_device = NULL; + + + if (!device) + return -EINVAL; + + mem_device = kzalloc(sizeof(struct acpi_memory_device), GFP_KERNEL); + if (!mem_device) + return -ENOMEM; + + INIT_LIST_HEAD(&mem_device->res_list); + mem_device->device = device; + sprintf(acpi_device_name(device), "%s", ACPI_MEMORY_DEVICE_NAME); + sprintf(acpi_device_class(device), "%s", ACPI_MEMORY_DEVICE_CLASS); + device->driver_data = mem_device; + + /* Get the range from the _CRS */ + result = acpi_memory_get_device_resources(mem_device); + if (result) { + kfree(mem_device); + return result; + } + + /* + * For booting existed memory devices, early boot code has recognized + * memory area by EFI/E820. If DSDT shows these memory devices on boot, + * hotplug is not necessary for them. + * For hot-added memory devices during runtime, it need hypercall to + * Xen hypervisor to add memory. + */ + if (!acpi_hotmem_initialized) + return 0; + + if (!acpi_memory_check_device(mem_device)) + result = xen_acpi_memory_enable_device(mem_device); + + return result; +} + +static int xen_acpi_memory_device_remove(struct acpi_device *device) +{ + struct acpi_memory_device *mem_device = NULL; + + if (!device || !acpi_driver_data(device)) + return -EINVAL; + + mem_device = acpi_driver_data(device); + kfree(mem_device); + + return 0; +} + +/* + * Helper function to check for memory device + */ +static acpi_status is_memory_device(acpi_handle handle) +{ + char *hardware_id; + acpi_status status; + struct acpi_device_info *info; + + status = acpi_get_object_info(handle, &info); + if (ACPI_FAILURE(status)) + return status; + + if (!(info->valid & ACPI_VALID_HID)) { + kfree(info); + return AE_ERROR; + } + + hardware_id = info->hardware_id.string; + if ((hardware_id == NULL) || + (strcmp(hardware_id, ACPI_MEMORY_DEVICE_HID))) + status = AE_ERROR; + + kfree(info); + return status; +} + +static acpi_status +acpi_memory_register_notify_handler(acpi_handle handle, + u32 level, void *ctxt, void **retv) +{ + acpi_status status; + + status = is_memory_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* continue */ + + status = acpi_install_notify_handler(handle, ACPI_SYSTEM_NOTIFY, + acpi_memory_device_notify, NULL); + /* continue */ + return AE_OK; +} + +static acpi_status +acpi_memory_deregister_notify_handler(acpi_handle handle, + u32 level, void *ctxt, void **retv) +{ + acpi_status status; + + status = is_memory_device(handle); + if (ACPI_FAILURE(status)) + return AE_OK; /* continue */ + + status = acpi_remove_notify_handler(handle, + ACPI_SYSTEM_NOTIFY, + acpi_memory_device_notify); + + return AE_OK; /* continue */ +} + +static const struct acpi_device_id memory_device_ids[] = { + {ACPI_MEMORY_DEVICE_HID, 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, memory_device_ids); + +static struct acpi_driver xen_acpi_memory_device_driver = { + .name = "acpi_memhotplug", + .class = ACPI_MEMORY_DEVICE_CLASS, + .ids = memory_device_ids, + .ops = { + .add = xen_acpi_memory_device_add, + .remove = xen_acpi_memory_device_remove, + }, +}; + +static int __init xen_acpi_memory_device_init(void) +{ + int result; + acpi_status status; + + if (!xen_initial_domain()) + return -ENODEV; + + /* unregister the stub which only used to reserve driver space */ + xen_stub_memory_device_exit(); + + result = acpi_bus_register_driver(&xen_acpi_memory_device_driver); + if (result < 0) { + xen_stub_memory_device_init(); + return -ENODEV; + } + + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_memory_register_notify_handler, + NULL, NULL, NULL); + + if (ACPI_FAILURE(status)) { + pr_warn(PREFIX "walk_namespace failed\n"); + acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); + xen_stub_memory_device_init(); + return -ENODEV; + } + + acpi_hotmem_initialized = true; + return 0; +} + +static void __exit xen_acpi_memory_device_exit(void) +{ + acpi_status status; + + if (!xen_initial_domain()) + return; + + status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + acpi_memory_deregister_notify_handler, + NULL, NULL, NULL); + if (ACPI_FAILURE(status)) + pr_warn(PREFIX "walk_namespace failed\n"); + + acpi_bus_unregister_driver(&xen_acpi_memory_device_driver); + + /* + * stub reserve space again to prevent any chance of native + * driver loading. + */ + xen_stub_memory_device_init(); + return; +} + +module_init(xen_acpi_memory_device_init); +module_exit(xen_acpi_memory_device_exit); +ACPI_MODULE_NAME("xen-acpi-memhotplug"); +MODULE_AUTHOR("Liu Jinsong <jinsong.liu@intel.com>"); +MODULE_DESCRIPTION("Xen Hotplug Mem Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-acpi-pad.c b/drivers/xen/xen-acpi-pad.c new file mode 100644 index 000000000..f83b75450 --- /dev/null +++ b/drivers/xen/xen-acpi-pad.c @@ -0,0 +1,170 @@ +/* + * xen-acpi-pad.c - Xen pad interface + * + * Copyright (c) 2012, Intel Corporation. + * Author: Liu, Jinsong <jinsong.liu@intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/interface/version.h> +#include <xen/xen-ops.h> +#include <asm/xen/hypercall.h> + +#define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad" +#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" +#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80 +static DEFINE_MUTEX(xen_cpu_lock); + +static int xen_acpi_pad_idle_cpus(unsigned int idle_nums) +{ + struct xen_platform_op op; + + op.cmd = XENPF_core_parking; + op.u.core_parking.type = XEN_CORE_PARKING_SET; + op.u.core_parking.idle_nums = idle_nums; + + return HYPERVISOR_dom0_op(&op); +} + +static int xen_acpi_pad_idle_cpus_num(void) +{ + struct xen_platform_op op; + + op.cmd = XENPF_core_parking; + op.u.core_parking.type = XEN_CORE_PARKING_GET; + + return HYPERVISOR_dom0_op(&op) + ?: op.u.core_parking.idle_nums; +} + +/* + * Query firmware how many CPUs should be idle + * return -1 on failure + */ +static int acpi_pad_pur(acpi_handle handle) +{ + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *package; + int num = -1; + + if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PUR", NULL, &buffer))) + return num; + + if (!buffer.length || !buffer.pointer) + return num; + + package = buffer.pointer; + + if (package->type == ACPI_TYPE_PACKAGE && + package->package.count == 2 && + package->package.elements[0].integer.value == 1) /* rev 1 */ + num = package->package.elements[1].integer.value; + + kfree(buffer.pointer); + return num; +} + +static void acpi_pad_handle_notify(acpi_handle handle) +{ + int idle_nums; + struct acpi_buffer param = { + .length = 4, + .pointer = (void *)&idle_nums, + }; + + + mutex_lock(&xen_cpu_lock); + idle_nums = acpi_pad_pur(handle); + if (idle_nums < 0) { + mutex_unlock(&xen_cpu_lock); + return; + } + + idle_nums = xen_acpi_pad_idle_cpus(idle_nums) + ?: xen_acpi_pad_idle_cpus_num(); + if (idle_nums >= 0) + acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY, + 0, ¶m); + mutex_unlock(&xen_cpu_lock); +} + +static void acpi_pad_notify(acpi_handle handle, u32 event, + void *data) +{ + switch (event) { + case ACPI_PROCESSOR_AGGREGATOR_NOTIFY: + acpi_pad_handle_notify(handle); + break; + default: + pr_warn("Unsupported event [0x%x]\n", event); + break; + } +} + +static int acpi_pad_add(struct acpi_device *device) +{ + acpi_status status; + + strcpy(acpi_device_name(device), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME); + strcpy(acpi_device_class(device), ACPI_PROCESSOR_AGGREGATOR_CLASS); + + status = acpi_install_notify_handler(device->handle, + ACPI_DEVICE_NOTIFY, acpi_pad_notify, device); + if (ACPI_FAILURE(status)) + return -ENODEV; + + return 0; +} + +static int acpi_pad_remove(struct acpi_device *device) +{ + mutex_lock(&xen_cpu_lock); + xen_acpi_pad_idle_cpus(0); + mutex_unlock(&xen_cpu_lock); + + acpi_remove_notify_handler(device->handle, + ACPI_DEVICE_NOTIFY, acpi_pad_notify); + return 0; +} + +static const struct acpi_device_id pad_device_ids[] = { + {"ACPI000C", 0}, + {"", 0}, +}; + +static struct acpi_driver acpi_pad_driver = { + .name = "processor_aggregator", + .class = ACPI_PROCESSOR_AGGREGATOR_CLASS, + .ids = pad_device_ids, + .ops = { + .add = acpi_pad_add, + .remove = acpi_pad_remove, + }, +}; + +static int __init xen_acpi_pad_init(void) +{ + /* Only DOM0 is responsible for Xen acpi pad */ + if (!xen_initial_domain()) + return -ENODEV; + + /* Only Xen4.2 or later support Xen acpi pad */ + if (!xen_running_on_version_or_later(4, 2)) + return -ENODEV; + + return acpi_bus_register_driver(&acpi_pad_driver); +} +subsys_initcall(xen_acpi_pad_init); diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c new file mode 100644 index 000000000..59fc190f1 --- /dev/null +++ b/drivers/xen/xen-acpi-processor.c @@ -0,0 +1,597 @@ +/* + * Copyright 2012 by Oracle Inc + * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This code borrows ideas from https://lkml.org/lkml/2011/11/30/249 + * so many thanks go to Kevin Tian <kevin.tian@intel.com> + * and Yu Ke <ke.yu@intel.com>. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpumask.h> +#include <linux/cpufreq.h> +#include <linux/freezer.h> +#include <linux/kernel.h> +#include <linux/kthread.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <acpi/processor.h> +#include <xen/xen.h> +#include <xen/xen-ops.h> +#include <xen/interface/platform.h> +#include <asm/xen/hypercall.h> + +static int no_hypercall; +MODULE_PARM_DESC(off, "Inhibit the hypercall."); +module_param_named(off, no_hypercall, int, 0400); + +/* + * Note: Do not convert the acpi_id* below to cpumask_var_t or use cpumask_bit + * - as those shrink to nr_cpu_bits (which is dependent on possible_cpu), which + * can be less than what we want to put in. Instead use the 'nr_acpi_bits' + * which is dynamically computed based on the MADT or x2APIC table. + */ +static unsigned int nr_acpi_bits; +/* Mutex to protect the acpi_ids_done - for CPU hotplug use. */ +static DEFINE_MUTEX(acpi_ids_mutex); +/* Which ACPI ID we have processed from 'struct acpi_processor'. */ +static unsigned long *acpi_ids_done; +/* Which ACPI ID exist in the SSDT/DSDT processor definitions. */ +static unsigned long *acpi_id_present; +/* And if there is an _CST definition (or a PBLK) for the ACPI IDs */ +static unsigned long *acpi_id_cst_present; + +static int push_cxx_to_hypervisor(struct acpi_processor *_pr) +{ + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.set_pminfo.id = _pr->acpi_id, + .u.set_pminfo.type = XEN_PM_CX, + }; + struct xen_processor_cx *dst_cx, *dst_cx_states = NULL; + struct acpi_processor_cx *cx; + unsigned int i, ok; + int ret = 0; + + dst_cx_states = kcalloc(_pr->power.count, + sizeof(struct xen_processor_cx), GFP_KERNEL); + if (!dst_cx_states) + return -ENOMEM; + + for (ok = 0, i = 1; i <= _pr->power.count; i++) { + cx = &_pr->power.states[i]; + if (!cx->valid) + continue; + + dst_cx = &(dst_cx_states[ok++]); + + dst_cx->reg.space_id = ACPI_ADR_SPACE_SYSTEM_IO; + if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) { + dst_cx->reg.bit_width = 8; + dst_cx->reg.bit_offset = 0; + dst_cx->reg.access_size = 1; + } else { + dst_cx->reg.space_id = ACPI_ADR_SPACE_FIXED_HARDWARE; + if (cx->entry_method == ACPI_CSTATE_FFH) { + /* NATIVE_CSTATE_BEYOND_HALT */ + dst_cx->reg.bit_offset = 2; + dst_cx->reg.bit_width = 1; /* VENDOR_INTEL */ + } + dst_cx->reg.access_size = 0; + } + dst_cx->reg.address = cx->address; + + dst_cx->type = cx->type; + dst_cx->latency = cx->latency; + + dst_cx->dpcnt = 0; + set_xen_guest_handle(dst_cx->dp, NULL); + } + if (!ok) { + pr_debug("No _Cx for ACPI CPU %u\n", _pr->acpi_id); + kfree(dst_cx_states); + return -EINVAL; + } + op.u.set_pminfo.power.count = ok; + op.u.set_pminfo.power.flags.bm_control = _pr->flags.bm_control; + op.u.set_pminfo.power.flags.bm_check = _pr->flags.bm_check; + op.u.set_pminfo.power.flags.has_cst = _pr->flags.has_cst; + op.u.set_pminfo.power.flags.power_setup_done = + _pr->flags.power_setup_done; + + set_xen_guest_handle(op.u.set_pminfo.power.states, dst_cx_states); + + if (!no_hypercall) + ret = HYPERVISOR_dom0_op(&op); + + if (!ret) { + pr_debug("ACPI CPU%u - C-states uploaded.\n", _pr->acpi_id); + for (i = 1; i <= _pr->power.count; i++) { + cx = &_pr->power.states[i]; + if (!cx->valid) + continue; + pr_debug(" C%d: %s %d uS\n", + cx->type, cx->desc, (u32)cx->latency); + } + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) + /* EINVAL means the ACPI ID is incorrect - meaning the ACPI + * table is referencing a non-existing CPU - which can happen + * with broken ACPI tables. */ + pr_err("(CX): Hypervisor error (%d) for ACPI CPU%u\n", + ret, _pr->acpi_id); + + kfree(dst_cx_states); + + return ret; +} +static struct xen_processor_px * +xen_copy_pss_data(struct acpi_processor *_pr, + struct xen_processor_performance *dst_perf) +{ + struct xen_processor_px *dst_states = NULL; + unsigned int i; + + BUILD_BUG_ON(sizeof(struct xen_processor_px) != + sizeof(struct acpi_processor_px)); + + dst_states = kcalloc(_pr->performance->state_count, + sizeof(struct xen_processor_px), GFP_KERNEL); + if (!dst_states) + return ERR_PTR(-ENOMEM); + + dst_perf->state_count = _pr->performance->state_count; + for (i = 0; i < _pr->performance->state_count; i++) { + /* Fortunatly for us, they are both the same size */ + memcpy(&(dst_states[i]), &(_pr->performance->states[i]), + sizeof(struct acpi_processor_px)); + } + return dst_states; +} +static int xen_copy_psd_data(struct acpi_processor *_pr, + struct xen_processor_performance *dst) +{ + struct acpi_psd_package *pdomain; + + BUILD_BUG_ON(sizeof(struct xen_psd_package) != + sizeof(struct acpi_psd_package)); + + /* This information is enumerated only if acpi_processor_preregister_performance + * has been called. + */ + dst->shared_type = _pr->performance->shared_type; + + pdomain = &(_pr->performance->domain_info); + + /* 'acpi_processor_preregister_performance' does not parse if the + * num_processors <= 1, but Xen still requires it. Do it manually here. + */ + if (pdomain->num_processors <= 1) { + if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ALL) + dst->shared_type = CPUFREQ_SHARED_TYPE_ALL; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_HW_ALL) + dst->shared_type = CPUFREQ_SHARED_TYPE_HW; + else if (pdomain->coord_type == DOMAIN_COORD_TYPE_SW_ANY) + dst->shared_type = CPUFREQ_SHARED_TYPE_ANY; + + } + memcpy(&(dst->domain_info), pdomain, sizeof(struct acpi_psd_package)); + return 0; +} +static int xen_copy_pct_data(struct acpi_pct_register *pct, + struct xen_pct_register *dst_pct) +{ + /* It would be nice if you could just do 'memcpy(pct, dst_pct') but + * sadly the Xen structure did not have the proper padding so the + * descriptor field takes two (dst_pct) bytes instead of one (pct). + */ + dst_pct->descriptor = pct->descriptor; + dst_pct->length = pct->length; + dst_pct->space_id = pct->space_id; + dst_pct->bit_width = pct->bit_width; + dst_pct->bit_offset = pct->bit_offset; + dst_pct->reserved = pct->reserved; + dst_pct->address = pct->address; + return 0; +} +static int push_pxx_to_hypervisor(struct acpi_processor *_pr) +{ + int ret = 0; + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .interface_version = XENPF_INTERFACE_VERSION, + .u.set_pminfo.id = _pr->acpi_id, + .u.set_pminfo.type = XEN_PM_PX, + }; + struct xen_processor_performance *dst_perf; + struct xen_processor_px *dst_states = NULL; + + dst_perf = &op.u.set_pminfo.perf; + + dst_perf->platform_limit = _pr->performance_platform_limit; + dst_perf->flags |= XEN_PX_PPC; + xen_copy_pct_data(&(_pr->performance->control_register), + &dst_perf->control_register); + xen_copy_pct_data(&(_pr->performance->status_register), + &dst_perf->status_register); + dst_perf->flags |= XEN_PX_PCT; + dst_states = xen_copy_pss_data(_pr, dst_perf); + if (!IS_ERR_OR_NULL(dst_states)) { + set_xen_guest_handle(dst_perf->states, dst_states); + dst_perf->flags |= XEN_PX_PSS; + } + if (!xen_copy_psd_data(_pr, dst_perf)) + dst_perf->flags |= XEN_PX_PSD; + + if (dst_perf->flags != (XEN_PX_PSD | XEN_PX_PSS | XEN_PX_PCT | XEN_PX_PPC)) { + pr_warn("ACPI CPU%u missing some P-state data (%x), skipping\n", + _pr->acpi_id, dst_perf->flags); + ret = -ENODEV; + goto err_free; + } + + if (!no_hypercall) + ret = HYPERVISOR_dom0_op(&op); + + if (!ret) { + struct acpi_processor_performance *perf; + unsigned int i; + + perf = _pr->performance; + pr_debug("ACPI CPU%u - P-states uploaded.\n", _pr->acpi_id); + for (i = 0; i < perf->state_count; i++) { + pr_debug(" %cP%d: %d MHz, %d mW, %d uS\n", + (i == perf->state ? '*' : ' '), i, + (u32) perf->states[i].core_frequency, + (u32) perf->states[i].power, + (u32) perf->states[i].transition_latency); + } + } else if ((ret != -EINVAL) && (ret != -ENOSYS)) + /* EINVAL means the ACPI ID is incorrect - meaning the ACPI + * table is referencing a non-existing CPU - which can happen + * with broken ACPI tables. */ + pr_warn("(_PXX): Hypervisor error (%d) for ACPI CPU%u\n", + ret, _pr->acpi_id); +err_free: + if (!IS_ERR_OR_NULL(dst_states)) + kfree(dst_states); + + return ret; +} +static int upload_pm_data(struct acpi_processor *_pr) +{ + int err = 0; + + mutex_lock(&acpi_ids_mutex); + if (__test_and_set_bit(_pr->acpi_id, acpi_ids_done)) { + mutex_unlock(&acpi_ids_mutex); + return -EBUSY; + } + if (_pr->flags.power) + err = push_cxx_to_hypervisor(_pr); + + if (_pr->performance && _pr->performance->states) + err |= push_pxx_to_hypervisor(_pr); + + mutex_unlock(&acpi_ids_mutex); + return err; +} +static unsigned int __init get_max_acpi_id(void) +{ + struct xenpf_pcpuinfo *info; + struct xen_platform_op op = { + .cmd = XENPF_get_cpuinfo, + .interface_version = XENPF_INTERFACE_VERSION, + }; + int ret = 0; + unsigned int i, last_cpu, max_acpi_id = 0; + + info = &op.u.pcpu_info; + info->xen_cpuid = 0; + + ret = HYPERVISOR_dom0_op(&op); + if (ret) + return NR_CPUS; + + /* The max_present is the same irregardless of the xen_cpuid */ + last_cpu = op.u.pcpu_info.max_present; + for (i = 0; i <= last_cpu; i++) { + info->xen_cpuid = i; + ret = HYPERVISOR_dom0_op(&op); + if (ret) + continue; + max_acpi_id = max(info->acpi_id, max_acpi_id); + } + max_acpi_id *= 2; /* Slack for CPU hotplug support. */ + pr_debug("Max ACPI ID: %u\n", max_acpi_id); + return max_acpi_id; +} +/* + * The read_acpi_id and check_acpi_ids are there to support the Xen + * oddity of virtual CPUs != physical CPUs in the initial domain. + * The user can supply 'xen_max_vcpus=X' on the Xen hypervisor line + * which will band the amount of CPUs the initial domain can see. + * In general that is OK, except it plays havoc with any of the + * for_each_[present|online]_cpu macros which are banded to the virtual + * CPU amount. + */ +static acpi_status +read_acpi_id(acpi_handle handle, u32 lvl, void *context, void **rv) +{ + u32 acpi_id; + acpi_status status; + acpi_object_type acpi_type; + unsigned long long tmp; + union acpi_object object = { 0 }; + struct acpi_buffer buffer = { sizeof(union acpi_object), &object }; + acpi_io_address pblk = 0; + + status = acpi_get_type(handle, &acpi_type); + if (ACPI_FAILURE(status)) + return AE_OK; + + switch (acpi_type) { + case ACPI_TYPE_PROCESSOR: + status = acpi_evaluate_object(handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) + return AE_OK; + acpi_id = object.processor.proc_id; + pblk = object.processor.pblk_address; + break; + case ACPI_TYPE_DEVICE: + status = acpi_evaluate_integer(handle, "_UID", NULL, &tmp); + if (ACPI_FAILURE(status)) + return AE_OK; + acpi_id = tmp; + break; + default: + return AE_OK; + } + /* There are more ACPI Processor objects than in x2APIC or MADT. + * This can happen with incorrect ACPI SSDT declerations. */ + if (acpi_id > nr_acpi_bits) { + pr_debug("We only have %u, trying to set %u\n", + nr_acpi_bits, acpi_id); + return AE_OK; + } + /* OK, There is a ACPI Processor object */ + __set_bit(acpi_id, acpi_id_present); + + pr_debug("ACPI CPU%u w/ PBLK:0x%lx\n", acpi_id, (unsigned long)pblk); + + status = acpi_evaluate_object(handle, "_CST", NULL, &buffer); + if (ACPI_FAILURE(status)) { + if (!pblk) + return AE_OK; + } + /* .. and it has a C-state */ + __set_bit(acpi_id, acpi_id_cst_present); + + return AE_OK; +} +static int check_acpi_ids(struct acpi_processor *pr_backup) +{ + + if (!pr_backup) + return -ENODEV; + + if (acpi_id_present && acpi_id_cst_present) + /* OK, done this once .. skip to uploading */ + goto upload; + + /* All online CPUs have been processed at this stage. Now verify + * whether in fact "online CPUs" == physical CPUs. + */ + acpi_id_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + if (!acpi_id_present) + return -ENOMEM; + + acpi_id_cst_present = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + if (!acpi_id_cst_present) { + kfree(acpi_id_present); + return -ENOMEM; + } + + acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, + ACPI_UINT32_MAX, + read_acpi_id, NULL, NULL, NULL); + acpi_get_devices("ACPI0007", read_acpi_id, NULL, NULL); + +upload: + if (!bitmap_equal(acpi_id_present, acpi_ids_done, nr_acpi_bits)) { + unsigned int i; + for_each_set_bit(i, acpi_id_present, nr_acpi_bits) { + pr_backup->acpi_id = i; + /* Mask out C-states if there are no _CST or PBLK */ + pr_backup->flags.power = test_bit(i, acpi_id_cst_present); + (void)upload_pm_data(pr_backup); + } + } + + return 0; +} +static int __init check_prereq(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + + if (!xen_initial_domain()) + return -ENODEV; + + if (!acpi_gbl_FADT.smi_command) + return -ENODEV; + + if (c->x86_vendor == X86_VENDOR_INTEL) { + if (!cpu_has(c, X86_FEATURE_EST)) + return -ENODEV; + + return 0; + } + if (c->x86_vendor == X86_VENDOR_AMD) { + /* Copied from powernow-k8.h, can't include ../cpufreq/powernow + * as we get compile warnings for the static functions. + */ +#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007 +#define USE_HW_PSTATE 0x00000080 + u32 eax, ebx, ecx, edx; + cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx); + if ((edx & USE_HW_PSTATE) != USE_HW_PSTATE) + return -ENODEV; + return 0; + } + return -ENODEV; +} +/* acpi_perf_data is a pointer to percpu data. */ +static struct acpi_processor_performance __percpu *acpi_perf_data; + +static void free_acpi_perf_data(void) +{ + unsigned int i; + + /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */ + for_each_possible_cpu(i) + free_cpumask_var(per_cpu_ptr(acpi_perf_data, i) + ->shared_cpu_map); + free_percpu(acpi_perf_data); +} + +static int xen_upload_processor_pm_data(void) +{ + struct acpi_processor *pr_backup = NULL; + unsigned int i; + int rc = 0; + + pr_info("Uploading Xen processor PM info\n"); + + for_each_possible_cpu(i) { + struct acpi_processor *_pr; + _pr = per_cpu(processors, i /* APIC ID */); + if (!_pr) + continue; + + if (!pr_backup) { + pr_backup = kzalloc(sizeof(struct acpi_processor), GFP_KERNEL); + if (pr_backup) + memcpy(pr_backup, _pr, sizeof(struct acpi_processor)); + } + (void)upload_pm_data(_pr); + } + + rc = check_acpi_ids(pr_backup); + kfree(pr_backup); + + return rc; +} + +static int xen_acpi_processor_resume(struct notifier_block *nb, + unsigned long action, void *data) +{ + bitmap_zero(acpi_ids_done, nr_acpi_bits); + return xen_upload_processor_pm_data(); +} + +struct notifier_block xen_acpi_processor_resume_nb = { + .notifier_call = xen_acpi_processor_resume, +}; + +static int __init xen_acpi_processor_init(void) +{ + unsigned int i; + int rc = check_prereq(); + + if (rc) + return rc; + + nr_acpi_bits = get_max_acpi_id() + 1; + acpi_ids_done = kcalloc(BITS_TO_LONGS(nr_acpi_bits), sizeof(unsigned long), GFP_KERNEL); + if (!acpi_ids_done) + return -ENOMEM; + + acpi_perf_data = alloc_percpu(struct acpi_processor_performance); + if (!acpi_perf_data) { + pr_debug("Memory allocation error for acpi_perf_data\n"); + kfree(acpi_ids_done); + return -ENOMEM; + } + for_each_possible_cpu(i) { + if (!zalloc_cpumask_var_node( + &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map, + GFP_KERNEL, cpu_to_node(i))) { + rc = -ENOMEM; + goto err_out; + } + } + + /* Do initialization in ACPI core. It is OK to fail here. */ + (void)acpi_processor_preregister_performance(acpi_perf_data); + + for_each_possible_cpu(i) { + struct acpi_processor *pr; + struct acpi_processor_performance *perf; + + pr = per_cpu(processors, i); + perf = per_cpu_ptr(acpi_perf_data, i); + if (!pr) + continue; + + pr->performance = perf; + rc = acpi_processor_get_performance_info(pr); + if (rc) + goto err_out; + } + + rc = xen_upload_processor_pm_data(); + if (rc) + goto err_unregister; + + xen_resume_notifier_register(&xen_acpi_processor_resume_nb); + + return 0; +err_unregister: + for_each_possible_cpu(i) { + struct acpi_processor_performance *perf; + perf = per_cpu_ptr(acpi_perf_data, i); + acpi_processor_unregister_performance(perf, i); + } +err_out: + /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */ + free_acpi_perf_data(); + kfree(acpi_ids_done); + return rc; +} +static void __exit xen_acpi_processor_exit(void) +{ + int i; + + xen_resume_notifier_unregister(&xen_acpi_processor_resume_nb); + kfree(acpi_ids_done); + kfree(acpi_id_present); + kfree(acpi_id_cst_present); + for_each_possible_cpu(i) { + struct acpi_processor_performance *perf; + perf = per_cpu_ptr(acpi_perf_data, i); + acpi_processor_unregister_performance(perf, i); + } + free_acpi_perf_data(); +} + +MODULE_AUTHOR("Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>"); +MODULE_DESCRIPTION("Xen ACPI Processor P-states (and Cx) driver which uploads PM data to Xen hypervisor"); +MODULE_LICENSE("GPL"); + +/* We want to be loaded before the CPU freq scaling drivers are loaded. + * They are loaded in late_initcall. */ +device_initcall(xen_acpi_processor_init); +module_exit(xen_acpi_processor_exit); diff --git a/drivers/xen/xen-balloon.c b/drivers/xen/xen-balloon.c new file mode 100644 index 000000000..39e7ef8d3 --- /dev/null +++ b/drivers/xen/xen-balloon.c @@ -0,0 +1,254 @@ +/****************************************************************************** + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/interface/xen.h> +#include <xen/balloon.h> +#include <xen/xenbus.h> +#include <xen/features.h> +#include <xen/page.h> + +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) + +#define BALLOON_CLASS_NAME "xen_memory" + +static struct device balloon_dev; + +static int register_balloon(struct device *dev); + +/* React to a change in the target key */ +static void watch_target(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + unsigned long long new_target; + int err; + + err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); + if (err != 1) { + /* This is ok (for domain0 at least) - so just return */ + return; + } + + /* The given memory/target value is in KiB, so it needs converting to + * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. + */ + balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); +} +static struct xenbus_watch target_watch = { + .node = "memory/target", + .callback = watch_target, +}; + + +static int balloon_init_watcher(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + int err; + + err = register_xenbus_watch(&target_watch); + if (err) + pr_err("Failed to set balloon watcher\n"); + + return NOTIFY_DONE; +} + +static struct notifier_block xenstore_notifier = { + .notifier_call = balloon_init_watcher, +}; + +static int __init balloon_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + pr_info("Initialising balloon driver\n"); + + register_balloon(&balloon_dev); + + register_xen_selfballooning(&balloon_dev); + + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} +subsys_initcall(balloon_init); + +static void balloon_exit(void) +{ + /* XXX - release balloon here */ + return; +} + +module_exit(balloon_exit); + +#define BALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(balloon_stats.current_pages)); +BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_low)); +BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(balloon_stats.balloon_high)); + +static DEVICE_ULONG_ATTR(schedule_delay, 0444, balloon_stats.schedule_delay); +static DEVICE_ULONG_ATTR(max_schedule_delay, 0644, balloon_stats.max_schedule_delay); +static DEVICE_ULONG_ATTR(retry_count, 0444, balloon_stats.retry_count); +static DEVICE_ULONG_ATTR(max_retry_count, 0644, balloon_stats.max_retry_count); + +static ssize_t show_target_kb(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", PAGES2KB(balloon_stats.target_pages)); +} + +static ssize_t store_target_kb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + target_bytes = simple_strtoull(buf, &endchar, 0) * 1024; + + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static DEVICE_ATTR(target_kb, S_IRUGO | S_IWUSR, + show_target_kb, store_target_kb); + + +static ssize_t show_target(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)balloon_stats.target_pages + << PAGE_SHIFT); +} + +static ssize_t store_target(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + char *endchar; + unsigned long long target_bytes; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + target_bytes = memparse(buf, &endchar); + + balloon_set_new_target(target_bytes >> PAGE_SHIFT); + + return count; +} + +static DEVICE_ATTR(target, S_IRUGO | S_IWUSR, + show_target, store_target); + + +static struct attribute *balloon_attrs[] = { + &dev_attr_target_kb.attr, + &dev_attr_target.attr, + &dev_attr_schedule_delay.attr.attr, + &dev_attr_max_schedule_delay.attr.attr, + &dev_attr_retry_count.attr.attr, + &dev_attr_max_retry_count.attr.attr, + NULL +}; + +static const struct attribute_group balloon_group = { + .attrs = balloon_attrs +}; + +static struct attribute *balloon_info_attrs[] = { + &dev_attr_current_kb.attr, + &dev_attr_low_kb.attr, + &dev_attr_high_kb.attr, + NULL +}; + +static const struct attribute_group balloon_info_group = { + .name = "info", + .attrs = balloon_info_attrs +}; + +static const struct attribute_group *balloon_groups[] = { + &balloon_group, + &balloon_info_group, + NULL +}; + +static struct bus_type balloon_subsys = { + .name = BALLOON_CLASS_NAME, + .dev_name = BALLOON_CLASS_NAME, +}; + +static int register_balloon(struct device *dev) +{ + int error; + + error = subsys_system_register(&balloon_subsys, NULL); + if (error) + return error; + + dev->id = 0; + dev->bus = &balloon_subsys; + dev->groups = balloon_groups; + + error = device_register(dev); + if (error) { + bus_unregister(&balloon_subsys); + return error; + } + + return 0; +} + +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xen-pciback/Makefile b/drivers/xen/xen-pciback/Makefile new file mode 100644 index 000000000..ffe0ad343 --- /dev/null +++ b/drivers/xen/xen-pciback/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback.o + +xen-pciback-y := pci_stub.o pciback_ops.o xenbus.o +xen-pciback-y += conf_space.o conf_space_header.o \ + conf_space_capability.o \ + conf_space_quirks.o vpci.o \ + passthrough.o diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c new file mode 100644 index 000000000..9c234209d --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space.c @@ -0,0 +1,438 @@ +/* + * PCI Backend - Functions for creating a virtual configuration space for + * exported PCI Devices. + * It's dangerous to allow PCI Driver Domains to change their + * device's resources (memory, i/o ports, interrupts). We need to + * restrict changes to certain PCI Configuration registers: + * BARs, INTERRUPT_PIN, most registers in the header... + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +bool xen_pcibk_permissive; +module_param_named(permissive, xen_pcibk_permissive, bool, 0644); + +/* This is where xen_pcibk_read_config_byte, xen_pcibk_read_config_word, + * xen_pcibk_write_config_word, and xen_pcibk_write_config_byte are created. */ +#define DEFINE_PCI_CONFIG(op, size, type) \ +int xen_pcibk_##op##_config_##size \ +(struct pci_dev *dev, int offset, type value, void *data) \ +{ \ + return pci_##op##_config_##size(dev, offset, value); \ +} + +DEFINE_PCI_CONFIG(read, byte, u8 *) +DEFINE_PCI_CONFIG(read, word, u16 *) +DEFINE_PCI_CONFIG(read, dword, u32 *) + +DEFINE_PCI_CONFIG(write, byte, u8) +DEFINE_PCI_CONFIG(write, word, u16) +DEFINE_PCI_CONFIG(write, dword, u32) + +static int conf_space_read(struct pci_dev *dev, + const struct config_field_entry *entry, + int offset, u32 *value) +{ + int ret = 0; + const struct config_field *field = entry->field; + + *value = 0; + + switch (field->size) { + case 1: + if (field->u.b.read) + ret = field->u.b.read(dev, offset, (u8 *) value, + entry->data); + break; + case 2: + if (field->u.w.read) + ret = field->u.w.read(dev, offset, (u16 *) value, + entry->data); + break; + case 4: + if (field->u.dw.read) + ret = field->u.dw.read(dev, offset, value, entry->data); + break; + } + return ret; +} + +static int conf_space_write(struct pci_dev *dev, + const struct config_field_entry *entry, + int offset, u32 value) +{ + int ret = 0; + const struct config_field *field = entry->field; + + switch (field->size) { + case 1: + if (field->u.b.write) + ret = field->u.b.write(dev, offset, (u8) value, + entry->data); + break; + case 2: + if (field->u.w.write) + ret = field->u.w.write(dev, offset, (u16) value, + entry->data); + break; + case 4: + if (field->u.dw.write) + ret = field->u.dw.write(dev, offset, value, + entry->data); + break; + } + return ret; +} + +static inline u32 get_mask(int size) +{ + if (size == 1) + return 0xff; + else if (size == 2) + return 0xffff; + else + return 0xffffffff; +} + +static inline int valid_request(int offset, int size) +{ + /* Validate request (no un-aligned requests) */ + if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0) + return 1; + return 0; +} + +static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask, + int offset) +{ + if (offset >= 0) { + new_val_mask <<= (offset * 8); + new_val <<= (offset * 8); + } else { + new_val_mask >>= (offset * -8); + new_val >>= (offset * -8); + } + val = (val & ~new_val_mask) | (new_val & new_val_mask); + + return val; +} + +static int xen_pcibios_err_to_errno(int err) +{ + switch (err) { + case PCIBIOS_SUCCESSFUL: + return XEN_PCI_ERR_success; + case PCIBIOS_DEVICE_NOT_FOUND: + return XEN_PCI_ERR_dev_not_found; + case PCIBIOS_BAD_REGISTER_NUMBER: + return XEN_PCI_ERR_invalid_offset; + case PCIBIOS_FUNC_NOT_SUPPORTED: + return XEN_PCI_ERR_not_implemented; + case PCIBIOS_SET_FAILED: + return XEN_PCI_ERR_access_denied; + } + return err; +} + +int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, + u32 *ret_val) +{ + int err = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + int req_start, req_end, field_start, field_end; + /* if read fails for any reason, return 0 + * (as if device didn't respond) */ + u32 value = 0, tmp_val; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x\n", + pci_name(dev), size, offset); + + if (!valid_request(offset, size)) { + err = XEN_PCI_ERR_invalid_offset; + goto out; + } + + /* Get the real value first, then modify as appropriate */ + switch (size) { + case 1: + err = pci_read_config_byte(dev, offset, (u8 *) &value); + break; + case 2: + err = pci_read_config_word(dev, offset, (u16 *) &value); + break; + case 4: + err = pci_read_config_dword(dev, offset, &value); + break; + } + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + req_start = offset; + req_end = offset + size; + field_start = OFFSET(cfg_entry); + field_end = OFFSET(cfg_entry) + field->size; + + if ((req_start >= field_start && req_start < field_end) + || (req_end > field_start && req_end <= field_end)) { + err = conf_space_read(dev, cfg_entry, field_start, + &tmp_val); + if (err) + goto out; + + value = merge_value(value, tmp_val, + get_mask(field->size), + field_start - req_start); + } + } + +out: + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: read %d bytes at 0x%x = %x\n", + pci_name(dev), size, offset, value); + + *ret_val = value; + return xen_pcibios_err_to_errno(err); +} + +int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, u32 value) +{ + int err = 0, handled = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + u32 tmp_val; + int req_start, req_end, field_start, field_end; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG + DRV_NAME ": %s: write request %d bytes at 0x%x = %x\n", + pci_name(dev), size, offset, value); + + if (!valid_request(offset, size)) + return XEN_PCI_ERR_invalid_offset; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + req_start = offset; + req_end = offset + size; + field_start = OFFSET(cfg_entry); + field_end = OFFSET(cfg_entry) + field->size; + + if ((req_start >= field_start && req_start < field_end) + || (req_end > field_start && req_end <= field_end)) { + tmp_val = 0; + + err = xen_pcibk_config_read(dev, field_start, + field->size, &tmp_val); + if (err) + break; + + tmp_val = merge_value(tmp_val, value, get_mask(size), + req_start - field_start); + + err = conf_space_write(dev, cfg_entry, field_start, + tmp_val); + + /* handled is set true here, but not every byte + * may have been written! Properly detecting if + * every byte is handled is unnecessary as the + * flag is used to detect devices that need + * special helpers to work correctly. + */ + handled = 1; + } + } + + if (!handled && !err) { + /* By default, anything not specificially handled above is + * read-only. The permissive flag changes this behavior so + * that anything not specifically handled above is writable. + * This means that some fields may still be read-only because + * they have entries in the config_field list that intercept + * the write and do nothing. */ + if (dev_data->permissive || xen_pcibk_permissive) { + switch (size) { + case 1: + err = pci_write_config_byte(dev, offset, + (u8) value); + break; + case 2: + err = pci_write_config_word(dev, offset, + (u16) value); + break; + case 4: + err = pci_write_config_dword(dev, offset, + (u32) value); + break; + } + } else if (!dev_data->warned_on_write) { + dev_data->warned_on_write = 1; + dev_warn(&dev->dev, "Driver tried to write to a " + "read-only configuration space field at offset" + " 0x%x, size %d. This may be harmless, but if " + "you have problems with your device:\n" + "1) see permissive attribute in sysfs\n" + "2) report problems to the xen-devel " + "mailing list along with details of your " + "device obtained from lspci.\n", offset, size); + } + } + + return xen_pcibios_err_to_errno(err); +} + +void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry, *t; + const struct config_field *field; + + dev_dbg(&dev->dev, "free-ing dynamically allocated virtual " + "configuration space fields\n"); + if (!dev_data) + return; + + list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { + field = cfg_entry->field; + + if (field->clean) { + field->clean((struct config_field *)field); + + kfree(cfg_entry->data); + + list_del(&cfg_entry->list); + kfree(cfg_entry); + } + + } +} + +void xen_pcibk_config_reset_dev(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + const struct config_field_entry *cfg_entry; + const struct config_field *field; + + dev_dbg(&dev->dev, "resetting virtual configuration space\n"); + if (!dev_data) + return; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + + if (field->reset) + field->reset(dev, OFFSET(cfg_entry), cfg_entry->data); + } +} + +void xen_pcibk_config_free_dev(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry, *t; + const struct config_field *field; + + dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n"); + if (!dev_data) + return; + + list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { + list_del(&cfg_entry->list); + + field = cfg_entry->field; + + if (field->release) + field->release(dev, OFFSET(cfg_entry), cfg_entry->data); + + kfree(cfg_entry); + } +} + +int xen_pcibk_config_add_field_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int base_offset) +{ + int err = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry; + void *tmp; + + cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL); + if (!cfg_entry) { + err = -ENOMEM; + goto out; + } + + cfg_entry->data = NULL; + cfg_entry->field = field; + cfg_entry->base_offset = base_offset; + + /* silently ignore duplicate fields */ + err = xen_pcibk_field_is_dup(dev, OFFSET(cfg_entry)); + if (err) + goto out; + + if (field->init) { + tmp = field->init(dev, OFFSET(cfg_entry)); + + if (IS_ERR(tmp)) { + err = PTR_ERR(tmp); + goto out; + } + + cfg_entry->data = tmp; + } + + dev_dbg(&dev->dev, "added config field at offset 0x%02x\n", + OFFSET(cfg_entry)); + list_add_tail(&cfg_entry->list, &dev_data->config_fields); + +out: + if (err) + kfree(cfg_entry); + + return err; +} + +/* This sets up the device's virtual configuration space to keep track of + * certain registers (like the base address registers (BARs) so that we can + * keep the client from manipulating them directly. + */ +int xen_pcibk_config_init_dev(struct pci_dev *dev) +{ + int err = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + + dev_dbg(&dev->dev, "initializing virtual configuration space\n"); + + INIT_LIST_HEAD(&dev_data->config_fields); + + err = xen_pcibk_config_header_add_fields(dev); + if (err) + goto out; + + err = xen_pcibk_config_capability_add_fields(dev); + if (err) + goto out; + + err = xen_pcibk_config_quirks_init(dev); + +out: + return err; +} + +int xen_pcibk_config_init(void) +{ + return xen_pcibk_config_capability_init(); +} diff --git a/drivers/xen/xen-pciback/conf_space.h b/drivers/xen/xen-pciback/conf_space.h new file mode 100644 index 000000000..62461a8ba --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space.h @@ -0,0 +1,128 @@ +/* + * PCI Backend - Common data structures for overriding the configuration space + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#ifndef __XEN_PCIBACK_CONF_SPACE_H__ +#define __XEN_PCIBACK_CONF_SPACE_H__ + +#include <linux/list.h> +#include <linux/err.h> + +/* conf_field_init can return an errno in a ptr with ERR_PTR() */ +typedef void *(*conf_field_init) (struct pci_dev *dev, int offset); +typedef void (*conf_field_reset) (struct pci_dev *dev, int offset, void *data); +typedef void (*conf_field_free) (struct pci_dev *dev, int offset, void *data); + +typedef int (*conf_dword_write) (struct pci_dev *dev, int offset, u32 value, + void *data); +typedef int (*conf_word_write) (struct pci_dev *dev, int offset, u16 value, + void *data); +typedef int (*conf_byte_write) (struct pci_dev *dev, int offset, u8 value, + void *data); +typedef int (*conf_dword_read) (struct pci_dev *dev, int offset, u32 *value, + void *data); +typedef int (*conf_word_read) (struct pci_dev *dev, int offset, u16 *value, + void *data); +typedef int (*conf_byte_read) (struct pci_dev *dev, int offset, u8 *value, + void *data); + +/* These are the fields within the configuration space which we + * are interested in intercepting reads/writes to and changing their + * values. + */ +struct config_field { + unsigned int offset; + unsigned int size; + unsigned int mask; + conf_field_init init; + conf_field_reset reset; + conf_field_free release; + void (*clean) (struct config_field *field); + union { + struct { + conf_dword_write write; + conf_dword_read read; + } dw; + struct { + conf_word_write write; + conf_word_read read; + } w; + struct { + conf_byte_write write; + conf_byte_read read; + } b; + } u; + struct list_head list; +}; + +struct config_field_entry { + struct list_head list; + const struct config_field *field; + unsigned int base_offset; + void *data; +}; + +extern bool xen_pcibk_permissive; + +#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset) + +/* Add fields to a device - the add_fields macro expects to get a pointer to + * the first entry in an array (of which the ending is marked by size==0) + */ +int xen_pcibk_config_add_field_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int offset); + +static inline int xen_pcibk_config_add_field(struct pci_dev *dev, + const struct config_field *field) +{ + return xen_pcibk_config_add_field_offset(dev, field, 0); +} + +static inline int xen_pcibk_config_add_fields(struct pci_dev *dev, + const struct config_field *field) +{ + int i, err = 0; + for (i = 0; field[i].size != 0; i++) { + err = xen_pcibk_config_add_field(dev, &field[i]); + if (err) + break; + } + return err; +} + +static inline int xen_pcibk_config_add_fields_offset(struct pci_dev *dev, + const struct config_field *field, + unsigned int offset) +{ + int i, err = 0; + for (i = 0; field[i].size != 0; i++) { + err = xen_pcibk_config_add_field_offset(dev, &field[i], offset); + if (err) + break; + } + return err; +} + +/* Read/Write the real configuration space */ +int xen_pcibk_read_config_byte(struct pci_dev *dev, int offset, u8 *value, + void *data); +int xen_pcibk_read_config_word(struct pci_dev *dev, int offset, u16 *value, + void *data); +int xen_pcibk_read_config_dword(struct pci_dev *dev, int offset, u32 *value, + void *data); +int xen_pcibk_write_config_byte(struct pci_dev *dev, int offset, u8 value, + void *data); +int xen_pcibk_write_config_word(struct pci_dev *dev, int offset, u16 value, + void *data); +int xen_pcibk_write_config_dword(struct pci_dev *dev, int offset, u32 value, + void *data); + +int xen_pcibk_config_capability_init(void); + +int xen_pcibk_config_header_add_fields(struct pci_dev *dev); +int xen_pcibk_config_capability_add_fields(struct pci_dev *dev); + +#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */ diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c new file mode 100644 index 000000000..7f83e9083 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_capability.c @@ -0,0 +1,207 @@ +/* + * PCI Backend - Handles the virtual fields found on the capability lists + * in the configuration space. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" + +static LIST_HEAD(capabilities); +struct xen_pcibk_config_capability { + struct list_head cap_list; + + int capability; + + /* If the device has the capability found above, add these fields */ + const struct config_field *fields; +}; + +static const struct config_field caplist_header[] = { + { + .offset = PCI_CAP_LIST_ID, + .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */ + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = NULL, + }, + {} +}; + +static inline void register_capability(struct xen_pcibk_config_capability *cap) +{ + list_add_tail(&cap->cap_list, &capabilities); +} + +int xen_pcibk_config_capability_add_fields(struct pci_dev *dev) +{ + int err = 0; + struct xen_pcibk_config_capability *cap; + int cap_offset; + + list_for_each_entry(cap, &capabilities, cap_list) { + cap_offset = pci_find_capability(dev, cap->capability); + if (cap_offset) { + dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n", + cap->capability, cap_offset); + + err = xen_pcibk_config_add_fields_offset(dev, + caplist_header, + cap_offset); + if (err) + goto out; + err = xen_pcibk_config_add_fields_offset(dev, + cap->fields, + cap_offset); + if (err) + goto out; + } + } + +out: + return err; +} + +static int vpd_address_write(struct pci_dev *dev, int offset, u16 value, + void *data) +{ + /* Disallow writes to the vital product data */ + if (value & PCI_VPD_ADDR_F) + return PCIBIOS_SET_FAILED; + else + return pci_write_config_word(dev, offset, value); +} + +static const struct config_field caplist_vpd[] = { + { + .offset = PCI_VPD_ADDR, + .size = 2, + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = vpd_address_write, + }, + { + .offset = PCI_VPD_DATA, + .size = 4, + .u.dw.read = xen_pcibk_read_config_dword, + .u.dw.write = NULL, + }, + {} +}; + +static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value, + void *data) +{ + int err; + u16 real_value; + + err = pci_read_config_word(dev, offset, &real_value); + if (err) + goto out; + + *value = real_value & ~PCI_PM_CAP_PME_MASK; + +out: + return err; +} + +/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. + * Can't allow driver domain to enable PMEs - they're shared */ +#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK) + +static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, + void *data) +{ + int err; + u16 old_value; + pci_power_t new_state, old_state; + + err = pci_read_config_word(dev, offset, &old_value); + if (err) + goto out; + + old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK); + new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); + + new_value &= PM_OK_BITS; + if ((old_value & PM_OK_BITS) != new_value) { + new_value = (old_value & ~PM_OK_BITS) | new_value; + err = pci_write_config_word(dev, offset, new_value); + if (err) + goto out; + } + + /* Let pci core handle the power management change */ + dev_dbg(&dev->dev, "set power state to %x\n", new_state); + err = pci_set_power_state(dev, new_state); + if (err) { + err = PCIBIOS_SET_FAILED; + goto out; + } + + out: + return err; +} + +/* Ensure PMEs are disabled */ +static void *pm_ctrl_init(struct pci_dev *dev, int offset) +{ + int err; + u16 value; + + err = pci_read_config_word(dev, offset, &value); + if (err) + goto out; + + if (value & PCI_PM_CTRL_PME_ENABLE) { + value &= ~PCI_PM_CTRL_PME_ENABLE; + err = pci_write_config_word(dev, offset, value); + } + +out: + return ERR_PTR(err); +} + +static const struct config_field caplist_pm[] = { + { + .offset = PCI_PM_PMC, + .size = 2, + .u.w.read = pm_caps_read, + }, + { + .offset = PCI_PM_CTRL, + .size = 2, + .init = pm_ctrl_init, + .u.w.read = xen_pcibk_read_config_word, + .u.w.write = pm_ctrl_write, + }, + { + .offset = PCI_PM_PPB_EXTENSIONS, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + { + .offset = PCI_PM_DATA_REGISTER, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + {} +}; + +static struct xen_pcibk_config_capability xen_pcibk_config_capability_pm = { + .capability = PCI_CAP_ID_PM, + .fields = caplist_pm, +}; +static struct xen_pcibk_config_capability xen_pcibk_config_capability_vpd = { + .capability = PCI_CAP_ID_VPD, + .fields = caplist_vpd, +}; + +int xen_pcibk_config_capability_init(void) +{ + register_capability(&xen_pcibk_config_capability_vpd); + register_capability(&xen_pcibk_config_capability_pm); + + return 0; +} diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c new file mode 100644 index 000000000..ad3d17d29 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -0,0 +1,433 @@ +/* + * PCI Backend - Handles the virtual fields in the configuration space headers. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" + +struct pci_cmd_info { + u16 val; +}; + +struct pci_bar_info { + u32 val; + u32 len_val; + int which; +}; + +#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO)) +#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER) + +/* Bits guests are allowed to control in permissive mode. */ +#define PCI_COMMAND_GUEST (PCI_COMMAND_MASTER|PCI_COMMAND_SPECIAL| \ + PCI_COMMAND_INVALIDATE|PCI_COMMAND_VGA_PALETTE| \ + PCI_COMMAND_WAIT|PCI_COMMAND_FAST_BACK) + +static void *command_init(struct pci_dev *dev, int offset) +{ + struct pci_cmd_info *cmd = kmalloc(sizeof(*cmd), GFP_KERNEL); + int err; + + if (!cmd) + return ERR_PTR(-ENOMEM); + + err = pci_read_config_word(dev, PCI_COMMAND, &cmd->val); + if (err) { + kfree(cmd); + return ERR_PTR(err); + } + + return cmd; +} + +static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data) +{ + int ret = pci_read_config_word(dev, offset, value); + const struct pci_cmd_info *cmd = data; + + *value &= PCI_COMMAND_GUEST; + *value |= cmd->val & ~PCI_COMMAND_GUEST; + + return ret; +} + +static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) +{ + struct xen_pcibk_dev_data *dev_data; + int err; + u16 val; + struct pci_cmd_info *cmd = data; + + dev_data = pci_get_drvdata(dev); + if (!pci_is_enabled(dev) && is_enable_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: enable\n", + pci_name(dev)); + err = pci_enable_device(dev); + if (err) + return err; + if (dev_data) + dev_data->enable_intx = 1; + } else if (pci_is_enabled(dev) && !is_enable_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: disable\n", + pci_name(dev)); + pci_disable_device(dev); + if (dev_data) + dev_data->enable_intx = 0; + } + + if (!dev->is_busmaster && is_master_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: set bus master\n", + pci_name(dev)); + pci_set_master(dev); + } else if (dev->is_busmaster && !is_master_cmd(value)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: clear bus master\n", + pci_name(dev)); + pci_clear_master(dev); + } + + if (!(cmd->val & PCI_COMMAND_INVALIDATE) && + (value & PCI_COMMAND_INVALIDATE)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG + DRV_NAME ": %s: enable memory-write-invalidate\n", + pci_name(dev)); + err = pci_set_mwi(dev); + if (err) { + pr_warn("%s: cannot enable memory-write-invalidate (%d)\n", + pci_name(dev), err); + value &= ~PCI_COMMAND_INVALIDATE; + } + } else if ((cmd->val & PCI_COMMAND_INVALIDATE) && + !(value & PCI_COMMAND_INVALIDATE)) { + if (unlikely(verbose_request)) + printk(KERN_DEBUG + DRV_NAME ": %s: disable memory-write-invalidate\n", + pci_name(dev)); + pci_clear_mwi(dev); + } + + cmd->val = value; + + if (!xen_pcibk_permissive && (!dev_data || !dev_data->permissive)) + return 0; + + /* Only allow the guest to control certain bits. */ + err = pci_read_config_word(dev, offset, &val); + if (err || val == value) + return err; + + value &= PCI_COMMAND_GUEST; + value |= val & ~PCI_COMMAND_GUEST; + + return pci_write_config_word(dev, offset, value); +} + +static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + pr_warn(DRV_NAME ": driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + /* A write to obtain the length must happen as a 32-bit write. + * This does not (yet) support writing individual bytes + */ + if (value == ~PCI_ROM_ADDRESS_ENABLE) + bar->which = 1; + else { + u32 tmpval; + pci_read_config_dword(dev, offset, &tmpval); + if (tmpval != bar->val && value == bar->val) { + /* Allow restoration of bar value. */ + pci_write_config_dword(dev, offset, bar->val); + } + bar->which = 0; + } + + /* Do we need to support enabling/disabling the rom address here? */ + + return 0; +} + +/* For the BARs, only allow writes which write ~0 or + * the correct resource information + * (Needed for when the driver probes the resource usage) + */ +static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + pr_warn(DRV_NAME ": driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + /* A write to obtain the length must happen as a 32-bit write. + * This does not (yet) support writing individual bytes + */ + if (value == ~0) + bar->which = 1; + else { + u32 tmpval; + pci_read_config_dword(dev, offset, &tmpval); + if (tmpval != bar->val && value == bar->val) { + /* Allow restoration of bar value. */ + pci_write_config_dword(dev, offset, bar->val); + } + bar->which = 0; + } + + return 0; +} + +static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) +{ + struct pci_bar_info *bar = data; + + if (unlikely(!bar)) { + pr_warn(DRV_NAME ": driver data not found for %s\n", + pci_name(dev)); + return XEN_PCI_ERR_op_failed; + } + + *value = bar->which ? bar->len_val : bar->val; + + return 0; +} + +static inline void read_dev_bar(struct pci_dev *dev, + struct pci_bar_info *bar_info, int offset, + u32 len_mask) +{ + int pos; + struct resource *res = dev->resource; + + if (offset == PCI_ROM_ADDRESS || offset == PCI_ROM_ADDRESS1) + pos = PCI_ROM_RESOURCE; + else { + pos = (offset - PCI_BASE_ADDRESS_0) / 4; + if (pos && ((res[pos - 1].flags & (PCI_BASE_ADDRESS_SPACE | + PCI_BASE_ADDRESS_MEM_TYPE_MASK)) == + (PCI_BASE_ADDRESS_SPACE_MEMORY | + PCI_BASE_ADDRESS_MEM_TYPE_64))) { + bar_info->val = res[pos - 1].start >> 32; + bar_info->len_val = res[pos - 1].end >> 32; + return; + } + } + + bar_info->val = res[pos].start | + (res[pos].flags & PCI_REGION_FLAG_MASK); + bar_info->len_val = resource_size(&res[pos]); +} + +static void *bar_init(struct pci_dev *dev, int offset) +{ + struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); + + if (!bar) + return ERR_PTR(-ENOMEM); + + read_dev_bar(dev, bar, offset, ~0); + bar->which = 0; + + return bar; +} + +static void *rom_init(struct pci_dev *dev, int offset) +{ + struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); + + if (!bar) + return ERR_PTR(-ENOMEM); + + read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE); + bar->which = 0; + + return bar; +} + +static void bar_reset(struct pci_dev *dev, int offset, void *data) +{ + struct pci_bar_info *bar = data; + + bar->which = 0; +} + +static void bar_release(struct pci_dev *dev, int offset, void *data) +{ + kfree(data); +} + +static int xen_pcibk_read_vendor(struct pci_dev *dev, int offset, + u16 *value, void *data) +{ + *value = dev->vendor; + + return 0; +} + +static int xen_pcibk_read_device(struct pci_dev *dev, int offset, + u16 *value, void *data) +{ + *value = dev->device; + + return 0; +} + +static int interrupt_read(struct pci_dev *dev, int offset, u8 * value, + void *data) +{ + *value = (u8) dev->irq; + + return 0; +} + +static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data) +{ + u8 cur_value; + int err; + + err = pci_read_config_byte(dev, offset, &cur_value); + if (err) + goto out; + + if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START) + || value == PCI_BIST_START) + err = pci_write_config_byte(dev, offset, value); + +out: + return err; +} + +static const struct config_field header_common[] = { + { + .offset = PCI_VENDOR_ID, + .size = 2, + .u.w.read = xen_pcibk_read_vendor, + }, + { + .offset = PCI_DEVICE_ID, + .size = 2, + .u.w.read = xen_pcibk_read_device, + }, + { + .offset = PCI_COMMAND, + .size = 2, + .init = command_init, + .release = bar_release, + .u.w.read = command_read, + .u.w.write = command_write, + }, + { + .offset = PCI_INTERRUPT_LINE, + .size = 1, + .u.b.read = interrupt_read, + }, + { + .offset = PCI_INTERRUPT_PIN, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + { + /* Any side effects of letting driver domain control cache line? */ + .offset = PCI_CACHE_LINE_SIZE, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + .u.b.write = xen_pcibk_write_config_byte, + }, + { + .offset = PCI_LATENCY_TIMER, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + }, + { + .offset = PCI_BIST, + .size = 1, + .u.b.read = xen_pcibk_read_config_byte, + .u.b.write = bist_write, + }, + {} +}; + +#define CFG_FIELD_BAR(reg_offset) \ + { \ + .offset = reg_offset, \ + .size = 4, \ + .init = bar_init, \ + .reset = bar_reset, \ + .release = bar_release, \ + .u.dw.read = bar_read, \ + .u.dw.write = bar_write, \ + } + +#define CFG_FIELD_ROM(reg_offset) \ + { \ + .offset = reg_offset, \ + .size = 4, \ + .init = rom_init, \ + .reset = bar_reset, \ + .release = bar_release, \ + .u.dw.read = bar_read, \ + .u.dw.write = rom_write, \ + } + +static const struct config_field header_0[] = { + CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_2), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_3), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_4), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_5), + CFG_FIELD_ROM(PCI_ROM_ADDRESS), + {} +}; + +static const struct config_field header_1[] = { + CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), + CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), + CFG_FIELD_ROM(PCI_ROM_ADDRESS1), + {} +}; + +int xen_pcibk_config_header_add_fields(struct pci_dev *dev) +{ + int err; + + err = xen_pcibk_config_add_fields(dev, header_common); + if (err) + goto out; + + switch (dev->hdr_type) { + case PCI_HEADER_TYPE_NORMAL: + err = xen_pcibk_config_add_fields(dev, header_0); + break; + + case PCI_HEADER_TYPE_BRIDGE: + err = xen_pcibk_config_add_fields(dev, header_1); + break; + + default: + err = -EINVAL; + pr_err("%s: Unsupported header type %d!\n", + pci_name(dev), dev->hdr_type); + break; + } + +out: + return err; +} diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c new file mode 100644 index 000000000..7476791ca --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_quirks.c @@ -0,0 +1,139 @@ +/* + * PCI Backend - Handle special overlays for broken devices. + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + * Author: Chris Bookholt <hap10@epoch.ncsc.mil> + */ + +#include <linux/kernel.h> +#include <linux/pci.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +LIST_HEAD(xen_pcibk_quirks); +static inline const struct pci_device_id * +match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) +{ + if ((id->vendor == PCI_ANY_ID || id->vendor == dev->vendor) && + (id->device == PCI_ANY_ID || id->device == dev->device) && + (id->subvendor == PCI_ANY_ID || + id->subvendor == dev->subsystem_vendor) && + (id->subdevice == PCI_ANY_ID || + id->subdevice == dev->subsystem_device) && + !((id->class ^ dev->class) & id->class_mask)) + return id; + return NULL; +} + +static struct xen_pcibk_config_quirk *xen_pcibk_find_quirk(struct pci_dev *dev) +{ + struct xen_pcibk_config_quirk *tmp_quirk; + + list_for_each_entry(tmp_quirk, &xen_pcibk_quirks, quirks_list) + if (match_one_device(&tmp_quirk->devid, dev) != NULL) + goto out; + tmp_quirk = NULL; + printk(KERN_DEBUG DRV_NAME + ": quirk didn't match any device known\n"); +out: + return tmp_quirk; +} + +static inline void register_quirk(struct xen_pcibk_config_quirk *quirk) +{ + list_add_tail(&quirk->quirks_list, &xen_pcibk_quirks); +} + +int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg) +{ + int ret = 0; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + struct config_field_entry *cfg_entry; + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + if (OFFSET(cfg_entry) == reg) { + ret = 1; + break; + } + } + return ret; +} + +int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field + *field) +{ + int err = 0; + + switch (field->size) { + case 1: + field->u.b.read = xen_pcibk_read_config_byte; + field->u.b.write = xen_pcibk_write_config_byte; + break; + case 2: + field->u.w.read = xen_pcibk_read_config_word; + field->u.w.write = xen_pcibk_write_config_word; + break; + case 4: + field->u.dw.read = xen_pcibk_read_config_dword; + field->u.dw.write = xen_pcibk_write_config_dword; + break; + default: + err = -EINVAL; + goto out; + } + + xen_pcibk_config_add_field(dev, field); + +out: + return err; +} + +int xen_pcibk_config_quirks_init(struct pci_dev *dev) +{ + struct xen_pcibk_config_quirk *quirk; + int ret = 0; + + quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC); + if (!quirk) { + ret = -ENOMEM; + goto out; + } + + quirk->devid.vendor = dev->vendor; + quirk->devid.device = dev->device; + quirk->devid.subvendor = dev->subsystem_vendor; + quirk->devid.subdevice = dev->subsystem_device; + quirk->devid.class = 0; + quirk->devid.class_mask = 0; + quirk->devid.driver_data = 0UL; + + quirk->pdev = dev; + + register_quirk(quirk); +out: + return ret; +} + +void xen_pcibk_config_field_free(struct config_field *field) +{ + kfree(field); +} + +int xen_pcibk_config_quirk_release(struct pci_dev *dev) +{ + struct xen_pcibk_config_quirk *quirk; + int ret = 0; + + quirk = xen_pcibk_find_quirk(dev); + if (!quirk) { + ret = -ENXIO; + goto out; + } + + list_del(&quirk->quirks_list); + kfree(quirk); + +out: + return ret; +} diff --git a/drivers/xen/xen-pciback/conf_space_quirks.h b/drivers/xen/xen-pciback/conf_space_quirks.h new file mode 100644 index 000000000..cfcc517e4 --- /dev/null +++ b/drivers/xen/xen-pciback/conf_space_quirks.h @@ -0,0 +1,33 @@ +/* + * PCI Backend - Data structures for special overlays for broken devices. + * + * Ryan Wilson <hap9@epoch.ncsc.mil> + * Chris Bookholt <hap10@epoch.ncsc.mil> + */ + +#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ +#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ + +#include <linux/pci.h> +#include <linux/list.h> + +struct xen_pcibk_config_quirk { + struct list_head quirks_list; + struct pci_device_id devid; + struct pci_dev *pdev; +}; + +int xen_pcibk_config_quirks_add_field(struct pci_dev *dev, struct config_field + *field); + +int xen_pcibk_config_quirks_remove_field(struct pci_dev *dev, int reg); + +int xen_pcibk_config_quirks_init(struct pci_dev *dev); + +void xen_pcibk_config_field_free(struct config_field *field); + +int xen_pcibk_config_quirk_release(struct pci_dev *dev); + +int xen_pcibk_field_is_dup(struct pci_dev *dev, unsigned int reg); + +#endif diff --git a/drivers/xen/xen-pciback/passthrough.c b/drivers/xen/xen-pciback/passthrough.c new file mode 100644 index 000000000..f16a30e2a --- /dev/null +++ b/drivers/xen/xen-pciback/passthrough.c @@ -0,0 +1,196 @@ +/* + * PCI Backend - Provides restricted access to the real PCI bus topology + * to the frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/mutex.h> +#include "pciback.h" + +struct passthrough_dev_data { + /* Access to dev_list must be protected by lock */ + struct list_head dev_list; + struct mutex lock; +}; + +static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, + unsigned int domain, + unsigned int bus, + unsigned int devfn) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry; + struct pci_dev *dev = NULL; + + mutex_lock(&dev_data->lock); + + list_for_each_entry(dev_entry, &dev_data->dev_list, list) { + if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus) + && bus == (unsigned int)dev_entry->dev->bus->number + && devfn == dev_entry->dev->devfn) { + dev = dev_entry->dev; + break; + } + } + + mutex_unlock(&dev_data->lock); + + return dev; +} + +static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, + int devid, publish_pci_dev_cb publish_cb) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry; + unsigned int domain, bus, devfn; + int err; + + dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); + if (!dev_entry) + return -ENOMEM; + dev_entry->dev = dev; + + mutex_lock(&dev_data->lock); + list_add_tail(&dev_entry->list, &dev_data->dev_list); + mutex_unlock(&dev_data->lock); + + /* Publish this device. */ + domain = (unsigned int)pci_domain_nr(dev->bus); + bus = (unsigned int)dev->bus->number; + devfn = dev->devfn; + err = publish_cb(pdev, domain, bus, devfn, devid); + + return err; +} + +static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, bool lock) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *t; + struct pci_dev *found_dev = NULL; + + mutex_lock(&dev_data->lock); + + list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { + if (dev_entry->dev == dev) { + list_del(&dev_entry->list); + found_dev = dev_entry->dev; + kfree(dev_entry); + } + } + + mutex_unlock(&dev_data->lock); + + if (found_dev) { + if (lock) + device_lock(&found_dev->dev); + pcistub_put_pci_dev(found_dev); + if (lock) + device_unlock(&found_dev->dev); + } +} + +static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ + struct passthrough_dev_data *dev_data; + + dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL); + if (!dev_data) + return -ENOMEM; + + mutex_init(&dev_data->lock); + + INIT_LIST_HEAD(&dev_data->dev_list); + + pdev->pci_dev_data = dev_data; + + return 0; +} + +static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, + publish_pci_root_cb publish_root_cb) +{ + int err = 0; + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *e; + struct pci_dev *dev; + int found; + unsigned int domain, bus; + + mutex_lock(&dev_data->lock); + + list_for_each_entry(dev_entry, &dev_data->dev_list, list) { + /* Only publish this device as a root if none of its + * parent bridges are exported + */ + found = 0; + dev = dev_entry->dev->bus->self; + for (; !found && dev != NULL; dev = dev->bus->self) { + list_for_each_entry(e, &dev_data->dev_list, list) { + if (dev == e->dev) { + found = 1; + break; + } + } + } + + domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus); + bus = (unsigned int)dev_entry->dev->bus->number; + + if (!found) { + err = publish_root_cb(pdev, domain, bus); + if (err) + break; + } + } + + mutex_unlock(&dev_data->lock); + + return err; +} + +static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ + struct passthrough_dev_data *dev_data = pdev->pci_dev_data; + struct pci_dev_entry *dev_entry, *t; + + list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { + struct pci_dev *dev = dev_entry->dev; + list_del(&dev_entry->list); + device_lock(&dev->dev); + pcistub_put_pci_dev(dev); + device_unlock(&dev->dev); + kfree(dev_entry); + } + + kfree(dev_data); + pdev->pci_dev_data = NULL; +} + +static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, + struct xen_pcibk_device *pdev, + unsigned int *domain, unsigned int *bus, + unsigned int *devfn) +{ + *domain = pci_domain_nr(pcidev->bus); + *bus = pcidev->bus->number; + *devfn = pcidev->devfn; + return 1; +} + +const struct xen_pcibk_backend xen_pcibk_passthrough_backend = { + .name = "passthrough", + .init = __xen_pcibk_init_devices, + .free = __xen_pcibk_release_devices, + .find = __xen_pcibk_get_pcifront_dev, + .publish = __xen_pcibk_publish_pci_roots, + .release = __xen_pcibk_release_pci_dev, + .add = __xen_pcibk_add_pci_dev, + .get = __xen_pcibk_get_pci_dev, +}; diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c new file mode 100644 index 000000000..258b7c325 --- /dev/null +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -0,0 +1,1610 @@ +/* + * PCI Stub Driver - Grabs devices in backend to be exported later + * + * Ryan Wilson <hap9@epoch.ncsc.mil> + * Chris Bookholt <hap10@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rwsem.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/kref.h> +#include <linux/pci.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/atomic.h> +#include <xen/events.h> +#include <asm/xen/pci.h> +#include <asm/xen/hypervisor.h> +#include <xen/interface/physdev.h> +#include "pciback.h" +#include "conf_space.h" +#include "conf_space_quirks.h" + +static char *pci_devs_to_hide; +wait_queue_head_t xen_pcibk_aer_wait_queue; +/*Add sem for sync AER handling and xen_pcibk remove/reconfigue ops, +* We want to avoid in middle of AER ops, xen_pcibk devices is being removed +*/ +static DECLARE_RWSEM(pcistub_sem); +module_param_named(hide, pci_devs_to_hide, charp, 0444); + +struct pcistub_device_id { + struct list_head slot_list; + int domain; + unsigned char bus; + unsigned int devfn; +}; +static LIST_HEAD(pcistub_device_ids); +static DEFINE_SPINLOCK(device_ids_lock); + +struct pcistub_device { + struct kref kref; + struct list_head dev_list; + spinlock_t lock; + + struct pci_dev *dev; + struct xen_pcibk_device *pdev;/* non-NULL if struct pci_dev is in use */ +}; + +/* Access to pcistub_devices & seized_devices lists and the initialize_devices + * flag must be locked with pcistub_devices_lock + */ +static DEFINE_SPINLOCK(pcistub_devices_lock); +static LIST_HEAD(pcistub_devices); + +/* wait for device_initcall before initializing our devices + * (see pcistub_init_devices_late) + */ +static int initialize_devices; +static LIST_HEAD(seized_devices); + +static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + + dev_dbg(&dev->dev, "pcistub_device_alloc\n"); + + psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC); + if (!psdev) + return NULL; + + psdev->dev = pci_dev_get(dev); + if (!psdev->dev) { + kfree(psdev); + return NULL; + } + + kref_init(&psdev->kref); + spin_lock_init(&psdev->lock); + + return psdev; +} + +/* Don't call this directly as it's called by pcistub_device_put */ +static void pcistub_device_release(struct kref *kref) +{ + struct pcistub_device *psdev; + struct pci_dev *dev; + struct xen_pcibk_dev_data *dev_data; + + psdev = container_of(kref, struct pcistub_device, kref); + dev = psdev->dev; + dev_data = pci_get_drvdata(dev); + + dev_dbg(&dev->dev, "pcistub_device_release\n"); + + xen_unregister_device_domain_owner(dev); + + /* Call the reset function which does not take lock as this + * is called from "unbind" which takes a device_lock mutex. + */ + __pci_reset_function_locked(dev); + if (pci_load_and_free_saved_state(dev, &dev_data->pci_saved_state)) + dev_info(&dev->dev, "Could not reload PCI state\n"); + else + pci_restore_state(dev); + + if (dev->msix_cap) { + struct physdev_pci_device ppdev = { + .seg = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn + }; + int err = HYPERVISOR_physdev_op(PHYSDEVOP_release_msix, + &ppdev); + + if (err && err != -ENOSYS) + dev_warn(&dev->dev, "MSI-X release failed (%d)\n", + err); + } + + /* Disable the device */ + xen_pcibk_reset_device(dev); + + kfree(dev_data); + pci_set_drvdata(dev, NULL); + + /* Clean-up the device */ + xen_pcibk_config_free_dyn_fields(dev); + xen_pcibk_config_free_dev(dev); + + pci_clear_dev_assigned(dev); + pci_dev_put(dev); + + kfree(psdev); +} + +static inline void pcistub_device_get(struct pcistub_device *psdev) +{ + kref_get(&psdev->kref); +} + +static inline void pcistub_device_put(struct pcistub_device *psdev) +{ + kref_put(&psdev->kref, pcistub_device_release); +} + +static struct pcistub_device *pcistub_device_find(int domain, int bus, + int slot, int func) +{ + struct pcistub_device *psdev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev != NULL + && domain == pci_domain_nr(psdev->dev->bus) + && bus == psdev->dev->bus->number + && slot == PCI_SLOT(psdev->dev->devfn) + && func == PCI_FUNC(psdev->dev->devfn)) { + pcistub_device_get(psdev); + goto out; + } + } + + /* didn't find it */ + psdev = NULL; + +out: + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return psdev; +} + +static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev, + struct pcistub_device *psdev) +{ + struct pci_dev *pci_dev = NULL; + unsigned long flags; + + pcistub_device_get(psdev); + + spin_lock_irqsave(&psdev->lock, flags); + if (!psdev->pdev) { + psdev->pdev = pdev; + pci_dev = psdev->dev; + } + spin_unlock_irqrestore(&psdev->lock, flags); + + if (!pci_dev) + pcistub_device_put(psdev); + + return pci_dev; +} + +struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, + int domain, int bus, + int slot, int func) +{ + struct pcistub_device *psdev; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev != NULL + && domain == pci_domain_nr(psdev->dev->bus) + && bus == psdev->dev->bus->number + && slot == PCI_SLOT(psdev->dev->devfn) + && func == PCI_FUNC(psdev->dev->devfn)) { + found_dev = pcistub_device_get_pci_dev(pdev, psdev); + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return found_dev; +} + +struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev) +{ + struct pcistub_device *psdev; + struct pci_dev *found_dev = NULL; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_dev = pcistub_device_get_pci_dev(pdev, psdev); + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return found_dev; +} + +/* + * Called when: + * - XenBus state has been reconfigure (pci unplug). See xen_pcibk_remove_device + * - XenBus state has been disconnected (guest shutdown). See xen_pcibk_xenbus_remove + * - 'echo BDF > unbind' on pciback module with no guest attached. See pcistub_remove + * - 'echo BDF > unbind' with a guest still using it. See pcistub_remove + * + * As such we have to be careful. + * + * To make this easier, the caller has to hold the device lock. + */ +void pcistub_put_pci_dev(struct pci_dev *dev) +{ + struct pcistub_device *psdev, *found_psdev = NULL; + unsigned long flags; + struct xen_pcibk_dev_data *dev_data; + int ret; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_psdev = psdev; + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + if (WARN_ON(!found_psdev)) + return; + + /*hold this lock for avoiding breaking link between + * pcistub and xen_pcibk when AER is in processing + */ + down_write(&pcistub_sem); + /* Cleanup our device + * (so it's ready for the next domain) + */ + device_lock_assert(&dev->dev); + __pci_reset_function_locked(dev); + + dev_data = pci_get_drvdata(dev); + ret = pci_load_saved_state(dev, dev_data->pci_saved_state); + if (!ret) { + /* + * The usual sequence is pci_save_state & pci_restore_state + * but the guest might have messed the configuration space up. + * Use the initial version (when device was bound to us). + */ + pci_restore_state(dev); + } else + dev_info(&dev->dev, "Could not reload PCI state\n"); + /* This disables the device. */ + xen_pcibk_reset_device(dev); + + /* And cleanup up our emulated fields. */ + xen_pcibk_config_reset_dev(dev); + xen_pcibk_config_free_dyn_fields(dev); + + xen_unregister_device_domain_owner(dev); + + spin_lock_irqsave(&found_psdev->lock, flags); + found_psdev->pdev = NULL; + spin_unlock_irqrestore(&found_psdev->lock, flags); + + pcistub_device_put(found_psdev); + up_write(&pcistub_sem); +} + +static int pcistub_match_one(struct pci_dev *dev, + struct pcistub_device_id *pdev_id) +{ + /* Match the specified device by domain, bus, slot, func and also if + * any of the device's parent bridges match. + */ + for (; dev != NULL; dev = dev->bus->self) { + if (pci_domain_nr(dev->bus) == pdev_id->domain + && dev->bus->number == pdev_id->bus + && dev->devfn == pdev_id->devfn) + return 1; + + /* Sometimes topmost bridge links to itself. */ + if (dev == dev->bus->self) + break; + } + + return 0; +} + +static int pcistub_match(struct pci_dev *dev) +{ + struct pcistub_device_id *pdev_id; + unsigned long flags; + int found = 0; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) { + if (pcistub_match_one(dev, pdev_id)) { + found = 1; + break; + } + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return found; +} + +static int pcistub_init_device(struct pci_dev *dev) +{ + struct xen_pcibk_dev_data *dev_data; + int err = 0; + + dev_dbg(&dev->dev, "initializing...\n"); + + /* The PCI backend is not intended to be a module (or to work with + * removable PCI devices (yet). If it were, xen_pcibk_config_free() + * would need to be called somewhere to free the memory allocated + * here and then to call kfree(pci_get_drvdata(psdev->dev)). + */ + dev_data = kzalloc(sizeof(*dev_data) + strlen(DRV_NAME "[]") + + strlen(pci_name(dev)) + 1, GFP_ATOMIC); + if (!dev_data) { + err = -ENOMEM; + goto out; + } + pci_set_drvdata(dev, dev_data); + + /* + * Setup name for fake IRQ handler. It will only be enabled + * once the device is turned on by the guest. + */ + sprintf(dev_data->irq_name, DRV_NAME "[%s]", pci_name(dev)); + + dev_dbg(&dev->dev, "initializing config\n"); + + init_waitqueue_head(&xen_pcibk_aer_wait_queue); + err = xen_pcibk_config_init_dev(dev); + if (err) + goto out; + + /* HACK: Force device (& ACPI) to determine what IRQ it's on - we + * must do this here because pcibios_enable_device may specify + * the pci device's true irq (and possibly its other resources) + * if they differ from what's in the configuration space. + * This makes the assumption that the device's resources won't + * change after this point (otherwise this code may break!) + */ + dev_dbg(&dev->dev, "enabling device\n"); + err = pci_enable_device(dev); + if (err) + goto config_release; + + if (dev->msix_cap) { + struct physdev_pci_device ppdev = { + .seg = pci_domain_nr(dev->bus), + .bus = dev->bus->number, + .devfn = dev->devfn + }; + + err = HYPERVISOR_physdev_op(PHYSDEVOP_prepare_msix, &ppdev); + if (err && err != -ENOSYS) + dev_err(&dev->dev, "MSI-X preparation failed (%d)\n", + err); + } + + /* We need the device active to save the state. */ + dev_dbg(&dev->dev, "save state of device\n"); + pci_save_state(dev); + dev_data->pci_saved_state = pci_store_saved_state(dev); + if (!dev_data->pci_saved_state) + dev_err(&dev->dev, "Could not store PCI conf saved state!\n"); + else { + dev_dbg(&dev->dev, "resetting (FLR, D3, etc) the device\n"); + __pci_reset_function_locked(dev); + pci_restore_state(dev); + } + /* Now disable the device (this also ensures some private device + * data is setup before we export) + */ + dev_dbg(&dev->dev, "reset device\n"); + xen_pcibk_reset_device(dev); + + pci_set_dev_assigned(dev); + return 0; + +config_release: + xen_pcibk_config_free_dev(dev); + +out: + pci_set_drvdata(dev, NULL); + kfree(dev_data); + return err; +} + +/* + * Because some initialization still happens on + * devices during fs_initcall, we need to defer + * full initialization of our devices until + * device_initcall. + */ +static int __init pcistub_init_devices_late(void) +{ + struct pcistub_device *psdev; + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + while (!list_empty(&seized_devices)) { + psdev = container_of(seized_devices.next, + struct pcistub_device, dev_list); + list_del(&psdev->dev_list); + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + err = pcistub_init_device(psdev->dev); + if (err) { + dev_err(&psdev->dev->dev, + "error %d initializing device\n", err); + kfree(psdev); + psdev = NULL; + } + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (psdev) + list_add_tail(&psdev->dev_list, &pcistub_devices); + } + + initialize_devices = 1; + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + return 0; +} + +static int pcistub_seize(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + unsigned long flags; + int err = 0; + + psdev = pcistub_device_alloc(dev); + if (!psdev) + return -ENOMEM; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (initialize_devices) { + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + /* don't want irqs disabled when calling pcistub_init_device */ + err = pcistub_init_device(psdev->dev); + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + if (!err) + list_add(&psdev->dev_list, &pcistub_devices); + } else { + dev_dbg(&dev->dev, "deferring initialization\n"); + list_add(&psdev->dev_list, &seized_devices); + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + if (err) + pcistub_device_put(psdev); + + return err; +} + +/* Called when 'bind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ +static int pcistub_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + int err = 0; + + dev_dbg(&dev->dev, "probing...\n"); + + if (pcistub_match(dev)) { + + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL + && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { + dev_err(&dev->dev, "can't export pci devices that " + "don't have a normal (0) or bridge (1) " + "header type!\n"); + err = -ENODEV; + goto out; + } + + dev_info(&dev->dev, "seizing device\n"); + err = pcistub_seize(dev); + } else + /* Didn't find the device */ + err = -ENODEV; + +out: + return err; +} + +/* Called when 'unbind'. This means we must _NOT_ call pci_reset_function or + * other functions that take the sysfs lock. */ +static void pcistub_remove(struct pci_dev *dev) +{ + struct pcistub_device *psdev, *found_psdev = NULL; + unsigned long flags; + + dev_dbg(&dev->dev, "removing\n"); + + spin_lock_irqsave(&pcistub_devices_lock, flags); + + xen_pcibk_config_quirk_release(dev); + + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (psdev->dev == dev) { + found_psdev = psdev; + break; + } + } + + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + if (found_psdev) { + dev_dbg(&dev->dev, "found device to remove %s\n", + found_psdev->pdev ? "- in-use" : ""); + + if (found_psdev->pdev) { + int domid = xen_find_device_domain_owner(dev); + + pr_warn("****** removing device %s while still in-use by domain %d! ******\n", + pci_name(found_psdev->dev), domid); + pr_warn("****** driver domain may still access this device's i/o resources!\n"); + pr_warn("****** shutdown driver domain before binding device\n"); + pr_warn("****** to other drivers or domains\n"); + + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ + xen_pcibk_release_pci_dev(found_psdev->pdev, + found_psdev->dev, + false /* caller holds the lock. */); + } + + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_del(&found_psdev->dev_list); + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + + /* the final put for releasing from the list */ + pcistub_device_put(found_psdev); + } +} + +static const struct pci_device_id pcistub_ids[] = { + { + .vendor = PCI_ANY_ID, + .device = PCI_ANY_ID, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + }, + {0,}, +}; + +#define PCI_NODENAME_MAX 40 +static void kill_domain_by_device(struct pcistub_device *psdev) +{ + struct xenbus_transaction xbt; + int err; + char nodename[PCI_NODENAME_MAX]; + + BUG_ON(!psdev); + snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0", + psdev->pdev->xdev->otherend_id); + +again: + err = xenbus_transaction_start(&xbt); + if (err) { + dev_err(&psdev->dev->dev, + "error %d when start xenbus transaction\n", err); + return; + } + /*PV AER handlers will set this flag*/ + xenbus_printf(xbt, nodename, "aerState" , "aerfail"); + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + dev_err(&psdev->dev->dev, + "error %d when end xenbus transaction\n", err); + return; + } +} + +/* For each aer recovery step error_detected, mmio_enabled, etc, front_end and + * backend need to have cooperation. In xen_pcibk, those steps will do similar + * jobs: send service request and waiting for front_end response. +*/ +static pci_ers_result_t common_process(struct pcistub_device *psdev, + pci_channel_state_t state, int aer_cmd, + pci_ers_result_t result) +{ + pci_ers_result_t res = result; + struct xen_pcie_aer_op *aer_op; + struct xen_pcibk_device *pdev = psdev->pdev; + struct xen_pci_sharedinfo *sh_info = pdev->sh_info; + int ret; + + /*with PV AER drivers*/ + aer_op = &(sh_info->aer_op); + aer_op->cmd = aer_cmd ; + /*useful for error_detected callback*/ + aer_op->err = state; + /*pcifront_end BDF*/ + ret = xen_pcibk_get_pcifront_dev(psdev->dev, psdev->pdev, + &aer_op->domain, &aer_op->bus, &aer_op->devfn); + if (!ret) { + dev_err(&psdev->dev->dev, + DRV_NAME ": failed to get pcifront device\n"); + return PCI_ERS_RESULT_NONE; + } + wmb(); + + dev_dbg(&psdev->dev->dev, + DRV_NAME ": aer_op %x dom %x bus %x devfn %x\n", + aer_cmd, aer_op->domain, aer_op->bus, aer_op->devfn); + /*local flag to mark there's aer request, xen_pcibk callback will use + * this flag to judge whether we need to check pci-front give aer + * service ack signal + */ + set_bit(_PCIB_op_pending, (unsigned long *)&pdev->flags); + + /*It is possible that a pcifront conf_read_write ops request invokes + * the callback which cause the spurious execution of wake_up. + * Yet it is harmless and better than a spinlock here + */ + set_bit(_XEN_PCIB_active, + (unsigned long *)&sh_info->flags); + wmb(); + notify_remote_via_irq(pdev->evtchn_irq); + + ret = wait_event_timeout(xen_pcibk_aer_wait_queue, + !(test_bit(_XEN_PCIB_active, (unsigned long *) + &sh_info->flags)), 300*HZ); + + if (!ret) { + if (test_bit(_XEN_PCIB_active, + (unsigned long *)&sh_info->flags)) { + dev_err(&psdev->dev->dev, + "pcifront aer process not responding!\n"); + clear_bit(_XEN_PCIB_active, + (unsigned long *)&sh_info->flags); + aer_op->err = PCI_ERS_RESULT_NONE; + return res; + } + } + clear_bit(_PCIB_op_pending, (unsigned long *)&pdev->flags); + + if (test_bit(_XEN_PCIF_active, + (unsigned long *)&sh_info->flags)) { + dev_dbg(&psdev->dev->dev, + "schedule pci_conf service in " DRV_NAME "\n"); + xen_pcibk_test_and_schedule_op(psdev->pdev); + } + + res = (pci_ers_result_t)aer_op->err; + return res; +} + +/* +* xen_pcibk_slot_reset: it will send the slot_reset request to pcifront in case +* of the device driver could provide this service, and then wait for pcifront +* ack. +* @dev: pointer to PCI devices +* return value is used by aer_core do_recovery policy +*/ +static pci_ers_result_t xen_pcibk_slot_reset(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_RECOVERED; + dev_dbg(&dev->dev, "xen_pcibk_slot_reset(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto end; + } + + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + goto end; + } + result = common_process(psdev, 1, XEN_PCI_OP_aer_slotreset, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER slot_reset service or disconnected!\n"); + kill_domain_by_device(psdev); + } +end: + if (psdev) + pcistub_device_put(psdev); + up_write(&pcistub_sem); + return result; + +} + + +/*xen_pcibk_mmio_enabled: it will send the mmio_enabled request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack +* @dev: pointer to PCI devices +* return value is used by aer_core do_recovery policy +*/ + +static pci_ers_result_t xen_pcibk_mmio_enabled(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_RECOVERED; + dev_dbg(&dev->dev, "xen_pcibk_mmio_enabled(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto end; + } + + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + goto end; + } + result = common_process(psdev, 1, XEN_PCI_OP_aer_mmio, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER mmio_enabled service or disconnected!\n"); + kill_domain_by_device(psdev); + } +end: + if (psdev) + pcistub_device_put(psdev); + up_write(&pcistub_sem); + return result; +} + +/*xen_pcibk_error_detected: it will send the error_detected request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack. +* @dev: pointer to PCI devices +* @error: the current PCI connection state +* return value is used by aer_core do_recovery policy +*/ + +static pci_ers_result_t xen_pcibk_error_detected(struct pci_dev *dev, + pci_channel_state_t error) +{ + struct pcistub_device *psdev; + pci_ers_result_t result; + + result = PCI_ERS_RESULT_CAN_RECOVER; + dev_dbg(&dev->dev, "xen_pcibk_error_detected(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto end; + } + + /*Guest owns the device yet no aer handler regiested, kill guest*/ + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_dbg(&dev->dev, "guest may have no aer driver, kill it\n"); + kill_domain_by_device(psdev); + goto end; + } + result = common_process(psdev, error, XEN_PCI_OP_aer_detected, result); + + if (result == PCI_ERS_RESULT_NONE || + result == PCI_ERS_RESULT_DISCONNECT) { + dev_dbg(&dev->dev, + "No AER error_detected service or disconnected!\n"); + kill_domain_by_device(psdev); + } +end: + if (psdev) + pcistub_device_put(psdev); + up_write(&pcistub_sem); + return result; +} + +/*xen_pcibk_error_resume: it will send the error_resume request to pcifront +* in case of the device driver could provide this service, and then wait +* for pcifront ack. +* @dev: pointer to PCI devices +*/ + +static void xen_pcibk_error_resume(struct pci_dev *dev) +{ + struct pcistub_device *psdev; + + dev_dbg(&dev->dev, "xen_pcibk_error_resume(bus:%x,devfn:%x)\n", + dev->bus->number, dev->devfn); + + down_write(&pcistub_sem); + psdev = pcistub_device_find(pci_domain_nr(dev->bus), + dev->bus->number, + PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn)); + + if (!psdev || !psdev->pdev) { + dev_err(&dev->dev, + DRV_NAME " device is not found/assigned\n"); + goto end; + } + + if (!psdev->pdev->sh_info) { + dev_err(&dev->dev, DRV_NAME " device is not connected or owned" + " by HVM, kill it\n"); + kill_domain_by_device(psdev); + goto end; + } + + if (!test_bit(_XEN_PCIB_AERHANDLER, + (unsigned long *)&psdev->pdev->sh_info->flags)) { + dev_err(&dev->dev, + "guest with no AER driver should have been killed\n"); + kill_domain_by_device(psdev); + goto end; + } + common_process(psdev, 1, XEN_PCI_OP_aer_resume, + PCI_ERS_RESULT_RECOVERED); +end: + if (psdev) + pcistub_device_put(psdev); + up_write(&pcistub_sem); + return; +} + +/*add xen_pcibk AER handling*/ +static const struct pci_error_handlers xen_pcibk_error_handler = { + .error_detected = xen_pcibk_error_detected, + .mmio_enabled = xen_pcibk_mmio_enabled, + .slot_reset = xen_pcibk_slot_reset, + .resume = xen_pcibk_error_resume, +}; + +/* + * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't + * for a normal device. I don't want it to be loaded automatically. + */ + +static struct pci_driver xen_pcibk_pci_driver = { + /* The name should be xen_pciback, but until the tools are updated + * we will keep it as pciback. */ + .name = "pciback", + .id_table = pcistub_ids, + .probe = pcistub_probe, + .remove = pcistub_remove, + .err_handler = &xen_pcibk_error_handler, +}; + +static inline int str_to_slot(const char *buf, int *domain, int *bus, + int *slot, int *func) +{ + int parsed = 0; + + switch (sscanf(buf, " %x:%x:%x.%x %n", domain, bus, slot, func, + &parsed)) { + case 3: + *func = -1; + sscanf(buf, " %x:%x:%x.* %n", domain, bus, slot, &parsed); + break; + case 2: + *slot = *func = -1; + sscanf(buf, " %x:%x:*.* %n", domain, bus, &parsed); + break; + } + if (parsed && !buf[parsed]) + return 0; + + /* try again without domain */ + *domain = 0; + switch (sscanf(buf, " %x:%x.%x %n", bus, slot, func, &parsed)) { + case 2: + *func = -1; + sscanf(buf, " %x:%x.* %n", bus, slot, &parsed); + break; + case 1: + *slot = *func = -1; + sscanf(buf, " %x:*.* %n", bus, &parsed); + break; + } + if (parsed && !buf[parsed]) + return 0; + + return -EINVAL; +} + +static inline int str_to_quirk(const char *buf, int *domain, int *bus, int + *slot, int *func, int *reg, int *size, int *mask) +{ + int parsed = 0; + + sscanf(buf, " %x:%x:%x.%x-%x:%x:%x %n", domain, bus, slot, func, + reg, size, mask, &parsed); + if (parsed && !buf[parsed]) + return 0; + + /* try again without domain */ + *domain = 0; + sscanf(buf, " %x:%x.%x-%x:%x:%x %n", bus, slot, func, reg, size, + mask, &parsed); + if (parsed && !buf[parsed]) + return 0; + + return -EINVAL; +} + +static int pcistub_device_id_add(int domain, int bus, int slot, int func) +{ + struct pcistub_device_id *pci_dev_id; + unsigned long flags; + int rc = 0, devfn = PCI_DEVFN(slot, func); + + if (slot < 0) { + for (slot = 0; !rc && slot < 32; ++slot) + rc = pcistub_device_id_add(domain, bus, slot, func); + return rc; + } + + if (func < 0) { + for (func = 0; !rc && func < 8; ++func) + rc = pcistub_device_id_add(domain, bus, slot, func); + return rc; + } + + if (( +#if !defined(MODULE) /* pci_domains_supported is not being exported */ \ + || !defined(CONFIG_PCI_DOMAINS) + !pci_domains_supported ? domain : +#endif + domain < 0 || domain > 0xffff) + || bus < 0 || bus > 0xff + || PCI_SLOT(devfn) != slot + || PCI_FUNC(devfn) != func) + return -EINVAL; + + pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL); + if (!pci_dev_id) + return -ENOMEM; + + pci_dev_id->domain = domain; + pci_dev_id->bus = bus; + pci_dev_id->devfn = devfn; + + pr_debug("wants to seize %04x:%02x:%02x.%d\n", + domain, bus, slot, func); + + spin_lock_irqsave(&device_ids_lock, flags); + list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids); + spin_unlock_irqrestore(&device_ids_lock, flags); + + return 0; +} + +static int pcistub_device_id_remove(int domain, int bus, int slot, int func) +{ + struct pcistub_device_id *pci_dev_id, *t; + int err = -ENOENT; + unsigned long flags; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, + slot_list) { + if (pci_dev_id->domain == domain && pci_dev_id->bus == bus + && (slot < 0 || PCI_SLOT(pci_dev_id->devfn) == slot) + && (func < 0 || PCI_FUNC(pci_dev_id->devfn) == func)) { + /* Don't break; here because it's possible the same + * slot could be in the list more than once + */ + list_del(&pci_dev_id->slot_list); + kfree(pci_dev_id); + + err = 0; + + pr_debug("removed %04x:%02x:%02x.%d from seize list\n", + domain, bus, slot, func); + } + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return err; +} + +static int pcistub_reg_add(int domain, int bus, int slot, int func, + unsigned int reg, unsigned int size, + unsigned int mask) +{ + int err = 0; + struct pcistub_device *psdev; + struct pci_dev *dev; + struct config_field *field; + + if (reg > 0xfff || (size < 4 && (mask >> (size * 8)))) + return -EINVAL; + + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev) { + err = -ENODEV; + goto out; + } + dev = psdev->dev; + + field = kzalloc(sizeof(*field), GFP_ATOMIC); + if (!field) { + err = -ENOMEM; + goto out; + } + + field->offset = reg; + field->size = size; + field->mask = mask; + field->init = NULL; + field->reset = NULL; + field->release = NULL; + field->clean = xen_pcibk_config_field_free; + + err = xen_pcibk_config_quirks_add_field(dev, field); + if (err) + kfree(field); +out: + if (psdev) + pcistub_device_put(psdev); + return err; +} + +static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + err = pcistub_device_id_add(domain, bus, slot, func); + +out: + if (!err) + err = count; + return err; +} +static DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add); + +static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + err = pcistub_device_id_remove(domain, bus, slot, func); + +out: + if (!err) + err = count; + return err; +} +static DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove); + +static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) +{ + struct pcistub_device_id *pci_dev_id; + size_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) { + if (count >= PAGE_SIZE) + break; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "%04x:%02x:%02x.%d\n", + pci_dev_id->domain, pci_dev_id->bus, + PCI_SLOT(pci_dev_id->devfn), + PCI_FUNC(pci_dev_id->devfn)); + } + spin_unlock_irqrestore(&device_ids_lock, flags); + + return count; +} +static DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL); + +static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) +{ + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + size_t count = 0; + unsigned long flags; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (count >= PAGE_SIZE) + break; + if (!psdev->dev) + continue; + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data) + continue; + count += + scnprintf(buf + count, PAGE_SIZE - count, + "%s:%s:%sing:%ld\n", + pci_name(psdev->dev), + dev_data->isr_on ? "on" : "off", + dev_data->ack_intr ? "ack" : "not ack", + dev_data->handled); + } + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return count; +} +static DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL); + +static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, + const char *buf, + size_t count) +{ + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + int domain, bus, slot, func; + int err; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + return err; + + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev) { + err = -ENOENT; + goto out; + } + + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data) { + err = -ENOENT; + goto out; + } + + dev_dbg(&psdev->dev->dev, "%s fake irq handler: %d->%d\n", + dev_data->irq_name, dev_data->isr_on, + !dev_data->isr_on); + + dev_data->isr_on = !(dev_data->isr_on); + if (dev_data->isr_on) + dev_data->ack_intr = 1; +out: + if (psdev) + pcistub_device_put(psdev); + if (!err) + err = count; + return err; +} +static DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, + pcistub_irq_handler_switch); + +static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func, reg, size, mask; + int err; + + err = str_to_quirk(buf, &domain, &bus, &slot, &func, ®, &size, + &mask); + if (err) + goto out; + + err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask); + +out: + if (!err) + err = count; + return err; +} + +static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf) +{ + int count = 0; + unsigned long flags; + struct xen_pcibk_config_quirk *quirk; + struct xen_pcibk_dev_data *dev_data; + const struct config_field *field; + const struct config_field_entry *cfg_entry; + + spin_lock_irqsave(&device_ids_lock, flags); + list_for_each_entry(quirk, &xen_pcibk_quirks, quirks_list) { + if (count >= PAGE_SIZE) + goto out; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n", + quirk->pdev->bus->number, + PCI_SLOT(quirk->pdev->devfn), + PCI_FUNC(quirk->pdev->devfn), + quirk->devid.vendor, quirk->devid.device, + quirk->devid.subvendor, + quirk->devid.subdevice); + + dev_data = pci_get_drvdata(quirk->pdev); + + list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { + field = cfg_entry->field; + if (count >= PAGE_SIZE) + goto out; + + count += scnprintf(buf + count, PAGE_SIZE - count, + "\t\t%08x:%01x:%08x\n", + cfg_entry->base_offset + + field->offset, field->size, + field->mask); + } + } + +out: + spin_unlock_irqrestore(&device_ids_lock, flags); + + return count; +} +static DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, + pcistub_quirk_add); + +static ssize_t permissive_add(struct device_driver *drv, const char *buf, + size_t count) +{ + int domain, bus, slot, func; + int err; + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + + err = str_to_slot(buf, &domain, &bus, &slot, &func); + if (err) + goto out; + + psdev = pcistub_device_find(domain, bus, slot, func); + if (!psdev) { + err = -ENODEV; + goto out; + } + + dev_data = pci_get_drvdata(psdev->dev); + /* the driver data for a device should never be null at this point */ + if (!dev_data) { + err = -ENXIO; + goto release; + } + if (!dev_data->permissive) { + dev_data->permissive = 1; + /* Let user know that what they're doing could be unsafe */ + dev_warn(&psdev->dev->dev, "enabling permissive mode " + "configuration space accesses!\n"); + dev_warn(&psdev->dev->dev, + "permissive mode is potentially unsafe!\n"); + } +release: + pcistub_device_put(psdev); +out: + if (!err) + err = count; + return err; +} + +static ssize_t permissive_show(struct device_driver *drv, char *buf) +{ + struct pcistub_device *psdev; + struct xen_pcibk_dev_data *dev_data; + size_t count = 0; + unsigned long flags; + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (count >= PAGE_SIZE) + break; + if (!psdev->dev) + continue; + dev_data = pci_get_drvdata(psdev->dev); + if (!dev_data || !dev_data->permissive) + continue; + count += + scnprintf(buf + count, PAGE_SIZE - count, "%s\n", + pci_name(psdev->dev)); + } + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + return count; +} +static DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, + permissive_add); + +static void pcistub_exit(void) +{ + driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_new_slot); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_remove_slot); + driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_slots); + driver_remove_file(&xen_pcibk_pci_driver.driver, &driver_attr_quirks); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_permissive); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handlers); + driver_remove_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handler_state); + pci_unregister_driver(&xen_pcibk_pci_driver); +} + +static int __init pcistub_init(void) +{ + int pos = 0; + int err = 0; + int domain, bus, slot, func; + int parsed; + + if (pci_devs_to_hide && *pci_devs_to_hide) { + do { + parsed = 0; + + err = sscanf(pci_devs_to_hide + pos, + " (%x:%x:%x.%x) %n", + &domain, &bus, &slot, &func, &parsed); + switch (err) { + case 3: + func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:%x:%x.*) %n", + &domain, &bus, &slot, &parsed); + break; + case 2: + slot = func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:%x:*.*) %n", + &domain, &bus, &parsed); + break; + } + + if (!parsed) { + domain = 0; + err = sscanf(pci_devs_to_hide + pos, + " (%x:%x.%x) %n", + &bus, &slot, &func, &parsed); + switch (err) { + case 2: + func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:%x.*) %n", + &bus, &slot, &parsed); + break; + case 1: + slot = func = -1; + sscanf(pci_devs_to_hide + pos, + " (%x:*.*) %n", + &bus, &parsed); + break; + } + } + + if (parsed <= 0) + goto parse_error; + + err = pcistub_device_id_add(domain, bus, slot, func); + if (err) + goto out; + + pos += parsed; + } while (pci_devs_to_hide[pos]); + } + + /* If we're the first PCI Device Driver to register, we're the + * first one to get offered PCI devices as they become + * available (and thus we can be the first to grab them) + */ + err = pci_register_driver(&xen_pcibk_pci_driver); + if (err < 0) + goto out; + + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_new_slot); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_remove_slot); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_slots); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_quirks); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_permissive); + + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handlers); + if (!err) + err = driver_create_file(&xen_pcibk_pci_driver.driver, + &driver_attr_irq_handler_state); + if (err) + pcistub_exit(); + +out: + return err; + +parse_error: + pr_err("Error parsing pci_devs_to_hide at \"%s\"\n", + pci_devs_to_hide + pos); + return -EINVAL; +} + +#ifndef MODULE +/* + * fs_initcall happens before device_initcall + * so xen_pcibk *should* get called first (b/c we + * want to suck up any device before other drivers + * get a chance by being the first pci device + * driver to register) + */ +fs_initcall(pcistub_init); +#endif + +#ifdef CONFIG_PCI_IOV +static struct pcistub_device *find_vfs(const struct pci_dev *pdev) +{ + struct pcistub_device *psdev = NULL; + unsigned long flags; + bool found = false; + + spin_lock_irqsave(&pcistub_devices_lock, flags); + list_for_each_entry(psdev, &pcistub_devices, dev_list) { + if (!psdev->pdev && psdev->dev != pdev + && pci_physfn(psdev->dev) == pdev) { + found = true; + break; + } + } + spin_unlock_irqrestore(&pcistub_devices_lock, flags); + if (found) + return psdev; + return NULL; +} + +static int pci_stub_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct device *dev = data; + const struct pci_dev *pdev = to_pci_dev(dev); + + if (action != BUS_NOTIFY_UNBIND_DRIVER) + return NOTIFY_DONE; + + if (!pdev->is_physfn) + return NOTIFY_DONE; + + for (;;) { + struct pcistub_device *psdev = find_vfs(pdev); + if (!psdev) + break; + device_release_driver(&psdev->dev->dev); + } + return NOTIFY_DONE; +} + +static struct notifier_block pci_stub_nb = { + .notifier_call = pci_stub_notifier, +}; +#endif + +static int __init xen_pcibk_init(void) +{ + int err; + + if (!xen_initial_domain()) + return -ENODEV; + + err = xen_pcibk_config_init(); + if (err) + return err; + +#ifdef MODULE + err = pcistub_init(); + if (err < 0) + return err; +#endif + + pcistub_init_devices_late(); + err = xen_pcibk_xenbus_register(); + if (err) + pcistub_exit(); +#ifdef CONFIG_PCI_IOV + else + bus_register_notifier(&pci_bus_type, &pci_stub_nb); +#endif + + return err; +} + +static void __exit xen_pcibk_cleanup(void) +{ +#ifdef CONFIG_PCI_IOV + bus_unregister_notifier(&pci_bus_type, &pci_stub_nb); +#endif + xen_pcibk_xenbus_unregister(); + pcistub_exit(); +} + +module_init(xen_pcibk_init); +module_exit(xen_pcibk_cleanup); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("xen-backend:pci"); diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h new file mode 100644 index 000000000..58e38d586 --- /dev/null +++ b/drivers/xen/xen-pciback/pciback.h @@ -0,0 +1,193 @@ +/* + * PCI Backend Common Data Structures & Function Declarations + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ +#ifndef __XEN_PCIBACK_H__ +#define __XEN_PCIBACK_H__ + +#include <linux/pci.h> +#include <linux/interrupt.h> +#include <xen/xenbus.h> +#include <linux/list.h> +#include <linux/spinlock.h> +#include <linux/workqueue.h> +#include <linux/atomic.h> +#include <xen/interface/io/pciif.h> + +#define DRV_NAME "xen-pciback" + +struct pci_dev_entry { + struct list_head list; + struct pci_dev *dev; +}; + +#define _PDEVF_op_active (0) +#define PDEVF_op_active (1<<(_PDEVF_op_active)) +#define _PCIB_op_pending (1) +#define PCIB_op_pending (1<<(_PCIB_op_pending)) + +struct xen_pcibk_device { + void *pci_dev_data; + struct mutex dev_lock; + struct xenbus_device *xdev; + struct xenbus_watch be_watch; + u8 be_watching; + int evtchn_irq; + struct xen_pci_sharedinfo *sh_info; + unsigned long flags; + struct work_struct op_work; +}; + +struct xen_pcibk_dev_data { + struct list_head config_fields; + struct pci_saved_state *pci_saved_state; + unsigned int permissive:1; + unsigned int warned_on_write:1; + unsigned int enable_intx:1; + unsigned int isr_on:1; /* Whether the IRQ handler is installed. */ + unsigned int ack_intr:1; /* .. and ACK-ing */ + unsigned long handled; + unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */ + char irq_name[0]; /* xen-pcibk[000:04:00.0] */ +}; + +/* Used by XenBus and xen_pcibk_ops.c */ +extern wait_queue_head_t xen_pcibk_aer_wait_queue; +extern struct workqueue_struct *xen_pcibk_wq; +/* Used by pcistub.c and conf_space_quirks.c */ +extern struct list_head xen_pcibk_quirks; + +/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */ +struct pci_dev *pcistub_get_pci_dev_by_slot(struct xen_pcibk_device *pdev, + int domain, int bus, + int slot, int func); +struct pci_dev *pcistub_get_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev); +void pcistub_put_pci_dev(struct pci_dev *dev); + +/* Ensure a device is turned off or reset */ +void xen_pcibk_reset_device(struct pci_dev *pdev); + +/* Access a virtual configuration space for a PCI device */ +int xen_pcibk_config_init(void); +int xen_pcibk_config_init_dev(struct pci_dev *dev); +void xen_pcibk_config_free_dyn_fields(struct pci_dev *dev); +void xen_pcibk_config_reset_dev(struct pci_dev *dev); +void xen_pcibk_config_free_dev(struct pci_dev *dev); +int xen_pcibk_config_read(struct pci_dev *dev, int offset, int size, + u32 *ret_val); +int xen_pcibk_config_write(struct pci_dev *dev, int offset, int size, + u32 value); + +/* Handle requests for specific devices from the frontend */ +typedef int (*publish_pci_dev_cb) (struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn, unsigned int devid); +typedef int (*publish_pci_root_cb) (struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus); + +/* Backend registration for the two types of BDF representation: + * vpci - BDFs start at 00 + * passthrough - BDFs are exactly like in the host. + */ +struct xen_pcibk_backend { + const char *name; + int (*init)(struct xen_pcibk_device *pdev); + void (*free)(struct xen_pcibk_device *pdev); + int (*find)(struct pci_dev *pcidev, struct xen_pcibk_device *pdev, + unsigned int *domain, unsigned int *bus, + unsigned int *devfn); + int (*publish)(struct xen_pcibk_device *pdev, publish_pci_root_cb cb); + void (*release)(struct xen_pcibk_device *pdev, struct pci_dev *dev, + bool lock); + int (*add)(struct xen_pcibk_device *pdev, struct pci_dev *dev, + int devid, publish_pci_dev_cb publish_cb); + struct pci_dev *(*get)(struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn); +}; + +extern const struct xen_pcibk_backend xen_pcibk_vpci_backend; +extern const struct xen_pcibk_backend xen_pcibk_passthrough_backend; +extern const struct xen_pcibk_backend *xen_pcibk_backend; + +static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, + int devid, + publish_pci_dev_cb publish_cb) +{ + if (xen_pcibk_backend && xen_pcibk_backend->add) + return xen_pcibk_backend->add(pdev, dev, devid, publish_cb); + return -1; +} + +static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, bool lock) +{ + if (xen_pcibk_backend && xen_pcibk_backend->release) + return xen_pcibk_backend->release(pdev, dev, lock); +} + +static inline struct pci_dev * +xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, unsigned int domain, + unsigned int bus, unsigned int devfn) +{ + if (xen_pcibk_backend && xen_pcibk_backend->get) + return xen_pcibk_backend->get(pdev, domain, bus, devfn); + return NULL; +} + +/** +* Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in xen_pcibk +* before sending aer request to pcifront, so that guest could identify +* device, coopearte with xen_pcibk to finish aer recovery job if device driver +* has the capability +*/ +static inline int xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, + struct xen_pcibk_device *pdev, + unsigned int *domain, + unsigned int *bus, + unsigned int *devfn) +{ + if (xen_pcibk_backend && xen_pcibk_backend->find) + return xen_pcibk_backend->find(pcidev, pdev, domain, bus, + devfn); + return -1; +} + +static inline int xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ + if (xen_pcibk_backend && xen_pcibk_backend->init) + return xen_pcibk_backend->init(pdev); + return -1; +} + +static inline int xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, + publish_pci_root_cb cb) +{ + if (xen_pcibk_backend && xen_pcibk_backend->publish) + return xen_pcibk_backend->publish(pdev, cb); + return -1; +} + +static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ + if (xen_pcibk_backend && xen_pcibk_backend->free) + return xen_pcibk_backend->free(pdev); +} + +/* Handles events from front-end */ +irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id); +void xen_pcibk_do_op(struct work_struct *data); + +int xen_pcibk_xenbus_register(void); +void xen_pcibk_xenbus_unregister(void); + +extern int verbose_request; + +void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev); +#endif + +/* Handles shared IRQs that can to device domain and control domain. */ +void xen_pcibk_irq_handler(struct pci_dev *dev, int reset); diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c new file mode 100644 index 000000000..c4a0666de --- /dev/null +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -0,0 +1,387 @@ +/* + * PCI Backend Operations - respond to PCI requests from Frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/wait.h> +#include <linux/bitops.h> +#include <xen/events.h> +#include <linux/sched.h> +#include "pciback.h" + +int verbose_request; +module_param(verbose_request, int, 0644); + +static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id); + +/* Ensure a device is has the fake IRQ handler "turned on/off" and is + * ready to be exported. This MUST be run after xen_pcibk_reset_device + * which does the actual PCI device enable/disable. + */ +static void xen_pcibk_control_isr(struct pci_dev *dev, int reset) +{ + struct xen_pcibk_dev_data *dev_data; + int rc; + int enable = 0; + + dev_data = pci_get_drvdata(dev); + if (!dev_data) + return; + + /* We don't deal with bridges */ + if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) + return; + + if (reset) { + dev_data->enable_intx = 0; + dev_data->ack_intr = 0; + } + enable = dev_data->enable_intx; + + /* Asked to disable, but ISR isn't runnig */ + if (!enable && !dev_data->isr_on) + return; + + /* Squirrel away the IRQs in the dev_data. We need this + * b/c when device transitions to MSI, the dev->irq is + * overwritten with the MSI vector. + */ + if (enable) + dev_data->irq = dev->irq; + + /* + * SR-IOV devices in all use MSI-X and have no legacy + * interrupts, so inhibit creating a fake IRQ handler for them. + */ + if (dev_data->irq == 0) + goto out; + + dev_dbg(&dev->dev, "%s: #%d %s %s%s %s-> %s\n", + dev_data->irq_name, + dev_data->irq, + pci_is_enabled(dev) ? "on" : "off", + dev->msi_enabled ? "MSI" : "", + dev->msix_enabled ? "MSI/X" : "", + dev_data->isr_on ? "enable" : "disable", + enable ? "enable" : "disable"); + + if (enable) { + rc = request_irq(dev_data->irq, + xen_pcibk_guest_interrupt, IRQF_SHARED, + dev_data->irq_name, dev); + if (rc) { + dev_err(&dev->dev, "%s: failed to install fake IRQ " \ + "handler for IRQ %d! (rc:%d)\n", + dev_data->irq_name, dev_data->irq, rc); + goto out; + } + } else { + free_irq(dev_data->irq, dev); + dev_data->irq = 0; + } + dev_data->isr_on = enable; + dev_data->ack_intr = enable; +out: + dev_dbg(&dev->dev, "%s: #%d %s %s%s %s\n", + dev_data->irq_name, + dev_data->irq, + pci_is_enabled(dev) ? "on" : "off", + dev->msi_enabled ? "MSI" : "", + dev->msix_enabled ? "MSI/X" : "", + enable ? (dev_data->isr_on ? "enabled" : "failed to enable") : + (dev_data->isr_on ? "failed to disable" : "disabled")); +} + +/* Ensure a device is "turned off" and ready to be exported. + * (Also see xen_pcibk_config_reset to ensure virtual configuration space is + * ready to be re-exported) + */ +void xen_pcibk_reset_device(struct pci_dev *dev) +{ + u16 cmd; + + xen_pcibk_control_isr(dev, 1 /* reset device */); + + /* Disable devices (but not bridges) */ + if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) { +#ifdef CONFIG_PCI_MSI + /* The guest could have been abruptly killed without + * disabling MSI/MSI-X interrupts.*/ + if (dev->msix_enabled) + pci_disable_msix(dev); + if (dev->msi_enabled) + pci_disable_msi(dev); +#endif + if (pci_is_enabled(dev)) + pci_disable_device(dev); + + pci_write_config_word(dev, PCI_COMMAND, 0); + + dev->is_busmaster = 0; + } else { + pci_read_config_word(dev, PCI_COMMAND, &cmd); + if (cmd & (PCI_COMMAND_INVALIDATE)) { + cmd &= ~(PCI_COMMAND_INVALIDATE); + pci_write_config_word(dev, PCI_COMMAND, cmd); + + dev->is_busmaster = 0; + } + } +} + +#ifdef CONFIG_PCI_MSI +static +int xen_pcibk_enable_msi(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + int status; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: enable MSI\n", pci_name(dev)); + + status = pci_enable_msi(dev); + + if (status) { + pr_warn_ratelimited("%s: error enabling MSI for guest %u: err %d\n", + pci_name(dev), pdev->xdev->otherend_id, + status); + op->value = 0; + return XEN_PCI_ERR_op_failed; + } + + /* The value the guest needs is actually the IDT vector, not the + * the local domain's IRQ number. */ + + op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), + op->value); + + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 0; + + return 0; +} + +static +int xen_pcibk_disable_msi(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: disable MSI\n", + pci_name(dev)); + pci_disable_msi(dev); + + op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: MSI: %d\n", pci_name(dev), + op->value); + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 1; + return 0; +} + +static +int xen_pcibk_enable_msix(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + int i, result; + struct msix_entry *entries; + + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: enable MSI-X\n", + pci_name(dev)); + if (op->value > SH_INFO_MAX_VEC) + return -EINVAL; + + entries = kmalloc(op->value * sizeof(*entries), GFP_KERNEL); + if (entries == NULL) + return -ENOMEM; + + for (i = 0; i < op->value; i++) { + entries[i].entry = op->msix_entries[i].entry; + entries[i].vector = op->msix_entries[i].vector; + } + + result = pci_enable_msix_exact(dev, entries, op->value); + if (result == 0) { + for (i = 0; i < op->value; i++) { + op->msix_entries[i].entry = entries[i].entry; + if (entries[i].vector) { + op->msix_entries[i].vector = + xen_pirq_from_irq(entries[i].vector); + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: " \ + "MSI-X[%d]: %d\n", + pci_name(dev), i, + op->msix_entries[i].vector); + } + } + } else + pr_warn_ratelimited("%s: error enabling MSI-X for guest %u: err %d!\n", + pci_name(dev), pdev->xdev->otherend_id, + result); + kfree(entries); + + op->value = result; + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 0; + + return result > 0 ? 0 : result; +} + +static +int xen_pcibk_disable_msix(struct xen_pcibk_device *pdev, + struct pci_dev *dev, struct xen_pci_op *op) +{ + struct xen_pcibk_dev_data *dev_data; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: disable MSI-X\n", + pci_name(dev)); + pci_disable_msix(dev); + + /* + * SR-IOV devices (which don't have any legacy IRQ) have + * an undefined IRQ value of zero. + */ + op->value = dev->irq ? xen_pirq_from_irq(dev->irq) : 0; + if (unlikely(verbose_request)) + printk(KERN_DEBUG DRV_NAME ": %s: MSI-X: %d\n", pci_name(dev), + op->value); + dev_data = pci_get_drvdata(dev); + if (dev_data) + dev_data->ack_intr = 1; + return 0; +} +#endif +/* +* Now the same evtchn is used for both pcifront conf_read_write request +* as well as pcie aer front end ack. We use a new work_queue to schedule +* xen_pcibk conf_read_write service for avoiding confict with aer_core +* do_recovery job which also use the system default work_queue +*/ +void xen_pcibk_test_and_schedule_op(struct xen_pcibk_device *pdev) +{ + /* Check that frontend is requesting an operation and that we are not + * already processing a request */ + if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags) + && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) { + queue_work(xen_pcibk_wq, &pdev->op_work); + } + /*_XEN_PCIB_active should have been cleared by pcifront. And also make + sure xen_pcibk is waiting for ack by checking _PCIB_op_pending*/ + if (!test_bit(_XEN_PCIB_active, (unsigned long *)&pdev->sh_info->flags) + && test_bit(_PCIB_op_pending, &pdev->flags)) { + wake_up(&xen_pcibk_aer_wait_queue); + } +} + +/* Performing the configuration space reads/writes must not be done in atomic + * context because some of the pci_* functions can sleep (mostly due to ACPI + * use of semaphores). This function is intended to be called from a work + * queue in process context taking a struct xen_pcibk_device as a parameter */ + +void xen_pcibk_do_op(struct work_struct *data) +{ + struct xen_pcibk_device *pdev = + container_of(data, struct xen_pcibk_device, op_work); + struct pci_dev *dev; + struct xen_pcibk_dev_data *dev_data = NULL; + struct xen_pci_op *op = &pdev->sh_info->op; + int test_intx = 0; + + dev = xen_pcibk_get_pci_dev(pdev, op->domain, op->bus, op->devfn); + + if (dev == NULL) + op->err = XEN_PCI_ERR_dev_not_found; + else { + dev_data = pci_get_drvdata(dev); + if (dev_data) + test_intx = dev_data->enable_intx; + switch (op->cmd) { + case XEN_PCI_OP_conf_read: + op->err = xen_pcibk_config_read(dev, + op->offset, op->size, &op->value); + break; + case XEN_PCI_OP_conf_write: + op->err = xen_pcibk_config_write(dev, + op->offset, op->size, op->value); + break; +#ifdef CONFIG_PCI_MSI + case XEN_PCI_OP_enable_msi: + op->err = xen_pcibk_enable_msi(pdev, dev, op); + break; + case XEN_PCI_OP_disable_msi: + op->err = xen_pcibk_disable_msi(pdev, dev, op); + break; + case XEN_PCI_OP_enable_msix: + op->err = xen_pcibk_enable_msix(pdev, dev, op); + break; + case XEN_PCI_OP_disable_msix: + op->err = xen_pcibk_disable_msix(pdev, dev, op); + break; +#endif + default: + op->err = XEN_PCI_ERR_not_implemented; + break; + } + } + if (!op->err && dev && dev_data) { + /* Transition detected */ + if ((dev_data->enable_intx != test_intx)) + xen_pcibk_control_isr(dev, 0 /* no reset */); + } + /* Tell the driver domain that we're done. */ + wmb(); + clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); + notify_remote_via_irq(pdev->evtchn_irq); + + /* Mark that we're done. */ + smp_mb__before_atomic(); /* /after/ clearing PCIF_active */ + clear_bit(_PDEVF_op_active, &pdev->flags); + smp_mb__after_atomic(); /* /before/ final check for work */ + + /* Check to see if the driver domain tried to start another request in + * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. + */ + xen_pcibk_test_and_schedule_op(pdev); +} + +irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id) +{ + struct xen_pcibk_device *pdev = dev_id; + + xen_pcibk_test_and_schedule_op(pdev); + + return IRQ_HANDLED; +} +static irqreturn_t xen_pcibk_guest_interrupt(int irq, void *dev_id) +{ + struct pci_dev *dev = (struct pci_dev *)dev_id; + struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); + + if (dev_data->isr_on && dev_data->ack_intr) { + dev_data->handled++; + if ((dev_data->handled % 1000) == 0) { + if (xen_test_irq_shared(irq)) { + pr_info("%s IRQ line is not shared " + "with other domains. Turning ISR off\n", + dev_data->irq_name); + dev_data->ack_intr = 0; + } + } + return IRQ_HANDLED; + } + return IRQ_NONE; +} diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c new file mode 100644 index 000000000..c99f8bb1c --- /dev/null +++ b/drivers/xen/xen-pciback/vpci.c @@ -0,0 +1,270 @@ +/* + * PCI Backend - Provides a Virtual PCI bus (with real devices) + * to the frontend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/pci.h> +#include <linux/mutex.h> +#include "pciback.h" + +#define PCI_SLOT_MAX 32 + +struct vpci_dev_data { + /* Access to dev_list must be protected by lock */ + struct list_head dev_list[PCI_SLOT_MAX]; + struct mutex lock; +}; + +static inline struct list_head *list_first(struct list_head *head) +{ + return head->next; +} + +static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, + unsigned int domain, + unsigned int bus, + unsigned int devfn) +{ + struct pci_dev_entry *entry; + struct pci_dev *dev = NULL; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + + if (domain != 0 || bus != 0) + return NULL; + + if (PCI_SLOT(devfn) < PCI_SLOT_MAX) { + mutex_lock(&vpci_dev->lock); + + list_for_each_entry(entry, + &vpci_dev->dev_list[PCI_SLOT(devfn)], + list) { + if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) { + dev = entry->dev; + break; + } + } + + mutex_unlock(&vpci_dev->lock); + } + return dev; +} + +static inline int match_slot(struct pci_dev *l, struct pci_dev *r) +{ + if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus) + && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn)) + return 1; + + return 0; +} + +static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, int devid, + publish_pci_dev_cb publish_cb) +{ + int err = 0, slot, func = -1; + struct pci_dev_entry *t, *dev_entry; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + + if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) { + err = -EFAULT; + xenbus_dev_fatal(pdev->xdev, err, + "Can't export bridges on the virtual PCI bus"); + goto out; + } + + dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); + if (!dev_entry) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "Error adding entry to virtual PCI bus"); + goto out; + } + + dev_entry->dev = dev; + + mutex_lock(&vpci_dev->lock); + + /* + * Keep multi-function devices together on the virtual PCI bus, except + * virtual functions. + */ + if (!dev->is_virtfn) { + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (list_empty(&vpci_dev->dev_list[slot])) + continue; + + t = list_entry(list_first(&vpci_dev->dev_list[slot]), + struct pci_dev_entry, list); + + if (match_slot(dev, t->dev)) { + pr_info("vpci: %s: assign to virtual slot %d func %d\n", + pci_name(dev), slot, + PCI_FUNC(dev->devfn)); + list_add_tail(&dev_entry->list, + &vpci_dev->dev_list[slot]); + func = PCI_FUNC(dev->devfn); + goto unlock; + } + } + } + + /* Assign to a new slot on the virtual PCI bus */ + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + if (list_empty(&vpci_dev->dev_list[slot])) { + pr_info("vpci: %s: assign to virtual slot %d\n", + pci_name(dev), slot); + list_add_tail(&dev_entry->list, + &vpci_dev->dev_list[slot]); + func = dev->is_virtfn ? 0 : PCI_FUNC(dev->devfn); + goto unlock; + } + } + + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "No more space on root virtual PCI bus"); + +unlock: + mutex_unlock(&vpci_dev->lock); + + /* Publish this device. */ + if (!err) + err = publish_cb(pdev, 0, 0, PCI_DEVFN(slot, func), devid); + else + kfree(dev_entry); + +out: + return err; +} + +static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, + struct pci_dev *dev, bool lock) +{ + int slot; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + struct pci_dev *found_dev = NULL; + + mutex_lock(&vpci_dev->lock); + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + struct pci_dev_entry *e; + + list_for_each_entry(e, &vpci_dev->dev_list[slot], list) { + if (e->dev == dev) { + list_del(&e->list); + found_dev = e->dev; + kfree(e); + goto out; + } + } + } + +out: + mutex_unlock(&vpci_dev->lock); + + if (found_dev) { + if (lock) + device_lock(&found_dev->dev); + pcistub_put_pci_dev(found_dev); + if (lock) + device_unlock(&found_dev->dev); + } +} + +static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) +{ + int slot; + struct vpci_dev_data *vpci_dev; + + vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL); + if (!vpci_dev) + return -ENOMEM; + + mutex_init(&vpci_dev->lock); + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) + INIT_LIST_HEAD(&vpci_dev->dev_list[slot]); + + pdev->pci_dev_data = vpci_dev; + + return 0; +} + +static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, + publish_pci_root_cb publish_cb) +{ + /* The Virtual PCI bus has only one root */ + return publish_cb(pdev, 0, 0); +} + +static void __xen_pcibk_release_devices(struct xen_pcibk_device *pdev) +{ + int slot; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + struct pci_dev_entry *e, *tmp; + list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], + list) { + struct pci_dev *dev = e->dev; + list_del(&e->list); + device_lock(&dev->dev); + pcistub_put_pci_dev(dev); + device_unlock(&dev->dev); + kfree(e); + } + } + + kfree(vpci_dev); + pdev->pci_dev_data = NULL; +} + +static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, + struct xen_pcibk_device *pdev, + unsigned int *domain, unsigned int *bus, + unsigned int *devfn) +{ + struct pci_dev_entry *entry; + struct pci_dev *dev = NULL; + struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; + int found = 0, slot; + + mutex_lock(&vpci_dev->lock); + for (slot = 0; slot < PCI_SLOT_MAX; slot++) { + list_for_each_entry(entry, + &vpci_dev->dev_list[slot], + list) { + dev = entry->dev; + if (dev && dev->bus->number == pcidev->bus->number + && pci_domain_nr(dev->bus) == + pci_domain_nr(pcidev->bus) + && dev->devfn == pcidev->devfn) { + found = 1; + *domain = 0; + *bus = 0; + *devfn = PCI_DEVFN(slot, + PCI_FUNC(pcidev->devfn)); + } + } + } + mutex_unlock(&vpci_dev->lock); + return found; +} + +const struct xen_pcibk_backend xen_pcibk_vpci_backend = { + .name = "vpci", + .init = __xen_pcibk_init_devices, + .free = __xen_pcibk_release_devices, + .find = __xen_pcibk_get_pcifront_dev, + .publish = __xen_pcibk_publish_pci_roots, + .release = __xen_pcibk_release_pci_dev, + .add = __xen_pcibk_add_pci_dev, + .get = __xen_pcibk_get_pci_dev, +}; diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c new file mode 100644 index 000000000..98bc345f2 --- /dev/null +++ b/drivers/xen/xen-pciback/xenbus.c @@ -0,0 +1,750 @@ +/* + * PCI Backend Xenbus Setup - handles setup with frontend and xend + * + * Author: Ryan Wilson <hap9@epoch.ncsc.mil> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/vmalloc.h> +#include <linux/workqueue.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <asm/xen/pci.h> +#include "pciback.h" + +#define INVALID_EVTCHN_IRQ (-1) +struct workqueue_struct *xen_pcibk_wq; + +static bool __read_mostly passthrough; +module_param(passthrough, bool, S_IRUGO); +MODULE_PARM_DESC(passthrough, + "Option to specify how to export PCI topology to guest:\n"\ + " 0 - (default) Hide the true PCI topology and makes the frontend\n"\ + " there is a single PCI bus with only the exported devices on it.\n"\ + " For example, a device at 03:05.0 will be re-assigned to 00:00.0\n"\ + " while second device at 02:1a.1 will be re-assigned to 00:01.1.\n"\ + " 1 - Passthrough provides a real view of the PCI topology to the\n"\ + " frontend (for example, a device at 06:01.b will still appear at\n"\ + " 06:01.b to the frontend). This is similar to how Xen 2.0.x\n"\ + " exposed PCI devices to its driver domains. This may be required\n"\ + " for drivers which depend on finding their hardward in certain\n"\ + " bus/slot locations."); + +static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) +{ + struct xen_pcibk_device *pdev; + + pdev = kzalloc(sizeof(struct xen_pcibk_device), GFP_KERNEL); + if (pdev == NULL) + goto out; + dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev); + + pdev->xdev = xdev; + dev_set_drvdata(&xdev->dev, pdev); + + mutex_init(&pdev->dev_lock); + + pdev->sh_info = NULL; + pdev->evtchn_irq = INVALID_EVTCHN_IRQ; + pdev->be_watching = 0; + + INIT_WORK(&pdev->op_work, xen_pcibk_do_op); + + if (xen_pcibk_init_devices(pdev)) { + kfree(pdev); + pdev = NULL; + } +out: + return pdev; +} + +static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev) +{ + mutex_lock(&pdev->dev_lock); + /* Ensure the guest can't trigger our handler before removing devices */ + if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) { + unbind_from_irqhandler(pdev->evtchn_irq, pdev); + pdev->evtchn_irq = INVALID_EVTCHN_IRQ; + } + + /* If the driver domain started an op, make sure we complete it + * before releasing the shared memory */ + + /* Note, the workqueue does not use spinlocks at all.*/ + flush_workqueue(xen_pcibk_wq); + + if (pdev->sh_info != NULL) { + xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info); + pdev->sh_info = NULL; + } + mutex_unlock(&pdev->dev_lock); +} + +static void free_pdev(struct xen_pcibk_device *pdev) +{ + if (pdev->be_watching) { + unregister_xenbus_watch(&pdev->be_watch); + pdev->be_watching = 0; + } + + xen_pcibk_disconnect(pdev); + + /* N.B. This calls pcistub_put_pci_dev which does the FLR on all + * of the PCIe devices. */ + xen_pcibk_release_devices(pdev); + + dev_set_drvdata(&pdev->xdev->dev, NULL); + pdev->xdev = NULL; + + kfree(pdev); +} + +static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref, + int remote_evtchn) +{ + int err = 0; + void *vaddr; + + dev_dbg(&pdev->xdev->dev, + "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n", + gnt_ref, remote_evtchn); + + err = xenbus_map_ring_valloc(pdev->xdev, &gnt_ref, 1, &vaddr); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error mapping other domain page in ours."); + goto out; + } + + pdev->sh_info = vaddr; + + err = bind_interdomain_evtchn_to_irqhandler( + pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event, + 0, DRV_NAME, pdev); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error binding event channel to IRQ"); + goto out; + } + pdev->evtchn_irq = err; + err = 0; + + dev_dbg(&pdev->xdev->dev, "Attached!\n"); +out: + return err; +} + +static int xen_pcibk_attach(struct xen_pcibk_device *pdev) +{ + int err = 0; + int gnt_ref, remote_evtchn; + char *magic = NULL; + + + mutex_lock(&pdev->dev_lock); + /* Make sure we only do this setup once */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateInitialised) + goto out; + + /* Wait for frontend to state that it has published the configuration */ + if (xenbus_read_driver_state(pdev->xdev->otherend) != + XenbusStateInitialised) + goto out; + + dev_dbg(&pdev->xdev->dev, "Reading frontend config\n"); + + err = xenbus_gather(XBT_NIL, pdev->xdev->otherend, + "pci-op-ref", "%u", &gnt_ref, + "event-channel", "%u", &remote_evtchn, + "magic", NULL, &magic, NULL); + if (err) { + /* If configuration didn't get read correctly, wait longer */ + xenbus_dev_fatal(pdev->xdev, err, + "Error reading configuration from frontend"); + goto out; + } + + if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) { + xenbus_dev_fatal(pdev->xdev, -EFAULT, + "version mismatch (%s/%s) with pcifront - " + "halting " DRV_NAME, + magic, XEN_PCI_MAGIC); + err = -EFAULT; + goto out; + } + + err = xen_pcibk_do_attach(pdev, gnt_ref, remote_evtchn); + if (err) + goto out; + + dev_dbg(&pdev->xdev->dev, "Connecting...\n"); + + err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); + if (err) + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to connected state!"); + + dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err); +out: + mutex_unlock(&pdev->dev_lock); + + kfree(magic); + + return err; +} + +static int xen_pcibk_publish_pci_dev(struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus, + unsigned int devfn, unsigned int devid) +{ + int err; + int len; + char str[64]; + + len = snprintf(str, sizeof(str), "vdev-%d", devid); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + /* Note: The PV protocol uses %02x, don't change it */ + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, + "%04x:%02x:%02x.%02x", domain, bus, + PCI_SLOT(devfn), PCI_FUNC(devfn)); + +out: + return err; +} + +static int xen_pcibk_export_device(struct xen_pcibk_device *pdev, + int domain, int bus, int slot, int func, + int devid) +{ + struct pci_dev *dev; + int err = 0; + + dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n", + domain, bus, slot, func); + + dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func); + if (!dev) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Couldn't locate PCI device " + "(%04x:%02x:%02x.%d)! " + "perhaps already in-use?", + domain, bus, slot, func); + goto out; + } + + err = xen_pcibk_add_pci_dev(pdev, dev, devid, + xen_pcibk_publish_pci_dev); + if (err) + goto out; + + dev_info(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id); + if (xen_register_device_domain_owner(dev, + pdev->xdev->otherend_id) != 0) { + dev_err(&dev->dev, "Stealing ownership from dom%d.\n", + xen_find_device_domain_owner(dev)); + xen_unregister_device_domain_owner(dev); + xen_register_device_domain_owner(dev, pdev->xdev->otherend_id); + } + + /* TODO: It'd be nice to export a bridge and have all of its children + * get exported with it. This may be best done in xend (which will + * have to calculate resource usage anyway) but we probably want to + * put something in here to ensure that if a bridge gets given to a + * driver domain, that all devices under that bridge are not given + * to other driver domains (as he who controls the bridge can disable + * it and stop the other devices from working). + */ +out: + return err; +} + +static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev, + int domain, int bus, int slot, int func) +{ + int err = 0; + struct pci_dev *dev; + + dev_dbg(&pdev->xdev->dev, "removing dom %x bus %x slot %x func %x\n", + domain, bus, slot, func); + + dev = xen_pcibk_get_pci_dev(pdev, domain, bus, PCI_DEVFN(slot, func)); + if (!dev) { + err = -EINVAL; + dev_dbg(&pdev->xdev->dev, "Couldn't locate PCI device " + "(%04x:%02x:%02x.%d)! not owned by this domain\n", + domain, bus, slot, func); + goto out; + } + + dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id); + xen_unregister_device_domain_owner(dev); + + /* N.B. This ends up calling pcistub_put_pci_dev which ends up + * doing the FLR. */ + xen_pcibk_release_pci_dev(pdev, dev, true /* use the lock. */); + +out: + return err; +} + +static int xen_pcibk_publish_pci_root(struct xen_pcibk_device *pdev, + unsigned int domain, unsigned int bus) +{ + unsigned int d, b; + int i, root_num, len, err; + char str[64]; + + dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n"); + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + "root_num", "%d", &root_num); + if (err == 0 || err == -ENOENT) + root_num = 0; + else if (err < 0) + goto out; + + /* Verify that we haven't already published this pci root */ + for (i = 0; i < root_num; i++) { + len = snprintf(str, sizeof(str), "root-%d", i); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + str, "%x:%x", &d, &b); + if (err < 0) + goto out; + if (err != 2) { + err = -EINVAL; + goto out; + } + + if (d == domain && b == bus) { + err = 0; + goto out; + } + } + + len = snprintf(str, sizeof(str), "root-%d", root_num); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n", + root_num, domain, bus); + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, + "%04x:%02x", domain, bus); + if (err) + goto out; + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, + "root_num", "%d", (root_num + 1)); + +out: + return err; +} + +static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) +{ + int err = 0; + int num_devs; + int domain, bus, slot, func; + int substate; + int i, len; + char state_str[64]; + char dev_str[64]; + + + dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n"); + + mutex_lock(&pdev->dev_lock); + /* Make sure we only reconfigure once */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateReconfiguring) + goto out; + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", + &num_devs); + if (err != 1) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of devices"); + goto out; + } + + for (i = 0; i < num_devs; i++) { + len = snprintf(state_str, sizeof(state_str), "state-%d", i); + if (unlikely(len >= (sizeof(state_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, state_str, + "%d", &substate); + if (err != 1) + substate = XenbusStateUnknown; + + switch (substate) { + case XenbusStateInitialising: + dev_dbg(&pdev->xdev->dev, "Attaching dev-%d ...\n", i); + + len = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); + if (unlikely(len >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while " + "reading configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + dev_str, "%x:%x:%x.%x", + &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device " + "configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = xen_pcibk_export_device(pdev, domain, bus, slot, + func, i); + if (err) + goto out; + + /* Publish pci roots. */ + err = xen_pcibk_publish_pci_roots(pdev, + xen_pcibk_publish_pci_root); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error while publish PCI root" + "buses for frontend"); + goto out; + } + + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, + state_str, "%d", + XenbusStateInitialised); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error switching substate of " + "dev-%d\n", i); + goto out; + } + break; + + case XenbusStateClosing: + dev_dbg(&pdev->xdev->dev, "Detaching dev-%d ...\n", i); + + len = snprintf(dev_str, sizeof(dev_str), "vdev-%d", i); + if (unlikely(len >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while " + "reading configuration"); + goto out; + } + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, + dev_str, "%x:%x:%x.%x", + &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device " + "configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = xen_pcibk_remove_device(pdev, domain, bus, slot, + func); + if (err) + goto out; + + /* TODO: If at some point we implement support for pci + * root hot-remove on pcifront side, we'll need to + * remove unnecessary xenstore nodes of pci roots here. + */ + + break; + + default: + break; + } + } + + err = xenbus_switch_state(pdev->xdev, XenbusStateReconfigured); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to reconfigured state!"); + goto out; + } + +out: + mutex_unlock(&pdev->dev_lock); + return 0; +} + +static void xen_pcibk_frontend_changed(struct xenbus_device *xdev, + enum xenbus_state fe_state) +{ + struct xen_pcibk_device *pdev = dev_get_drvdata(&xdev->dev); + + dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state); + + switch (fe_state) { + case XenbusStateInitialised: + xen_pcibk_attach(pdev); + break; + + case XenbusStateReconfiguring: + xen_pcibk_reconfigure(pdev); + break; + + case XenbusStateConnected: + /* pcifront switched its state from reconfiguring to connected. + * Then switch to connected state. + */ + xenbus_switch_state(xdev, XenbusStateConnected); + break; + + case XenbusStateClosing: + xen_pcibk_disconnect(pdev); + xenbus_switch_state(xdev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xen_pcibk_disconnect(pdev); + xenbus_switch_state(xdev, XenbusStateClosed); + if (xenbus_dev_is_online(xdev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + dev_dbg(&xdev->dev, "frontend is gone! unregister device\n"); + device_unregister(&xdev->dev); + break; + + default: + break; + } +} + +static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev) +{ + /* Get configuration from xend (if available now) */ + int domain, bus, slot, func; + int err = 0; + int i, num_devs; + char dev_str[64]; + char state_str[64]; + + mutex_lock(&pdev->dev_lock); + /* It's possible we could get the call to setup twice, so make sure + * we're not already connected. + */ + if (xenbus_read_driver_state(pdev->xdev->nodename) != + XenbusStateInitWait) + goto out; + + dev_dbg(&pdev->xdev->dev, "getting be setup\n"); + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", + &num_devs); + if (err != 1) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of devices"); + goto out; + } + + for (i = 0; i < num_devs; i++) { + int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); + if (unlikely(l >= (sizeof(dev_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + + err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str, + "%x:%x:%x.%x", &domain, &bus, &slot, &func); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error reading device configuration"); + goto out; + } + if (err != 4) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error parsing pci device " + "configuration"); + goto out; + } + + err = xen_pcibk_export_device(pdev, domain, bus, slot, func, i); + if (err) + goto out; + + /* Switch substate of this device. */ + l = snprintf(state_str, sizeof(state_str), "state-%d", i); + if (unlikely(l >= (sizeof(state_str) - 1))) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "String overflow while reading " + "configuration"); + goto out; + } + err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, state_str, + "%d", XenbusStateInitialised); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, "Error switching " + "substate of dev-%d\n", i); + goto out; + } + } + + err = xen_pcibk_publish_pci_roots(pdev, xen_pcibk_publish_pci_root); + if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error while publish PCI root buses " + "for frontend"); + goto out; + } + + err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised); + if (err) + xenbus_dev_fatal(pdev->xdev, err, + "Error switching to initialised state!"); + +out: + mutex_unlock(&pdev->dev_lock); + if (!err) + /* see if pcifront is already configured (if not, we'll wait) */ + xen_pcibk_attach(pdev); + return err; +} + +static void xen_pcibk_be_watch(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + struct xen_pcibk_device *pdev = + container_of(watch, struct xen_pcibk_device, be_watch); + + switch (xenbus_read_driver_state(pdev->xdev->nodename)) { + case XenbusStateInitWait: + xen_pcibk_setup_backend(pdev); + break; + + default: + break; + } +} + +static int xen_pcibk_xenbus_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err = 0; + struct xen_pcibk_device *pdev = alloc_pdev(dev); + + if (pdev == NULL) { + err = -ENOMEM; + xenbus_dev_fatal(dev, err, + "Error allocating xen_pcibk_device struct"); + goto out; + } + + /* wait for xend to configure us */ + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto out; + + /* watch the backend node for backend configuration information */ + err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch, + xen_pcibk_be_watch); + if (err) + goto out; + + pdev->be_watching = 1; + + /* We need to force a call to our callback here in case + * xend already configured us! + */ + xen_pcibk_be_watch(&pdev->be_watch, NULL, 0); + +out: + return err; +} + +static int xen_pcibk_xenbus_remove(struct xenbus_device *dev) +{ + struct xen_pcibk_device *pdev = dev_get_drvdata(&dev->dev); + + if (pdev != NULL) + free_pdev(pdev); + + return 0; +} + +static const struct xenbus_device_id xen_pcibk_ids[] = { + {"pci"}, + {""}, +}; + +static struct xenbus_driver xen_pcibk_driver = { + .name = DRV_NAME, + .ids = xen_pcibk_ids, + .probe = xen_pcibk_xenbus_probe, + .remove = xen_pcibk_xenbus_remove, + .otherend_changed = xen_pcibk_frontend_changed, +}; + +const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend; + +int __init xen_pcibk_xenbus_register(void) +{ + xen_pcibk_wq = create_workqueue("xen_pciback_workqueue"); + if (!xen_pcibk_wq) { + pr_err("%s: create xen_pciback_workqueue failed\n", __func__); + return -EFAULT; + } + xen_pcibk_backend = &xen_pcibk_vpci_backend; + if (passthrough) + xen_pcibk_backend = &xen_pcibk_passthrough_backend; + pr_info("backend is %s\n", xen_pcibk_backend->name); + return xenbus_register_backend(&xen_pcibk_driver); +} + +void __exit xen_pcibk_xenbus_unregister(void) +{ + destroy_workqueue(xen_pcibk_wq); + xenbus_unregister_driver(&xen_pcibk_driver); +} diff --git a/drivers/xen/xen-scsiback.c b/drivers/xen/xen-scsiback.c new file mode 100644 index 000000000..b7f51504f --- /dev/null +++ b/drivers/xen/xen-scsiback.c @@ -0,0 +1,2076 @@ +/* + * Xen SCSI backend driver + * + * Copyright (c) 2008, FUJITSU Limited + * + * Based on the blkback driver code. + * Adaption to kernel taget core infrastructure taken from vhost/scsi.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) "xen-pvscsi: " fmt + +#include <stdarg.h> + +#include <linux/module.h> +#include <linux/utsname.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/list.h> +#include <linux/gfp.h> +#include <linux/delay.h> +#include <linux/spinlock.h> +#include <linux/configfs.h> + +#include <generated/utsrelease.h> + +#include <scsi/scsi.h> +#include <scsi/scsi_dbg.h> +#include <scsi/scsi_eh.h> +#include <scsi/scsi_tcq.h> + +#include <target/target_core_base.h> +#include <target/target_core_fabric.h> +#include <target/target_core_configfs.h> +#include <target/target_core_fabric_configfs.h> + +#include <asm/hypervisor.h> + +#include <xen/xen.h> +#include <xen/balloon.h> +#include <xen/events.h> +#include <xen/xenbus.h> +#include <xen/grant_table.h> +#include <xen/page.h> + +#include <xen/interface/grant_table.h> +#include <xen/interface/io/vscsiif.h> + +#define VSCSI_VERSION "v0.1" +#define VSCSI_NAMELEN 32 + +struct ids_tuple { + unsigned int hst; /* host */ + unsigned int chn; /* channel */ + unsigned int tgt; /* target */ + unsigned int lun; /* LUN */ +}; + +struct v2p_entry { + struct ids_tuple v; /* translate from */ + struct scsiback_tpg *tpg; /* translate to */ + unsigned int lun; + struct kref kref; + struct list_head l; +}; + +struct vscsibk_info { + struct xenbus_device *dev; + + domid_t domid; + unsigned int irq; + + struct vscsiif_back_ring ring; + int ring_error; + + spinlock_t ring_lock; + atomic_t nr_unreplied_reqs; + + spinlock_t v2p_lock; + struct list_head v2p_entry_lists; + + wait_queue_head_t waiting_to_free; +}; + +/* theoretical maximum of grants for one request */ +#define VSCSI_MAX_GRANTS (SG_ALL + VSCSIIF_SG_TABLESIZE) + +/* + * VSCSI_GRANT_BATCH is the maximum number of grants to be processed in one + * call to map/unmap grants. Don't choose it too large, as there are arrays + * with VSCSI_GRANT_BATCH elements allocated on the stack. + */ +#define VSCSI_GRANT_BATCH 16 + +struct vscsibk_pend { + uint16_t rqid; + + uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; + uint8_t cmd_len; + + uint8_t sc_data_direction; + uint16_t n_sg; /* real length of SG list */ + uint16_t n_grants; /* SG pages and potentially SG list */ + uint32_t data_len; + uint32_t result; + + struct vscsibk_info *info; + struct v2p_entry *v2p; + struct scatterlist *sgl; + + uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE]; + + grant_handle_t grant_handles[VSCSI_MAX_GRANTS]; + struct page *pages[VSCSI_MAX_GRANTS]; + + struct se_cmd se_cmd; +}; + +struct scsiback_tmr { + atomic_t tmr_complete; + wait_queue_head_t tmr_wait; +}; + +struct scsiback_nexus { + /* Pointer to TCM session for I_T Nexus */ + struct se_session *tvn_se_sess; +}; + +struct scsiback_tport { + /* SCSI protocol the tport is providing */ + u8 tport_proto_id; + /* Binary World Wide unique Port Name for pvscsi Target port */ + u64 tport_wwpn; + /* ASCII formatted WWPN for pvscsi Target port */ + char tport_name[VSCSI_NAMELEN]; + /* Returned by scsiback_make_tport() */ + struct se_wwn tport_wwn; +}; + +struct scsiback_tpg { + /* scsiback port target portal group tag for TCM */ + u16 tport_tpgt; + /* track number of TPG Port/Lun Links wrt explicit I_T Nexus shutdown */ + int tv_tpg_port_count; + /* xen-pvscsi references to tpg_nexus, protected by tv_tpg_mutex */ + int tv_tpg_fe_count; + /* list for scsiback_list */ + struct list_head tv_tpg_list; + /* Used to protect access for tpg_nexus */ + struct mutex tv_tpg_mutex; + /* Pointer to the TCM pvscsi I_T Nexus for this TPG endpoint */ + struct scsiback_nexus *tpg_nexus; + /* Pointer back to scsiback_tport */ + struct scsiback_tport *tport; + /* Returned by scsiback_make_tpg() */ + struct se_portal_group se_tpg; + /* alias used in xenstore */ + char param_alias[VSCSI_NAMELEN]; + /* list of info structures related to this target portal group */ + struct list_head info_list; +}; + +#define SCSIBACK_INVALID_HANDLE (~0) + +static bool log_print_stat; +module_param(log_print_stat, bool, 0644); + +static int scsiback_max_buffer_pages = 1024; +module_param_named(max_buffer_pages, scsiback_max_buffer_pages, int, 0644); +MODULE_PARM_DESC(max_buffer_pages, +"Maximum number of free pages to keep in backend buffer"); + +static struct kmem_cache *scsiback_cachep; +static DEFINE_SPINLOCK(free_pages_lock); +static int free_pages_num; +static LIST_HEAD(scsiback_free_pages); + +/* Global spinlock to protect scsiback TPG list */ +static DEFINE_MUTEX(scsiback_mutex); +static LIST_HEAD(scsiback_list); + +static const struct target_core_fabric_ops scsiback_ops; + +static void scsiback_get(struct vscsibk_info *info) +{ + atomic_inc(&info->nr_unreplied_reqs); +} + +static void scsiback_put(struct vscsibk_info *info) +{ + if (atomic_dec_and_test(&info->nr_unreplied_reqs)) + wake_up(&info->waiting_to_free); +} + +static void put_free_pages(struct page **page, int num) +{ + unsigned long flags; + int i = free_pages_num + num, n = num; + + if (num == 0) + return; + if (i > scsiback_max_buffer_pages) { + n = min(num, i - scsiback_max_buffer_pages); + gnttab_free_pages(n, page + num - n); + n = num - n; + } + spin_lock_irqsave(&free_pages_lock, flags); + for (i = 0; i < n; i++) + list_add(&page[i]->lru, &scsiback_free_pages); + free_pages_num += n; + spin_unlock_irqrestore(&free_pages_lock, flags); +} + +static int get_free_page(struct page **page) +{ + unsigned long flags; + + spin_lock_irqsave(&free_pages_lock, flags); + if (list_empty(&scsiback_free_pages)) { + spin_unlock_irqrestore(&free_pages_lock, flags); + return gnttab_alloc_pages(1, page); + } + page[0] = list_first_entry(&scsiback_free_pages, struct page, lru); + list_del(&page[0]->lru); + free_pages_num--; + spin_unlock_irqrestore(&free_pages_lock, flags); + return 0; +} + +static unsigned long vaddr_page(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + + return (unsigned long)pfn_to_kaddr(pfn); +} + +static unsigned long vaddr(struct vscsibk_pend *req, int seg) +{ + return vaddr_page(req->pages[seg]); +} + +static void scsiback_print_status(char *sense_buffer, int errors, + struct vscsibk_pend *pending_req) +{ + struct scsiback_tpg *tpg = pending_req->v2p->tpg; + + pr_err("[%s:%d] cmnd[0]=%02x -> st=%02x msg=%02x host=%02x drv=%02x\n", + tpg->tport->tport_name, pending_req->v2p->lun, + pending_req->cmnd[0], status_byte(errors), msg_byte(errors), + host_byte(errors), driver_byte(errors)); +} + +static void scsiback_fast_flush_area(struct vscsibk_pend *req) +{ + struct gnttab_unmap_grant_ref unmap[VSCSI_GRANT_BATCH]; + struct page *pages[VSCSI_GRANT_BATCH]; + unsigned int i, invcount = 0; + grant_handle_t handle; + int err; + + kfree(req->sgl); + req->sgl = NULL; + req->n_sg = 0; + + if (!req->n_grants) + return; + + for (i = 0; i < req->n_grants; i++) { + handle = req->grant_handles[i]; + if (handle == SCSIBACK_INVALID_HANDLE) + continue; + gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), + GNTMAP_host_map, handle); + req->grant_handles[i] = SCSIBACK_INVALID_HANDLE; + pages[invcount] = req->pages[i]; + put_page(pages[invcount]); + invcount++; + if (invcount < VSCSI_GRANT_BATCH) + continue; + err = gnttab_unmap_refs(unmap, NULL, pages, invcount); + BUG_ON(err); + invcount = 0; + } + + if (invcount) { + err = gnttab_unmap_refs(unmap, NULL, pages, invcount); + BUG_ON(err); + } + + put_free_pages(req->pages, req->n_grants); + req->n_grants = 0; +} + +static void scsiback_free_translation_entry(struct kref *kref) +{ + struct v2p_entry *entry = container_of(kref, struct v2p_entry, kref); + struct scsiback_tpg *tpg = entry->tpg; + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count--; + mutex_unlock(&tpg->tv_tpg_mutex); + + kfree(entry); +} + +static void scsiback_do_resp_with_sense(char *sense_buffer, int32_t result, + uint32_t resid, struct vscsibk_pend *pending_req) +{ + struct vscsiif_response *ring_res; + struct vscsibk_info *info = pending_req->info; + int notify; + struct scsi_sense_hdr sshdr; + unsigned long flags; + unsigned len; + + spin_lock_irqsave(&info->ring_lock, flags); + + ring_res = RING_GET_RESPONSE(&info->ring, info->ring.rsp_prod_pvt); + info->ring.rsp_prod_pvt++; + + ring_res->rslt = result; + ring_res->rqid = pending_req->rqid; + + if (sense_buffer != NULL && + scsi_normalize_sense(sense_buffer, VSCSIIF_SENSE_BUFFERSIZE, + &sshdr)) { + len = min_t(unsigned, 8 + sense_buffer[7], + VSCSIIF_SENSE_BUFFERSIZE); + memcpy(ring_res->sense_buffer, sense_buffer, len); + ring_res->sense_len = len; + } else { + ring_res->sense_len = 0; + } + + ring_res->residual_len = resid; + + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info->ring, notify); + spin_unlock_irqrestore(&info->ring_lock, flags); + + if (notify) + notify_remote_via_irq(info->irq); + + if (pending_req->v2p) + kref_put(&pending_req->v2p->kref, + scsiback_free_translation_entry); +} + +static void scsiback_cmd_done(struct vscsibk_pend *pending_req) +{ + struct vscsibk_info *info = pending_req->info; + unsigned char *sense_buffer; + unsigned int resid; + int errors; + + sense_buffer = pending_req->sense_buffer; + resid = pending_req->se_cmd.residual_count; + errors = pending_req->result; + + if (errors && log_print_stat) + scsiback_print_status(sense_buffer, errors, pending_req); + + scsiback_fast_flush_area(pending_req); + scsiback_do_resp_with_sense(sense_buffer, errors, resid, pending_req); + scsiback_put(info); +} + +static void scsiback_cmd_exec(struct vscsibk_pend *pending_req) +{ + struct se_cmd *se_cmd = &pending_req->se_cmd; + struct se_session *sess = pending_req->v2p->tpg->tpg_nexus->tvn_se_sess; + int rc; + + memset(pending_req->sense_buffer, 0, VSCSIIF_SENSE_BUFFERSIZE); + + memset(se_cmd, 0, sizeof(*se_cmd)); + + scsiback_get(pending_req->info); + rc = target_submit_cmd_map_sgls(se_cmd, sess, pending_req->cmnd, + pending_req->sense_buffer, pending_req->v2p->lun, + pending_req->data_len, 0, + pending_req->sc_data_direction, 0, + pending_req->sgl, pending_req->n_sg, + NULL, 0, NULL, 0); + if (rc < 0) { + transport_send_check_condition_and_sense(se_cmd, + TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE, 0); + transport_generic_free_cmd(se_cmd, 0); + } +} + +static int scsiback_gnttab_data_map_batch(struct gnttab_map_grant_ref *map, + struct page **pg, grant_handle_t *grant, int cnt) +{ + int err, i; + + if (!cnt) + return 0; + + err = gnttab_map_refs(map, NULL, pg, cnt); + BUG_ON(err); + for (i = 0; i < cnt; i++) { + if (unlikely(map[i].status != GNTST_okay)) { + pr_err("invalid buffer -- could not remap it\n"); + map[i].handle = SCSIBACK_INVALID_HANDLE; + err = -ENOMEM; + } else { + get_page(pg[i]); + } + grant[i] = map[i].handle; + } + return err; +} + +static int scsiback_gnttab_data_map_list(struct vscsibk_pend *pending_req, + struct scsiif_request_segment *seg, struct page **pg, + grant_handle_t *grant, int cnt, u32 flags) +{ + int mapcount = 0, i, err = 0; + struct gnttab_map_grant_ref map[VSCSI_GRANT_BATCH]; + struct vscsibk_info *info = pending_req->info; + + for (i = 0; i < cnt; i++) { + if (get_free_page(pg + mapcount)) { + put_free_pages(pg, mapcount); + pr_err("no grant page\n"); + return -ENOMEM; + } + gnttab_set_map_op(&map[mapcount], vaddr_page(pg[mapcount]), + flags, seg[i].gref, info->domid); + mapcount++; + if (mapcount < VSCSI_GRANT_BATCH) + continue; + err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount); + pg += mapcount; + grant += mapcount; + pending_req->n_grants += mapcount; + if (err) + return err; + mapcount = 0; + } + err = scsiback_gnttab_data_map_batch(map, pg, grant, mapcount); + pending_req->n_grants += mapcount; + return err; +} + +static int scsiback_gnttab_data_map(struct vscsiif_request *ring_req, + struct vscsibk_pend *pending_req) +{ + u32 flags; + int i, err, n_segs, i_seg = 0; + struct page **pg; + struct scsiif_request_segment *seg; + unsigned long end_seg = 0; + unsigned int nr_segments = (unsigned int)ring_req->nr_segments; + unsigned int nr_sgl = 0; + struct scatterlist *sg; + grant_handle_t *grant; + + pending_req->n_sg = 0; + pending_req->n_grants = 0; + pending_req->data_len = 0; + + nr_segments &= ~VSCSIIF_SG_GRANT; + if (!nr_segments) + return 0; + + if (nr_segments > VSCSIIF_SG_TABLESIZE) { + pr_debug("invalid parameter nr_seg = %d\n", + ring_req->nr_segments); + return -EINVAL; + } + + if (ring_req->nr_segments & VSCSIIF_SG_GRANT) { + err = scsiback_gnttab_data_map_list(pending_req, ring_req->seg, + pending_req->pages, pending_req->grant_handles, + nr_segments, GNTMAP_host_map | GNTMAP_readonly); + if (err) + return err; + nr_sgl = nr_segments; + nr_segments = 0; + for (i = 0; i < nr_sgl; i++) { + n_segs = ring_req->seg[i].length / + sizeof(struct scsiif_request_segment); + if ((unsigned)ring_req->seg[i].offset + + (unsigned)ring_req->seg[i].length > PAGE_SIZE || + n_segs * sizeof(struct scsiif_request_segment) != + ring_req->seg[i].length) + return -EINVAL; + nr_segments += n_segs; + } + if (nr_segments > SG_ALL) { + pr_debug("invalid nr_seg = %d\n", nr_segments); + return -EINVAL; + } + } + + /* free of (sgl) in fast_flush_area() */ + pending_req->sgl = kmalloc_array(nr_segments, + sizeof(struct scatterlist), GFP_KERNEL); + if (!pending_req->sgl) + return -ENOMEM; + + sg_init_table(pending_req->sgl, nr_segments); + pending_req->n_sg = nr_segments; + + flags = GNTMAP_host_map; + if (pending_req->sc_data_direction == DMA_TO_DEVICE) + flags |= GNTMAP_readonly; + + pg = pending_req->pages + nr_sgl; + grant = pending_req->grant_handles + nr_sgl; + if (!nr_sgl) { + seg = ring_req->seg; + err = scsiback_gnttab_data_map_list(pending_req, seg, + pg, grant, nr_segments, flags); + if (err) + return err; + } else { + for (i = 0; i < nr_sgl; i++) { + seg = (struct scsiif_request_segment *)( + vaddr(pending_req, i) + ring_req->seg[i].offset); + n_segs = ring_req->seg[i].length / + sizeof(struct scsiif_request_segment); + err = scsiback_gnttab_data_map_list(pending_req, seg, + pg, grant, n_segs, flags); + if (err) + return err; + pg += n_segs; + grant += n_segs; + } + end_seg = vaddr(pending_req, 0) + ring_req->seg[0].offset; + seg = (struct scsiif_request_segment *)end_seg; + end_seg += ring_req->seg[0].length; + pg = pending_req->pages + nr_sgl; + } + + for_each_sg(pending_req->sgl, sg, nr_segments, i) { + sg_set_page(sg, pg[i], seg->length, seg->offset); + pending_req->data_len += seg->length; + seg++; + if (nr_sgl && (unsigned long)seg >= end_seg) { + i_seg++; + end_seg = vaddr(pending_req, i_seg) + + ring_req->seg[i_seg].offset; + seg = (struct scsiif_request_segment *)end_seg; + end_seg += ring_req->seg[i_seg].length; + } + if (sg->offset >= PAGE_SIZE || + sg->length > PAGE_SIZE || + sg->offset + sg->length > PAGE_SIZE) + return -EINVAL; + } + + return 0; +} + +static void scsiback_disconnect(struct vscsibk_info *info) +{ + wait_event(info->waiting_to_free, + atomic_read(&info->nr_unreplied_reqs) == 0); + + unbind_from_irqhandler(info->irq, info); + info->irq = 0; + xenbus_unmap_ring_vfree(info->dev, info->ring.sring); +} + +static void scsiback_device_action(struct vscsibk_pend *pending_req, + enum tcm_tmreq_table act, int tag) +{ + int rc, err = FAILED; + struct scsiback_tpg *tpg = pending_req->v2p->tpg; + struct se_cmd *se_cmd = &pending_req->se_cmd; + struct scsiback_tmr *tmr; + + tmr = kzalloc(sizeof(struct scsiback_tmr), GFP_KERNEL); + if (!tmr) + goto out; + + init_waitqueue_head(&tmr->tmr_wait); + + transport_init_se_cmd(se_cmd, tpg->se_tpg.se_tpg_tfo, + tpg->tpg_nexus->tvn_se_sess, 0, DMA_NONE, TCM_SIMPLE_TAG, + &pending_req->sense_buffer[0]); + + rc = core_tmr_alloc_req(se_cmd, tmr, act, GFP_KERNEL); + if (rc < 0) + goto out; + + se_cmd->se_tmr_req->ref_task_tag = tag; + + if (transport_lookup_tmr_lun(se_cmd, pending_req->v2p->lun) < 0) + goto out; + + transport_generic_handle_tmr(se_cmd); + wait_event(tmr->tmr_wait, atomic_read(&tmr->tmr_complete)); + + err = (se_cmd->se_tmr_req->response == TMR_FUNCTION_COMPLETE) ? + SUCCESS : FAILED; + +out: + if (tmr) { + transport_generic_free_cmd(&pending_req->se_cmd, 1); + kfree(tmr); + } + + scsiback_do_resp_with_sense(NULL, err, 0, pending_req); + + kmem_cache_free(scsiback_cachep, pending_req); +} + +/* + Perform virtual to physical translation +*/ +static struct v2p_entry *scsiback_do_translation(struct vscsibk_info *info, + struct ids_tuple *v) +{ + struct v2p_entry *entry; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + kref_get(&entry->kref); + goto out; + } + } + entry = NULL; + +out: + spin_unlock_irqrestore(&info->v2p_lock, flags); + return entry; +} + +static int prepare_pending_reqs(struct vscsibk_info *info, + struct vscsiif_request *ring_req, + struct vscsibk_pend *pending_req) +{ + struct v2p_entry *v2p; + struct ids_tuple vir; + + pending_req->rqid = ring_req->rqid; + pending_req->info = info; + + vir.chn = ring_req->channel; + vir.tgt = ring_req->id; + vir.lun = ring_req->lun; + + v2p = scsiback_do_translation(info, &vir); + if (!v2p) { + pending_req->v2p = NULL; + pr_debug("the v2p of (chn:%d, tgt:%d, lun:%d) doesn't exist.\n", + vir.chn, vir.tgt, vir.lun); + return -ENODEV; + } + pending_req->v2p = v2p; + + /* request range check from frontend */ + pending_req->sc_data_direction = ring_req->sc_data_direction; + if ((pending_req->sc_data_direction != DMA_BIDIRECTIONAL) && + (pending_req->sc_data_direction != DMA_TO_DEVICE) && + (pending_req->sc_data_direction != DMA_FROM_DEVICE) && + (pending_req->sc_data_direction != DMA_NONE)) { + pr_debug("invalid parameter data_dir = %d\n", + pending_req->sc_data_direction); + return -EINVAL; + } + + pending_req->cmd_len = ring_req->cmd_len; + if (pending_req->cmd_len > VSCSIIF_MAX_COMMAND_SIZE) { + pr_debug("invalid parameter cmd_len = %d\n", + pending_req->cmd_len); + return -EINVAL; + } + memcpy(pending_req->cmnd, ring_req->cmnd, pending_req->cmd_len); + + return 0; +} + +static int scsiback_do_cmd_fn(struct vscsibk_info *info) +{ + struct vscsiif_back_ring *ring = &info->ring; + struct vscsiif_request ring_req; + struct vscsibk_pend *pending_req; + RING_IDX rc, rp; + int err, more_to_do; + uint32_t result; + + rc = ring->req_cons; + rp = ring->sring->req_prod; + rmb(); /* guest system is accessing ring, too */ + + if (RING_REQUEST_PROD_OVERFLOW(ring, rp)) { + rc = ring->rsp_prod_pvt; + pr_warn("Dom%d provided bogus ring requests (%#x - %#x = %u). Halting ring processing\n", + info->domid, rp, rc, rp - rc); + info->ring_error = 1; + return 0; + } + + while ((rc != rp)) { + if (RING_REQUEST_CONS_OVERFLOW(ring, rc)) + break; + pending_req = kmem_cache_alloc(scsiback_cachep, GFP_KERNEL); + if (!pending_req) + return 1; + + ring_req = *RING_GET_REQUEST(ring, rc); + ring->req_cons = ++rc; + + err = prepare_pending_reqs(info, &ring_req, pending_req); + if (err) { + switch (err) { + case -ENODEV: + result = DID_NO_CONNECT; + break; + default: + result = DRIVER_ERROR; + break; + } + scsiback_do_resp_with_sense(NULL, result << 24, 0, + pending_req); + kmem_cache_free(scsiback_cachep, pending_req); + return 1; + } + + switch (ring_req.act) { + case VSCSIIF_ACT_SCSI_CDB: + if (scsiback_gnttab_data_map(&ring_req, pending_req)) { + scsiback_fast_flush_area(pending_req); + scsiback_do_resp_with_sense(NULL, + DRIVER_ERROR << 24, 0, pending_req); + kmem_cache_free(scsiback_cachep, pending_req); + } else { + scsiback_cmd_exec(pending_req); + } + break; + case VSCSIIF_ACT_SCSI_ABORT: + scsiback_device_action(pending_req, TMR_ABORT_TASK, + ring_req.ref_rqid); + break; + case VSCSIIF_ACT_SCSI_RESET: + scsiback_device_action(pending_req, TMR_LUN_RESET, 0); + break; + default: + pr_err_ratelimited("invalid request\n"); + scsiback_do_resp_with_sense(NULL, DRIVER_ERROR << 24, + 0, pending_req); + kmem_cache_free(scsiback_cachep, pending_req); + break; + } + + /* Yield point for this unbounded loop. */ + cond_resched(); + } + + RING_FINAL_CHECK_FOR_REQUESTS(&info->ring, more_to_do); + return more_to_do; +} + +static irqreturn_t scsiback_irq_fn(int irq, void *dev_id) +{ + struct vscsibk_info *info = dev_id; + + if (info->ring_error) + return IRQ_HANDLED; + + while (scsiback_do_cmd_fn(info)) + cond_resched(); + + return IRQ_HANDLED; +} + +static int scsiback_init_sring(struct vscsibk_info *info, grant_ref_t ring_ref, + evtchn_port_t evtchn) +{ + void *area; + struct vscsiif_sring *sring; + int err; + + if (info->irq) + return -1; + + err = xenbus_map_ring_valloc(info->dev, &ring_ref, 1, &area); + if (err) + return err; + + sring = (struct vscsiif_sring *)area; + BACK_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = bind_interdomain_evtchn_to_irq(info->domid, evtchn); + if (err < 0) + goto unmap_page; + + info->irq = err; + + err = request_threaded_irq(info->irq, NULL, scsiback_irq_fn, + IRQF_ONESHOT, "vscsiif-backend", info); + if (err) + goto free_irq; + + return 0; + +free_irq: + unbind_from_irqhandler(info->irq, info); + info->irq = 0; +unmap_page: + xenbus_unmap_ring_vfree(info->dev, area); + + return err; +} + +static int scsiback_map(struct vscsibk_info *info) +{ + struct xenbus_device *dev = info->dev; + unsigned int ring_ref, evtchn; + int err; + + err = xenbus_gather(XBT_NIL, dev->otherend, + "ring-ref", "%u", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, "reading %s ring", dev->otherend); + return err; + } + + return scsiback_init_sring(info, ring_ref, evtchn); +} + +/* + Add a new translation entry +*/ +static int scsiback_add_translation_entry(struct vscsibk_info *info, + char *phy, struct ids_tuple *v) +{ + int err = 0; + struct v2p_entry *entry; + struct v2p_entry *new; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + char *lunp; + unsigned int lun; + struct scsiback_tpg *tpg_entry, *tpg = NULL; + char *error = "doesn't exist"; + + lunp = strrchr(phy, ':'); + if (!lunp) { + pr_err("illegal format of physical device %s\n", phy); + return -EINVAL; + } + *lunp = 0; + lunp++; + if (kstrtouint(lunp, 10, &lun) || lun >= TRANSPORT_MAX_LUNS_PER_TPG) { + pr_err("lun number not valid: %s\n", lunp); + return -EINVAL; + } + + mutex_lock(&scsiback_mutex); + list_for_each_entry(tpg_entry, &scsiback_list, tv_tpg_list) { + if (!strcmp(phy, tpg_entry->tport->tport_name) || + !strcmp(phy, tpg_entry->param_alias)) { + spin_lock(&tpg_entry->se_tpg.tpg_lun_lock); + if (tpg_entry->se_tpg.tpg_lun_list[lun]->lun_status == + TRANSPORT_LUN_STATUS_ACTIVE) { + if (!tpg_entry->tpg_nexus) + error = "nexus undefined"; + else + tpg = tpg_entry; + } + spin_unlock(&tpg_entry->se_tpg.tpg_lun_lock); + break; + } + } + if (tpg) { + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count++; + mutex_unlock(&tpg->tv_tpg_mutex); + } + mutex_unlock(&scsiback_mutex); + + if (!tpg) { + pr_err("%s:%d %s\n", phy, lun, error); + return -ENODEV; + } + + new = kmalloc(sizeof(struct v2p_entry), GFP_KERNEL); + if (new == NULL) { + err = -ENOMEM; + goto out_free; + } + + spin_lock_irqsave(&info->v2p_lock, flags); + + /* Check double assignment to identical virtual ID */ + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + pr_warn("Virtual ID is already used. Assignment was not performed.\n"); + err = -EEXIST; + goto out; + } + + } + + /* Create a new translation entry and add to the list */ + kref_init(&new->kref); + new->v = *v; + new->tpg = tpg; + new->lun = lun; + list_add_tail(&new->l, head); + +out: + spin_unlock_irqrestore(&info->v2p_lock, flags); + +out_free: + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_fe_count--; + mutex_unlock(&tpg->tv_tpg_mutex); + + if (err) + kfree(new); + + return err; +} + +static void __scsiback_del_translation_entry(struct v2p_entry *entry) +{ + list_del(&entry->l); + kref_put(&entry->kref, scsiback_free_translation_entry); +} + +/* + Delete the translation entry specfied +*/ +static int scsiback_del_translation_entry(struct vscsibk_info *info, + struct ids_tuple *v) +{ + struct v2p_entry *entry; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + /* Find out the translation entry specified */ + list_for_each_entry(entry, head, l) { + if ((entry->v.chn == v->chn) && + (entry->v.tgt == v->tgt) && + (entry->v.lun == v->lun)) { + goto found; + } + } + + spin_unlock_irqrestore(&info->v2p_lock, flags); + return 1; + +found: + /* Delete the translation entry specfied */ + __scsiback_del_translation_entry(entry); + + spin_unlock_irqrestore(&info->v2p_lock, flags); + return 0; +} + +static void scsiback_do_add_lun(struct vscsibk_info *info, const char *state, + char *phy, struct ids_tuple *vir, int try) +{ + if (!scsiback_add_translation_entry(info, phy, vir)) { + if (xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateInitialised)) { + pr_err("xenbus_printf error %s\n", state); + scsiback_del_translation_entry(info, vir); + } + } else if (!try) { + xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateClosed); + } +} + +static void scsiback_do_del_lun(struct vscsibk_info *info, const char *state, + struct ids_tuple *vir) +{ + if (!scsiback_del_translation_entry(info, vir)) { + if (xenbus_printf(XBT_NIL, info->dev->nodename, state, + "%d", XenbusStateClosed)) + pr_err("xenbus_printf error %s\n", state); + } +} + +#define VSCSIBACK_OP_ADD_OR_DEL_LUN 1 +#define VSCSIBACK_OP_UPDATEDEV_STATE 2 + +static void scsiback_do_1lun_hotplug(struct vscsibk_info *info, int op, + char *ent) +{ + int err; + struct ids_tuple vir; + char *val; + int device_state; + char phy[VSCSI_NAMELEN]; + char str[64]; + char state[64]; + struct xenbus_device *dev = info->dev; + + /* read status */ + snprintf(state, sizeof(state), "vscsi-devs/%s/state", ent); + err = xenbus_scanf(XBT_NIL, dev->nodename, state, "%u", &device_state); + if (XENBUS_EXIST_ERR(err)) + return; + + /* physical SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/p-dev", ent); + val = xenbus_read(XBT_NIL, dev->nodename, str, NULL); + if (IS_ERR(val)) { + xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + return; + } + strlcpy(phy, val, VSCSI_NAMELEN); + kfree(val); + + /* virtual SCSI device */ + snprintf(str, sizeof(str), "vscsi-devs/%s/v-dev", ent); + err = xenbus_scanf(XBT_NIL, dev->nodename, str, "%u:%u:%u:%u", + &vir.hst, &vir.chn, &vir.tgt, &vir.lun); + if (XENBUS_EXIST_ERR(err)) { + xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + return; + } + + switch (op) { + case VSCSIBACK_OP_ADD_OR_DEL_LUN: + switch (device_state) { + case XenbusStateInitialising: + scsiback_do_add_lun(info, state, phy, &vir, 0); + break; + case XenbusStateConnected: + scsiback_do_add_lun(info, state, phy, &vir, 1); + break; + case XenbusStateClosing: + scsiback_do_del_lun(info, state, &vir); + break; + default: + break; + } + break; + + case VSCSIBACK_OP_UPDATEDEV_STATE: + if (device_state == XenbusStateInitialised) { + /* modify vscsi-devs/dev-x/state */ + if (xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateConnected)) { + pr_err("xenbus_printf error %s\n", str); + scsiback_del_translation_entry(info, &vir); + xenbus_printf(XBT_NIL, dev->nodename, state, + "%d", XenbusStateClosed); + } + } + break; + /* When it is necessary, processing is added here. */ + default: + break; + } +} + +static void scsiback_do_lun_hotplug(struct vscsibk_info *info, int op) +{ + int i; + char **dir; + unsigned int ndir = 0; + + dir = xenbus_directory(XBT_NIL, info->dev->nodename, "vscsi-devs", + &ndir); + if (IS_ERR(dir)) + return; + + for (i = 0; i < ndir; i++) + scsiback_do_1lun_hotplug(info, op, dir[i]); + + kfree(dir); +} + +static void scsiback_frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct vscsibk_info *info = dev_get_drvdata(&dev->dev); + + switch (frontend_state) { + case XenbusStateInitialising: + break; + + case XenbusStateInitialised: + if (scsiback_map(info)) + break; + + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN); + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateConnected: + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_UPDATEDEV_STATE); + + if (dev->state == XenbusStateConnected) + break; + + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosing: + if (info->irq) + scsiback_disconnect(info); + + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + device_unregister(&dev->dev); + break; + + case XenbusStateReconfiguring: + scsiback_do_lun_hotplug(info, VSCSIBACK_OP_ADD_OR_DEL_LUN); + xenbus_switch_state(dev, XenbusStateReconfigured); + + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + +/* + Release the translation entry specfied +*/ +static void scsiback_release_translation_entry(struct vscsibk_info *info) +{ + struct v2p_entry *entry, *tmp; + struct list_head *head = &(info->v2p_entry_lists); + unsigned long flags; + + spin_lock_irqsave(&info->v2p_lock, flags); + + list_for_each_entry_safe(entry, tmp, head, l) + __scsiback_del_translation_entry(entry); + + spin_unlock_irqrestore(&info->v2p_lock, flags); +} + +static int scsiback_remove(struct xenbus_device *dev) +{ + struct vscsibk_info *info = dev_get_drvdata(&dev->dev); + + if (info->irq) + scsiback_disconnect(info); + + scsiback_release_translation_entry(info); + + dev_set_drvdata(&dev->dev, NULL); + + return 0; +} + +static int scsiback_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + + struct vscsibk_info *info = kzalloc(sizeof(struct vscsibk_info), + GFP_KERNEL); + + pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id); + + if (!info) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating backend structure"); + return -ENOMEM; + } + info->dev = dev; + dev_set_drvdata(&dev->dev, info); + + info->domid = dev->otherend_id; + spin_lock_init(&info->ring_lock); + info->ring_error = 0; + atomic_set(&info->nr_unreplied_reqs, 0); + init_waitqueue_head(&info->waiting_to_free); + info->dev = dev; + info->irq = 0; + INIT_LIST_HEAD(&info->v2p_entry_lists); + spin_lock_init(&info->v2p_lock); + + err = xenbus_printf(XBT_NIL, dev->nodename, "feature-sg-grant", "%u", + SG_ALL); + if (err) + xenbus_dev_error(dev, err, "writing feature-sg-grant"); + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + + return 0; + +fail: + pr_warn("%s failed\n", __func__); + scsiback_remove(dev); + + return err; +} + +static char *scsiback_dump_proto_id(struct scsiback_tport *tport) +{ + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return "SAS"; + case SCSI_PROTOCOL_FCP: + return "FCP"; + case SCSI_PROTOCOL_ISCSI: + return "iSCSI"; + default: + break; + } + + return "Unknown"; +} + +static u8 scsiback_get_fabric_proto_ident(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_get_fabric_proto_ident(se_tpg); + case SCSI_PROTOCOL_FCP: + return fc_get_fabric_proto_ident(se_tpg); + case SCSI_PROTOCOL_ISCSI: + return iscsi_get_fabric_proto_ident(se_tpg); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_get_fabric_proto_ident(se_tpg); +} + +static char *scsiback_get_fabric_wwn(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + return &tport->tport_name[0]; +} + +static u16 scsiback_get_tag(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + return tpg->tport_tpgt; +} + +static u32 scsiback_get_default_depth(struct se_portal_group *se_tpg) +{ + return 1; +} + +static u32 +scsiback_get_pr_transport_id(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code, + unsigned char *buf) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); + case SCSI_PROTOCOL_FCP: + return fc_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); + case SCSI_PROTOCOL_ISCSI: + return iscsi_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_get_pr_transport_id(se_tpg, se_nacl, pr_reg, + format_code, buf); +} + +static u32 +scsiback_get_pr_transport_id_len(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl, + struct t10_pr_registration *pr_reg, + int *format_code) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); + case SCSI_PROTOCOL_FCP: + return fc_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); + case SCSI_PROTOCOL_ISCSI: + return iscsi_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_get_pr_transport_id_len(se_tpg, se_nacl, pr_reg, + format_code); +} + +static char * +scsiback_parse_pr_out_transport_id(struct se_portal_group *se_tpg, + const char *buf, + u32 *out_tid_len, + char **port_nexus_ptr) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport = tpg->tport; + + switch (tport->tport_proto_id) { + case SCSI_PROTOCOL_SAS: + return sas_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); + case SCSI_PROTOCOL_FCP: + return fc_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); + case SCSI_PROTOCOL_ISCSI: + return iscsi_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); + default: + pr_err("Unknown tport_proto_id: 0x%02x, using SAS emulation\n", + tport->tport_proto_id); + break; + } + + return sas_parse_pr_out_transport_id(se_tpg, buf, out_tid_len, + port_nexus_ptr); +} + +static struct se_wwn * +scsiback_make_tport(struct target_fabric_configfs *tf, + struct config_group *group, + const char *name) +{ + struct scsiback_tport *tport; + char *ptr; + u64 wwpn = 0; + int off = 0; + + tport = kzalloc(sizeof(struct scsiback_tport), GFP_KERNEL); + if (!tport) + return ERR_PTR(-ENOMEM); + + tport->tport_wwpn = wwpn; + /* + * Determine the emulated Protocol Identifier and Target Port Name + * based on the incoming configfs directory name. + */ + ptr = strstr(name, "naa."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_SAS; + goto check_len; + } + ptr = strstr(name, "fc."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_FCP; + off = 3; /* Skip over "fc." */ + goto check_len; + } + ptr = strstr(name, "iqn."); + if (ptr) { + tport->tport_proto_id = SCSI_PROTOCOL_ISCSI; + goto check_len; + } + + pr_err("Unable to locate prefix for emulated Target Port: %s\n", name); + kfree(tport); + return ERR_PTR(-EINVAL); + +check_len: + if (strlen(name) >= VSCSI_NAMELEN) { + pr_err("Emulated %s Address: %s, exceeds max: %d\n", name, + scsiback_dump_proto_id(tport), VSCSI_NAMELEN); + kfree(tport); + return ERR_PTR(-EINVAL); + } + snprintf(&tport->tport_name[0], VSCSI_NAMELEN, "%s", &name[off]); + + pr_debug("Allocated emulated Target %s Address: %s\n", + scsiback_dump_proto_id(tport), name); + + return &tport->tport_wwn; +} + +static void scsiback_drop_tport(struct se_wwn *wwn) +{ + struct scsiback_tport *tport = container_of(wwn, + struct scsiback_tport, tport_wwn); + + pr_debug("Deallocating emulated Target %s Address: %s\n", + scsiback_dump_proto_id(tport), tport->tport_name); + + kfree(tport); +} + +static struct se_node_acl * +scsiback_alloc_fabric_acl(struct se_portal_group *se_tpg) +{ + return kzalloc(sizeof(struct se_node_acl), GFP_KERNEL); +} + +static void +scsiback_release_fabric_acl(struct se_portal_group *se_tpg, + struct se_node_acl *se_nacl) +{ + kfree(se_nacl); +} + +static u32 scsiback_tpg_get_inst_index(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int scsiback_check_stop_free(struct se_cmd *se_cmd) +{ + /* + * Do not release struct se_cmd's containing a valid TMR pointer. + * These will be released directly in scsiback_device_action() + * with transport_generic_free_cmd(). + */ + if (se_cmd->se_cmd_flags & SCF_SCSI_TMR_CDB) + return 0; + + transport_generic_free_cmd(se_cmd, 0); + return 1; +} + +static void scsiback_release_cmd(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + kmem_cache_free(scsiback_cachep, pending_req); +} + +static int scsiback_shutdown_session(struct se_session *se_sess) +{ + return 0; +} + +static void scsiback_close_session(struct se_session *se_sess) +{ +} + +static u32 scsiback_sess_get_index(struct se_session *se_sess) +{ + return 0; +} + +static int scsiback_write_pending(struct se_cmd *se_cmd) +{ + /* Go ahead and process the write immediately */ + target_execute_cmd(se_cmd); + + return 0; +} + +static int scsiback_write_pending_status(struct se_cmd *se_cmd) +{ + return 0; +} + +static void scsiback_set_default_node_attrs(struct se_node_acl *nacl) +{ +} + +static u32 scsiback_get_task_tag(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + return pending_req->rqid; +} + +static int scsiback_get_cmd_state(struct se_cmd *se_cmd) +{ + return 0; +} + +static int scsiback_queue_data_in(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + pending_req->result = SAM_STAT_GOOD; + scsiback_cmd_done(pending_req); + return 0; +} + +static int scsiback_queue_status(struct se_cmd *se_cmd) +{ + struct vscsibk_pend *pending_req = container_of(se_cmd, + struct vscsibk_pend, se_cmd); + + if (se_cmd->sense_buffer && + ((se_cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) || + (se_cmd->se_cmd_flags & SCF_EMULATED_TASK_SENSE))) + pending_req->result = (DRIVER_SENSE << 24) | + SAM_STAT_CHECK_CONDITION; + else + pending_req->result = se_cmd->scsi_status; + + scsiback_cmd_done(pending_req); + return 0; +} + +static void scsiback_queue_tm_rsp(struct se_cmd *se_cmd) +{ + struct se_tmr_req *se_tmr = se_cmd->se_tmr_req; + struct scsiback_tmr *tmr = se_tmr->fabric_tmr_ptr; + + atomic_set(&tmr->tmr_complete, 1); + wake_up(&tmr->tmr_wait); +} + +static void scsiback_aborted_task(struct se_cmd *se_cmd) +{ +} + +static ssize_t scsiback_tpg_param_show_alias(struct se_portal_group *se_tpg, + char *page) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg, + se_tpg); + ssize_t rb; + + mutex_lock(&tpg->tv_tpg_mutex); + rb = snprintf(page, PAGE_SIZE, "%s\n", tpg->param_alias); + mutex_unlock(&tpg->tv_tpg_mutex); + + return rb; +} + +static ssize_t scsiback_tpg_param_store_alias(struct se_portal_group *se_tpg, + const char *page, size_t count) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, struct scsiback_tpg, + se_tpg); + int len; + + if (strlen(page) >= VSCSI_NAMELEN) { + pr_err("param alias: %s, exceeds max: %d\n", page, + VSCSI_NAMELEN); + return -EINVAL; + } + + mutex_lock(&tpg->tv_tpg_mutex); + len = snprintf(tpg->param_alias, VSCSI_NAMELEN, "%s", page); + if (tpg->param_alias[len - 1] == '\n') + tpg->param_alias[len - 1] = '\0'; + mutex_unlock(&tpg->tv_tpg_mutex); + + return count; +} + +TF_TPG_PARAM_ATTR(scsiback, alias, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *scsiback_param_attrs[] = { + &scsiback_tpg_param_alias.attr, + NULL, +}; + +static int scsiback_make_nexus(struct scsiback_tpg *tpg, + const char *name) +{ + struct se_portal_group *se_tpg; + struct se_session *se_sess; + struct scsiback_nexus *tv_nexus; + + mutex_lock(&tpg->tv_tpg_mutex); + if (tpg->tpg_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_debug("tpg->tpg_nexus already exists\n"); + return -EEXIST; + } + se_tpg = &tpg->se_tpg; + + tv_nexus = kzalloc(sizeof(struct scsiback_nexus), GFP_KERNEL); + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENOMEM; + } + /* + * Initialize the struct se_session pointer + */ + tv_nexus->tvn_se_sess = transport_init_session(TARGET_PROT_NORMAL); + if (IS_ERR(tv_nexus->tvn_se_sess)) { + mutex_unlock(&tpg->tv_tpg_mutex); + kfree(tv_nexus); + return -ENOMEM; + } + se_sess = tv_nexus->tvn_se_sess; + /* + * Since we are running in 'demo mode' this call with generate a + * struct se_node_acl for the scsiback struct se_portal_group with + * the SCSI Initiator port name of the passed configfs group 'name'. + */ + tv_nexus->tvn_se_sess->se_node_acl = core_tpg_check_initiator_node_acl( + se_tpg, (unsigned char *)name); + if (!tv_nexus->tvn_se_sess->se_node_acl) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_debug("core_tpg_check_initiator_node_acl() failed for %s\n", + name); + goto out; + } + /* Now register the TCM pvscsi virtual I_T Nexus as active. */ + transport_register_session(se_tpg, tv_nexus->tvn_se_sess->se_node_acl, + tv_nexus->tvn_se_sess, tv_nexus); + tpg->tpg_nexus = tv_nexus; + + mutex_unlock(&tpg->tv_tpg_mutex); + return 0; + +out: + transport_free_session(se_sess); + kfree(tv_nexus); + return -ENOMEM; +} + +static int scsiback_drop_nexus(struct scsiback_tpg *tpg) +{ + struct se_session *se_sess; + struct scsiback_nexus *tv_nexus; + + mutex_lock(&tpg->tv_tpg_mutex); + tv_nexus = tpg->tpg_nexus; + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + + se_sess = tv_nexus->tvn_se_sess; + if (!se_sess) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + + if (tpg->tv_tpg_port_count != 0) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG port count: %d\n", + tpg->tv_tpg_port_count); + return -EBUSY; + } + + if (tpg->tv_tpg_fe_count != 0) { + mutex_unlock(&tpg->tv_tpg_mutex); + pr_err("Unable to remove xen-pvscsi I_T Nexus with active TPG frontend count: %d\n", + tpg->tv_tpg_fe_count); + return -EBUSY; + } + + pr_debug("Removing I_T Nexus to emulated %s Initiator Port: %s\n", + scsiback_dump_proto_id(tpg->tport), + tv_nexus->tvn_se_sess->se_node_acl->initiatorname); + + /* + * Release the SCSI I_T Nexus to the emulated xen-pvscsi Target Port + */ + transport_deregister_session(tv_nexus->tvn_se_sess); + tpg->tpg_nexus = NULL; + mutex_unlock(&tpg->tv_tpg_mutex); + + kfree(tv_nexus); + return 0; +} + +static ssize_t scsiback_tpg_show_nexus(struct se_portal_group *se_tpg, + char *page) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_nexus *tv_nexus; + ssize_t ret; + + mutex_lock(&tpg->tv_tpg_mutex); + tv_nexus = tpg->tpg_nexus; + if (!tv_nexus) { + mutex_unlock(&tpg->tv_tpg_mutex); + return -ENODEV; + } + ret = snprintf(page, PAGE_SIZE, "%s\n", + tv_nexus->tvn_se_sess->se_node_acl->initiatorname); + mutex_unlock(&tpg->tv_tpg_mutex); + + return ret; +} + +static ssize_t scsiback_tpg_store_nexus(struct se_portal_group *se_tpg, + const char *page, + size_t count) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + struct scsiback_tport *tport_wwn = tpg->tport; + unsigned char i_port[VSCSI_NAMELEN], *ptr, *port_ptr; + int ret; + /* + * Shutdown the active I_T nexus if 'NULL' is passed. + */ + if (!strncmp(page, "NULL", 4)) { + ret = scsiback_drop_nexus(tpg); + return (!ret) ? count : ret; + } + /* + * Otherwise make sure the passed virtual Initiator port WWN matches + * the fabric protocol_id set in scsiback_make_tport(), and call + * scsiback_make_nexus(). + */ + if (strlen(page) >= VSCSI_NAMELEN) { + pr_err("Emulated NAA Sas Address: %s, exceeds max: %d\n", + page, VSCSI_NAMELEN); + return -EINVAL; + } + snprintf(&i_port[0], VSCSI_NAMELEN, "%s", page); + + ptr = strstr(i_port, "naa."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_SAS) { + pr_err("Passed SAS Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[0]; + goto check_newline; + } + ptr = strstr(i_port, "fc."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_FCP) { + pr_err("Passed FCP Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[3]; /* Skip over "fc." */ + goto check_newline; + } + ptr = strstr(i_port, "iqn."); + if (ptr) { + if (tport_wwn->tport_proto_id != SCSI_PROTOCOL_ISCSI) { + pr_err("Passed iSCSI Initiator Port %s does not match target port protoid: %s\n", + i_port, scsiback_dump_proto_id(tport_wwn)); + return -EINVAL; + } + port_ptr = &i_port[0]; + goto check_newline; + } + pr_err("Unable to locate prefix for emulated Initiator Port: %s\n", + i_port); + return -EINVAL; + /* + * Clear any trailing newline for the NAA WWN + */ +check_newline: + if (i_port[strlen(i_port) - 1] == '\n') + i_port[strlen(i_port) - 1] = '\0'; + + ret = scsiback_make_nexus(tpg, port_ptr); + if (ret < 0) + return ret; + + return count; +} + +TF_TPG_BASE_ATTR(scsiback, nexus, S_IRUGO | S_IWUSR); + +static struct configfs_attribute *scsiback_tpg_attrs[] = { + &scsiback_tpg_nexus.attr, + NULL, +}; + +static ssize_t +scsiback_wwn_show_attr_version(struct target_fabric_configfs *tf, + char *page) +{ + return sprintf(page, "xen-pvscsi fabric module %s on %s/%s on " + UTS_RELEASE"\n", + VSCSI_VERSION, utsname()->sysname, utsname()->machine); +} + +TF_WWN_ATTR_RO(scsiback, version); + +static struct configfs_attribute *scsiback_wwn_attrs[] = { + &scsiback_wwn_version.attr, + NULL, +}; + +static char *scsiback_get_fabric_name(void) +{ + return "xen-pvscsi"; +} + +static int scsiback_port_link(struct se_portal_group *se_tpg, + struct se_lun *lun) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_port_count++; + mutex_unlock(&tpg->tv_tpg_mutex); + + return 0; +} + +static void scsiback_port_unlink(struct se_portal_group *se_tpg, + struct se_lun *lun) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&tpg->tv_tpg_mutex); + tpg->tv_tpg_port_count--; + mutex_unlock(&tpg->tv_tpg_mutex); +} + +static struct se_portal_group * +scsiback_make_tpg(struct se_wwn *wwn, + struct config_group *group, + const char *name) +{ + struct scsiback_tport *tport = container_of(wwn, + struct scsiback_tport, tport_wwn); + + struct scsiback_tpg *tpg; + u16 tpgt; + int ret; + + if (strstr(name, "tpgt_") != name) + return ERR_PTR(-EINVAL); + ret = kstrtou16(name + 5, 10, &tpgt); + if (ret) + return ERR_PTR(ret); + + tpg = kzalloc(sizeof(struct scsiback_tpg), GFP_KERNEL); + if (!tpg) + return ERR_PTR(-ENOMEM); + + mutex_init(&tpg->tv_tpg_mutex); + INIT_LIST_HEAD(&tpg->tv_tpg_list); + INIT_LIST_HEAD(&tpg->info_list); + tpg->tport = tport; + tpg->tport_tpgt = tpgt; + + ret = core_tpg_register(&scsiback_ops, wwn, + &tpg->se_tpg, tpg, TRANSPORT_TPG_TYPE_NORMAL); + if (ret < 0) { + kfree(tpg); + return NULL; + } + mutex_lock(&scsiback_mutex); + list_add_tail(&tpg->tv_tpg_list, &scsiback_list); + mutex_unlock(&scsiback_mutex); + + return &tpg->se_tpg; +} + +static void scsiback_drop_tpg(struct se_portal_group *se_tpg) +{ + struct scsiback_tpg *tpg = container_of(se_tpg, + struct scsiback_tpg, se_tpg); + + mutex_lock(&scsiback_mutex); + list_del(&tpg->tv_tpg_list); + mutex_unlock(&scsiback_mutex); + /* + * Release the virtual I_T Nexus for this xen-pvscsi TPG + */ + scsiback_drop_nexus(tpg); + /* + * Deregister the se_tpg from TCM. + */ + core_tpg_deregister(se_tpg); + kfree(tpg); +} + +static int scsiback_check_true(struct se_portal_group *se_tpg) +{ + return 1; +} + +static int scsiback_check_false(struct se_portal_group *se_tpg) +{ + return 0; +} + +static const struct target_core_fabric_ops scsiback_ops = { + .module = THIS_MODULE, + .name = "xen-pvscsi", + .get_fabric_name = scsiback_get_fabric_name, + .get_fabric_proto_ident = scsiback_get_fabric_proto_ident, + .tpg_get_wwn = scsiback_get_fabric_wwn, + .tpg_get_tag = scsiback_get_tag, + .tpg_get_default_depth = scsiback_get_default_depth, + .tpg_get_pr_transport_id = scsiback_get_pr_transport_id, + .tpg_get_pr_transport_id_len = scsiback_get_pr_transport_id_len, + .tpg_parse_pr_out_transport_id = scsiback_parse_pr_out_transport_id, + .tpg_check_demo_mode = scsiback_check_true, + .tpg_check_demo_mode_cache = scsiback_check_true, + .tpg_check_demo_mode_write_protect = scsiback_check_false, + .tpg_check_prod_mode_write_protect = scsiback_check_false, + .tpg_alloc_fabric_acl = scsiback_alloc_fabric_acl, + .tpg_release_fabric_acl = scsiback_release_fabric_acl, + .tpg_get_inst_index = scsiback_tpg_get_inst_index, + .check_stop_free = scsiback_check_stop_free, + .release_cmd = scsiback_release_cmd, + .put_session = NULL, + .shutdown_session = scsiback_shutdown_session, + .close_session = scsiback_close_session, + .sess_get_index = scsiback_sess_get_index, + .sess_get_initiator_sid = NULL, + .write_pending = scsiback_write_pending, + .write_pending_status = scsiback_write_pending_status, + .set_default_node_attributes = scsiback_set_default_node_attrs, + .get_task_tag = scsiback_get_task_tag, + .get_cmd_state = scsiback_get_cmd_state, + .queue_data_in = scsiback_queue_data_in, + .queue_status = scsiback_queue_status, + .queue_tm_rsp = scsiback_queue_tm_rsp, + .aborted_task = scsiback_aborted_task, + /* + * Setup callers for generic logic in target_core_fabric_configfs.c + */ + .fabric_make_wwn = scsiback_make_tport, + .fabric_drop_wwn = scsiback_drop_tport, + .fabric_make_tpg = scsiback_make_tpg, + .fabric_drop_tpg = scsiback_drop_tpg, + .fabric_post_link = scsiback_port_link, + .fabric_pre_unlink = scsiback_port_unlink, + .fabric_make_np = NULL, + .fabric_drop_np = NULL, +#if 0 + .fabric_make_nodeacl = scsiback_make_nodeacl, + .fabric_drop_nodeacl = scsiback_drop_nodeacl, +#endif + + .tfc_wwn_attrs = scsiback_wwn_attrs, + .tfc_tpg_base_attrs = scsiback_tpg_attrs, + .tfc_tpg_param_attrs = scsiback_param_attrs, +}; + +static const struct xenbus_device_id scsiback_ids[] = { + { "vscsi" }, + { "" } +}; + +static struct xenbus_driver scsiback_driver = { + .ids = scsiback_ids, + .probe = scsiback_probe, + .remove = scsiback_remove, + .otherend_changed = scsiback_frontend_changed +}; + +static void scsiback_init_pend(void *p) +{ + struct vscsibk_pend *pend = p; + int i; + + memset(pend, 0, sizeof(*pend)); + for (i = 0; i < VSCSI_MAX_GRANTS; i++) + pend->grant_handles[i] = SCSIBACK_INVALID_HANDLE; +} + +static int __init scsiback_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + pr_debug("xen-pvscsi: fabric module %s on %s/%s on "UTS_RELEASE"\n", + VSCSI_VERSION, utsname()->sysname, utsname()->machine); + + scsiback_cachep = kmem_cache_create("vscsiif_cache", + sizeof(struct vscsibk_pend), 0, 0, scsiback_init_pend); + if (!scsiback_cachep) + return -ENOMEM; + + ret = xenbus_register_backend(&scsiback_driver); + if (ret) + goto out_cache_destroy; + + ret = target_register_template(&scsiback_ops); + if (ret) + goto out_unregister_xenbus; + + return 0; + +out_unregister_xenbus: + xenbus_unregister_driver(&scsiback_driver); +out_cache_destroy: + kmem_cache_destroy(scsiback_cachep); + pr_err("%s: error %d\n", __func__, ret); + return ret; +} + +static void __exit scsiback_exit(void) +{ + struct page *page; + + while (free_pages_num) { + if (get_free_page(&page)) + BUG(); + gnttab_free_pages(1, &page); + } + target_unregister_template(&scsiback_ops); + xenbus_unregister_driver(&scsiback_driver); + kmem_cache_destroy(scsiback_cachep); +} + +module_init(scsiback_init); +module_exit(scsiback_exit); + +MODULE_DESCRIPTION("Xen SCSI backend driver"); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("xen-backend:vscsi"); +MODULE_AUTHOR("Juergen Gross <jgross@suse.com>"); diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c new file mode 100644 index 000000000..3b2bffde5 --- /dev/null +++ b/drivers/xen/xen-selfballoon.c @@ -0,0 +1,579 @@ +/****************************************************************************** + * Xen selfballoon driver (and optional frontswap self-shrinking driver) + * + * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. + * + * This code complements the cleancache and frontswap patchsets to optimize + * support for Xen Transcendent Memory ("tmem"). The policy it implements + * is rudimentary and will likely improve over time, but it does work well + * enough today. + * + * Two functionalities are implemented here which both use "control theory" + * (feedback) to optimize memory utilization. In a virtualized environment + * such as Xen, RAM is often a scarce resource and we would like to ensure + * that each of a possibly large number of virtual machines is using RAM + * efficiently, i.e. using as little as possible when under light load + * and obtaining as much as possible when memory demands are high. + * Since RAM needs vary highly dynamically and sometimes dramatically, + * "hysteresis" is used, that is, memory target is determined not just + * on current data but also on past data stored in the system. + * + * "Selfballooning" creates memory pressure by managing the Xen balloon + * driver to decrease and increase available kernel memory, driven + * largely by the target value of "Committed_AS" (see /proc/meminfo). + * Since Committed_AS does not account for clean mapped pages (i.e. pages + * in RAM that are identical to pages on disk), selfballooning has the + * affect of pushing less frequently used clean pagecache pages out of + * kernel RAM and, presumably using cleancache, into Xen tmem where + * Xen can more efficiently optimize RAM utilization for such pages. + * + * When kernel memory demand unexpectedly increases faster than Xen, via + * the selfballoon driver, is able to (or chooses to) provide usable RAM, + * the kernel may invoke swapping. In most cases, frontswap is able + * to absorb this swapping into Xen tmem. However, due to the fact + * that the kernel swap subsystem assumes swapping occurs to a disk, + * swapped pages may sit on the disk for a very long time; even if + * the kernel knows the page will never be used again. This is because + * the disk space costs very little and can be overwritten when + * necessary. When such stale pages are in frontswap, however, they + * are taking up valuable real estate. "Frontswap selfshrinking" works + * to resolve this: When frontswap activity is otherwise stable + * and the guest kernel is not under memory pressure, the "frontswap + * selfshrinking" accounts for this by providing pressure to remove some + * pages from frontswap and return them to kernel memory. + * + * For both "selfballooning" and "frontswap-selfshrinking", a worker + * thread is used and sysfs tunables are provided to adjust the frequency + * and rate of adjustments to achieve the goal, as well as to disable one + * or both functions independently. + * + * While some argue that this functionality can and should be implemented + * in userspace, it has been observed that bad things happen (e.g. OOMs). + * + * System configuration note: Selfballooning should not be enabled on + * systems without a sufficiently large swap device configured; for best + * results, it is recommended that total swap be increased by the size + * of the guest memory. Note, that selfballooning should be disabled by default + * if frontswap is not configured. Similarly selfballooning should be enabled + * by default if frontswap is configured and can be disabled with the + * "tmem.selfballooning=0" kernel boot option. Finally, when frontswap is + * configured, frontswap-selfshrinking can be disabled with the + * "tmem.selfshrink=0" kernel boot option. + * + * Selfballooning is disallowed in domain0 and force-disabled. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/bootmem.h> +#include <linux/swap.h> +#include <linux/mm.h> +#include <linux/mman.h> +#include <linux/module.h> +#include <linux/workqueue.h> +#include <linux/device.h> +#include <xen/balloon.h> +#include <xen/tmem.h> +#include <xen/xen.h> + +/* Enable/disable with sysfs. */ +static int xen_selfballooning_enabled __read_mostly; + +/* + * Controls rate at which memory target (this iteration) approaches + * ultimate goal when memory need is increasing (up-hysteresis) or + * decreasing (down-hysteresis). Higher values of hysteresis cause + * slower increases/decreases. The default values for the various + * parameters were deemed reasonable by experimentation, may be + * workload-dependent, and can all be adjusted via sysfs. + */ +static unsigned int selfballoon_downhysteresis __read_mostly = 8; +static unsigned int selfballoon_uphysteresis __read_mostly = 1; + +/* In HZ, controls frequency of worker invocation. */ +static unsigned int selfballoon_interval __read_mostly = 5; + +/* + * Minimum usable RAM in MB for selfballooning target for balloon. + * If non-zero, it is added to totalreserve_pages and self-ballooning + * will not balloon below the sum. If zero, a piecewise linear function + * is calculated as a minimum and added to totalreserve_pages. Note that + * setting this value indiscriminately may cause OOMs and crashes. + */ +static unsigned int selfballoon_min_usable_mb; + +/* + * Amount of RAM in MB to add to the target number of pages. + * Can be used to reserve some more room for caches and the like. + */ +static unsigned int selfballoon_reserved_mb; + +static void selfballoon_process(struct work_struct *work); +static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); + +#ifdef CONFIG_FRONTSWAP +#include <linux/frontswap.h> + +/* Enable/disable with sysfs. */ +static bool frontswap_selfshrinking __read_mostly; + +/* + * The default values for the following parameters were deemed reasonable + * by experimentation, may be workload-dependent, and can all be + * adjusted via sysfs. + */ + +/* Control rate for frontswap shrinking. Higher hysteresis is slower. */ +static unsigned int frontswap_hysteresis __read_mostly = 20; + +/* + * Number of selfballoon worker invocations to wait before observing that + * frontswap selfshrinking should commence. Note that selfshrinking does + * not use a separate worker thread. + */ +static unsigned int frontswap_inertia __read_mostly = 3; + +/* Countdown to next invocation of frontswap_shrink() */ +static unsigned long frontswap_inertia_counter; + +/* + * Invoked by the selfballoon worker thread, uses current number of pages + * in frontswap (frontswap_curr_pages()), previous status, and control + * values (hysteresis and inertia) to determine if frontswap should be + * shrunk and what the new frontswap size should be. Note that + * frontswap_shrink is essentially a partial swapoff that immediately + * transfers pages from the "swap device" (frontswap) back into kernel + * RAM; despite the name, frontswap "shrinking" is very different from + * the "shrinker" interface used by the kernel MM subsystem to reclaim + * memory. + */ +static void frontswap_selfshrink(void) +{ + static unsigned long cur_frontswap_pages; + static unsigned long last_frontswap_pages; + static unsigned long tgt_frontswap_pages; + + last_frontswap_pages = cur_frontswap_pages; + cur_frontswap_pages = frontswap_curr_pages(); + if (!cur_frontswap_pages || + (cur_frontswap_pages > last_frontswap_pages)) { + frontswap_inertia_counter = frontswap_inertia; + return; + } + if (frontswap_inertia_counter && --frontswap_inertia_counter) + return; + if (cur_frontswap_pages <= frontswap_hysteresis) + tgt_frontswap_pages = 0; + else + tgt_frontswap_pages = cur_frontswap_pages - + (cur_frontswap_pages / frontswap_hysteresis); + frontswap_shrink(tgt_frontswap_pages); + frontswap_inertia_counter = frontswap_inertia; +} + +#endif /* CONFIG_FRONTSWAP */ + +#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) +#define PAGES2MB(pages) ((pages) >> (20 - PAGE_SHIFT)) + +/* + * Use current balloon size, the goal (vm_committed_as), and hysteresis + * parameters to set a new target balloon size + */ +static void selfballoon_process(struct work_struct *work) +{ + unsigned long cur_pages, goal_pages, tgt_pages, floor_pages; + unsigned long useful_pages; + bool reset_timer = false; + + if (xen_selfballooning_enabled) { + cur_pages = totalram_pages; + tgt_pages = cur_pages; /* default is no change */ + goal_pages = vm_memory_committed() + + totalreserve_pages + + MB2PAGES(selfballoon_reserved_mb); +#ifdef CONFIG_FRONTSWAP + /* allow space for frontswap pages to be repatriated */ + if (frontswap_selfshrinking && frontswap_enabled) + goal_pages += frontswap_curr_pages(); +#endif + if (cur_pages > goal_pages) + tgt_pages = cur_pages - + ((cur_pages - goal_pages) / + selfballoon_downhysteresis); + else if (cur_pages < goal_pages) + tgt_pages = cur_pages + + ((goal_pages - cur_pages) / + selfballoon_uphysteresis); + /* else if cur_pages == goal_pages, no change */ + useful_pages = max_pfn - totalreserve_pages; + if (selfballoon_min_usable_mb != 0) + floor_pages = totalreserve_pages + + MB2PAGES(selfballoon_min_usable_mb); + /* piecewise linear function ending in ~3% slope */ + else if (useful_pages < MB2PAGES(16)) + floor_pages = max_pfn; /* not worth ballooning */ + else if (useful_pages < MB2PAGES(64)) + floor_pages = totalreserve_pages + MB2PAGES(16) + + ((useful_pages - MB2PAGES(16)) >> 1); + else if (useful_pages < MB2PAGES(512)) + floor_pages = totalreserve_pages + MB2PAGES(40) + + ((useful_pages - MB2PAGES(40)) >> 3); + else /* useful_pages >= MB2PAGES(512) */ + floor_pages = totalreserve_pages + MB2PAGES(99) + + ((useful_pages - MB2PAGES(99)) >> 5); + if (tgt_pages < floor_pages) + tgt_pages = floor_pages; + balloon_set_new_target(tgt_pages + + balloon_stats.current_pages - totalram_pages); + reset_timer = true; + } +#ifdef CONFIG_FRONTSWAP + if (frontswap_selfshrinking && frontswap_enabled) { + frontswap_selfshrink(); + reset_timer = true; + } +#endif + if (reset_timer) + schedule_delayed_work(&selfballoon_worker, + selfballoon_interval * HZ); +} + +#ifdef CONFIG_SYSFS + +#include <linux/capability.h> + +#define SELFBALLOON_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } + +SELFBALLOON_SHOW(selfballooning, "%d\n", xen_selfballooning_enabled); + +static ssize_t store_selfballooning(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + bool was_enabled = xen_selfballooning_enabled; + unsigned long tmp; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + err = kstrtoul(buf, 10, &tmp); + if (err) + return err; + if ((tmp != 0) && (tmp != 1)) + return -EINVAL; + + xen_selfballooning_enabled = !!tmp; + if (!was_enabled && xen_selfballooning_enabled) + schedule_delayed_work(&selfballoon_worker, + selfballoon_interval * HZ); + + return count; +} + +static DEVICE_ATTR(selfballooning, S_IRUGO | S_IWUSR, + show_selfballooning, store_selfballooning); + +SELFBALLOON_SHOW(selfballoon_interval, "%d\n", selfballoon_interval); + +static ssize_t store_selfballoon_interval(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + selfballoon_interval = val; + return count; +} + +static DEVICE_ATTR(selfballoon_interval, S_IRUGO | S_IWUSR, + show_selfballoon_interval, store_selfballoon_interval); + +SELFBALLOON_SHOW(selfballoon_downhys, "%d\n", selfballoon_downhysteresis); + +static ssize_t store_selfballoon_downhys(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + selfballoon_downhysteresis = val; + return count; +} + +static DEVICE_ATTR(selfballoon_downhysteresis, S_IRUGO | S_IWUSR, + show_selfballoon_downhys, store_selfballoon_downhys); + + +SELFBALLOON_SHOW(selfballoon_uphys, "%d\n", selfballoon_uphysteresis); + +static ssize_t store_selfballoon_uphys(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + selfballoon_uphysteresis = val; + return count; +} + +static DEVICE_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, + show_selfballoon_uphys, store_selfballoon_uphys); + +SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n", + selfballoon_min_usable_mb); + +static ssize_t store_selfballoon_min_usable_mb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + selfballoon_min_usable_mb = val; + return count; +} + +static DEVICE_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR, + show_selfballoon_min_usable_mb, + store_selfballoon_min_usable_mb); + +SELFBALLOON_SHOW(selfballoon_reserved_mb, "%d\n", + selfballoon_reserved_mb); + +static ssize_t store_selfballoon_reserved_mb(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + selfballoon_reserved_mb = val; + return count; +} + +static DEVICE_ATTR(selfballoon_reserved_mb, S_IRUGO | S_IWUSR, + show_selfballoon_reserved_mb, + store_selfballoon_reserved_mb); + + +#ifdef CONFIG_FRONTSWAP +SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); + +static ssize_t store_frontswap_selfshrinking(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + bool was_enabled = frontswap_selfshrinking; + unsigned long tmp; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &tmp); + if (err) + return err; + if ((tmp != 0) && (tmp != 1)) + return -EINVAL; + frontswap_selfshrinking = !!tmp; + if (!was_enabled && !xen_selfballooning_enabled && + frontswap_selfshrinking) + schedule_delayed_work(&selfballoon_worker, + selfballoon_interval * HZ); + + return count; +} + +static DEVICE_ATTR(frontswap_selfshrinking, S_IRUGO | S_IWUSR, + show_frontswap_selfshrinking, store_frontswap_selfshrinking); + +SELFBALLOON_SHOW(frontswap_inertia, "%d\n", frontswap_inertia); + +static ssize_t store_frontswap_inertia(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + frontswap_inertia = val; + frontswap_inertia_counter = val; + return count; +} + +static DEVICE_ATTR(frontswap_inertia, S_IRUGO | S_IWUSR, + show_frontswap_inertia, store_frontswap_inertia); + +SELFBALLOON_SHOW(frontswap_hysteresis, "%d\n", frontswap_hysteresis); + +static ssize_t store_frontswap_hysteresis(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = kstrtoul(buf, 10, &val); + if (err) + return err; + if (val == 0) + return -EINVAL; + frontswap_hysteresis = val; + return count; +} + +static DEVICE_ATTR(frontswap_hysteresis, S_IRUGO | S_IWUSR, + show_frontswap_hysteresis, store_frontswap_hysteresis); + +#endif /* CONFIG_FRONTSWAP */ + +static struct attribute *selfballoon_attrs[] = { + &dev_attr_selfballooning.attr, + &dev_attr_selfballoon_interval.attr, + &dev_attr_selfballoon_downhysteresis.attr, + &dev_attr_selfballoon_uphysteresis.attr, + &dev_attr_selfballoon_min_usable_mb.attr, + &dev_attr_selfballoon_reserved_mb.attr, +#ifdef CONFIG_FRONTSWAP + &dev_attr_frontswap_selfshrinking.attr, + &dev_attr_frontswap_hysteresis.attr, + &dev_attr_frontswap_inertia.attr, +#endif + NULL +}; + +static const struct attribute_group selfballoon_group = { + .name = "selfballoon", + .attrs = selfballoon_attrs +}; +#endif + +int register_xen_selfballooning(struct device *dev) +{ + int error = -1; + +#ifdef CONFIG_SYSFS + error = sysfs_create_group(&dev->kobj, &selfballoon_group); +#endif + return error; +} +EXPORT_SYMBOL(register_xen_selfballooning); + +int xen_selfballoon_init(bool use_selfballooning, bool use_frontswap_selfshrink) +{ + bool enable = false; + unsigned long reserve_pages; + + if (!xen_domain()) + return -ENODEV; + + if (xen_initial_domain()) { + pr_info("Xen selfballooning driver disabled for domain0\n"); + return -ENODEV; + } + + xen_selfballooning_enabled = tmem_enabled && use_selfballooning; + if (xen_selfballooning_enabled) { + pr_info("Initializing Xen selfballooning driver\n"); + enable = true; + } +#ifdef CONFIG_FRONTSWAP + frontswap_selfshrinking = tmem_enabled && use_frontswap_selfshrink; + if (frontswap_selfshrinking) { + pr_info("Initializing frontswap selfshrinking driver\n"); + enable = true; + } +#endif + if (!enable) + return -ENODEV; + + /* + * Give selfballoon_reserved_mb a default value(10% of total ram pages) + * to make selfballoon not so aggressive. + * + * There are mainly two reasons: + * 1) The original goal_page didn't consider some pages used by kernel + * space, like slab pages and memory used by device drivers. + * + * 2) The balloon driver may not give back memory to guest OS fast + * enough when the workload suddenly aquries a lot of physical memory. + * + * In both cases, the guest OS will suffer from memory pressure and + * OOM killer may be triggered. + * By reserving extra 10% of total ram pages, we can keep the system + * much more reliably and response faster in some cases. + */ + if (!selfballoon_reserved_mb) { + reserve_pages = totalram_pages / 10; + selfballoon_reserved_mb = PAGES2MB(reserve_pages); + } + schedule_delayed_work(&selfballoon_worker, selfballoon_interval * HZ); + + return 0; +} +EXPORT_SYMBOL(xen_selfballoon_init); diff --git a/drivers/xen/xen-stub.c b/drivers/xen/xen-stub.c new file mode 100644 index 000000000..bbef194c5 --- /dev/null +++ b/drivers/xen/xen-stub.c @@ -0,0 +1,100 @@ +/* + * xen-stub.c - stub drivers to reserve space for Xen + * + * Copyright (C) 2012 Intel Corporation + * Author: Liu Jinsong <jinsong.liu@intel.com> + * Author: Jiang Yunhong <yunhong.jiang@intel.com> + * + * Copyright (C) 2012 Oracle Inc + * Author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/export.h> +#include <linux/types.h> +#include <linux/acpi.h> +#include <xen/acpi.h> + +#ifdef CONFIG_ACPI + +/*-------------------------------------------- + stub driver for Xen memory hotplug +--------------------------------------------*/ + +static const struct acpi_device_id memory_device_ids[] = { + {ACPI_MEMORY_DEVICE_HID, 0}, + {"", 0}, +}; + +static struct acpi_driver xen_stub_memory_device_driver = { + /* same name as native memory driver to block native loaded */ + .name = "acpi_memhotplug", + .class = ACPI_MEMORY_DEVICE_CLASS, + .ids = memory_device_ids, +}; + +int xen_stub_memory_device_init(void) +{ + if (!xen_initial_domain()) + return -ENODEV; + + /* just reserve space for Xen, block native driver loaded */ + return acpi_bus_register_driver(&xen_stub_memory_device_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_memory_device_init); +subsys_initcall(xen_stub_memory_device_init); + +void xen_stub_memory_device_exit(void) +{ + acpi_bus_unregister_driver(&xen_stub_memory_device_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_memory_device_exit); + + +/*-------------------------------------------- + stub driver for Xen cpu hotplug +--------------------------------------------*/ + +static const struct acpi_device_id processor_device_ids[] = { + {ACPI_PROCESSOR_OBJECT_HID, 0}, + {ACPI_PROCESSOR_DEVICE_HID, 0}, + {"", 0}, +}; + +static struct acpi_driver xen_stub_processor_driver = { + /* same name as native processor driver to block native loaded */ + .name = "processor", + .class = ACPI_PROCESSOR_CLASS, + .ids = processor_device_ids, +}; + +int xen_stub_processor_init(void) +{ + if (!xen_initial_domain()) + return -ENODEV; + + /* just reserve space for Xen, block native driver loaded */ + return acpi_bus_register_driver(&xen_stub_processor_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_processor_init); +subsys_initcall(xen_stub_processor_init); + +void xen_stub_processor_exit(void) +{ + acpi_bus_unregister_driver(&xen_stub_processor_driver); +} +EXPORT_SYMBOL_GPL(xen_stub_processor_exit); + +#endif diff --git a/drivers/xen/xenbus/Makefile b/drivers/xen/xenbus/Makefile new file mode 100644 index 000000000..31e2e9050 --- /dev/null +++ b/drivers/xen/xenbus/Makefile @@ -0,0 +1,14 @@ +obj-y += xenbus.o +obj-y += xenbus_dev_frontend.o + +xenbus-objs = +xenbus-objs += xenbus_client.o +xenbus-objs += xenbus_comms.o +xenbus-objs += xenbus_xs.o +xenbus-objs += xenbus_probe.o + +xenbus-be-objs-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o +xenbus-objs += $(xenbus-be-objs-y) + +obj-$(CONFIG_XEN_BACKEND) += xenbus_dev_backend.o +obj-$(CONFIG_XEN_XENBUS_FRONTEND) += xenbus_probe_frontend.o diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c new file mode 100644 index 000000000..96b2011d2 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_client.c @@ -0,0 +1,906 @@ +/****************************************************************************** + * Client-facing interface for the Xenbus driver. In other words, the + * interface between the Xenbus and the device-specific code, be it the + * frontend or the backend of that driver. + * + * Copyright (C) 2005 XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <linux/export.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/page.h> +#include <xen/interface/xen.h> +#include <xen/interface/event_channel.h> +#include <xen/balloon.h> +#include <xen/events.h> +#include <xen/grant_table.h> +#include <xen/xenbus.h> +#include <xen/xen.h> +#include <xen/features.h> + +#include "xenbus_probe.h" + +struct xenbus_map_node { + struct list_head next; + union { + struct { + struct vm_struct *area; + } pv; + struct { + struct page *pages[XENBUS_MAX_RING_PAGES]; + void *addr; + } hvm; + }; + grant_handle_t handles[XENBUS_MAX_RING_PAGES]; + unsigned int nr_handles; +}; + +static DEFINE_SPINLOCK(xenbus_valloc_lock); +static LIST_HEAD(xenbus_valloc_pages); + +struct xenbus_ring_ops { + int (*map)(struct xenbus_device *dev, + grant_ref_t *gnt_refs, unsigned int nr_grefs, + void **vaddr); + int (*unmap)(struct xenbus_device *dev, void *vaddr); +}; + +static const struct xenbus_ring_ops *ring_ops __read_mostly; + +const char *xenbus_strstate(enum xenbus_state state) +{ + static const char *const name[] = { + [ XenbusStateUnknown ] = "Unknown", + [ XenbusStateInitialising ] = "Initialising", + [ XenbusStateInitWait ] = "InitWait", + [ XenbusStateInitialised ] = "Initialised", + [ XenbusStateConnected ] = "Connected", + [ XenbusStateClosing ] = "Closing", + [ XenbusStateClosed ] = "Closed", + [XenbusStateReconfiguring] = "Reconfiguring", + [XenbusStateReconfigured] = "Reconfigured", + }; + return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID"; +} +EXPORT_SYMBOL_GPL(xenbus_strstate); + +/** + * xenbus_watch_path - register a watch + * @dev: xenbus device + * @path: path to watch + * @watch: watch to register + * @callback: callback to register + * + * Register a @watch on the given path, using the given xenbus_watch structure + * for storage, and the given @callback function as the callback. Return 0 on + * success, or -errno on error. On success, the given @path will be saved as + * @watch->node, and remains the caller's to free. On error, @watch->node will + * be NULL, the device will switch to %XenbusStateClosing, and the error will + * be saved in the store. + */ +int xenbus_watch_path(struct xenbus_device *dev, const char *path, + struct xenbus_watch *watch, + void (*callback)(struct xenbus_watch *, + const char **, unsigned int)) +{ + int err; + + watch->node = path; + watch->callback = callback; + + err = register_xenbus_watch(watch); + + if (err) { + watch->node = NULL; + watch->callback = NULL; + xenbus_dev_fatal(dev, err, "adding watch on %s", path); + } + + return err; +} +EXPORT_SYMBOL_GPL(xenbus_watch_path); + + +/** + * xenbus_watch_pathfmt - register a watch on a sprintf-formatted path + * @dev: xenbus device + * @watch: watch to register + * @callback: callback to register + * @pathfmt: format of path to watch + * + * Register a watch on the given @path, using the given xenbus_watch + * structure for storage, and the given @callback function as the callback. + * Return 0 on success, or -errno on error. On success, the watched path + * (@path/@path2) will be saved as @watch->node, and becomes the caller's to + * kfree(). On error, watch->node will be NULL, so the caller has nothing to + * free, the device will switch to %XenbusStateClosing, and the error will be + * saved in the store. + */ +int xenbus_watch_pathfmt(struct xenbus_device *dev, + struct xenbus_watch *watch, + void (*callback)(struct xenbus_watch *, + const char **, unsigned int), + const char *pathfmt, ...) +{ + int err; + va_list ap; + char *path; + + va_start(ap, pathfmt); + path = kvasprintf(GFP_NOIO | __GFP_HIGH, pathfmt, ap); + va_end(ap); + + if (!path) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch"); + return -ENOMEM; + } + err = xenbus_watch_path(dev, path, watch, callback); + + if (err) + kfree(path); + return err; +} +EXPORT_SYMBOL_GPL(xenbus_watch_pathfmt); + +static void xenbus_switch_fatal(struct xenbus_device *, int, int, + const char *, ...); + +static int +__xenbus_switch_state(struct xenbus_device *dev, + enum xenbus_state state, int depth) +{ + /* We check whether the state is currently set to the given value, and + if not, then the state is set. We don't want to unconditionally + write the given state, because we don't want to fire watches + unnecessarily. Furthermore, if the node has gone, we don't write + to it, as the device will be tearing down, and we don't want to + resurrect that directory. + + Note that, because of this cached value of our state, this + function will not take a caller's Xenstore transaction + (something it was trying to in the past) because dev->state + would not get reset if the transaction was aborted. + */ + + struct xenbus_transaction xbt; + int current_state; + int err, abort; + + if (state == dev->state) + return 0; + +again: + abort = 1; + + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_switch_fatal(dev, depth, err, "starting transaction"); + return 0; + } + + err = xenbus_scanf(xbt, dev->nodename, "state", "%d", ¤t_state); + if (err != 1) + goto abort; + + err = xenbus_printf(xbt, dev->nodename, "state", "%d", state); + if (err) { + xenbus_switch_fatal(dev, depth, err, "writing new state"); + goto abort; + } + + abort = 0; +abort: + err = xenbus_transaction_end(xbt, abort); + if (err) { + if (err == -EAGAIN && !abort) + goto again; + xenbus_switch_fatal(dev, depth, err, "ending transaction"); + } else + dev->state = state; + + return 0; +} + +/** + * xenbus_switch_state + * @dev: xenbus device + * @state: new state + * + * Advertise in the store a change of the given driver to the given new_state. + * Return 0 on success, or -errno on error. On error, the device will switch + * to XenbusStateClosing, and the error will be saved in the store. + */ +int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) +{ + return __xenbus_switch_state(dev, state, 0); +} + +EXPORT_SYMBOL_GPL(xenbus_switch_state); + +int xenbus_frontend_closed(struct xenbus_device *dev) +{ + xenbus_switch_state(dev, XenbusStateClosed); + complete(&dev->down); + return 0; +} +EXPORT_SYMBOL_GPL(xenbus_frontend_closed); + +/** + * Return the path to the error node for the given device, or NULL on failure. + * If the value returned is non-NULL, then it is the caller's to kfree. + */ +static char *error_path(struct xenbus_device *dev) +{ + return kasprintf(GFP_KERNEL, "error/%s", dev->nodename); +} + + +static void xenbus_va_dev_error(struct xenbus_device *dev, int err, + const char *fmt, va_list ap) +{ + unsigned int len; + char *printf_buffer = NULL; + char *path_buffer = NULL; + +#define PRINTF_BUFFER_SIZE 4096 + printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL); + if (printf_buffer == NULL) + goto fail; + + len = sprintf(printf_buffer, "%i ", -err); + vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap); + + dev_err(&dev->dev, "%s\n", printf_buffer); + + path_buffer = error_path(dev); + + if (path_buffer == NULL) { + dev_err(&dev->dev, "failed to write error node for %s (%s)\n", + dev->nodename, printf_buffer); + goto fail; + } + + if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) { + dev_err(&dev->dev, "failed to write error node for %s (%s)\n", + dev->nodename, printf_buffer); + goto fail; + } + +fail: + kfree(printf_buffer); + kfree(path_buffer); +} + + +/** + * xenbus_dev_error + * @dev: xenbus device + * @err: error to report + * @fmt: error message format + * + * Report the given negative errno into the store, along with the given + * formatted message. + */ +void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + xenbus_va_dev_error(dev, err, fmt, ap); + va_end(ap); +} +EXPORT_SYMBOL_GPL(xenbus_dev_error); + +/** + * xenbus_dev_fatal + * @dev: xenbus device + * @err: error to report + * @fmt: error message format + * + * Equivalent to xenbus_dev_error(dev, err, fmt, args), followed by + * xenbus_switch_state(dev, XenbusStateClosing) to schedule an orderly + * closedown of this driver and its peer. + */ + +void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + xenbus_va_dev_error(dev, err, fmt, ap); + va_end(ap); + + xenbus_switch_state(dev, XenbusStateClosing); +} +EXPORT_SYMBOL_GPL(xenbus_dev_fatal); + +/** + * Equivalent to xenbus_dev_fatal(dev, err, fmt, args), but helps + * avoiding recursion within xenbus_switch_state. + */ +static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + xenbus_va_dev_error(dev, err, fmt, ap); + va_end(ap); + + if (!depth) + __xenbus_switch_state(dev, XenbusStateClosing, 1); +} + +/** + * xenbus_grant_ring + * @dev: xenbus device + * @vaddr: starting virtual address of the ring + * @nr_pages: number of pages to be granted + * @grefs: grant reference array to be filled in + * + * Grant access to the given @vaddr to the peer of the given device. + * Then fill in @grefs with grant references. Return 0 on success, or + * -errno on error. On error, the device will switch to + * XenbusStateClosing, and the error will be saved in the store. + */ +int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr, + unsigned int nr_pages, grant_ref_t *grefs) +{ + int err; + int i, j; + + for (i = 0; i < nr_pages; i++) { + unsigned long addr = (unsigned long)vaddr + + (PAGE_SIZE * i); + err = gnttab_grant_foreign_access(dev->otherend_id, + virt_to_mfn(addr), 0); + if (err < 0) { + xenbus_dev_fatal(dev, err, + "granting access to ring page"); + goto fail; + } + grefs[i] = err; + } + + return 0; + +fail: + for (j = 0; j < i; j++) + gnttab_end_foreign_access_ref(grefs[j], 0); + return err; +} +EXPORT_SYMBOL_GPL(xenbus_grant_ring); + + +/** + * Allocate an event channel for the given xenbus_device, assigning the newly + * created local port to *port. Return 0 on success, or -errno on error. On + * error, the device will switch to XenbusStateClosing, and the error will be + * saved in the store. + */ +int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port) +{ + struct evtchn_alloc_unbound alloc_unbound; + int err; + + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = dev->otherend_id; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (err) + xenbus_dev_fatal(dev, err, "allocating event channel"); + else + *port = alloc_unbound.port; + + return err; +} +EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn); + + +/** + * Free an existing event channel. Returns 0 on success or -errno on error. + */ +int xenbus_free_evtchn(struct xenbus_device *dev, int port) +{ + struct evtchn_close close; + int err; + + close.port = port; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); + if (err) + xenbus_dev_error(dev, err, "freeing event channel %d", port); + + return err; +} +EXPORT_SYMBOL_GPL(xenbus_free_evtchn); + + +/** + * xenbus_map_ring_valloc + * @dev: xenbus device + * @gnt_refs: grant reference array + * @nr_grefs: number of grant references + * @vaddr: pointer to address to be filled out by mapping + * + * Map @nr_grefs pages of memory into this domain from another + * domain's grant table. xenbus_map_ring_valloc allocates @nr_grefs + * pages of virtual address space, maps the pages to that address, and + * sets *vaddr to that address. Returns 0 on success, and GNTST_* + * (see xen/include/interface/grant_table.h) or -ENOMEM / -EINVAL on + * error. If an error is returned, device will switch to + * XenbusStateClosing and the error message will be saved in XenStore. + */ +int xenbus_map_ring_valloc(struct xenbus_device *dev, grant_ref_t *gnt_refs, + unsigned int nr_grefs, void **vaddr) +{ + return ring_ops->map(dev, gnt_refs, nr_grefs, vaddr); +} +EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc); + +/* N.B. sizeof(phys_addr_t) doesn't always equal to sizeof(unsigned + * long), e.g. 32-on-64. Caller is responsible for preparing the + * right array to feed into this function */ +static int __xenbus_map_ring(struct xenbus_device *dev, + grant_ref_t *gnt_refs, + unsigned int nr_grefs, + grant_handle_t *handles, + phys_addr_t *addrs, + unsigned int flags, + bool *leaked) +{ + struct gnttab_map_grant_ref map[XENBUS_MAX_RING_PAGES]; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES]; + int i, j; + int err = GNTST_okay; + + if (nr_grefs > XENBUS_MAX_RING_PAGES) + return -EINVAL; + + for (i = 0; i < nr_grefs; i++) { + memset(&map[i], 0, sizeof(map[i])); + gnttab_set_map_op(&map[i], addrs[i], flags, gnt_refs[i], + dev->otherend_id); + handles[i] = INVALID_GRANT_HANDLE; + } + + gnttab_batch_map(map, i); + + for (i = 0; i < nr_grefs; i++) { + if (map[i].status != GNTST_okay) { + err = map[i].status; + xenbus_dev_fatal(dev, map[i].status, + "mapping in shared page %d from domain %d", + gnt_refs[i], dev->otherend_id); + goto fail; + } else + handles[i] = map[i].handle; + } + + return GNTST_okay; + + fail: + for (i = j = 0; i < nr_grefs; i++) { + if (handles[i] != INVALID_GRANT_HANDLE) { + memset(&unmap[j], 0, sizeof(unmap[j])); + gnttab_set_unmap_op(&unmap[j], (phys_addr_t)addrs[i], + GNTMAP_host_map, handles[i]); + j++; + } + } + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, j)) + BUG(); + + *leaked = false; + for (i = 0; i < j; i++) { + if (unmap[i].status != GNTST_okay) { + *leaked = true; + break; + } + } + + return err; +} + +static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev, + grant_ref_t *gnt_refs, + unsigned int nr_grefs, + void **vaddr) +{ + struct xenbus_map_node *node; + struct vm_struct *area; + pte_t *ptes[XENBUS_MAX_RING_PAGES]; + phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES]; + int err = GNTST_okay; + int i; + bool leaked; + + *vaddr = NULL; + + if (nr_grefs > XENBUS_MAX_RING_PAGES) + return -EINVAL; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + area = alloc_vm_area(PAGE_SIZE * nr_grefs, ptes); + if (!area) { + kfree(node); + return -ENOMEM; + } + + for (i = 0; i < nr_grefs; i++) + phys_addrs[i] = arbitrary_virt_to_machine(ptes[i]).maddr; + + err = __xenbus_map_ring(dev, gnt_refs, nr_grefs, node->handles, + phys_addrs, + GNTMAP_host_map | GNTMAP_contains_pte, + &leaked); + if (err) + goto failed; + + node->nr_handles = nr_grefs; + node->pv.area = area; + + spin_lock(&xenbus_valloc_lock); + list_add(&node->next, &xenbus_valloc_pages); + spin_unlock(&xenbus_valloc_lock); + + *vaddr = area->addr; + return 0; + +failed: + if (!leaked) + free_vm_area(area); + else + pr_alert("leaking VM area %p size %u page(s)", area, nr_grefs); + + kfree(node); + return err; +} + +static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev, + grant_ref_t *gnt_ref, + unsigned int nr_grefs, + void **vaddr) +{ + struct xenbus_map_node *node; + int i; + int err; + void *addr; + bool leaked = false; + /* Why do we need two arrays? See comment of __xenbus_map_ring */ + phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES]; + unsigned long addrs[XENBUS_MAX_RING_PAGES]; + + if (nr_grefs > XENBUS_MAX_RING_PAGES) + return -EINVAL; + + *vaddr = NULL; + + node = kzalloc(sizeof(*node), GFP_KERNEL); + if (!node) + return -ENOMEM; + + err = alloc_xenballooned_pages(nr_grefs, node->hvm.pages, + false /* lowmem */); + if (err) + goto out_err; + + for (i = 0; i < nr_grefs; i++) { + unsigned long pfn = page_to_pfn(node->hvm.pages[i]); + phys_addrs[i] = (unsigned long)pfn_to_kaddr(pfn); + addrs[i] = (unsigned long)pfn_to_kaddr(pfn); + } + + err = __xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handles, + phys_addrs, GNTMAP_host_map, &leaked); + node->nr_handles = nr_grefs; + + if (err) + goto out_free_ballooned_pages; + + addr = vmap(node->hvm.pages, nr_grefs, VM_MAP | VM_IOREMAP, + PAGE_KERNEL); + if (!addr) { + err = -ENOMEM; + goto out_xenbus_unmap_ring; + } + + node->hvm.addr = addr; + + spin_lock(&xenbus_valloc_lock); + list_add(&node->next, &xenbus_valloc_pages); + spin_unlock(&xenbus_valloc_lock); + + *vaddr = addr; + return 0; + + out_xenbus_unmap_ring: + if (!leaked) + xenbus_unmap_ring(dev, node->handles, node->nr_handles, + addrs); + else + pr_alert("leaking %p size %u page(s)", + addr, nr_grefs); + out_free_ballooned_pages: + if (!leaked) + free_xenballooned_pages(nr_grefs, node->hvm.pages); + out_err: + kfree(node); + return err; +} + + +/** + * xenbus_map_ring + * @dev: xenbus device + * @gnt_refs: grant reference array + * @nr_grefs: number of grant reference + * @handles: pointer to grant handle to be filled + * @vaddrs: addresses to be mapped to + * @leaked: fail to clean up a failed map, caller should not free vaddr + * + * Map pages of memory into this domain from another domain's grant table. + * xenbus_map_ring does not allocate the virtual address space (you must do + * this yourself!). It only maps in the pages to the specified address. + * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h) + * or -ENOMEM / -EINVAL on error. If an error is returned, device will switch to + * XenbusStateClosing and the first error message will be saved in XenStore. + * Further more if we fail to map the ring, caller should check @leaked. + * If @leaked is not zero it means xenbus_map_ring fails to clean up, caller + * should not free the address space of @vaddr. + */ +int xenbus_map_ring(struct xenbus_device *dev, grant_ref_t *gnt_refs, + unsigned int nr_grefs, grant_handle_t *handles, + unsigned long *vaddrs, bool *leaked) +{ + phys_addr_t phys_addrs[XENBUS_MAX_RING_PAGES]; + int i; + + if (nr_grefs > XENBUS_MAX_RING_PAGES) + return -EINVAL; + + for (i = 0; i < nr_grefs; i++) + phys_addrs[i] = (unsigned long)vaddrs[i]; + + return __xenbus_map_ring(dev, gnt_refs, nr_grefs, handles, + phys_addrs, GNTMAP_host_map, leaked); +} +EXPORT_SYMBOL_GPL(xenbus_map_ring); + + +/** + * xenbus_unmap_ring_vfree + * @dev: xenbus device + * @vaddr: addr to unmap + * + * Based on Rusty Russell's skeleton driver's unmap_page. + * Unmap a page of memory in this domain that was imported from another domain. + * Use xenbus_unmap_ring_vfree if you mapped in your memory with + * xenbus_map_ring_valloc (it will free the virtual address space). + * Returns 0 on success and returns GNTST_* on error + * (see xen/include/interface/grant_table.h). + */ +int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) +{ + return ring_ops->unmap(dev, vaddr); +} +EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); + +static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr) +{ + struct xenbus_map_node *node; + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES]; + unsigned int level; + int i; + bool leaked = false; + int err; + + spin_lock(&xenbus_valloc_lock); + list_for_each_entry(node, &xenbus_valloc_pages, next) { + if (node->pv.area->addr == vaddr) { + list_del(&node->next); + goto found; + } + } + node = NULL; + found: + spin_unlock(&xenbus_valloc_lock); + + if (!node) { + xenbus_dev_error(dev, -ENOENT, + "can't find mapped virtual address %p", vaddr); + return GNTST_bad_virt_addr; + } + + for (i = 0; i < node->nr_handles; i++) { + unsigned long addr; + + memset(&unmap[i], 0, sizeof(unmap[i])); + addr = (unsigned long)vaddr + (PAGE_SIZE * i); + unmap[i].host_addr = arbitrary_virt_to_machine( + lookup_address(addr, &level)).maddr; + unmap[i].dev_bus_addr = 0; + unmap[i].handle = node->handles[i]; + } + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, i)) + BUG(); + + err = GNTST_okay; + leaked = false; + for (i = 0; i < node->nr_handles; i++) { + if (unmap[i].status != GNTST_okay) { + leaked = true; + xenbus_dev_error(dev, unmap[i].status, + "unmapping page at handle %d error %d", + node->handles[i], unmap[i].status); + err = unmap[i].status; + break; + } + } + + if (!leaked) + free_vm_area(node->pv.area); + else + pr_alert("leaking VM area %p size %u page(s)", + node->pv.area, node->nr_handles); + + kfree(node); + return err; +} + +static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr) +{ + int rv; + struct xenbus_map_node *node; + void *addr; + unsigned long addrs[XENBUS_MAX_RING_PAGES]; + int i; + + spin_lock(&xenbus_valloc_lock); + list_for_each_entry(node, &xenbus_valloc_pages, next) { + addr = node->hvm.addr; + if (addr == vaddr) { + list_del(&node->next); + goto found; + } + } + node = addr = NULL; + found: + spin_unlock(&xenbus_valloc_lock); + + if (!node) { + xenbus_dev_error(dev, -ENOENT, + "can't find mapped virtual address %p", vaddr); + return GNTST_bad_virt_addr; + } + + for (i = 0; i < node->nr_handles; i++) + addrs[i] = (unsigned long)pfn_to_kaddr(page_to_pfn(node->hvm.pages[i])); + + rv = xenbus_unmap_ring(dev, node->handles, node->nr_handles, + addrs); + if (!rv) + vunmap(vaddr); + else + WARN(1, "Leaking %p, size %u page(s)\n", vaddr, + node->nr_handles); + + kfree(node); + return rv; +} + +/** + * xenbus_unmap_ring + * @dev: xenbus device + * @handles: grant handle array + * @nr_handles: number of handles in the array + * @vaddrs: addresses to unmap + * + * Unmap memory in this domain that was imported from another domain. + * Returns 0 on success and returns GNTST_* on error + * (see xen/include/interface/grant_table.h). + */ +int xenbus_unmap_ring(struct xenbus_device *dev, + grant_handle_t *handles, unsigned int nr_handles, + unsigned long *vaddrs) +{ + struct gnttab_unmap_grant_ref unmap[XENBUS_MAX_RING_PAGES]; + int i; + int err; + + if (nr_handles > XENBUS_MAX_RING_PAGES) + return -EINVAL; + + for (i = 0; i < nr_handles; i++) + gnttab_set_unmap_op(&unmap[i], vaddrs[i], + GNTMAP_host_map, handles[i]); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, unmap, i)) + BUG(); + + err = GNTST_okay; + for (i = 0; i < nr_handles; i++) { + if (unmap[i].status != GNTST_okay) { + xenbus_dev_error(dev, unmap[i].status, + "unmapping page at handle %d error %d", + handles[i], unmap[i].status); + err = unmap[i].status; + break; + } + } + + return err; +} +EXPORT_SYMBOL_GPL(xenbus_unmap_ring); + + +/** + * xenbus_read_driver_state + * @path: path for driver + * + * Return the state of the driver rooted at the given store path, or + * XenbusStateUnknown if no state can be read. + */ +enum xenbus_state xenbus_read_driver_state(const char *path) +{ + enum xenbus_state result; + int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL); + if (err) + result = XenbusStateUnknown; + + return result; +} +EXPORT_SYMBOL_GPL(xenbus_read_driver_state); + +static const struct xenbus_ring_ops ring_ops_pv = { + .map = xenbus_map_ring_valloc_pv, + .unmap = xenbus_unmap_ring_vfree_pv, +}; + +static const struct xenbus_ring_ops ring_ops_hvm = { + .map = xenbus_map_ring_valloc_hvm, + .unmap = xenbus_unmap_ring_vfree_hvm, +}; + +void __init xenbus_ring_ops_init(void) +{ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + ring_ops = &ring_ops_pv; + else + ring_ops = &ring_ops_hvm; +} diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c new file mode 100644 index 000000000..fdb0f339d --- /dev/null +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -0,0 +1,243 @@ +/****************************************************************************** + * xenbus_comms.c + * + * Low level code to talks to Xen Store: ringbuffer and event channel. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/wait.h> +#include <linux/interrupt.h> +#include <linux/sched.h> +#include <linux/err.h> +#include <xen/xenbus.h> +#include <asm/xen/hypervisor.h> +#include <xen/events.h> +#include <xen/page.h> +#include "xenbus_comms.h" + +static int xenbus_irq; + +static DECLARE_WORK(probe_work, xenbus_probe); + +static DECLARE_WAIT_QUEUE_HEAD(xb_waitq); + +static irqreturn_t wake_waiting(int irq, void *unused) +{ + if (unlikely(xenstored_ready == 0)) { + xenstored_ready = 1; + schedule_work(&probe_work); + } + + wake_up(&xb_waitq); + return IRQ_HANDLED; +} + +static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod) +{ + return ((prod - cons) <= XENSTORE_RING_SIZE); +} + +static void *get_output_chunk(XENSTORE_RING_IDX cons, + XENSTORE_RING_IDX prod, + char *buf, uint32_t *len) +{ + *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod); + if ((XENSTORE_RING_SIZE - (prod - cons)) < *len) + *len = XENSTORE_RING_SIZE - (prod - cons); + return buf + MASK_XENSTORE_IDX(prod); +} + +static const void *get_input_chunk(XENSTORE_RING_IDX cons, + XENSTORE_RING_IDX prod, + const char *buf, uint32_t *len) +{ + *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons); + if ((prod - cons) < *len) + *len = prod - cons; + return buf + MASK_XENSTORE_IDX(cons); +} + +/** + * xb_write - low level write + * @data: buffer to send + * @len: length of buffer + * + * Returns 0 on success, error otherwise. + */ +int xb_write(const void *data, unsigned len) +{ + struct xenstore_domain_interface *intf = xen_store_interface; + XENSTORE_RING_IDX cons, prod; + int rc; + + while (len != 0) { + void *dst; + unsigned int avail; + + rc = wait_event_interruptible( + xb_waitq, + (intf->req_prod - intf->req_cons) != + XENSTORE_RING_SIZE); + if (rc < 0) + return rc; + + /* Read indexes, then verify. */ + cons = intf->req_cons; + prod = intf->req_prod; + if (!check_indexes(cons, prod)) { + intf->req_cons = intf->req_prod = 0; + return -EIO; + } + + dst = get_output_chunk(cons, prod, intf->req, &avail); + if (avail == 0) + continue; + if (avail > len) + avail = len; + + /* Must write data /after/ reading the consumer index. */ + mb(); + + memcpy(dst, data, avail); + data += avail; + len -= avail; + + /* Other side must not see new producer until data is there. */ + wmb(); + intf->req_prod += avail; + + /* Implies mb(): other side will see the updated producer. */ + notify_remote_via_evtchn(xen_store_evtchn); + } + + return 0; +} + +int xb_data_to_read(void) +{ + struct xenstore_domain_interface *intf = xen_store_interface; + return (intf->rsp_cons != intf->rsp_prod); +} + +int xb_wait_for_data_to_read(void) +{ + return wait_event_interruptible(xb_waitq, xb_data_to_read()); +} + +int xb_read(void *data, unsigned len) +{ + struct xenstore_domain_interface *intf = xen_store_interface; + XENSTORE_RING_IDX cons, prod; + int rc; + + while (len != 0) { + unsigned int avail; + const char *src; + + rc = xb_wait_for_data_to_read(); + if (rc < 0) + return rc; + + /* Read indexes, then verify. */ + cons = intf->rsp_cons; + prod = intf->rsp_prod; + if (!check_indexes(cons, prod)) { + intf->rsp_cons = intf->rsp_prod = 0; + return -EIO; + } + + src = get_input_chunk(cons, prod, intf->rsp, &avail); + if (avail == 0) + continue; + if (avail > len) + avail = len; + + /* Must read data /after/ reading the producer index. */ + rmb(); + + memcpy(data, src, avail); + data += avail; + len -= avail; + + /* Other side must not see free space until we've copied out */ + mb(); + intf->rsp_cons += avail; + + pr_debug("Finished read of %i bytes (%i to go)\n", avail, len); + + /* Implies mb(): other side will see the updated consumer. */ + notify_remote_via_evtchn(xen_store_evtchn); + } + + return 0; +} + +/** + * xb_init_comms - Set up interrupt handler off store event channel. + */ +int xb_init_comms(void) +{ + struct xenstore_domain_interface *intf = xen_store_interface; + + if (intf->req_prod != intf->req_cons) + pr_err("request ring is not quiescent (%08x:%08x)!\n", + intf->req_cons, intf->req_prod); + + if (intf->rsp_prod != intf->rsp_cons) { + pr_warn("response ring is not quiescent (%08x:%08x): fixing up\n", + intf->rsp_cons, intf->rsp_prod); + /* breaks kdump */ + if (!reset_devices) + intf->rsp_cons = intf->rsp_prod; + } + + if (xenbus_irq) { + /* Already have an irq; assume we're resuming */ + rebind_evtchn_irq(xen_store_evtchn, xenbus_irq); + } else { + int err; + err = bind_evtchn_to_irqhandler(xen_store_evtchn, wake_waiting, + 0, "xenbus", &xb_waitq); + if (err < 0) { + pr_err("request irq failed %i\n", err); + return err; + } + + xenbus_irq = err; + } + + return 0; +} + +void xb_deinit_comms(void) +{ + unbind_from_irqhandler(xenbus_irq, &xb_waitq); + xenbus_irq = 0; +} diff --git a/drivers/xen/xenbus/xenbus_comms.h b/drivers/xen/xenbus/xenbus_comms.h new file mode 100644 index 000000000..e74f9c1fb --- /dev/null +++ b/drivers/xen/xenbus/xenbus_comms.h @@ -0,0 +1,52 @@ +/* + * Private include for xenbus communications. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _XENBUS_COMMS_H +#define _XENBUS_COMMS_H + +#include <linux/fs.h> + +int xs_init(void); +int xb_init_comms(void); +void xb_deinit_comms(void); + +/* Low level routines. */ +int xb_write(const void *data, unsigned len); +int xb_read(void *data, unsigned len); +int xb_data_to_read(void); +int xb_wait_for_data_to_read(void); +int xs_input_avail(void); +extern struct xenstore_domain_interface *xen_store_interface; +extern int xen_store_evtchn; +extern enum xenstore_init xen_store_domain_type; + +extern const struct file_operations xen_xenbus_fops; + +#endif /* _XENBUS_COMMS_H */ diff --git a/drivers/xen/xenbus/xenbus_dev_backend.c b/drivers/xen/xenbus/xenbus_dev_backend.c new file mode 100644 index 000000000..b17707ee0 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_dev_backend.c @@ -0,0 +1,142 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/miscdevice.h> +#include <linux/module.h> +#include <linux/capability.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/xenbus.h> +#include <xen/xenbus_dev.h> +#include <xen/grant_table.h> +#include <xen/events.h> +#include <asm/xen/hypervisor.h> + +#include "xenbus_comms.h" + +MODULE_LICENSE("GPL"); + +static int xenbus_backend_open(struct inode *inode, struct file *filp) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return nonseekable_open(inode, filp); +} + +static long xenbus_alloc(domid_t domid) +{ + struct evtchn_alloc_unbound arg; + int err = -EEXIST; + + xs_suspend(); + + /* If xenstored_ready is nonzero, that means we have already talked to + * xenstore and set up watches. These watches will be restored by + * xs_resume, but that requires communication over the port established + * below that is not visible to anyone until the ioctl returns. + * + * This can be resolved by splitting the ioctl into two parts + * (postponing the resume until xenstored is active) but this is + * unnecessarily complex for the intended use where xenstored is only + * started once - so return -EEXIST if it's already running. + */ + if (xenstored_ready) + goto out_err; + + gnttab_grant_foreign_access_ref(GNTTAB_RESERVED_XENSTORE, domid, + virt_to_mfn(xen_store_interface), 0 /* writable */); + + arg.dom = DOMID_SELF; + arg.remote_dom = domid; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, &arg); + if (err) + goto out_err; + + if (xen_store_evtchn > 0) + xb_deinit_comms(); + + xen_store_evtchn = arg.port; + + xs_resume(); + + return arg.port; + + out_err: + xs_suspend_cancel(); + return err; +} + +static long xenbus_backend_ioctl(struct file *file, unsigned int cmd, + unsigned long data) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case IOCTL_XENBUS_BACKEND_EVTCHN: + if (xen_store_evtchn > 0) + return xen_store_evtchn; + return -ENODEV; + case IOCTL_XENBUS_BACKEND_SETUP: + return xenbus_alloc(data); + default: + return -ENOTTY; + } +} + +static int xenbus_backend_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t size = vma->vm_end - vma->vm_start; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0)) + return -EINVAL; + + if (remap_pfn_range(vma, vma->vm_start, + virt_to_pfn(xen_store_interface), + size, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +static const struct file_operations xenbus_backend_fops = { + .open = xenbus_backend_open, + .mmap = xenbus_backend_mmap, + .unlocked_ioctl = xenbus_backend_ioctl, +}; + +static struct miscdevice xenbus_backend_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/xenbus_backend", + .fops = &xenbus_backend_fops, +}; + +static int __init xenbus_backend_init(void) +{ + int err; + + if (!xen_initial_domain()) + return -ENODEV; + + err = misc_register(&xenbus_backend_dev); + if (err) + pr_err("Could not register xenbus backend device\n"); + return err; +} + +static void __exit xenbus_backend_exit(void) +{ + misc_deregister(&xenbus_backend_dev); +} + +module_init(xenbus_backend_init); +module_exit(xenbus_backend_exit); diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c new file mode 100644 index 000000000..9433e4651 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -0,0 +1,634 @@ +/* + * Driver giving user-space access to the kernel's xenbus connection + * to xenstore. + * + * Copyright (c) 2005, Christian Limpach + * Copyright (c) 2005, Rusty Russell, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * Changes: + * 2008-10-07 Alex Zeffertt Replaced /proc/xen/xenbus with xenfs filesystem + * and /proc/xen compatibility mount point. + * Turned xenfs into a loadable module. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/uio.h> +#include <linux/notifier.h> +#include <linux/wait.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/mount.h> +#include <linux/pagemap.h> +#include <linux/uaccess.h> +#include <linux/init.h> +#include <linux/namei.h> +#include <linux/string.h> +#include <linux/slab.h> +#include <linux/miscdevice.h> +#include <linux/module.h> + +#include "xenbus_comms.h" + +#include <xen/xenbus.h> +#include <xen/xen.h> +#include <asm/xen/hypervisor.h> + +MODULE_LICENSE("GPL"); + +/* + * An element of a list of outstanding transactions, for which we're + * still waiting a reply. + */ +struct xenbus_transaction_holder { + struct list_head list; + struct xenbus_transaction handle; +}; + +/* + * A buffer of data on the queue. + */ +struct read_buffer { + struct list_head list; + unsigned int cons; + unsigned int len; + char msg[]; +}; + +struct xenbus_file_priv { + /* + * msgbuffer_mutex is held while partial requests are built up + * and complete requests are acted on. It therefore protects + * the "transactions" and "watches" lists, and the partial + * request length and buffer. + * + * reply_mutex protects the reply being built up to return to + * usermode. It nests inside msgbuffer_mutex but may be held + * alone during a watch callback. + */ + struct mutex msgbuffer_mutex; + + /* In-progress transactions */ + struct list_head transactions; + + /* Active watches. */ + struct list_head watches; + + /* Partial request. */ + unsigned int len; + union { + struct xsd_sockmsg msg; + char buffer[XENSTORE_PAYLOAD_MAX]; + } u; + + /* Response queue. */ + struct mutex reply_mutex; + struct list_head read_buffers; + wait_queue_head_t read_waitq; + +}; + +/* Read out any raw xenbus messages queued up. */ +static ssize_t xenbus_file_read(struct file *filp, + char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct xenbus_file_priv *u = filp->private_data; + struct read_buffer *rb; + unsigned i; + int ret; + + mutex_lock(&u->reply_mutex); +again: + while (list_empty(&u->read_buffers)) { + mutex_unlock(&u->reply_mutex); + if (filp->f_flags & O_NONBLOCK) + return -EAGAIN; + + ret = wait_event_interruptible(u->read_waitq, + !list_empty(&u->read_buffers)); + if (ret) + return ret; + mutex_lock(&u->reply_mutex); + } + + rb = list_entry(u->read_buffers.next, struct read_buffer, list); + i = 0; + while (i < len) { + unsigned sz = min((unsigned)len - i, rb->len - rb->cons); + + ret = copy_to_user(ubuf + i, &rb->msg[rb->cons], sz); + + i += sz - ret; + rb->cons += sz - ret; + + if (ret != 0) { + if (i == 0) + i = -EFAULT; + goto out; + } + + /* Clear out buffer if it has been consumed */ + if (rb->cons == rb->len) { + list_del(&rb->list); + kfree(rb); + if (list_empty(&u->read_buffers)) + break; + rb = list_entry(u->read_buffers.next, + struct read_buffer, list); + } + } + if (i == 0) + goto again; + +out: + mutex_unlock(&u->reply_mutex); + return i; +} + +/* + * Add a buffer to the queue. Caller must hold the appropriate lock + * if the queue is not local. (Commonly the caller will build up + * multiple queued buffers on a temporary local list, and then add it + * to the appropriate list under lock once all the buffers have een + * successfully allocated.) + */ +static int queue_reply(struct list_head *queue, const void *data, size_t len) +{ + struct read_buffer *rb; + + if (len == 0) + return 0; + + rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL); + if (rb == NULL) + return -ENOMEM; + + rb->cons = 0; + rb->len = len; + + memcpy(rb->msg, data, len); + + list_add_tail(&rb->list, queue); + return 0; +} + +/* + * Free all the read_buffer s on a list. + * Caller must have sole reference to list. + */ +static void queue_cleanup(struct list_head *list) +{ + struct read_buffer *rb; + + while (!list_empty(list)) { + rb = list_entry(list->next, struct read_buffer, list); + list_del(list->next); + kfree(rb); + } +} + +struct watch_adapter { + struct list_head list; + struct xenbus_watch watch; + struct xenbus_file_priv *dev_data; + char *token; +}; + +static void free_watch_adapter(struct watch_adapter *watch) +{ + kfree(watch->watch.node); + kfree(watch->token); + kfree(watch); +} + +static struct watch_adapter *alloc_watch_adapter(const char *path, + const char *token) +{ + struct watch_adapter *watch; + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (watch == NULL) + goto out_fail; + + watch->watch.node = kstrdup(path, GFP_KERNEL); + if (watch->watch.node == NULL) + goto out_free; + + watch->token = kstrdup(token, GFP_KERNEL); + if (watch->token == NULL) + goto out_free; + + return watch; + +out_free: + free_watch_adapter(watch); + +out_fail: + return NULL; +} + +static void watch_fired(struct xenbus_watch *watch, + const char **vec, + unsigned int len) +{ + struct watch_adapter *adap; + struct xsd_sockmsg hdr; + const char *path, *token; + int path_len, tok_len, body_len, data_len = 0; + int ret; + LIST_HEAD(staging_q); + + adap = container_of(watch, struct watch_adapter, watch); + + path = vec[XS_WATCH_PATH]; + token = adap->token; + + path_len = strlen(path) + 1; + tok_len = strlen(token) + 1; + if (len > 2) + data_len = vec[len] - vec[2] + 1; + body_len = path_len + tok_len + data_len; + + hdr.type = XS_WATCH_EVENT; + hdr.len = body_len; + + mutex_lock(&adap->dev_data->reply_mutex); + + ret = queue_reply(&staging_q, &hdr, sizeof(hdr)); + if (!ret) + ret = queue_reply(&staging_q, path, path_len); + if (!ret) + ret = queue_reply(&staging_q, token, tok_len); + if (!ret && len > 2) + ret = queue_reply(&staging_q, vec[2], data_len); + + if (!ret) { + /* success: pass reply list onto watcher */ + list_splice_tail(&staging_q, &adap->dev_data->read_buffers); + wake_up(&adap->dev_data->read_waitq); + } else + queue_cleanup(&staging_q); + + mutex_unlock(&adap->dev_data->reply_mutex); +} + +static int xenbus_write_transaction(unsigned msg_type, + struct xenbus_file_priv *u) +{ + int rc; + void *reply; + struct xenbus_transaction_holder *trans = NULL; + LIST_HEAD(staging_q); + + if (msg_type == XS_TRANSACTION_START) { + trans = kmalloc(sizeof(*trans), GFP_KERNEL); + if (!trans) { + rc = -ENOMEM; + goto out; + } + } + + reply = xenbus_dev_request_and_reply(&u->u.msg); + if (IS_ERR(reply)) { + kfree(trans); + rc = PTR_ERR(reply); + goto out; + } + + if (msg_type == XS_TRANSACTION_START) { + if (u->u.msg.type == XS_ERROR) + kfree(trans); + else { + trans->handle.id = simple_strtoul(reply, NULL, 0); + list_add(&trans->list, &u->transactions); + } + } else if (u->u.msg.type == XS_TRANSACTION_END) { + list_for_each_entry(trans, &u->transactions, list) + if (trans->handle.id == u->u.msg.tx_id) + break; + BUG_ON(&trans->list == &u->transactions); + list_del(&trans->list); + + kfree(trans); + } + + mutex_lock(&u->reply_mutex); + rc = queue_reply(&staging_q, &u->u.msg, sizeof(u->u.msg)); + if (!rc) + rc = queue_reply(&staging_q, reply, u->u.msg.len); + if (!rc) { + list_splice_tail(&staging_q, &u->read_buffers); + wake_up(&u->read_waitq); + } else { + queue_cleanup(&staging_q); + } + mutex_unlock(&u->reply_mutex); + + kfree(reply); + +out: + return rc; +} + +static int xenbus_write_watch(unsigned msg_type, struct xenbus_file_priv *u) +{ + struct watch_adapter *watch, *tmp_watch; + char *path, *token; + int err, rc; + LIST_HEAD(staging_q); + + path = u->u.buffer + sizeof(u->u.msg); + token = memchr(path, 0, u->u.msg.len); + if (token == NULL) { + rc = -EILSEQ; + goto out; + } + token++; + if (memchr(token, 0, u->u.msg.len - (token - path)) == NULL) { + rc = -EILSEQ; + goto out; + } + + if (msg_type == XS_WATCH) { + watch = alloc_watch_adapter(path, token); + if (watch == NULL) { + rc = -ENOMEM; + goto out; + } + + watch->watch.callback = watch_fired; + watch->dev_data = u; + + err = register_xenbus_watch(&watch->watch); + if (err) { + free_watch_adapter(watch); + rc = err; + goto out; + } + list_add(&watch->list, &u->watches); + } else { + list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { + if (!strcmp(watch->token, token) && + !strcmp(watch->watch.node, path)) { + unregister_xenbus_watch(&watch->watch); + list_del(&watch->list); + free_watch_adapter(watch); + break; + } + } + } + + /* Success. Synthesize a reply to say all is OK. */ + { + struct { + struct xsd_sockmsg hdr; + char body[3]; + } __packed reply = { + { + .type = msg_type, + .len = sizeof(reply.body) + }, + "OK" + }; + + mutex_lock(&u->reply_mutex); + rc = queue_reply(&u->read_buffers, &reply, sizeof(reply)); + wake_up(&u->read_waitq); + mutex_unlock(&u->reply_mutex); + } + +out: + return rc; +} + +static ssize_t xenbus_file_write(struct file *filp, + const char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct xenbus_file_priv *u = filp->private_data; + uint32_t msg_type; + int rc = len; + int ret; + LIST_HEAD(staging_q); + + /* + * We're expecting usermode to be writing properly formed + * xenbus messages. If they write an incomplete message we + * buffer it up. Once it is complete, we act on it. + */ + + /* + * Make sure concurrent writers can't stomp all over each + * other's messages and make a mess of our partial message + * buffer. We don't make any attemppt to stop multiple + * writers from making a mess of each other's incomplete + * messages; we're just trying to guarantee our own internal + * consistency and make sure that single writes are handled + * atomically. + */ + mutex_lock(&u->msgbuffer_mutex); + + /* Get this out of the way early to avoid confusion */ + if (len == 0) + goto out; + + /* Can't write a xenbus message larger we can buffer */ + if (len > sizeof(u->u.buffer) - u->len) { + /* On error, dump existing buffer */ + u->len = 0; + rc = -EINVAL; + goto out; + } + + ret = copy_from_user(u->u.buffer + u->len, ubuf, len); + + if (ret != 0) { + rc = -EFAULT; + goto out; + } + + /* Deal with a partial copy. */ + len -= ret; + rc = len; + + u->len += len; + + /* Return if we haven't got a full message yet */ + if (u->len < sizeof(u->u.msg)) + goto out; /* not even the header yet */ + + /* If we're expecting a message that's larger than we can + possibly send, dump what we have and return an error. */ + if ((sizeof(u->u.msg) + u->u.msg.len) > sizeof(u->u.buffer)) { + rc = -E2BIG; + u->len = 0; + goto out; + } + + if (u->len < (sizeof(u->u.msg) + u->u.msg.len)) + goto out; /* incomplete data portion */ + + /* + * OK, now we have a complete message. Do something with it. + */ + + msg_type = u->u.msg.type; + + switch (msg_type) { + case XS_WATCH: + case XS_UNWATCH: + /* (Un)Ask for some path to be watched for changes */ + ret = xenbus_write_watch(msg_type, u); + break; + + default: + /* Send out a transaction */ + ret = xenbus_write_transaction(msg_type, u); + break; + } + if (ret != 0) + rc = ret; + + /* Buffered message consumed */ + u->len = 0; + + out: + mutex_unlock(&u->msgbuffer_mutex); + return rc; +} + +static int xenbus_file_open(struct inode *inode, struct file *filp) +{ + struct xenbus_file_priv *u; + + if (xen_store_evtchn == 0) + return -ENOENT; + + nonseekable_open(inode, filp); + + u = kzalloc(sizeof(*u), GFP_KERNEL); + if (u == NULL) + return -ENOMEM; + + INIT_LIST_HEAD(&u->transactions); + INIT_LIST_HEAD(&u->watches); + INIT_LIST_HEAD(&u->read_buffers); + init_waitqueue_head(&u->read_waitq); + + mutex_init(&u->reply_mutex); + mutex_init(&u->msgbuffer_mutex); + + filp->private_data = u; + + return 0; +} + +static int xenbus_file_release(struct inode *inode, struct file *filp) +{ + struct xenbus_file_priv *u = filp->private_data; + struct xenbus_transaction_holder *trans, *tmp; + struct watch_adapter *watch, *tmp_watch; + struct read_buffer *rb, *tmp_rb; + + /* + * No need for locking here because there are no other users, + * by definition. + */ + + list_for_each_entry_safe(trans, tmp, &u->transactions, list) { + xenbus_transaction_end(trans->handle, 1); + list_del(&trans->list); + kfree(trans); + } + + list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { + unregister_xenbus_watch(&watch->watch); + list_del(&watch->list); + free_watch_adapter(watch); + } + + list_for_each_entry_safe(rb, tmp_rb, &u->read_buffers, list) { + list_del(&rb->list); + kfree(rb); + } + kfree(u); + + return 0; +} + +static unsigned int xenbus_file_poll(struct file *file, poll_table *wait) +{ + struct xenbus_file_priv *u = file->private_data; + + poll_wait(file, &u->read_waitq, wait); + if (!list_empty(&u->read_buffers)) + return POLLIN | POLLRDNORM; + return 0; +} + +const struct file_operations xen_xenbus_fops = { + .read = xenbus_file_read, + .write = xenbus_file_write, + .open = xenbus_file_open, + .release = xenbus_file_release, + .poll = xenbus_file_poll, + .llseek = no_llseek, +}; +EXPORT_SYMBOL_GPL(xen_xenbus_fops); + +static struct miscdevice xenbus_dev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "xen/xenbus", + .fops = &xen_xenbus_fops, +}; + +static int __init xenbus_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + err = misc_register(&xenbus_dev); + if (err) + pr_err("Could not register xenbus frontend device\n"); + return err; +} + +static void __exit xenbus_exit(void) +{ + misc_deregister(&xenbus_dev); +} + +module_init(xenbus_init); +module_exit(xenbus_exit); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c new file mode 100644 index 000000000..5390a674b --- /dev/null +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -0,0 +1,839 @@ +/****************************************************************************** + * Talks to Xen Store to figure out what devices we have. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 Mike Wray, Hewlett-Packard + * Copyright (C) 2005, 2006 XenSource Ltd + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DPRINTK(fmt, args...) \ + pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ + __func__, __LINE__, ##args) + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/notifier.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/io.h> +#include <linux/slab.h> +#include <linux/module.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/xen-ops.h> +#include <xen/page.h> + +#include <xen/hvm.h> + +#include "xenbus_comms.h" +#include "xenbus_probe.h" + + +int xen_store_evtchn; +EXPORT_SYMBOL_GPL(xen_store_evtchn); + +struct xenstore_domain_interface *xen_store_interface; +EXPORT_SYMBOL_GPL(xen_store_interface); + +enum xenstore_init xen_store_domain_type; +EXPORT_SYMBOL_GPL(xen_store_domain_type); + +static unsigned long xen_store_mfn; + +static BLOCKING_NOTIFIER_HEAD(xenstore_chain); + +/* If something in array of ids matches this device, return it. */ +static const struct xenbus_device_id * +match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) +{ + for (; *arr->devicetype != '\0'; arr++) { + if (!strcmp(arr->devicetype, dev->devicetype)) + return arr; + } + return NULL; +} + +int xenbus_match(struct device *_dev, struct device_driver *_drv) +{ + struct xenbus_driver *drv = to_xenbus_driver(_drv); + + if (!drv->ids) + return 0; + + return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; +} +EXPORT_SYMBOL_GPL(xenbus_match); + + +static void free_otherend_details(struct xenbus_device *dev) +{ + kfree(dev->otherend); + dev->otherend = NULL; +} + + +static void free_otherend_watch(struct xenbus_device *dev) +{ + if (dev->otherend_watch.node) { + unregister_xenbus_watch(&dev->otherend_watch); + kfree(dev->otherend_watch.node); + dev->otherend_watch.node = NULL; + } +} + + +static int talk_to_otherend(struct xenbus_device *dev) +{ + struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); + + free_otherend_watch(dev); + free_otherend_details(dev); + + return drv->read_otherend_details(dev); +} + + + +static int watch_otherend(struct xenbus_device *dev) +{ + struct xen_bus_type *bus = + container_of(dev->dev.bus, struct xen_bus_type, bus); + + return xenbus_watch_pathfmt(dev, &dev->otherend_watch, + bus->otherend_changed, + "%s/%s", dev->otherend, "state"); +} + + +int xenbus_read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node) +{ + int err = xenbus_gather(XBT_NIL, xendev->nodename, + id_node, "%i", &xendev->otherend_id, + path_node, NULL, &xendev->otherend, + NULL); + if (err) { + xenbus_dev_fatal(xendev, err, + "reading other end details from %s", + xendev->nodename); + return err; + } + if (strlen(xendev->otherend) == 0 || + !xenbus_exists(XBT_NIL, xendev->otherend, "")) { + xenbus_dev_fatal(xendev, -ENOENT, + "unable to read other end from %s. " + "missing or inaccessible.", + xendev->nodename); + free_otherend_details(xendev); + return -ENOENT; + } + + return 0; +} +EXPORT_SYMBOL_GPL(xenbus_read_otherend_details); + +void xenbus_otherend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len, + int ignore_on_shutdown) +{ + struct xenbus_device *dev = + container_of(watch, struct xenbus_device, otherend_watch); + struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); + enum xenbus_state state; + + /* Protect us against watches firing on old details when the otherend + details change, say immediately after a resume. */ + if (!dev->otherend || + strncmp(dev->otherend, vec[XS_WATCH_PATH], + strlen(dev->otherend))) { + dev_dbg(&dev->dev, "Ignoring watch at %s\n", + vec[XS_WATCH_PATH]); + return; + } + + state = xenbus_read_driver_state(dev->otherend); + + dev_dbg(&dev->dev, "state is %d, (%s), %s, %s\n", + state, xenbus_strstate(state), dev->otherend_watch.node, + vec[XS_WATCH_PATH]); + + /* + * Ignore xenbus transitions during shutdown. This prevents us doing + * work that can fail e.g., when the rootfs is gone. + */ + if (system_state > SYSTEM_RUNNING) { + if (ignore_on_shutdown && (state == XenbusStateClosing)) + xenbus_frontend_closed(dev); + return; + } + + if (drv->otherend_changed) + drv->otherend_changed(dev, state); +} +EXPORT_SYMBOL_GPL(xenbus_otherend_changed); + +int xenbus_dev_probe(struct device *_dev) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); + const struct xenbus_device_id *id; + int err; + + DPRINTK("%s", dev->nodename); + + if (!drv->probe) { + err = -ENODEV; + goto fail; + } + + id = match_device(drv->ids, dev); + if (!id) { + err = -ENODEV; + goto fail; + } + + err = talk_to_otherend(dev); + if (err) { + dev_warn(&dev->dev, "talk_to_otherend on %s failed.\n", + dev->nodename); + return err; + } + + err = drv->probe(dev, id); + if (err) + goto fail; + + err = watch_otherend(dev); + if (err) { + dev_warn(&dev->dev, "watch_otherend on %s failed.\n", + dev->nodename); + return err; + } + + return 0; +fail: + xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename); + xenbus_switch_state(dev, XenbusStateClosed); + return err; +} +EXPORT_SYMBOL_GPL(xenbus_dev_probe); + +int xenbus_dev_remove(struct device *_dev) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); + + DPRINTK("%s", dev->nodename); + + free_otherend_watch(dev); + + if (drv->remove) + drv->remove(dev); + + free_otherend_details(dev); + + xenbus_switch_state(dev, XenbusStateClosed); + return 0; +} +EXPORT_SYMBOL_GPL(xenbus_dev_remove); + +void xenbus_dev_shutdown(struct device *_dev) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + unsigned long timeout = 5*HZ; + + DPRINTK("%s", dev->nodename); + + get_device(&dev->dev); + if (dev->state != XenbusStateConnected) { + pr_info("%s: %s: %s != Connected, skipping\n", + __func__, dev->nodename, xenbus_strstate(dev->state)); + goto out; + } + xenbus_switch_state(dev, XenbusStateClosing); + timeout = wait_for_completion_timeout(&dev->down, timeout); + if (!timeout) + pr_info("%s: %s timeout closing device\n", + __func__, dev->nodename); + out: + put_device(&dev->dev); +} +EXPORT_SYMBOL_GPL(xenbus_dev_shutdown); + +int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus, + struct module *owner, const char *mod_name) +{ + drv->driver.name = drv->name ? drv->name : drv->ids[0].devicetype; + drv->driver.bus = &bus->bus; + drv->driver.owner = owner; + drv->driver.mod_name = mod_name; + + return driver_register(&drv->driver); +} +EXPORT_SYMBOL_GPL(xenbus_register_driver_common); + +void xenbus_unregister_driver(struct xenbus_driver *drv) +{ + driver_unregister(&drv->driver); +} +EXPORT_SYMBOL_GPL(xenbus_unregister_driver); + +struct xb_find_info { + struct xenbus_device *dev; + const char *nodename; +}; + +static int cmp_dev(struct device *dev, void *data) +{ + struct xenbus_device *xendev = to_xenbus_device(dev); + struct xb_find_info *info = data; + + if (!strcmp(xendev->nodename, info->nodename)) { + info->dev = xendev; + get_device(dev); + return 1; + } + return 0; +} + +static struct xenbus_device *xenbus_device_find(const char *nodename, + struct bus_type *bus) +{ + struct xb_find_info info = { .dev = NULL, .nodename = nodename }; + + bus_for_each_dev(bus, NULL, &info, cmp_dev); + return info.dev; +} + +static int cleanup_dev(struct device *dev, void *data) +{ + struct xenbus_device *xendev = to_xenbus_device(dev); + struct xb_find_info *info = data; + int len = strlen(info->nodename); + + DPRINTK("%s", info->nodename); + + /* Match the info->nodename path, or any subdirectory of that path. */ + if (strncmp(xendev->nodename, info->nodename, len)) + return 0; + + /* If the node name is longer, ensure it really is a subdirectory. */ + if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/')) + return 0; + + info->dev = xendev; + get_device(dev); + return 1; +} + +static void xenbus_cleanup_devices(const char *path, struct bus_type *bus) +{ + struct xb_find_info info = { .nodename = path }; + + do { + info.dev = NULL; + bus_for_each_dev(bus, NULL, &info, cleanup_dev); + if (info.dev) { + device_unregister(&info.dev->dev); + put_device(&info.dev->dev); + } + } while (info.dev); +} + +static void xenbus_dev_release(struct device *dev) +{ + if (dev) + kfree(to_xenbus_device(dev)); +} + +static ssize_t nodename_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); +} +static DEVICE_ATTR_RO(nodename); + +static ssize_t devtype_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); +} +static DEVICE_ATTR_RO(devtype); + +static ssize_t modalias_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%s:%s\n", dev->bus->name, + to_xenbus_device(dev)->devicetype); +} +static DEVICE_ATTR_RO(modalias); + +static struct attribute *xenbus_dev_attrs[] = { + &dev_attr_nodename.attr, + &dev_attr_devtype.attr, + &dev_attr_modalias.attr, + NULL, +}; + +static const struct attribute_group xenbus_dev_group = { + .attrs = xenbus_dev_attrs, +}; + +const struct attribute_group *xenbus_dev_groups[] = { + &xenbus_dev_group, + NULL, +}; +EXPORT_SYMBOL_GPL(xenbus_dev_groups); + +int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename) +{ + char devname[XEN_BUS_ID_SIZE]; + int err; + struct xenbus_device *xendev; + size_t stringlen; + char *tmpstring; + + enum xenbus_state state = xenbus_read_driver_state(nodename); + + if (state != XenbusStateInitialising) { + /* Device is not new, so ignore it. This can happen if a + device is going away after switching to Closed. */ + return 0; + } + + stringlen = strlen(nodename) + 1 + strlen(type) + 1; + xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL); + if (!xendev) + return -ENOMEM; + + xendev->state = XenbusStateInitialising; + + /* Copy the strings into the extra space. */ + + tmpstring = (char *)(xendev + 1); + strcpy(tmpstring, nodename); + xendev->nodename = tmpstring; + + tmpstring += strlen(tmpstring) + 1; + strcpy(tmpstring, type); + xendev->devicetype = tmpstring; + init_completion(&xendev->down); + + xendev->dev.bus = &bus->bus; + xendev->dev.release = xenbus_dev_release; + + err = bus->get_bus_id(devname, xendev->nodename); + if (err) + goto fail; + + dev_set_name(&xendev->dev, "%s", devname); + + /* Register with generic device framework. */ + err = device_register(&xendev->dev); + if (err) + goto fail; + + return 0; +fail: + kfree(xendev); + return err; +} +EXPORT_SYMBOL_GPL(xenbus_probe_node); + +static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) +{ + int err = 0; + char **dir; + unsigned int dir_n = 0; + int i; + + dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + for (i = 0; i < dir_n; i++) { + err = bus->probe(bus, type, dir[i]); + if (err) + break; + } + + kfree(dir); + return err; +} + +int xenbus_probe_devices(struct xen_bus_type *bus) +{ + int err = 0; + char **dir; + unsigned int i, dir_n; + + dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + for (i = 0; i < dir_n; i++) { + err = xenbus_probe_device_type(bus, dir[i]); + if (err) + break; + } + + kfree(dir); + return err; +} +EXPORT_SYMBOL_GPL(xenbus_probe_devices); + +static unsigned int char_count(const char *str, char c) +{ + unsigned int i, ret = 0; + + for (i = 0; str[i]; i++) + if (str[i] == c) + ret++; + return ret; +} + +static int strsep_len(const char *str, char c, unsigned int len) +{ + unsigned int i; + + for (i = 0; str[i]; i++) + if (str[i] == c) { + if (len == 0) + return i; + len--; + } + return (len == 0) ? i : -ERANGE; +} + +void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) +{ + int exists, rootlen; + struct xenbus_device *dev; + char type[XEN_BUS_ID_SIZE]; + const char *p, *root; + + if (char_count(node, '/') < 2) + return; + + exists = xenbus_exists(XBT_NIL, node, ""); + if (!exists) { + xenbus_cleanup_devices(node, &bus->bus); + return; + } + + /* backend/<type>/... or device/<type>/... */ + p = strchr(node, '/') + 1; + snprintf(type, XEN_BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p); + type[XEN_BUS_ID_SIZE-1] = '\0'; + + rootlen = strsep_len(node, '/', bus->levels); + if (rootlen < 0) + return; + root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node); + if (!root) + return; + + dev = xenbus_device_find(root, &bus->bus); + if (!dev) + xenbus_probe_node(bus, type, root); + else + put_device(&dev->dev); + + kfree(root); +} +EXPORT_SYMBOL_GPL(xenbus_dev_changed); + +int xenbus_dev_suspend(struct device *dev) +{ + int err = 0; + struct xenbus_driver *drv; + struct xenbus_device *xdev + = container_of(dev, struct xenbus_device, dev); + + DPRINTK("%s", xdev->nodename); + + if (dev->driver == NULL) + return 0; + drv = to_xenbus_driver(dev->driver); + if (drv->suspend) + err = drv->suspend(xdev); + if (err) + pr_warn("suspend %s failed: %i\n", dev_name(dev), err); + return 0; +} +EXPORT_SYMBOL_GPL(xenbus_dev_suspend); + +int xenbus_dev_resume(struct device *dev) +{ + int err; + struct xenbus_driver *drv; + struct xenbus_device *xdev + = container_of(dev, struct xenbus_device, dev); + + DPRINTK("%s", xdev->nodename); + + if (dev->driver == NULL) + return 0; + drv = to_xenbus_driver(dev->driver); + err = talk_to_otherend(xdev); + if (err) { + pr_warn("resume (talk_to_otherend) %s failed: %i\n", + dev_name(dev), err); + return err; + } + + xdev->state = XenbusStateInitialising; + + if (drv->resume) { + err = drv->resume(xdev); + if (err) { + pr_warn("resume %s failed: %i\n", dev_name(dev), err); + return err; + } + } + + err = watch_otherend(xdev); + if (err) { + pr_warn("resume (watch_otherend) %s failed: %d.\n", + dev_name(dev), err); + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(xenbus_dev_resume); + +int xenbus_dev_cancel(struct device *dev) +{ + /* Do nothing */ + DPRINTK("cancel"); + return 0; +} +EXPORT_SYMBOL_GPL(xenbus_dev_cancel); + +/* A flag to determine if xenstored is 'ready' (i.e. has started) */ +int xenstored_ready; + + +int register_xenstore_notifier(struct notifier_block *nb) +{ + int ret = 0; + + if (xenstored_ready > 0) + ret = nb->notifier_call(nb, 0, NULL); + else + blocking_notifier_chain_register(&xenstore_chain, nb); + + return ret; +} +EXPORT_SYMBOL_GPL(register_xenstore_notifier); + +void unregister_xenstore_notifier(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&xenstore_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_xenstore_notifier); + +void xenbus_probe(struct work_struct *unused) +{ + xenstored_ready = 1; + + /* Notify others that xenstore is up */ + blocking_notifier_call_chain(&xenstore_chain, 0, NULL); +} +EXPORT_SYMBOL_GPL(xenbus_probe); + +static int __init xenbus_probe_initcall(void) +{ + if (!xen_domain()) + return -ENODEV; + + if (xen_initial_domain() || xen_hvm_domain()) + return 0; + + xenbus_probe(NULL); + return 0; +} + +device_initcall(xenbus_probe_initcall); + +/* Set up event channel for xenstored which is run as a local process + * (this is normally used only in dom0) + */ +static int __init xenstored_local_init(void) +{ + int err = 0; + unsigned long page = 0; + struct evtchn_alloc_unbound alloc_unbound; + + /* Allocate Xenstore page */ + page = get_zeroed_page(GFP_KERNEL); + if (!page) + goto out_err; + + xen_store_mfn = xen_start_info->store_mfn = + pfn_to_mfn(virt_to_phys((void *)page) >> + PAGE_SHIFT); + + /* Next allocate a local port which xenstored can bind to */ + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = DOMID_SELF; + + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (err == -ENOSYS) + goto out_err; + + BUG_ON(err); + xen_store_evtchn = xen_start_info->store_evtchn = + alloc_unbound.port; + + return 0; + + out_err: + if (page != 0) + free_page(page); + return err; +} + +static int xenbus_resume_cb(struct notifier_block *nb, + unsigned long action, void *data) +{ + int err = 0; + + if (xen_hvm_domain()) { + uint64_t v; + + err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); + if (!err && v) + xen_store_evtchn = v; + else + pr_warn("Cannot update xenstore event channel: %d\n", + err); + } else + xen_store_evtchn = xen_start_info->store_evtchn; + + return err; +} + +static struct notifier_block xenbus_resume_nb = { + .notifier_call = xenbus_resume_cb, +}; + +static int __init xenbus_init(void) +{ + int err = 0; + uint64_t v = 0; + xen_store_domain_type = XS_UNKNOWN; + + if (!xen_domain()) + return -ENODEV; + + xenbus_ring_ops_init(); + + if (xen_pv_domain()) + xen_store_domain_type = XS_PV; + if (xen_hvm_domain()) + xen_store_domain_type = XS_HVM; + if (xen_hvm_domain() && xen_initial_domain()) + xen_store_domain_type = XS_LOCAL; + if (xen_pv_domain() && !xen_start_info->store_evtchn) + xen_store_domain_type = XS_LOCAL; + if (xen_pv_domain() && xen_start_info->store_evtchn) + xenstored_ready = 1; + + switch (xen_store_domain_type) { + case XS_LOCAL: + err = xenstored_local_init(); + if (err) + goto out_error; + xen_store_interface = mfn_to_virt(xen_store_mfn); + break; + case XS_PV: + xen_store_evtchn = xen_start_info->store_evtchn; + xen_store_mfn = xen_start_info->store_mfn; + xen_store_interface = mfn_to_virt(xen_store_mfn); + break; + case XS_HVM: + err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); + if (err) + goto out_error; + xen_store_evtchn = (int)v; + err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); + if (err) + goto out_error; + xen_store_mfn = (unsigned long)v; + xen_store_interface = + xen_remap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); + break; + default: + pr_warn("Xenstore state unknown\n"); + break; + } + + /* Initialize the interface to xenstore. */ + err = xs_init(); + if (err) { + pr_warn("Error initializing xenstore comms: %i\n", err); + goto out_error; + } + + if ((xen_store_domain_type != XS_LOCAL) && + (xen_store_domain_type != XS_UNKNOWN)) + xen_resume_notifier_register(&xenbus_resume_nb); + +#ifdef CONFIG_XEN_COMPAT_XENFS + /* + * Create xenfs mountpoint in /proc for compatibility with + * utilities that expect to find "xenbus" under "/proc/xen". + */ + proc_mkdir("xen", NULL); +#endif + +out_error: + return err; +} + +postcore_initcall(xenbus_init); + +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xenbus/xenbus_probe.h b/drivers/xen/xenbus/xenbus_probe.h new file mode 100644 index 000000000..c9ec7ca1f --- /dev/null +++ b/drivers/xen/xenbus/xenbus_probe.h @@ -0,0 +1,88 @@ +/****************************************************************************** + * xenbus_probe.h + * + * Talks to Xen Store to figure out what devices we have. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 XenSource Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef _XENBUS_PROBE_H +#define _XENBUS_PROBE_H + +#define XEN_BUS_ID_SIZE 20 + +struct xen_bus_type { + char *root; + unsigned int levels; + int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); + int (*probe)(struct xen_bus_type *bus, const char *type, + const char *dir); + void (*otherend_changed)(struct xenbus_watch *watch, const char **vec, + unsigned int len); + struct bus_type bus; +}; + +enum xenstore_init { + XS_UNKNOWN, + XS_PV, + XS_HVM, + XS_LOCAL, +}; + +extern const struct attribute_group *xenbus_dev_groups[]; + +extern int xenbus_match(struct device *_dev, struct device_driver *_drv); +extern int xenbus_dev_probe(struct device *_dev); +extern int xenbus_dev_remove(struct device *_dev); +extern int xenbus_register_driver_common(struct xenbus_driver *drv, + struct xen_bus_type *bus, + struct module *owner, + const char *mod_name); +extern int xenbus_probe_node(struct xen_bus_type *bus, + const char *type, + const char *nodename); +extern int xenbus_probe_devices(struct xen_bus_type *bus); + +extern void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); + +extern void xenbus_dev_shutdown(struct device *_dev); + +extern int xenbus_dev_suspend(struct device *dev); +extern int xenbus_dev_resume(struct device *dev); +extern int xenbus_dev_cancel(struct device *dev); + +extern void xenbus_otherend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len, + int ignore_on_shutdown); + +extern int xenbus_read_otherend_details(struct xenbus_device *xendev, + char *id_node, char *path_node); + +void xenbus_ring_ops_init(void); + +#endif diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c new file mode 100644 index 000000000..04f7f85a5 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -0,0 +1,276 @@ +/****************************************************************************** + * Talks to Xen Store to figure out what devices we have (backend half). + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * Copyright (C) 2005 Mike Wray, Hewlett-Packard + * Copyright (C) 2005, 2006 XenSource Ltd + * Copyright (C) 2007 Solarflare Communications, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DPRINTK(fmt, ...) \ + pr_debug("(%s:%d) " fmt "\n", \ + __func__, __LINE__, ##__VA_ARGS__) + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/notifier.h> +#include <linux/export.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/xen/hypervisor.h> +#include <asm/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/features.h> + +#include "xenbus_comms.h" +#include "xenbus_probe.h" + +/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ +static int backend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) +{ + int domid, err; + const char *devid, *type, *frontend; + unsigned int typelen; + + type = strchr(nodename, '/'); + if (!type) + return -EINVAL; + type++; + typelen = strcspn(type, "/"); + if (!typelen || type[typelen] != '/') + return -EINVAL; + + devid = strrchr(nodename, '/') + 1; + + err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, + "frontend", NULL, &frontend, + NULL); + if (err) + return err; + if (strlen(frontend) == 0) + err = -ERANGE; + if (!err && !xenbus_exists(XBT_NIL, frontend, "")) + err = -ENOENT; + kfree(frontend); + + if (err) + return err; + + if (snprintf(bus_id, XEN_BUS_ID_SIZE, "%.*s-%i-%s", + typelen, type, domid, devid) >= XEN_BUS_ID_SIZE) + return -ENOSPC; + return 0; +} + +static int xenbus_uevent_backend(struct device *dev, + struct kobj_uevent_env *env) +{ + struct xenbus_device *xdev; + struct xenbus_driver *drv; + struct xen_bus_type *bus; + + DPRINTK(""); + + if (dev == NULL) + return -ENODEV; + + xdev = to_xenbus_device(dev); + bus = container_of(xdev->dev.bus, struct xen_bus_type, bus); + + if (add_uevent_var(env, "MODALIAS=xen-backend:%s", xdev->devicetype)) + return -ENOMEM; + + /* stuff we want to pass to /sbin/hotplug */ + if (add_uevent_var(env, "XENBUS_TYPE=%s", xdev->devicetype)) + return -ENOMEM; + + if (add_uevent_var(env, "XENBUS_PATH=%s", xdev->nodename)) + return -ENOMEM; + + if (add_uevent_var(env, "XENBUS_BASE_PATH=%s", bus->root)) + return -ENOMEM; + + if (dev->driver) { + drv = to_xenbus_driver(dev->driver); + if (drv && drv->uevent) + return drv->uevent(xdev, env); + } + + return 0; +} + +/* backend/<typename>/<frontend-uuid>/<name> */ +static int xenbus_probe_backend_unit(struct xen_bus_type *bus, + const char *dir, + const char *type, + const char *name) +{ + char *nodename; + int err; + + nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name); + if (!nodename) + return -ENOMEM; + + DPRINTK("%s\n", nodename); + + err = xenbus_probe_node(bus, type, nodename); + kfree(nodename); + return err; +} + +/* backend/<typename>/<frontend-domid> */ +static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, + const char *domid) +{ + char *nodename; + int err = 0; + char **dir; + unsigned int i, dir_n = 0; + + DPRINTK(""); + + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, domid); + if (!nodename) + return -ENOMEM; + + dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); + if (IS_ERR(dir)) { + kfree(nodename); + return PTR_ERR(dir); + } + + for (i = 0; i < dir_n; i++) { + err = xenbus_probe_backend_unit(bus, nodename, type, dir[i]); + if (err) + break; + } + kfree(dir); + kfree(nodename); + return err; +} + +static void frontend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + xenbus_otherend_changed(watch, vec, len, 0); +} + +static struct xen_bus_type xenbus_backend = { + .root = "backend", + .levels = 3, /* backend/type/<frontend>/<id> */ + .get_bus_id = backend_bus_id, + .probe = xenbus_probe_backend, + .otherend_changed = frontend_changed, + .bus = { + .name = "xen-backend", + .match = xenbus_match, + .uevent = xenbus_uevent_backend, + .probe = xenbus_dev_probe, + .remove = xenbus_dev_remove, + .shutdown = xenbus_dev_shutdown, + .dev_groups = xenbus_dev_groups, + }, +}; + +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + DPRINTK(""); + + xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); +} + +static struct xenbus_watch be_watch = { + .node = "backend", + .callback = backend_changed, +}; + +static int read_frontend_details(struct xenbus_device *xendev) +{ + return xenbus_read_otherend_details(xendev, "frontend-id", "frontend"); +} + +int xenbus_dev_is_online(struct xenbus_device *dev) +{ + int rc, val; + + rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val); + if (rc != 1) + val = 0; /* no online node present */ + + return val; +} +EXPORT_SYMBOL_GPL(xenbus_dev_is_online); + +int __xenbus_register_backend(struct xenbus_driver *drv, struct module *owner, + const char *mod_name) +{ + drv->read_otherend_details = read_frontend_details; + + return xenbus_register_driver_common(drv, &xenbus_backend, + owner, mod_name); +} +EXPORT_SYMBOL_GPL(__xenbus_register_backend); + +static int backend_probe_and_watch(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + /* Enumerate devices in xenstore and watch for changes. */ + xenbus_probe_devices(&xenbus_backend); + register_xenbus_watch(&be_watch); + + return NOTIFY_DONE; +} + +static int __init xenbus_probe_backend_init(void) +{ + static struct notifier_block xenstore_notifier = { + .notifier_call = backend_probe_and_watch + }; + int err; + + DPRINTK(""); + + /* Register ourselves with the kernel bus subsystem */ + err = bus_register(&xenbus_backend.bus); + if (err) + return err; + + register_xenstore_notifier(&xenstore_notifier); + + return 0; +} +subsys_initcall(xenbus_probe_backend_init); diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c new file mode 100644 index 000000000..bcb53bdc4 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -0,0 +1,512 @@ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define DPRINTK(fmt, ...) \ + pr_debug("(%s:%d) " fmt "\n", \ + __func__, __LINE__, ##__VA_ARGS__) + +#include <linux/kernel.h> +#include <linux/err.h> +#include <linux/string.h> +#include <linux/ctype.h> +#include <linux/fcntl.h> +#include <linux/mm.h> +#include <linux/proc_fs.h> +#include <linux/notifier.h> +#include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/io.h> +#include <linux/module.h> + +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/xen/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/events.h> +#include <xen/page.h> +#include <xen/xen.h> + +#include <xen/platform_pci.h> + +#include "xenbus_comms.h" +#include "xenbus_probe.h" + + +static struct workqueue_struct *xenbus_frontend_wq; + +/* device/<type>/<id> => <type>-<id> */ +static int frontend_bus_id(char bus_id[XEN_BUS_ID_SIZE], const char *nodename) +{ + nodename = strchr(nodename, '/'); + if (!nodename || strlen(nodename + 1) >= XEN_BUS_ID_SIZE) { + pr_warn("bad frontend %s\n", nodename); + return -EINVAL; + } + + strlcpy(bus_id, nodename + 1, XEN_BUS_ID_SIZE); + if (!strchr(bus_id, '/')) { + pr_warn("bus_id %s no slash\n", bus_id); + return -EINVAL; + } + *strchr(bus_id, '/') = '-'; + return 0; +} + +/* device/<typename>/<name> */ +static int xenbus_probe_frontend(struct xen_bus_type *bus, const char *type, + const char *name) +{ + char *nodename; + int err; + + /* ignore console/0 */ + if (!strncmp(type, "console", 7) && !strncmp(name, "0", 1)) { + DPRINTK("Ignoring buggy device entry console/0"); + return 0; + } + + nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", bus->root, type, name); + if (!nodename) + return -ENOMEM; + + DPRINTK("%s", nodename); + + err = xenbus_probe_node(bus, type, nodename); + kfree(nodename); + return err; +} + +static int xenbus_uevent_frontend(struct device *_dev, + struct kobj_uevent_env *env) +{ + struct xenbus_device *dev = to_xenbus_device(_dev); + + if (add_uevent_var(env, "MODALIAS=xen:%s", dev->devicetype)) + return -ENOMEM; + + return 0; +} + + +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + xenbus_otherend_changed(watch, vec, len, 1); +} + +static void xenbus_frontend_delayed_resume(struct work_struct *w) +{ + struct xenbus_device *xdev = container_of(w, struct xenbus_device, work); + + xenbus_dev_resume(&xdev->dev); +} + +static int xenbus_frontend_dev_resume(struct device *dev) +{ + /* + * If xenstored is running in this domain, we cannot access the backend + * state at the moment, so we need to defer xenbus_dev_resume + */ + if (xen_store_domain_type == XS_LOCAL) { + struct xenbus_device *xdev = to_xenbus_device(dev); + + if (!xenbus_frontend_wq) { + pr_err("%s: no workqueue to process delayed resume\n", + xdev->nodename); + return -EFAULT; + } + + queue_work(xenbus_frontend_wq, &xdev->work); + + return 0; + } + + return xenbus_dev_resume(dev); +} + +static int xenbus_frontend_dev_probe(struct device *dev) +{ + if (xen_store_domain_type == XS_LOCAL) { + struct xenbus_device *xdev = to_xenbus_device(dev); + INIT_WORK(&xdev->work, xenbus_frontend_delayed_resume); + } + + return xenbus_dev_probe(dev); +} + +static const struct dev_pm_ops xenbus_pm_ops = { + .suspend = xenbus_dev_suspend, + .resume = xenbus_frontend_dev_resume, + .freeze = xenbus_dev_suspend, + .thaw = xenbus_dev_cancel, + .restore = xenbus_dev_resume, +}; + +static struct xen_bus_type xenbus_frontend = { + .root = "device", + .levels = 2, /* device/type/<id> */ + .get_bus_id = frontend_bus_id, + .probe = xenbus_probe_frontend, + .otherend_changed = backend_changed, + .bus = { + .name = "xen", + .match = xenbus_match, + .uevent = xenbus_uevent_frontend, + .probe = xenbus_frontend_dev_probe, + .remove = xenbus_dev_remove, + .shutdown = xenbus_dev_shutdown, + .dev_groups = xenbus_dev_groups, + + .pm = &xenbus_pm_ops, + }, +}; + +static void frontend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + DPRINTK(""); + + xenbus_dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); +} + + +/* We watch for devices appearing and vanishing. */ +static struct xenbus_watch fe_watch = { + .node = "device", + .callback = frontend_changed, +}; + +static int read_backend_details(struct xenbus_device *xendev) +{ + return xenbus_read_otherend_details(xendev, "backend-id", "backend"); +} + +static int is_device_connecting(struct device *dev, void *data, bool ignore_nonessential) +{ + struct xenbus_device *xendev = to_xenbus_device(dev); + struct device_driver *drv = data; + struct xenbus_driver *xendrv; + + /* + * A device with no driver will never connect. We care only about + * devices which should currently be in the process of connecting. + */ + if (!dev->driver) + return 0; + + /* Is this search limited to a particular driver? */ + if (drv && (dev->driver != drv)) + return 0; + + if (ignore_nonessential) { + /* With older QEMU, for PVonHVM guests the guest config files + * could contain: vfb = [ 'vnc=1, vnclisten=0.0.0.0'] + * which is nonsensical as there is no PV FB (there can be + * a PVKB) running as HVM guest. */ + + if ((strncmp(xendev->nodename, "device/vkbd", 11) == 0)) + return 0; + + if ((strncmp(xendev->nodename, "device/vfb", 10) == 0)) + return 0; + } + xendrv = to_xenbus_driver(dev->driver); + return (xendev->state < XenbusStateConnected || + (xendev->state == XenbusStateConnected && + xendrv->is_ready && !xendrv->is_ready(xendev))); +} +static int essential_device_connecting(struct device *dev, void *data) +{ + return is_device_connecting(dev, data, true /* ignore PV[KBB+FB] */); +} +static int non_essential_device_connecting(struct device *dev, void *data) +{ + return is_device_connecting(dev, data, false); +} + +static int exists_essential_connecting_device(struct device_driver *drv) +{ + return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, + essential_device_connecting); +} +static int exists_non_essential_connecting_device(struct device_driver *drv) +{ + return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, + non_essential_device_connecting); +} + +static int print_device_status(struct device *dev, void *data) +{ + struct xenbus_device *xendev = to_xenbus_device(dev); + struct device_driver *drv = data; + + /* Is this operation limited to a particular driver? */ + if (drv && (dev->driver != drv)) + return 0; + + if (!dev->driver) { + /* Information only: is this too noisy? */ + pr_info("Device with no driver: %s\n", xendev->nodename); + } else if (xendev->state < XenbusStateConnected) { + enum xenbus_state rstate = XenbusStateUnknown; + if (xendev->otherend) + rstate = xenbus_read_driver_state(xendev->otherend); + pr_warn("Timeout connecting to device: %s (local state %d, remote state %d)\n", + xendev->nodename, xendev->state, rstate); + } + + return 0; +} + +/* We only wait for device setup after most initcalls have run. */ +static int ready_to_wait_for_devices; + +static bool wait_loop(unsigned long start, unsigned int max_delay, + unsigned int *seconds_waited) +{ + if (time_after(jiffies, start + (*seconds_waited+5)*HZ)) { + if (!*seconds_waited) + pr_warn("Waiting for devices to initialise: "); + *seconds_waited += 5; + pr_cont("%us...", max_delay - *seconds_waited); + if (*seconds_waited == max_delay) { + pr_cont("\n"); + return true; + } + } + + schedule_timeout_interruptible(HZ/10); + + return false; +} +/* + * On a 5-minute timeout, wait for all devices currently configured. We need + * to do this to guarantee that the filesystems and / or network devices + * needed for boot are available, before we can allow the boot to proceed. + * + * This needs to be on a late_initcall, to happen after the frontend device + * drivers have been initialised, but before the root fs is mounted. + * + * A possible improvement here would be to have the tools add a per-device + * flag to the store entry, indicating whether it is needed at boot time. + * This would allow people who knew what they were doing to accelerate their + * boot slightly, but of course needs tools or manual intervention to set up + * those flags correctly. + */ +static void wait_for_devices(struct xenbus_driver *xendrv) +{ + unsigned long start = jiffies; + struct device_driver *drv = xendrv ? &xendrv->driver : NULL; + unsigned int seconds_waited = 0; + + if (!ready_to_wait_for_devices || !xen_domain()) + return; + + while (exists_non_essential_connecting_device(drv)) + if (wait_loop(start, 30, &seconds_waited)) + break; + + /* Skips PVKB and PVFB check.*/ + while (exists_essential_connecting_device(drv)) + if (wait_loop(start, 270, &seconds_waited)) + break; + + if (seconds_waited) + printk("\n"); + + bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, + print_device_status); +} + +int __xenbus_register_frontend(struct xenbus_driver *drv, struct module *owner, + const char *mod_name) +{ + int ret; + + drv->read_otherend_details = read_backend_details; + + ret = xenbus_register_driver_common(drv, &xenbus_frontend, + owner, mod_name); + if (ret) + return ret; + + /* If this driver is loaded as a module wait for devices to attach. */ + wait_for_devices(drv); + + return 0; +} +EXPORT_SYMBOL_GPL(__xenbus_register_frontend); + +static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq); +static int backend_state; + +static void xenbus_reset_backend_state_changed(struct xenbus_watch *w, + const char **v, unsigned int l) +{ + xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &backend_state); + printk(KERN_DEBUG "XENBUS: backend %s %s\n", + v[XS_WATCH_PATH], xenbus_strstate(backend_state)); + wake_up(&backend_state_wq); +} + +static void xenbus_reset_wait_for_backend(char *be, int expected) +{ + long timeout; + timeout = wait_event_interruptible_timeout(backend_state_wq, + backend_state == expected, 5 * HZ); + if (timeout <= 0) + pr_info("backend %s timed out\n", be); +} + +/* + * Reset frontend if it is in Connected or Closed state. + * Wait for backend to catch up. + * State Connected happens during kdump, Closed after kexec. + */ +static void xenbus_reset_frontend(char *fe, char *be, int be_state) +{ + struct xenbus_watch be_watch; + + printk(KERN_DEBUG "XENBUS: backend %s %s\n", + be, xenbus_strstate(be_state)); + + memset(&be_watch, 0, sizeof(be_watch)); + be_watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", be); + if (!be_watch.node) + return; + + be_watch.callback = xenbus_reset_backend_state_changed; + backend_state = XenbusStateUnknown; + + pr_info("triggering reconnect on %s\n", be); + register_xenbus_watch(&be_watch); + + /* fall through to forward backend to state XenbusStateInitialising */ + switch (be_state) { + case XenbusStateConnected: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing); + xenbus_reset_wait_for_backend(be, XenbusStateClosing); + + case XenbusStateClosing: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed); + xenbus_reset_wait_for_backend(be, XenbusStateClosed); + + case XenbusStateClosed: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising); + xenbus_reset_wait_for_backend(be, XenbusStateInitWait); + } + + unregister_xenbus_watch(&be_watch); + pr_info("reconnect done on %s\n", be); + kfree(be_watch.node); +} + +static void xenbus_check_frontend(char *class, char *dev) +{ + int be_state, fe_state, err; + char *backend, *frontend; + + frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev); + if (!frontend) + return; + + err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &fe_state); + if (err != 1) + goto out; + + switch (fe_state) { + case XenbusStateConnected: + case XenbusStateClosed: + printk(KERN_DEBUG "XENBUS: frontend %s %s\n", + frontend, xenbus_strstate(fe_state)); + backend = xenbus_read(XBT_NIL, frontend, "backend", NULL); + if (!backend || IS_ERR(backend)) + goto out; + err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state); + if (err == 1) + xenbus_reset_frontend(frontend, backend, be_state); + kfree(backend); + break; + default: + break; + } +out: + kfree(frontend); +} + +static void xenbus_reset_state(void) +{ + char **devclass, **dev; + int devclass_n, dev_n; + int i, j; + + devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n); + if (IS_ERR(devclass)) + return; + + for (i = 0; i < devclass_n; i++) { + dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n); + if (IS_ERR(dev)) + continue; + for (j = 0; j < dev_n; j++) + xenbus_check_frontend(devclass[i], dev[j]); + kfree(dev); + } + kfree(devclass); +} + +static int frontend_probe_and_watch(struct notifier_block *notifier, + unsigned long event, + void *data) +{ + /* reset devices in Connected or Closed state */ + if (xen_hvm_domain()) + xenbus_reset_state(); + /* Enumerate devices in xenstore and watch for changes. */ + xenbus_probe_devices(&xenbus_frontend); + register_xenbus_watch(&fe_watch); + + return NOTIFY_DONE; +} + + +static int __init xenbus_probe_frontend_init(void) +{ + static struct notifier_block xenstore_notifier = { + .notifier_call = frontend_probe_and_watch + }; + int err; + + DPRINTK(""); + + /* Register ourselves with the kernel bus subsystem */ + err = bus_register(&xenbus_frontend.bus); + if (err) + return err; + + register_xenstore_notifier(&xenstore_notifier); + + if (xen_store_domain_type == XS_LOCAL) { + xenbus_frontend_wq = create_workqueue("xenbus_frontend"); + if (!xenbus_frontend_wq) + pr_warn("create xenbus frontend workqueue failed, S3 resume is likely to fail\n"); + } + + return 0; +} +subsys_initcall(xenbus_probe_frontend_init); + +#ifndef MODULE +static int __init boot_wait_for_devices(void) +{ + if (!xen_has_pv_devices()) + return -ENODEV; + + ready_to_wait_for_devices = 1; + wait_for_devices(NULL); + return 0; +} + +late_initcall(boot_wait_for_devices); +#endif + +MODULE_LICENSE("GPL"); diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c new file mode 100644 index 000000000..ba804f3d8 --- /dev/null +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -0,0 +1,981 @@ +/****************************************************************************** + * xenbus_xs.c + * + * This is the kernel equivalent of the "xs" library. We don't need everything + * and we use xenbus_comms for communication. + * + * Copyright (C) 2005 Rusty Russell, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/unistd.h> +#include <linux/errno.h> +#include <linux/types.h> +#include <linux/uio.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/fcntl.h> +#include <linux/kthread.h> +#include <linux/rwsem.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <asm/xen/hypervisor.h> +#include <xen/xenbus.h> +#include <xen/xen.h> +#include "xenbus_comms.h" +#include "xenbus_probe.h" + +struct xs_stored_msg { + struct list_head list; + + struct xsd_sockmsg hdr; + + union { + /* Queued replies. */ + struct { + char *body; + } reply; + + /* Queued watch events. */ + struct { + struct xenbus_watch *handle; + char **vec; + unsigned int vec_size; + } watch; + } u; +}; + +struct xs_handle { + /* A list of replies. Currently only one will ever be outstanding. */ + struct list_head reply_list; + spinlock_t reply_lock; + wait_queue_head_t reply_waitq; + + /* + * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex. + * response_mutex is never taken simultaneously with the other three. + * + * transaction_mutex must be held before incrementing + * transaction_count. The mutex is held when a suspend is in + * progress to prevent new transactions starting. + * + * When decrementing transaction_count to zero the wait queue + * should be woken up, the suspend code waits for count to + * reach zero. + */ + + /* One request at a time. */ + struct mutex request_mutex; + + /* Protect xenbus reader thread against save/restore. */ + struct mutex response_mutex; + + /* Protect transactions against save/restore. */ + struct mutex transaction_mutex; + atomic_t transaction_count; + wait_queue_head_t transaction_wq; + + /* Protect watch (de)register against save/restore. */ + struct rw_semaphore watch_mutex; +}; + +static struct xs_handle xs_state; + +/* List of registered watches, and a lock to protect it. */ +static LIST_HEAD(watches); +static DEFINE_SPINLOCK(watches_lock); + +/* List of pending watch callback events, and a lock to protect it. */ +static LIST_HEAD(watch_events); +static DEFINE_SPINLOCK(watch_events_lock); + +/* + * Details of the xenwatch callback kernel thread. The thread waits on the + * watch_events_waitq for work to do (queued on watch_events list). When it + * wakes up it acquires the xenwatch_mutex before reading the list and + * carrying out work. + */ +static pid_t xenwatch_pid; +static DEFINE_MUTEX(xenwatch_mutex); +static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq); + +static int get_error(const char *errorstring) +{ + unsigned int i; + + for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) { + if (i == ARRAY_SIZE(xsd_errors) - 1) { + pr_warn("xen store gave: unknown error %s\n", + errorstring); + return EINVAL; + } + } + return xsd_errors[i].errnum; +} + +static bool xenbus_ok(void) +{ + switch (xen_store_domain_type) { + case XS_LOCAL: + switch (system_state) { + case SYSTEM_POWER_OFF: + case SYSTEM_RESTART: + case SYSTEM_HALT: + return false; + default: + break; + } + return true; + case XS_PV: + case XS_HVM: + /* FIXME: Could check that the remote domain is alive, + * but it is normally initial domain. */ + return true; + default: + break; + } + return false; +} +static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) +{ + struct xs_stored_msg *msg; + char *body; + + spin_lock(&xs_state.reply_lock); + + while (list_empty(&xs_state.reply_list)) { + spin_unlock(&xs_state.reply_lock); + if (xenbus_ok()) + /* XXX FIXME: Avoid synchronous wait for response here. */ + wait_event_timeout(xs_state.reply_waitq, + !list_empty(&xs_state.reply_list), + msecs_to_jiffies(500)); + else { + /* + * If we are in the process of being shut-down there is + * no point of trying to contact XenBus - it is either + * killed (xenstored application) or the other domain + * has been killed or is unreachable. + */ + return ERR_PTR(-EIO); + } + spin_lock(&xs_state.reply_lock); + } + + msg = list_entry(xs_state.reply_list.next, + struct xs_stored_msg, list); + list_del(&msg->list); + + spin_unlock(&xs_state.reply_lock); + + *type = msg->hdr.type; + if (len) + *len = msg->hdr.len; + body = msg->u.reply.body; + + kfree(msg); + + return body; +} + +static void transaction_start(void) +{ + mutex_lock(&xs_state.transaction_mutex); + atomic_inc(&xs_state.transaction_count); + mutex_unlock(&xs_state.transaction_mutex); +} + +static void transaction_end(void) +{ + if (atomic_dec_and_test(&xs_state.transaction_count)) + wake_up(&xs_state.transaction_wq); +} + +static void transaction_suspend(void) +{ + mutex_lock(&xs_state.transaction_mutex); + wait_event(xs_state.transaction_wq, + atomic_read(&xs_state.transaction_count) == 0); +} + +static void transaction_resume(void) +{ + mutex_unlock(&xs_state.transaction_mutex); +} + +void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) +{ + void *ret; + struct xsd_sockmsg req_msg = *msg; + int err; + + if (req_msg.type == XS_TRANSACTION_START) + transaction_start(); + + mutex_lock(&xs_state.request_mutex); + + err = xb_write(msg, sizeof(*msg) + msg->len); + if (err) { + msg->type = XS_ERROR; + ret = ERR_PTR(err); + } else + ret = read_reply(&msg->type, &msg->len); + + mutex_unlock(&xs_state.request_mutex); + + if (IS_ERR(ret)) + return ret; + + if ((msg->type == XS_TRANSACTION_END) || + ((req_msg.type == XS_TRANSACTION_START) && + (msg->type == XS_ERROR))) + transaction_end(); + + return ret; +} +EXPORT_SYMBOL(xenbus_dev_request_and_reply); + +/* Send message to xs, get kmalloc'ed reply. ERR_PTR() on error. */ +static void *xs_talkv(struct xenbus_transaction t, + enum xsd_sockmsg_type type, + const struct kvec *iovec, + unsigned int num_vecs, + unsigned int *len) +{ + struct xsd_sockmsg msg; + void *ret = NULL; + unsigned int i; + int err; + + msg.tx_id = t.id; + msg.req_id = 0; + msg.type = type; + msg.len = 0; + for (i = 0; i < num_vecs; i++) + msg.len += iovec[i].iov_len; + + mutex_lock(&xs_state.request_mutex); + + err = xb_write(&msg, sizeof(msg)); + if (err) { + mutex_unlock(&xs_state.request_mutex); + return ERR_PTR(err); + } + + for (i = 0; i < num_vecs; i++) { + err = xb_write(iovec[i].iov_base, iovec[i].iov_len); + if (err) { + mutex_unlock(&xs_state.request_mutex); + return ERR_PTR(err); + } + } + + ret = read_reply(&msg.type, len); + + mutex_unlock(&xs_state.request_mutex); + + if (IS_ERR(ret)) + return ret; + + if (msg.type == XS_ERROR) { + err = get_error(ret); + kfree(ret); + return ERR_PTR(-err); + } + + if (msg.type != type) { + pr_warn_ratelimited("unexpected type [%d], expected [%d]\n", + msg.type, type); + kfree(ret); + return ERR_PTR(-EINVAL); + } + return ret; +} + +/* Simplified version of xs_talkv: single message. */ +static void *xs_single(struct xenbus_transaction t, + enum xsd_sockmsg_type type, + const char *string, + unsigned int *len) +{ + struct kvec iovec; + + iovec.iov_base = (void *)string; + iovec.iov_len = strlen(string) + 1; + return xs_talkv(t, type, &iovec, 1, len); +} + +/* Many commands only need an ack, don't care what it says. */ +static int xs_error(char *reply) +{ + if (IS_ERR(reply)) + return PTR_ERR(reply); + kfree(reply); + return 0; +} + +static unsigned int count_strings(const char *strings, unsigned int len) +{ + unsigned int num; + const char *p; + + for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) + num++; + + return num; +} + +/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */ +static char *join(const char *dir, const char *name) +{ + char *buffer; + + if (strlen(name) == 0) + buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s", dir); + else + buffer = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/%s", dir, name); + return (!buffer) ? ERR_PTR(-ENOMEM) : buffer; +} + +static char **split(char *strings, unsigned int len, unsigned int *num) +{ + char *p, **ret; + + /* Count the strings. */ + *num = count_strings(strings, len); + + /* Transfer to one big alloc for easy freeing. */ + ret = kmalloc(*num * sizeof(char *) + len, GFP_NOIO | __GFP_HIGH); + if (!ret) { + kfree(strings); + return ERR_PTR(-ENOMEM); + } + memcpy(&ret[*num], strings, len); + kfree(strings); + + strings = (char *)&ret[*num]; + for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1) + ret[(*num)++] = p; + + return ret; +} + +char **xenbus_directory(struct xenbus_transaction t, + const char *dir, const char *node, unsigned int *num) +{ + char *strings, *path; + unsigned int len; + + path = join(dir, node); + if (IS_ERR(path)) + return (char **)path; + + strings = xs_single(t, XS_DIRECTORY, path, &len); + kfree(path); + if (IS_ERR(strings)) + return (char **)strings; + + return split(strings, len, num); +} +EXPORT_SYMBOL_GPL(xenbus_directory); + +/* Check if a path exists. Return 1 if it does. */ +int xenbus_exists(struct xenbus_transaction t, + const char *dir, const char *node) +{ + char **d; + int dir_n; + + d = xenbus_directory(t, dir, node, &dir_n); + if (IS_ERR(d)) + return 0; + kfree(d); + return 1; +} +EXPORT_SYMBOL_GPL(xenbus_exists); + +/* Get the value of a single file. + * Returns a kmalloced value: call free() on it after use. + * len indicates length in bytes. + */ +void *xenbus_read(struct xenbus_transaction t, + const char *dir, const char *node, unsigned int *len) +{ + char *path; + void *ret; + + path = join(dir, node); + if (IS_ERR(path)) + return (void *)path; + + ret = xs_single(t, XS_READ, path, len); + kfree(path); + return ret; +} +EXPORT_SYMBOL_GPL(xenbus_read); + +/* Write the value of a single file. + * Returns -err on failure. + */ +int xenbus_write(struct xenbus_transaction t, + const char *dir, const char *node, const char *string) +{ + const char *path; + struct kvec iovec[2]; + int ret; + + path = join(dir, node); + if (IS_ERR(path)) + return PTR_ERR(path); + + iovec[0].iov_base = (void *)path; + iovec[0].iov_len = strlen(path) + 1; + iovec[1].iov_base = (void *)string; + iovec[1].iov_len = strlen(string); + + ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL)); + kfree(path); + return ret; +} +EXPORT_SYMBOL_GPL(xenbus_write); + +/* Create a new directory. */ +int xenbus_mkdir(struct xenbus_transaction t, + const char *dir, const char *node) +{ + char *path; + int ret; + + path = join(dir, node); + if (IS_ERR(path)) + return PTR_ERR(path); + + ret = xs_error(xs_single(t, XS_MKDIR, path, NULL)); + kfree(path); + return ret; +} +EXPORT_SYMBOL_GPL(xenbus_mkdir); + +/* Destroy a file or directory (directories must be empty). */ +int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node) +{ + char *path; + int ret; + + path = join(dir, node); + if (IS_ERR(path)) + return PTR_ERR(path); + + ret = xs_error(xs_single(t, XS_RM, path, NULL)); + kfree(path); + return ret; +} +EXPORT_SYMBOL_GPL(xenbus_rm); + +/* Start a transaction: changes by others will not be seen during this + * transaction, and changes will not be visible to others until end. + */ +int xenbus_transaction_start(struct xenbus_transaction *t) +{ + char *id_str; + + transaction_start(); + + id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL); + if (IS_ERR(id_str)) { + transaction_end(); + return PTR_ERR(id_str); + } + + t->id = simple_strtoul(id_str, NULL, 0); + kfree(id_str); + return 0; +} +EXPORT_SYMBOL_GPL(xenbus_transaction_start); + +/* End a transaction. + * If abandon is true, transaction is discarded instead of committed. + */ +int xenbus_transaction_end(struct xenbus_transaction t, int abort) +{ + char abortstr[2]; + int err; + + if (abort) + strcpy(abortstr, "F"); + else + strcpy(abortstr, "T"); + + err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); + + transaction_end(); + + return err; +} +EXPORT_SYMBOL_GPL(xenbus_transaction_end); + +/* Single read and scanf: returns -errno or num scanned. */ +int xenbus_scanf(struct xenbus_transaction t, + const char *dir, const char *node, const char *fmt, ...) +{ + va_list ap; + int ret; + char *val; + + val = xenbus_read(t, dir, node, NULL); + if (IS_ERR(val)) + return PTR_ERR(val); + + va_start(ap, fmt); + ret = vsscanf(val, fmt, ap); + va_end(ap); + kfree(val); + /* Distinctive errno. */ + if (ret == 0) + return -ERANGE; + return ret; +} +EXPORT_SYMBOL_GPL(xenbus_scanf); + +/* Single printf and write: returns -errno or 0. */ +int xenbus_printf(struct xenbus_transaction t, + const char *dir, const char *node, const char *fmt, ...) +{ + va_list ap; + int ret; + char *buf; + + va_start(ap, fmt); + buf = kvasprintf(GFP_NOIO | __GFP_HIGH, fmt, ap); + va_end(ap); + + if (!buf) + return -ENOMEM; + + ret = xenbus_write(t, dir, node, buf); + + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(xenbus_printf); + +/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */ +int xenbus_gather(struct xenbus_transaction t, const char *dir, ...) +{ + va_list ap; + const char *name; + int ret = 0; + + va_start(ap, dir); + while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { + const char *fmt = va_arg(ap, char *); + void *result = va_arg(ap, void *); + char *p; + + p = xenbus_read(t, dir, name, NULL); + if (IS_ERR(p)) { + ret = PTR_ERR(p); + break; + } + if (fmt) { + if (sscanf(p, fmt, result) == 0) + ret = -EINVAL; + kfree(p); + } else + *(char **)result = p; + } + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(xenbus_gather); + +static int xs_watch(const char *path, const char *token) +{ + struct kvec iov[2]; + + iov[0].iov_base = (void *)path; + iov[0].iov_len = strlen(path) + 1; + iov[1].iov_base = (void *)token; + iov[1].iov_len = strlen(token) + 1; + + return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov, + ARRAY_SIZE(iov), NULL)); +} + +static int xs_unwatch(const char *path, const char *token) +{ + struct kvec iov[2]; + + iov[0].iov_base = (char *)path; + iov[0].iov_len = strlen(path) + 1; + iov[1].iov_base = (char *)token; + iov[1].iov_len = strlen(token) + 1; + + return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov, + ARRAY_SIZE(iov), NULL)); +} + +static struct xenbus_watch *find_watch(const char *token) +{ + struct xenbus_watch *i, *cmp; + + cmp = (void *)simple_strtoul(token, NULL, 16); + + list_for_each_entry(i, &watches, list) + if (i == cmp) + return i; + + return NULL; +} +/* + * Certain older XenBus toolstack cannot handle reading values that are + * not populated. Some Xen 3.4 installation are incapable of doing this + * so if we are running on anything older than 4 do not attempt to read + * control/platform-feature-xs_reset_watches. + */ +static bool xen_strict_xenbus_quirk(void) +{ +#ifdef CONFIG_X86 + uint32_t eax, ebx, ecx, edx, base; + + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + if ((eax >> 16) < 4) + return true; +#endif + return false; + +} +static void xs_reset_watches(void) +{ + int err, supported = 0; + + if (!xen_hvm_domain() || xen_initial_domain()) + return; + + if (xen_strict_xenbus_quirk()) + return; + + err = xenbus_scanf(XBT_NIL, "control", + "platform-feature-xs_reset_watches", "%d", &supported); + if (err != 1 || !supported) + return; + + err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL)); + if (err && err != -EEXIST) + pr_warn("xs_reset_watches failed: %d\n", err); +} + +/* Register callback to watch this node. */ +int register_xenbus_watch(struct xenbus_watch *watch) +{ + /* Pointer in ascii is the token. */ + char token[sizeof(watch) * 2 + 1]; + int err; + + sprintf(token, "%lX", (long)watch); + + down_read(&xs_state.watch_mutex); + + spin_lock(&watches_lock); + BUG_ON(find_watch(token)); + list_add(&watch->list, &watches); + spin_unlock(&watches_lock); + + err = xs_watch(watch->node, token); + + if (err) { + spin_lock(&watches_lock); + list_del(&watch->list); + spin_unlock(&watches_lock); + } + + up_read(&xs_state.watch_mutex); + + return err; +} +EXPORT_SYMBOL_GPL(register_xenbus_watch); + +void unregister_xenbus_watch(struct xenbus_watch *watch) +{ + struct xs_stored_msg *msg, *tmp; + char token[sizeof(watch) * 2 + 1]; + int err; + + sprintf(token, "%lX", (long)watch); + + down_read(&xs_state.watch_mutex); + + spin_lock(&watches_lock); + BUG_ON(!find_watch(token)); + list_del(&watch->list); + spin_unlock(&watches_lock); + + err = xs_unwatch(watch->node, token); + if (err) + pr_warn("Failed to release watch %s: %i\n", watch->node, err); + + up_read(&xs_state.watch_mutex); + + /* Make sure there are no callbacks running currently (unless + its us) */ + if (current->pid != xenwatch_pid) + mutex_lock(&xenwatch_mutex); + + /* Cancel pending watch events. */ + spin_lock(&watch_events_lock); + list_for_each_entry_safe(msg, tmp, &watch_events, list) { + if (msg->u.watch.handle != watch) + continue; + list_del(&msg->list); + kfree(msg->u.watch.vec); + kfree(msg); + } + spin_unlock(&watch_events_lock); + + if (current->pid != xenwatch_pid) + mutex_unlock(&xenwatch_mutex); +} +EXPORT_SYMBOL_GPL(unregister_xenbus_watch); + +void xs_suspend(void) +{ + transaction_suspend(); + down_write(&xs_state.watch_mutex); + mutex_lock(&xs_state.request_mutex); + mutex_lock(&xs_state.response_mutex); +} + +void xs_resume(void) +{ + struct xenbus_watch *watch; + char token[sizeof(watch) * 2 + 1]; + + xb_init_comms(); + + mutex_unlock(&xs_state.response_mutex); + mutex_unlock(&xs_state.request_mutex); + transaction_resume(); + + /* No need for watches_lock: the watch_mutex is sufficient. */ + list_for_each_entry(watch, &watches, list) { + sprintf(token, "%lX", (long)watch); + xs_watch(watch->node, token); + } + + up_write(&xs_state.watch_mutex); +} + +void xs_suspend_cancel(void) +{ + mutex_unlock(&xs_state.response_mutex); + mutex_unlock(&xs_state.request_mutex); + up_write(&xs_state.watch_mutex); + mutex_unlock(&xs_state.transaction_mutex); +} + +static int xenwatch_thread(void *unused) +{ + struct list_head *ent; + struct xs_stored_msg *msg; + + for (;;) { + wait_event_interruptible(watch_events_waitq, + !list_empty(&watch_events)); + + if (kthread_should_stop()) + break; + + mutex_lock(&xenwatch_mutex); + + spin_lock(&watch_events_lock); + ent = watch_events.next; + if (ent != &watch_events) + list_del(ent); + spin_unlock(&watch_events_lock); + + if (ent != &watch_events) { + msg = list_entry(ent, struct xs_stored_msg, list); + msg->u.watch.handle->callback( + msg->u.watch.handle, + (const char **)msg->u.watch.vec, + msg->u.watch.vec_size); + kfree(msg->u.watch.vec); + kfree(msg); + } + + mutex_unlock(&xenwatch_mutex); + } + + return 0; +} + +static int process_msg(void) +{ + struct xs_stored_msg *msg; + char *body; + int err; + + /* + * We must disallow save/restore while reading a xenstore message. + * A partial read across s/r leaves us out of sync with xenstored. + */ + for (;;) { + err = xb_wait_for_data_to_read(); + if (err) + return err; + mutex_lock(&xs_state.response_mutex); + if (xb_data_to_read()) + break; + /* We raced with save/restore: pending data 'disappeared'. */ + mutex_unlock(&xs_state.response_mutex); + } + + + msg = kmalloc(sizeof(*msg), GFP_NOIO | __GFP_HIGH); + if (msg == NULL) { + err = -ENOMEM; + goto out; + } + + err = xb_read(&msg->hdr, sizeof(msg->hdr)); + if (err) { + kfree(msg); + goto out; + } + + if (msg->hdr.len > XENSTORE_PAYLOAD_MAX) { + kfree(msg); + err = -EINVAL; + goto out; + } + + body = kmalloc(msg->hdr.len + 1, GFP_NOIO | __GFP_HIGH); + if (body == NULL) { + kfree(msg); + err = -ENOMEM; + goto out; + } + + err = xb_read(body, msg->hdr.len); + if (err) { + kfree(body); + kfree(msg); + goto out; + } + body[msg->hdr.len] = '\0'; + + if (msg->hdr.type == XS_WATCH_EVENT) { + msg->u.watch.vec = split(body, msg->hdr.len, + &msg->u.watch.vec_size); + if (IS_ERR(msg->u.watch.vec)) { + err = PTR_ERR(msg->u.watch.vec); + kfree(msg); + goto out; + } + + spin_lock(&watches_lock); + msg->u.watch.handle = find_watch( + msg->u.watch.vec[XS_WATCH_TOKEN]); + if (msg->u.watch.handle != NULL) { + spin_lock(&watch_events_lock); + list_add_tail(&msg->list, &watch_events); + wake_up(&watch_events_waitq); + spin_unlock(&watch_events_lock); + } else { + kfree(msg->u.watch.vec); + kfree(msg); + } + spin_unlock(&watches_lock); + } else { + msg->u.reply.body = body; + spin_lock(&xs_state.reply_lock); + list_add_tail(&msg->list, &xs_state.reply_list); + spin_unlock(&xs_state.reply_lock); + wake_up(&xs_state.reply_waitq); + } + + out: + mutex_unlock(&xs_state.response_mutex); + return err; +} + +static int xenbus_thread(void *unused) +{ + int err; + + for (;;) { + err = process_msg(); + if (err) + pr_warn("error %d while reading message\n", err); + if (kthread_should_stop()) + break; + } + + return 0; +} + +int xs_init(void) +{ + int err; + struct task_struct *task; + + INIT_LIST_HEAD(&xs_state.reply_list); + spin_lock_init(&xs_state.reply_lock); + init_waitqueue_head(&xs_state.reply_waitq); + + mutex_init(&xs_state.request_mutex); + mutex_init(&xs_state.response_mutex); + mutex_init(&xs_state.transaction_mutex); + init_rwsem(&xs_state.watch_mutex); + atomic_set(&xs_state.transaction_count, 0); + init_waitqueue_head(&xs_state.transaction_wq); + + /* Initialize the shared memory rings to talk to xenstored */ + err = xb_init_comms(); + if (err) + return err; + + task = kthread_run(xenwatch_thread, NULL, "xenwatch"); + if (IS_ERR(task)) + return PTR_ERR(task); + xenwatch_pid = task->pid; + + task = kthread_run(xenbus_thread, NULL, "xenbus"); + if (IS_ERR(task)) + return PTR_ERR(task); + + /* shutdown watches for kexec boot */ + xs_reset_watches(); + + return 0; +} diff --git a/drivers/xen/xenfs/Makefile b/drivers/xen/xenfs/Makefile new file mode 100644 index 000000000..b019865fc --- /dev/null +++ b/drivers/xen/xenfs/Makefile @@ -0,0 +1,4 @@ +obj-$(CONFIG_XENFS) += xenfs.o + +xenfs-y = super.o +xenfs-$(CONFIG_XEN_DOM0) += xenstored.o diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c new file mode 100644 index 000000000..06092e0fe --- /dev/null +++ b/drivers/xen/xenfs/super.c @@ -0,0 +1,99 @@ +/* + * xenfs.c - a filesystem for passing info between the a domain and + * the hypervisor. + * + * 2008-10-07 Alex Zeffertt Replaced /proc/xen/xenbus with xenfs filesystem + * and /proc/xen compatibility mount point. + * Turned xenfs into a loadable module. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/magic.h> + +#include <xen/xen.h> + +#include "xenfs.h" +#include "../privcmd.h" +#include "../xenbus/xenbus_comms.h" + +#include <asm/xen/hypervisor.h> + +MODULE_DESCRIPTION("Xen filesystem"); +MODULE_LICENSE("GPL"); + +static ssize_t capabilities_read(struct file *file, char __user *buf, + size_t size, loff_t *off) +{ + char *tmp = ""; + + if (xen_initial_domain()) + tmp = "control_d\n"; + + return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp)); +} + +static const struct file_operations capabilities_file_ops = { + .read = capabilities_read, + .llseek = default_llseek, +}; + +static int xenfs_fill_super(struct super_block *sb, void *data, int silent) +{ + static struct tree_descr xenfs_files[] = { + [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, + { "capabilities", &capabilities_file_ops, S_IRUGO }, + { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, + {""}, + }; + + static struct tree_descr xenfs_init_files[] = { + [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, + { "capabilities", &capabilities_file_ops, S_IRUGO }, + { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, + { "xsd_kva", &xsd_kva_file_ops, S_IRUSR|S_IWUSR}, + { "xsd_port", &xsd_port_file_ops, S_IRUSR|S_IWUSR}, + {""}, + }; + + return simple_fill_super(sb, XENFS_SUPER_MAGIC, + xen_initial_domain() ? xenfs_init_files : xenfs_files); +} + +static struct dentry *xenfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_single(fs_type, flags, data, xenfs_fill_super); +} + +static struct file_system_type xenfs_type = { + .owner = THIS_MODULE, + .name = "xenfs", + .mount = xenfs_mount, + .kill_sb = kill_litter_super, +}; +MODULE_ALIAS_FS("xenfs"); + +static int __init xenfs_init(void) +{ + if (xen_domain()) + return register_filesystem(&xenfs_type); + + pr_info("not registering filesystem on non-xen platform\n"); + return 0; +} + +static void __exit xenfs_exit(void) +{ + if (xen_domain()) + unregister_filesystem(&xenfs_type); +} + +module_init(xenfs_init); +module_exit(xenfs_exit); + diff --git a/drivers/xen/xenfs/xenfs.h b/drivers/xen/xenfs/xenfs.h new file mode 100644 index 000000000..6b80c7779 --- /dev/null +++ b/drivers/xen/xenfs/xenfs.h @@ -0,0 +1,7 @@ +#ifndef _XENFS_XENBUS_H +#define _XENFS_XENBUS_H + +extern const struct file_operations xsd_kva_file_ops; +extern const struct file_operations xsd_port_file_ops; + +#endif /* _XENFS_XENBUS_H */ diff --git a/drivers/xen/xenfs/xenstored.c b/drivers/xen/xenfs/xenstored.c new file mode 100644 index 000000000..fef20dbc6 --- /dev/null +++ b/drivers/xen/xenfs/xenstored.c @@ -0,0 +1,68 @@ +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/fs.h> + +#include <xen/page.h> + +#include "xenfs.h" +#include "../xenbus/xenbus_comms.h" + +static ssize_t xsd_read(struct file *file, char __user *buf, + size_t size, loff_t *off) +{ + const char *str = (const char *)file->private_data; + return simple_read_from_buffer(buf, size, off, str, strlen(str)); +} + +static int xsd_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + return 0; +} + +static int xsd_kva_open(struct inode *inode, struct file *file) +{ + file->private_data = (void *)kasprintf(GFP_KERNEL, "0x%p", + xen_store_interface); + if (!file->private_data) + return -ENOMEM; + return 0; +} + +static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t size = vma->vm_end - vma->vm_start; + + if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0)) + return -EINVAL; + + if (remap_pfn_range(vma, vma->vm_start, + virt_to_pfn(xen_store_interface), + size, vma->vm_page_prot)) + return -EAGAIN; + + return 0; +} + +const struct file_operations xsd_kva_file_ops = { + .open = xsd_kva_open, + .mmap = xsd_kva_mmap, + .read = xsd_read, + .release = xsd_release, +}; + +static int xsd_port_open(struct inode *inode, struct file *file) +{ + file->private_data = (void *)kasprintf(GFP_KERNEL, "%d", + xen_store_evtchn); + if (!file->private_data) + return -ENOMEM; + return 0; +} + +const struct file_operations xsd_port_file_ops = { + .open = xsd_port_open, + .read = xsd_read, + .release = xsd_release, +}; diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c new file mode 100644 index 000000000..58a5389ae --- /dev/null +++ b/drivers/xen/xlate_mmu.c @@ -0,0 +1,143 @@ +/* + * MMU operations common to all auto-translated physmap guests. + * + * Copyright (C) 2015 Citrix Systems R&D Ltd. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#include <linux/kernel.h> +#include <linux/mm.h> + +#include <asm/xen/hypercall.h> +#include <asm/xen/hypervisor.h> + +#include <xen/xen.h> +#include <xen/page.h> +#include <xen/interface/xen.h> +#include <xen/interface/memory.h> + +/* map fgmfn of domid to lpfn in the current domain */ +static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn, + unsigned int domid) +{ + int rc; + struct xen_add_to_physmap_range xatp = { + .domid = DOMID_SELF, + .foreign_domid = domid, + .size = 1, + .space = XENMAPSPACE_gmfn_foreign, + }; + xen_ulong_t idx = fgmfn; + xen_pfn_t gpfn = lpfn; + int err = 0; + + set_xen_guest_handle(xatp.idxs, &idx); + set_xen_guest_handle(xatp.gpfns, &gpfn); + set_xen_guest_handle(xatp.errs, &err); + + rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp); + return rc < 0 ? rc : err; +} + +struct remap_data { + xen_pfn_t *fgmfn; /* foreign domain's gmfn */ + pgprot_t prot; + domid_t domid; + struct vm_area_struct *vma; + int index; + struct page **pages; + struct xen_remap_mfn_info *info; + int *err_ptr; + int mapped; +}; + +static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr, + void *data) +{ + struct remap_data *info = data; + struct page *page = info->pages[info->index++]; + unsigned long pfn = page_to_pfn(page); + pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot)); + int rc; + + rc = map_foreign_page(pfn, *info->fgmfn, info->domid); + *info->err_ptr++ = rc; + if (!rc) { + set_pte_at(info->vma->vm_mm, addr, ptep, pte); + info->mapped++; + } + info->fgmfn++; + + return 0; +} + +int xen_xlate_remap_gfn_array(struct vm_area_struct *vma, + unsigned long addr, + xen_pfn_t *mfn, int nr, + int *err_ptr, pgprot_t prot, + unsigned domid, + struct page **pages) +{ + int err; + struct remap_data data; + unsigned long range = nr << PAGE_SHIFT; + + /* Kept here for the purpose of making sure code doesn't break + x86 PVOPS */ + BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO))); + + data.fgmfn = mfn; + data.prot = prot; + data.domid = domid; + data.vma = vma; + data.pages = pages; + data.index = 0; + data.err_ptr = err_ptr; + data.mapped = 0; + + err = apply_to_page_range(vma->vm_mm, addr, range, + remap_pte_fn, &data); + return err < 0 ? err : data.mapped; +} +EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array); + +int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma, + int nr, struct page **pages) +{ + int i; + + for (i = 0; i < nr; i++) { + struct xen_remove_from_physmap xrp; + unsigned long pfn; + + pfn = page_to_pfn(pages[i]); + + xrp.domid = DOMID_SELF; + xrp.gpfn = pfn; + (void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp); + } + return 0; +} +EXPORT_SYMBOL_GPL(xen_xlate_unmap_gfn_range); |