summaryrefslogtreecommitdiff
path: root/arch/s390
diff options
context:
space:
mode:
Diffstat (limited to 'arch/s390')
-rw-r--r--arch/s390/Kconfig31
-rw-r--r--arch/s390/appldata/appldata_mem.c2
-rw-r--r--arch/s390/boot/compressed/Makefile8
-rw-r--r--arch/s390/boot/compressed/head.S11
-rw-r--r--arch/s390/configs/default_defconfig2
-rw-r--r--arch/s390/configs/gcov_defconfig2
-rw-r--r--arch/s390/configs/performance_defconfig2
-rw-r--r--arch/s390/crypto/Makefile3
-rw-r--r--arch/s390/crypto/aes_s390.c113
-rw-r--r--arch/s390/crypto/crc32-vx.c310
-rw-r--r--arch/s390/crypto/crc32be-vx.S207
-rw-r--r--arch/s390/crypto/crc32le-vx.S268
-rw-r--r--arch/s390/defconfig5
-rw-r--r--arch/s390/hypfs/hypfs_diag.c375
-rw-r--r--arch/s390/hypfs/hypfs_vm.c2
-rw-r--r--arch/s390/include/asm/atomic.h40
-rw-r--r--arch/s390/include/asm/cache.h5
-rw-r--r--arch/s390/include/asm/cio.h2
-rw-r--r--arch/s390/include/asm/cpacf.h10
-rw-r--r--arch/s390/include/asm/cpu_mf.h17
-rw-r--r--arch/s390/include/asm/diag.h151
-rw-r--r--arch/s390/include/asm/dma-mapping.h1
-rw-r--r--arch/s390/include/asm/elf.h1
-rw-r--r--arch/s390/include/asm/fcx.h2
-rw-r--r--arch/s390/include/asm/fpu/api.h75
-rw-r--r--arch/s390/include/asm/fpu/types.h10
-rw-r--r--arch/s390/include/asm/gmap.h82
-rw-r--r--arch/s390/include/asm/hugetlb.h5
-rw-r--r--arch/s390/include/asm/ipl.h10
-rw-r--r--arch/s390/include/asm/irq.h7
-rw-r--r--arch/s390/include/asm/jump_label.h1
-rw-r--r--arch/s390/include/asm/kprobes.h4
-rw-r--r--arch/s390/include/asm/kvm_host.h33
-rw-r--r--arch/s390/include/asm/mmu.h13
-rw-r--r--arch/s390/include/asm/mmu_context.h18
-rw-r--r--arch/s390/include/asm/page.h17
-rw-r--r--arch/s390/include/asm/perf_event.h12
-rw-r--r--arch/s390/include/asm/pgalloc.h2
-rw-r--r--arch/s390/include/asm/pgtable.h309
-rw-r--r--arch/s390/include/asm/processor.h19
-rw-r--r--arch/s390/include/asm/rwsem.h37
-rw-r--r--arch/s390/include/asm/sclp.h23
-rw-r--r--arch/s390/include/asm/sections.h1
-rw-r--r--arch/s390/include/asm/setup.h4
-rw-r--r--arch/s390/include/asm/sigp.h17
-rw-r--r--arch/s390/include/asm/spinlock.h3
-rw-r--r--arch/s390/include/asm/stp.h51
-rw-r--r--arch/s390/include/asm/timex.h66
-rw-r--r--arch/s390/include/asm/tlb.h22
-rw-r--r--arch/s390/include/asm/tlbflush.h28
-rw-r--r--arch/s390/include/asm/topology.h4
-rw-r--r--arch/s390/include/asm/uaccess.h84
-rw-r--r--arch/s390/include/uapi/asm/auxvec.h2
-rw-r--r--arch/s390/include/uapi/asm/kvm.h41
-rw-r--r--arch/s390/include/uapi/asm/ptrace.h6
-rw-r--r--arch/s390/include/uapi/asm/sie.h1
-rw-r--r--arch/s390/kernel/Makefile16
-rw-r--r--arch/s390/kernel/als.c124
-rw-r--r--arch/s390/kernel/cache.c7
-rw-r--r--arch/s390/kernel/diag.c39
-rw-r--r--arch/s390/kernel/dis.c1
-rw-r--r--arch/s390/kernel/dumpstack.c12
-rw-r--r--arch/s390/kernel/early.c22
-rw-r--r--arch/s390/kernel/entry.S11
-rw-r--r--arch/s390/kernel/entry.h2
-rw-r--r--arch/s390/kernel/fpu.c249
-rw-r--r--arch/s390/kernel/head.S47
-rw-r--r--arch/s390/kernel/ipl.c25
-rw-r--r--arch/s390/kernel/irq.c7
-rw-r--r--arch/s390/kernel/kprobes.c12
-rw-r--r--arch/s390/kernel/machine_kexec.c55
-rw-r--r--arch/s390/kernel/nmi.c13
-rw-r--r--arch/s390/kernel/perf_cpum_cf.c46
-rw-r--r--arch/s390/kernel/perf_cpum_sf.c59
-rw-r--r--arch/s390/kernel/perf_event.c30
-rw-r--r--arch/s390/kernel/processor.c114
-rw-r--r--arch/s390/kernel/ptrace.c30
-rw-r--r--arch/s390/kernel/sclp.c5
-rw-r--r--arch/s390/kernel/setup.c38
-rw-r--r--arch/s390/kernel/smp.c18
-rw-r--r--arch/s390/kernel/sysinfo.c63
-rw-r--r--arch/s390/kernel/time.c1053
-rw-r--r--arch/s390/kernel/topology.c89
-rw-r--r--arch/s390/kernel/vdso32/Makefile2
-rw-r--r--arch/s390/kernel/vdso64/Makefile2
-rw-r--r--arch/s390/kernel/vmlinux.lds.S21
-rw-r--r--arch/s390/kvm/Makefile2
-rw-r--r--arch/s390/kvm/diag.c5
-rw-r--r--arch/s390/kvm/gaccess.c387
-rw-r--r--arch/s390/kvm/gaccess.h3
-rw-r--r--arch/s390/kvm/guestdbg.c19
-rw-r--r--arch/s390/kvm/intercept.c33
-rw-r--r--arch/s390/kvm/interrupt.c34
-rw-r--r--arch/s390/kvm/kvm-s390.c411
-rw-r--r--arch/s390/kvm/kvm-s390.h22
-rw-r--r--arch/s390/kvm/priv.c226
-rw-r--r--arch/s390/kvm/sigp.c10
-rw-r--r--arch/s390/kvm/sthyi.c471
-rw-r--r--arch/s390/kvm/trace.h49
-rw-r--r--arch/s390/kvm/vsie.c1091
-rw-r--r--arch/s390/lib/string.c42
-rw-r--r--arch/s390/lib/uaccess.c8
-rw-r--r--arch/s390/mm/dump_pagetables.c2
-rw-r--r--arch/s390/mm/fault.c6
-rw-r--r--arch/s390/mm/gmap.c1577
-rw-r--r--arch/s390/mm/gup.c45
-rw-r--r--arch/s390/mm/hugetlbpage.c153
-rw-r--r--arch/s390/mm/init.c13
-rw-r--r--arch/s390/mm/page-states.c13
-rw-r--r--arch/s390/mm/pageattr.c269
-rw-r--r--arch/s390/mm/pgalloc.c39
-rw-r--r--arch/s390/mm/pgtable.c302
-rw-r--r--arch/s390/mm/vmem.c73
-rw-r--r--arch/s390/numa/mode_emu.c29
-rw-r--r--arch/s390/numa/numa.c8
-rw-r--r--arch/s390/oprofile/Makefile1
-rw-r--r--arch/s390/oprofile/init.c489
-rw-r--r--arch/s390/pci/pci_dma.c27
-rw-r--r--arch/s390/pci/pci_event.c3
-rw-r--r--arch/s390/pci/pci_insn.c12
-rw-r--r--arch/s390/tools/gen_facilities.c1
121 files changed, 7754 insertions, 2910 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index a8c259059..c109f073d 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -68,10 +68,10 @@ config DEBUG_RODATA
config S390
def_bool y
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
- select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ELF_RANDOMIZE
select ARCH_HAS_GCOV_PROFILE_ALL
+ select ARCH_HAS_KCOV
select ARCH_HAS_SG_CHAIN
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_INLINE_READ_LOCK
@@ -122,6 +122,7 @@ config S390
select HAVE_ALIGNED_STRUCT_PAGE if SLUB
select HAVE_ARCH_AUDITSYSCALL
select HAVE_ARCH_EARLY_PFN_TO_NID
+ select HAVE_ARCH_HARDENED_USERCOPY
select HAVE_ARCH_JUMP_LABEL
select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
select HAVE_ARCH_SECCOMP_FILTER
@@ -163,6 +164,7 @@ config S390
select NO_BOOTMEM
select OLD_SIGACTION
select OLD_SIGSUSPEND3
+ select SPARSE_IRQ
select SYSCTL_EXCEPTION_TRACE
select TTY
select VIRT_CPU_ACCOUNTING
@@ -477,6 +479,9 @@ config SCHED_MC
config SCHED_BOOK
def_bool n
+config SCHED_DRAWER
+ def_bool n
+
config SCHED_TOPOLOGY
def_bool y
prompt "Topology scheduler support"
@@ -484,6 +489,7 @@ config SCHED_TOPOLOGY
select SCHED_SMT
select SCHED_MC
select SCHED_BOOK
+ select SCHED_DRAWER
help
Topology scheduler support improves the CPU scheduler's decision
making when dealing with machines that have multi-threading,
@@ -605,16 +611,6 @@ config PCI_NR_FUNCTIONS
This allows you to specify the maximum number of PCI functions which
this kernel will support.
-config PCI_NR_MSI
- int "Maximum number of MSI interrupts (64-32768)"
- range 64 32768
- default "256"
- help
- This defines the number of virtual interrupts the kernel will
- provide for MSI interrupts. If you configure your system to have
- too few drivers will fail to allocate MSI interrupts for all
- PCI devices.
-
source "drivers/pci/Kconfig"
endif # PCI
@@ -875,4 +871,17 @@ config S390_GUEST
Select this option if you want to run the kernel as a guest under
the KVM hypervisor.
+config S390_GUEST_OLD_TRANSPORT
+ def_bool y
+ prompt "Guest support for old s390 virtio transport (DEPRECATED)"
+ depends on S390_GUEST
+ help
+ Enable this option to add support for the old s390-virtio
+ transport (i.e. virtio devices NOT based on virtio-ccw). This
+ type of virtio devices is only available on the experimental
+ kuli userspace or with old (< 2.6) qemu. If you are running
+ with a modern version of qemu (which supports virtio-ccw since
+ 1.4 and uses it by default since version 2.4), you probably won't
+ need this.
+
endmenu
diff --git a/arch/s390/appldata/appldata_mem.c b/arch/s390/appldata/appldata_mem.c
index edcf2a706..598df5708 100644
--- a/arch/s390/appldata/appldata_mem.c
+++ b/arch/s390/appldata/appldata_mem.c
@@ -102,7 +102,7 @@ static void appldata_get_mem_data(void *data)
mem_data->totalhigh = P2K(val.totalhigh);
mem_data->freehigh = P2K(val.freehigh);
mem_data->bufferram = P2K(val.bufferram);
- mem_data->cached = P2K(global_page_state(NR_FILE_PAGES)
+ mem_data->cached = P2K(global_node_page_state(NR_FILE_PAGES)
- val.bufferram);
si_swapinfo(&val);
diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile
index 1dd210347..33ba697c7 100644
--- a/arch/s390/boot/compressed/Makefile
+++ b/arch/s390/boot/compressed/Makefile
@@ -4,6 +4,8 @@
# create a compressed vmlinux image from the original vmlinux
#
+KCOV_INSTRUMENT := n
+
targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2
targets += vmlinux.bin.xz vmlinux.bin.lzma vmlinux.bin.lzo vmlinux.bin.lz4
targets += misc.o piggy.o sizes.h head.o
@@ -16,7 +18,7 @@ KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
GCOV_PROFILE := n
-OBJECTS := $(addprefix $(objtree)/arch/s390/kernel/, head.o sclp.o ebcdic.o)
+OBJECTS := $(addprefix $(objtree)/arch/s390/kernel/, head.o sclp.o ebcdic.o als.o)
OBJECTS += $(obj)/head.o $(obj)/misc.o $(obj)/piggy.o
LDFLAGS_vmlinux := --oformat $(LD_BFD) -e startup -T
@@ -31,10 +33,10 @@ quiet_cmd_sizes = GEN $@
$(obj)/sizes.h: vmlinux
$(call if_changed,sizes)
-AFLAGS_head.o += -I$(obj)
+AFLAGS_head.o += -I$(objtree)/$(obj)
$(obj)/head.o: $(obj)/sizes.h
-CFLAGS_misc.o += -I$(obj)
+CFLAGS_misc.o += -I$(objtree)/$(obj)
$(obj)/misc.o: $(obj)/sizes.h
OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
diff --git a/arch/s390/boot/compressed/head.S b/arch/s390/boot/compressed/head.S
index f86a4eef2..28c4f96a2 100644
--- a/arch/s390/boot/compressed/head.S
+++ b/arch/s390/boot/compressed/head.S
@@ -21,16 +21,21 @@ ENTRY(startup_continue)
lg %r15,.Lstack-.LPG1(%r13)
aghi %r15,-160
brasl %r14,decompress_kernel
- # setup registers for memory mover & branch to target
+ # Set up registers for memory mover. We move the decompressed image to
+ # 0x11000, starting at offset 0x11000 in the decompressed image so
+ # that code living at 0x11000 in the image will end up at 0x11000 in
+ # memory.
lgr %r4,%r2
lg %r2,.Loffset-.LPG1(%r13)
la %r4,0(%r2,%r4)
lg %r3,.Lmvsize-.LPG1(%r13)
lgr %r5,%r3
- # move the memory mover someplace safe
+ # Move the memory mover someplace safe so it doesn't overwrite itself.
la %r1,0x200
mvc 0(mover_end-mover,%r1),mover-.LPG1(%r13)
- # decompress image is started at 0x11000
+ # When the memory mover is done we pass control to
+ # arch/s390/kernel/head64.S:startup_continue which lives at 0x11000 in
+ # the decompressed image.
lgr %r6,%r2
br %r1
mover:
diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig
index d5ec71b2e..412b1bd21 100644
--- a/arch/s390/configs/default_defconfig
+++ b/arch/s390/configs/default_defconfig
@@ -602,7 +602,6 @@ CONFIG_FAIL_FUTEX=y
CONFIG_FAULT_INJECTION_DEBUG_FS=y
CONFIG_FAULT_INJECTION_STACKTRACE_FILTER=y
CONFIG_LATENCYTOP=y
-CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
CONFIG_IRQSOFF_TRACER=y
CONFIG_PREEMPT_TRACER=y
CONFIG_SCHED_TRACER=y
@@ -678,6 +677,7 @@ CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_DES_S390=m
CONFIG_CRYPTO_AES_S390=m
CONFIG_CRYPTO_GHASH_S390=m
+CONFIG_CRYPTO_CRC32_S390=y
CONFIG_ASYMMETRIC_KEY_TYPE=y
CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m
CONFIG_X509_CERTIFICATE_PARSER=m
diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig
index f46a35115..bec279eb4 100644
--- a/arch/s390/configs/gcov_defconfig
+++ b/arch/s390/configs/gcov_defconfig
@@ -552,7 +552,6 @@ CONFIG_NOTIFIER_ERROR_INJECTION=m
CONFIG_CPU_NOTIFIER_ERROR_INJECT=m
CONFIG_PM_NOTIFIER_ERROR_INJECT=m
CONFIG_LATENCYTOP=y
-CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
CONFIG_BLK_DEV_IO_TRACE=y
# CONFIG_KPROBE_EVENT is not set
CONFIG_TRACE_ENUM_MAP_FILE=y
@@ -616,6 +615,7 @@ CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_DES_S390=m
CONFIG_CRYPTO_AES_S390=m
CONFIG_CRYPTO_GHASH_S390=m
+CONFIG_CRYPTO_CRC32_S390=y
CONFIG_ASYMMETRIC_KEY_TYPE=y
CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m
CONFIG_X509_CERTIFICATE_PARSER=m
diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig
index ba0f2a58b..1751446a5 100644
--- a/arch/s390/configs/performance_defconfig
+++ b/arch/s390/configs/performance_defconfig
@@ -549,7 +549,6 @@ CONFIG_TIMER_STATS=y
CONFIG_RCU_TORTURE_TEST=m
CONFIG_RCU_CPU_STALL_TIMEOUT=60
CONFIG_LATENCYTOP=y
-CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
CONFIG_SCHED_TRACER=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_STACK_TRACER=y
@@ -615,6 +614,7 @@ CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_DES_S390=m
CONFIG_CRYPTO_AES_S390=m
CONFIG_CRYPTO_GHASH_S390=m
+CONFIG_CRYPTO_CRC32_S390=y
CONFIG_ASYMMETRIC_KEY_TYPE=y
CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=m
CONFIG_X509_CERTIFICATE_PARSER=m
diff --git a/arch/s390/crypto/Makefile b/arch/s390/crypto/Makefile
index 7f0b7cda6..d1033de4c 100644
--- a/arch/s390/crypto/Makefile
+++ b/arch/s390/crypto/Makefile
@@ -9,3 +9,6 @@ obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o
obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o
obj-$(CONFIG_S390_PRNG) += prng.o
obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o
+obj-$(CONFIG_CRYPTO_CRC32_S390) += crc32-vx_s390.o
+
+crc32-vx_s390-y := crc32-vx.o crc32le-vx.o crc32be-vx.o
diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c
index 7554a8bb2..2ea18b050 100644
--- a/arch/s390/crypto/aes_s390.c
+++ b/arch/s390/crypto/aes_s390.c
@@ -22,6 +22,7 @@
#include <crypto/aes.h>
#include <crypto/algapi.h>
+#include <crypto/internal/skcipher.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/cpufeature.h>
@@ -44,7 +45,7 @@ struct s390_aes_ctx {
long dec;
int key_len;
union {
- struct crypto_blkcipher *blk;
+ struct crypto_skcipher *blk;
struct crypto_cipher *cip;
} fallback;
};
@@ -63,7 +64,7 @@ struct s390_xts_ctx {
long enc;
long dec;
int key_len;
- struct crypto_blkcipher *fallback;
+ struct crypto_skcipher *fallback;
};
/*
@@ -237,16 +238,16 @@ static int setkey_fallback_blk(struct crypto_tfm *tfm, const u8 *key,
struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
unsigned int ret;
- sctx->fallback.blk->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
- sctx->fallback.blk->base.crt_flags |= (tfm->crt_flags &
- CRYPTO_TFM_REQ_MASK);
+ crypto_skcipher_clear_flags(sctx->fallback.blk, CRYPTO_TFM_REQ_MASK);
+ crypto_skcipher_set_flags(sctx->fallback.blk, tfm->crt_flags &
+ CRYPTO_TFM_REQ_MASK);
+
+ ret = crypto_skcipher_setkey(sctx->fallback.blk, key, len);
+
+ tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+ tfm->crt_flags |= crypto_skcipher_get_flags(sctx->fallback.blk) &
+ CRYPTO_TFM_RES_MASK;
- ret = crypto_blkcipher_setkey(sctx->fallback.blk, key, len);
- if (ret) {
- tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
- tfm->crt_flags |= (sctx->fallback.blk->base.crt_flags &
- CRYPTO_TFM_RES_MASK);
- }
return ret;
}
@@ -255,15 +256,17 @@ static int fallback_blk_dec(struct blkcipher_desc *desc,
unsigned int nbytes)
{
unsigned int ret;
- struct crypto_blkcipher *tfm;
- struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
+ struct crypto_blkcipher *tfm = desc->tfm;
+ struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm);
+ SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk);
- tfm = desc->tfm;
- desc->tfm = sctx->fallback.blk;
+ skcipher_request_set_tfm(req, sctx->fallback.blk);
+ skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+ skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
- ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes);
+ ret = crypto_skcipher_decrypt(req);
- desc->tfm = tfm;
+ skcipher_request_zero(req);
return ret;
}
@@ -272,15 +275,15 @@ static int fallback_blk_enc(struct blkcipher_desc *desc,
unsigned int nbytes)
{
unsigned int ret;
- struct crypto_blkcipher *tfm;
- struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm);
+ struct crypto_blkcipher *tfm = desc->tfm;
+ struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(tfm);
+ SKCIPHER_REQUEST_ON_STACK(req, sctx->fallback.blk);
- tfm = desc->tfm;
- desc->tfm = sctx->fallback.blk;
+ skcipher_request_set_tfm(req, sctx->fallback.blk);
+ skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+ skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
- ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes);
-
- desc->tfm = tfm;
+ ret = crypto_skcipher_encrypt(req);
return ret;
}
@@ -370,8 +373,9 @@ static int fallback_init_blk(struct crypto_tfm *tfm)
const char *name = tfm->__crt_alg->cra_name;
struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
- sctx->fallback.blk = crypto_alloc_blkcipher(name, 0,
- CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+ sctx->fallback.blk = crypto_alloc_skcipher(name, 0,
+ CRYPTO_ALG_ASYNC |
+ CRYPTO_ALG_NEED_FALLBACK);
if (IS_ERR(sctx->fallback.blk)) {
pr_err("Allocating AES fallback algorithm %s failed\n",
@@ -386,8 +390,7 @@ static void fallback_exit_blk(struct crypto_tfm *tfm)
{
struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm);
- crypto_free_blkcipher(sctx->fallback.blk);
- sctx->fallback.blk = NULL;
+ crypto_free_skcipher(sctx->fallback.blk);
}
static struct crypto_alg ecb_aes_alg = {
@@ -536,16 +539,16 @@ static int xts_fallback_setkey(struct crypto_tfm *tfm, const u8 *key,
struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
unsigned int ret;
- xts_ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK;
- xts_ctx->fallback->base.crt_flags |= (tfm->crt_flags &
- CRYPTO_TFM_REQ_MASK);
+ crypto_skcipher_clear_flags(xts_ctx->fallback, CRYPTO_TFM_REQ_MASK);
+ crypto_skcipher_set_flags(xts_ctx->fallback, tfm->crt_flags &
+ CRYPTO_TFM_REQ_MASK);
+
+ ret = crypto_skcipher_setkey(xts_ctx->fallback, key, len);
+
+ tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
+ tfm->crt_flags |= crypto_skcipher_get_flags(xts_ctx->fallback) &
+ CRYPTO_TFM_RES_MASK;
- ret = crypto_blkcipher_setkey(xts_ctx->fallback, key, len);
- if (ret) {
- tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK;
- tfm->crt_flags |= (xts_ctx->fallback->base.crt_flags &
- CRYPTO_TFM_RES_MASK);
- }
return ret;
}
@@ -553,16 +556,18 @@ static int xts_fallback_decrypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
{
- struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm);
- struct crypto_blkcipher *tfm;
+ struct crypto_blkcipher *tfm = desc->tfm;
+ struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm);
+ SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback);
unsigned int ret;
- tfm = desc->tfm;
- desc->tfm = xts_ctx->fallback;
+ skcipher_request_set_tfm(req, xts_ctx->fallback);
+ skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+ skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
- ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes);
+ ret = crypto_skcipher_decrypt(req);
- desc->tfm = tfm;
+ skcipher_request_zero(req);
return ret;
}
@@ -570,16 +575,18 @@ static int xts_fallback_encrypt(struct blkcipher_desc *desc,
struct scatterlist *dst, struct scatterlist *src,
unsigned int nbytes)
{
- struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm);
- struct crypto_blkcipher *tfm;
+ struct crypto_blkcipher *tfm = desc->tfm;
+ struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(tfm);
+ SKCIPHER_REQUEST_ON_STACK(req, xts_ctx->fallback);
unsigned int ret;
- tfm = desc->tfm;
- desc->tfm = xts_ctx->fallback;
+ skcipher_request_set_tfm(req, xts_ctx->fallback);
+ skcipher_request_set_callback(req, desc->flags, NULL, NULL);
+ skcipher_request_set_crypt(req, src, dst, nbytes, desc->info);
- ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes);
+ ret = crypto_skcipher_encrypt(req);
- desc->tfm = tfm;
+ skcipher_request_zero(req);
return ret;
}
@@ -700,8 +707,9 @@ static int xts_fallback_init(struct crypto_tfm *tfm)
const char *name = tfm->__crt_alg->cra_name;
struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
- xts_ctx->fallback = crypto_alloc_blkcipher(name, 0,
- CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK);
+ xts_ctx->fallback = crypto_alloc_skcipher(name, 0,
+ CRYPTO_ALG_ASYNC |
+ CRYPTO_ALG_NEED_FALLBACK);
if (IS_ERR(xts_ctx->fallback)) {
pr_err("Allocating XTS fallback algorithm %s failed\n",
@@ -715,8 +723,7 @@ static void xts_fallback_exit(struct crypto_tfm *tfm)
{
struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm);
- crypto_free_blkcipher(xts_ctx->fallback);
- xts_ctx->fallback = NULL;
+ crypto_free_skcipher(xts_ctx->fallback);
}
static struct crypto_alg xts_aes_alg = {
diff --git a/arch/s390/crypto/crc32-vx.c b/arch/s390/crypto/crc32-vx.c
new file mode 100644
index 000000000..2bad9d837
--- /dev/null
+++ b/arch/s390/crypto/crc32-vx.c
@@ -0,0 +1,310 @@
+/*
+ * Crypto-API module for CRC-32 algorithms implemented with the
+ * z/Architecture Vector Extension Facility.
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+#define KMSG_COMPONENT "crc32-vx"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/cpufeature.h>
+#include <linux/crc32.h>
+#include <crypto/internal/hash.h>
+#include <asm/fpu/api.h>
+
+
+#define CRC32_BLOCK_SIZE 1
+#define CRC32_DIGEST_SIZE 4
+
+#define VX_MIN_LEN 64
+#define VX_ALIGNMENT 16L
+#define VX_ALIGN_MASK (VX_ALIGNMENT - 1)
+
+struct crc_ctx {
+ u32 key;
+};
+
+struct crc_desc_ctx {
+ u32 crc;
+};
+
+/* Prototypes for functions in assembly files */
+u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size);
+
+/*
+ * DEFINE_CRC32_VX() - Define a CRC-32 function using the vector extension
+ *
+ * Creates a function to perform a particular CRC-32 computation. Depending
+ * on the message buffer, the hardware-accelerated or software implementation
+ * is used. Note that the message buffer is aligned to improve fetch
+ * operations of VECTOR LOAD MULTIPLE instructions.
+ *
+ */
+#define DEFINE_CRC32_VX(___fname, ___crc32_vx, ___crc32_sw) \
+ static u32 __pure ___fname(u32 crc, \
+ unsigned char const *data, size_t datalen) \
+ { \
+ struct kernel_fpu vxstate; \
+ unsigned long prealign, aligned, remaining; \
+ \
+ if (datalen < VX_MIN_LEN + VX_ALIGN_MASK) \
+ return ___crc32_sw(crc, data, datalen); \
+ \
+ if ((unsigned long)data & VX_ALIGN_MASK) { \
+ prealign = VX_ALIGNMENT - \
+ ((unsigned long)data & VX_ALIGN_MASK); \
+ datalen -= prealign; \
+ crc = ___crc32_sw(crc, data, prealign); \
+ data = (void *)((unsigned long)data + prealign); \
+ } \
+ \
+ aligned = datalen & ~VX_ALIGN_MASK; \
+ remaining = datalen & VX_ALIGN_MASK; \
+ \
+ kernel_fpu_begin(&vxstate, KERNEL_VXR_LOW); \
+ crc = ___crc32_vx(crc, data, aligned); \
+ kernel_fpu_end(&vxstate); \
+ \
+ if (remaining) \
+ crc = ___crc32_sw(crc, data + aligned, remaining); \
+ \
+ return crc; \
+ }
+
+DEFINE_CRC32_VX(crc32_le_vx, crc32_le_vgfm_16, crc32_le)
+DEFINE_CRC32_VX(crc32_be_vx, crc32_be_vgfm_16, crc32_be)
+DEFINE_CRC32_VX(crc32c_le_vx, crc32c_le_vgfm_16, __crc32c_le)
+
+
+static int crc32_vx_cra_init_zero(struct crypto_tfm *tfm)
+{
+ struct crc_ctx *mctx = crypto_tfm_ctx(tfm);
+
+ mctx->key = 0;
+ return 0;
+}
+
+static int crc32_vx_cra_init_invert(struct crypto_tfm *tfm)
+{
+ struct crc_ctx *mctx = crypto_tfm_ctx(tfm);
+
+ mctx->key = ~0;
+ return 0;
+}
+
+static int crc32_vx_init(struct shash_desc *desc)
+{
+ struct crc_ctx *mctx = crypto_shash_ctx(desc->tfm);
+ struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ ctx->crc = mctx->key;
+ return 0;
+}
+
+static int crc32_vx_setkey(struct crypto_shash *tfm, const u8 *newkey,
+ unsigned int newkeylen)
+{
+ struct crc_ctx *mctx = crypto_shash_ctx(tfm);
+
+ if (newkeylen != sizeof(mctx->key)) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ mctx->key = le32_to_cpu(*(__le32 *)newkey);
+ return 0;
+}
+
+static int crc32be_vx_setkey(struct crypto_shash *tfm, const u8 *newkey,
+ unsigned int newkeylen)
+{
+ struct crc_ctx *mctx = crypto_shash_ctx(tfm);
+
+ if (newkeylen != sizeof(mctx->key)) {
+ crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
+ return -EINVAL;
+ }
+ mctx->key = be32_to_cpu(*(__be32 *)newkey);
+ return 0;
+}
+
+static int crc32le_vx_final(struct shash_desc *desc, u8 *out)
+{
+ struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ *(__le32 *)out = cpu_to_le32p(&ctx->crc);
+ return 0;
+}
+
+static int crc32be_vx_final(struct shash_desc *desc, u8 *out)
+{
+ struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ *(__be32 *)out = cpu_to_be32p(&ctx->crc);
+ return 0;
+}
+
+static int crc32c_vx_final(struct shash_desc *desc, u8 *out)
+{
+ struct crc_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ /*
+ * Perform a final XOR with 0xFFFFFFFF to be in sync
+ * with the generic crc32c shash implementation.
+ */
+ *(__le32 *)out = ~cpu_to_le32p(&ctx->crc);
+ return 0;
+}
+
+static int __crc32le_vx_finup(u32 *crc, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(__le32 *)out = cpu_to_le32(crc32_le_vx(*crc, data, len));
+ return 0;
+}
+
+static int __crc32be_vx_finup(u32 *crc, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ *(__be32 *)out = cpu_to_be32(crc32_be_vx(*crc, data, len));
+ return 0;
+}
+
+static int __crc32c_vx_finup(u32 *crc, const u8 *data, unsigned int len,
+ u8 *out)
+{
+ /*
+ * Perform a final XOR with 0xFFFFFFFF to be in sync
+ * with the generic crc32c shash implementation.
+ */
+ *(__le32 *)out = ~cpu_to_le32(crc32c_le_vx(*crc, data, len));
+ return 0;
+}
+
+
+#define CRC32_VX_FINUP(alg, func) \
+ static int alg ## _vx_finup(struct shash_desc *desc, const u8 *data, \
+ unsigned int datalen, u8 *out) \
+ { \
+ return __ ## alg ## _vx_finup(shash_desc_ctx(desc), \
+ data, datalen, out); \
+ }
+
+CRC32_VX_FINUP(crc32le, crc32_le_vx)
+CRC32_VX_FINUP(crc32be, crc32_be_vx)
+CRC32_VX_FINUP(crc32c, crc32c_le_vx)
+
+#define CRC32_VX_DIGEST(alg, func) \
+ static int alg ## _vx_digest(struct shash_desc *desc, const u8 *data, \
+ unsigned int len, u8 *out) \
+ { \
+ return __ ## alg ## _vx_finup(crypto_shash_ctx(desc->tfm), \
+ data, len, out); \
+ }
+
+CRC32_VX_DIGEST(crc32le, crc32_le_vx)
+CRC32_VX_DIGEST(crc32be, crc32_be_vx)
+CRC32_VX_DIGEST(crc32c, crc32c_le_vx)
+
+#define CRC32_VX_UPDATE(alg, func) \
+ static int alg ## _vx_update(struct shash_desc *desc, const u8 *data, \
+ unsigned int datalen) \
+ { \
+ struct crc_desc_ctx *ctx = shash_desc_ctx(desc); \
+ ctx->crc = func(ctx->crc, data, datalen); \
+ return 0; \
+ }
+
+CRC32_VX_UPDATE(crc32le, crc32_le_vx)
+CRC32_VX_UPDATE(crc32be, crc32_be_vx)
+CRC32_VX_UPDATE(crc32c, crc32c_le_vx)
+
+
+static struct shash_alg crc32_vx_algs[] = {
+ /* CRC-32 LE */
+ {
+ .init = crc32_vx_init,
+ .setkey = crc32_vx_setkey,
+ .update = crc32le_vx_update,
+ .final = crc32le_vx_final,
+ .finup = crc32le_vx_finup,
+ .digest = crc32le_vx_digest,
+ .descsize = sizeof(struct crc_desc_ctx),
+ .digestsize = CRC32_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32",
+ .cra_driver_name = "crc32-vx",
+ .cra_priority = 200,
+ .cra_blocksize = CRC32_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crc_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32_vx_cra_init_zero,
+ },
+ },
+ /* CRC-32 BE */
+ {
+ .init = crc32_vx_init,
+ .setkey = crc32be_vx_setkey,
+ .update = crc32be_vx_update,
+ .final = crc32be_vx_final,
+ .finup = crc32be_vx_finup,
+ .digest = crc32be_vx_digest,
+ .descsize = sizeof(struct crc_desc_ctx),
+ .digestsize = CRC32_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32be",
+ .cra_driver_name = "crc32be-vx",
+ .cra_priority = 200,
+ .cra_blocksize = CRC32_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crc_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32_vx_cra_init_zero,
+ },
+ },
+ /* CRC-32C LE */
+ {
+ .init = crc32_vx_init,
+ .setkey = crc32_vx_setkey,
+ .update = crc32c_vx_update,
+ .final = crc32c_vx_final,
+ .finup = crc32c_vx_finup,
+ .digest = crc32c_vx_digest,
+ .descsize = sizeof(struct crc_desc_ctx),
+ .digestsize = CRC32_DIGEST_SIZE,
+ .base = {
+ .cra_name = "crc32c",
+ .cra_driver_name = "crc32c-vx",
+ .cra_priority = 200,
+ .cra_blocksize = CRC32_BLOCK_SIZE,
+ .cra_ctxsize = sizeof(struct crc_ctx),
+ .cra_module = THIS_MODULE,
+ .cra_init = crc32_vx_cra_init_invert,
+ },
+ },
+};
+
+
+static int __init crc_vx_mod_init(void)
+{
+ return crypto_register_shashes(crc32_vx_algs,
+ ARRAY_SIZE(crc32_vx_algs));
+}
+
+static void __exit crc_vx_mod_exit(void)
+{
+ crypto_unregister_shashes(crc32_vx_algs, ARRAY_SIZE(crc32_vx_algs));
+}
+
+module_cpu_feature_match(VXRS, crc_vx_mod_init);
+module_exit(crc_vx_mod_exit);
+
+MODULE_AUTHOR("Hendrik Brueckner <brueckner@linux.vnet.ibm.com>");
+MODULE_LICENSE("GPL");
+
+MODULE_ALIAS_CRYPTO("crc32");
+MODULE_ALIAS_CRYPTO("crc32-vx");
+MODULE_ALIAS_CRYPTO("crc32c");
+MODULE_ALIAS_CRYPTO("crc32c-vx");
diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S
new file mode 100644
index 000000000..8013989cd
--- /dev/null
+++ b/arch/s390/crypto/crc32be-vx.S
@@ -0,0 +1,207 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of CRC-32 checksums.
+ *
+ * This CRC-32 implementation algorithm processes the most-significant
+ * bit first (BE).
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/vx-insn.h>
+
+/* Vector register range containing CRC-32 constants */
+#define CONST_R1R2 %v9
+#define CONST_R3R4 %v10
+#define CONST_R5 %v11
+#define CONST_R6 %v12
+#define CONST_RU_POLY %v13
+#define CONST_CRC_POLY %v14
+
+.data
+.align 8
+
+/*
+ * The CRC-32 constant block contains reduction constants to fold and
+ * process particular chunks of the input data stream in parallel.
+ *
+ * For the CRC-32 variants, the constants are precomputed according to
+ * these defintions:
+ *
+ * R1 = x4*128+64 mod P(x)
+ * R2 = x4*128 mod P(x)
+ * R3 = x128+64 mod P(x)
+ * R4 = x128 mod P(x)
+ * R5 = x96 mod P(x)
+ * R6 = x64 mod P(x)
+ *
+ * Barret reduction constant, u, is defined as floor(x**64 / P(x)).
+ *
+ * where P(x) is the polynomial in the normal domain and the P'(x) is the
+ * polynomial in the reversed (bitreflected) domain.
+ *
+ * Note that the constant definitions below are extended in order to compute
+ * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
+ * The righmost doubleword can be 0 to prevent contribution to the result or
+ * can be multiplied by 1 to perform an XOR without the need for a separate
+ * VECTOR EXCLUSIVE OR instruction.
+ *
+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+ *
+ * P(x) = 0x04C11DB7
+ * P'(x) = 0xEDB88320
+ */
+
+.Lconstants_CRC_32_BE:
+ .quad 0x08833794c, 0x0e6228b11 # R1, R2
+ .quad 0x0c5b9cd4c, 0x0e8a45605 # R3, R4
+ .quad 0x0f200aa66, 1 << 32 # R5, x32
+ .quad 0x0490d678d, 1 # R6, 1
+ .quad 0x104d101df, 0 # u
+ .quad 0x104C11DB7, 0 # P(x)
+
+.previous
+
+.text
+/*
+ * The CRC-32 function(s) use these calling conventions:
+ *
+ * Parameters:
+ *
+ * %r2: Initial CRC value, typically ~0; and final CRC (return) value.
+ * %r3: Input buffer pointer, performance might be improved if the
+ * buffer is on a doubleword boundary.
+ * %r4: Length of the buffer, must be 64 bytes or greater.
+ *
+ * Register usage:
+ *
+ * %r5: CRC-32 constant pool base pointer.
+ * V0: Initial CRC value and intermediate constants and results.
+ * V1..V4: Data for CRC computation.
+ * V5..V8: Next data chunks that are fetched from the input buffer.
+ *
+ * V9..V14: CRC-32 constants.
+ */
+ENTRY(crc32_be_vgfm_16)
+ /* Load CRC-32 constants */
+ larl %r5,.Lconstants_CRC_32_BE
+ VLM CONST_R1R2,CONST_CRC_POLY,0,%r5
+
+ /* Load the initial CRC value into the leftmost word of V0. */
+ VZERO %v0
+ VLVGF %v0,%r2,0
+
+ /* Load a 64-byte data chunk and XOR with CRC */
+ VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */
+ VX %v1,%v0,%v1 /* V1 ^= CRC */
+ aghi %r3,64 /* BUF = BUF + 64 */
+ aghi %r4,-64 /* LEN = LEN - 64 */
+
+ /* Check remaining buffer size and jump to proper folding method */
+ cghi %r4,64
+ jl .Lless_than_64bytes
+
+.Lfold_64bytes_loop:
+ /* Load the next 64-byte data chunk into V5 to V8 */
+ VLM %v5,%v8,0,%r3
+
+ /*
+ * Perform a GF(2) multiplication of the doublewords in V1 with
+ * the reduction constants in V0. The intermediate result is
+ * then folded (accumulated) with the next data chunk in V5 and
+ * stored in V1. Repeat this step for the register contents
+ * in V2, V3, and V4 respectively.
+ */
+ VGFMAG %v1,CONST_R1R2,%v1,%v5
+ VGFMAG %v2,CONST_R1R2,%v2,%v6
+ VGFMAG %v3,CONST_R1R2,%v3,%v7
+ VGFMAG %v4,CONST_R1R2,%v4,%v8
+
+ /* Adjust buffer pointer and length for next loop */
+ aghi %r3,64 /* BUF = BUF + 64 */
+ aghi %r4,-64 /* LEN = LEN - 64 */
+
+ cghi %r4,64
+ jnl .Lfold_64bytes_loop
+
+.Lless_than_64bytes:
+ /* Fold V1 to V4 into a single 128-bit value in V1 */
+ VGFMAG %v1,CONST_R3R4,%v1,%v2
+ VGFMAG %v1,CONST_R3R4,%v1,%v3
+ VGFMAG %v1,CONST_R3R4,%v1,%v4
+
+ /* Check whether to continue with 64-bit folding */
+ cghi %r4,16
+ jl .Lfinal_fold
+
+.Lfold_16bytes_loop:
+
+ VL %v2,0,,%r3 /* Load next data chunk */
+ VGFMAG %v1,CONST_R3R4,%v1,%v2 /* Fold next data chunk */
+
+ /* Adjust buffer pointer and size for folding next data chunk */
+ aghi %r3,16
+ aghi %r4,-16
+
+ /* Process remaining data chunks */
+ cghi %r4,16
+ jnl .Lfold_16bytes_loop
+
+.Lfinal_fold:
+ /*
+ * The R5 constant is used to fold a 128-bit value into an 96-bit value
+ * that is XORed with the next 96-bit input data chunk. To use a single
+ * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to
+ * form an intermediate 96-bit value (with appended zeros) which is then
+ * XORed with the intermediate reduction result.
+ */
+ VGFMG %v1,CONST_R5,%v1
+
+ /*
+ * Further reduce the remaining 96-bit value to a 64-bit value using a
+ * single VGFMG, the rightmost doubleword is multiplied with 0x1. The
+ * intermediate result is then XORed with the product of the leftmost
+ * doubleword with R6. The result is a 64-bit value and is subject to
+ * the Barret reduction.
+ */
+ VGFMG %v1,CONST_R6,%v1
+
+ /*
+ * The input values to the Barret reduction are the degree-63 polynomial
+ * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+ * constant u. The Barret reduction result is the CRC value of R(x) mod
+ * P(x).
+ *
+ * The Barret reduction algorithm is defined as:
+ *
+ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+ * 3. C(x) = R(x) XOR T2(x) mod x^32
+ *
+ * Note: To compensate the division by x^32, use the vector unpack
+ * instruction to move the leftmost word into the leftmost doubleword
+ * of the vector register. The rightmost doubleword is multiplied
+ * with zero to not contribute to the intermedate results.
+ */
+
+ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+ VUPLLF %v2,%v1
+ VGFMG %v2,CONST_RU_POLY,%v2
+
+ /*
+ * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
+ * V2 and XOR the intermediate result, T2(x), with the value in V1.
+ * The final result is in the rightmost word of V2.
+ */
+ VUPLLF %v2,%v2
+ VGFMAG %v2,CONST_CRC_POLY,%v2,%v1
+
+.Ldone:
+ VLGVF %r2,%v2,3
+ br %r14
+
+.previous
diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.S
new file mode 100644
index 000000000..17f2504c2
--- /dev/null
+++ b/arch/s390/crypto/crc32le-vx.S
@@ -0,0 +1,268 @@
+/*
+ * Hardware-accelerated CRC-32 variants for Linux on z Systems
+ *
+ * Use the z/Architecture Vector Extension Facility to accelerate the
+ * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet
+ * and Castagnoli.
+ *
+ * This CRC-32 implementation algorithm is bitreflected and processes
+ * the least-significant bit first (Little-Endian).
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#include <linux/linkage.h>
+#include <asm/vx-insn.h>
+
+/* Vector register range containing CRC-32 constants */
+#define CONST_PERM_LE2BE %v9
+#define CONST_R2R1 %v10
+#define CONST_R4R3 %v11
+#define CONST_R5 %v12
+#define CONST_RU_POLY %v13
+#define CONST_CRC_POLY %v14
+
+.data
+.align 8
+
+/*
+ * The CRC-32 constant block contains reduction constants to fold and
+ * process particular chunks of the input data stream in parallel.
+ *
+ * For the CRC-32 variants, the constants are precomputed according to
+ * these definitions:
+ *
+ * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1
+ * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1
+ * R3 = [(x128+32 mod P'(x) << 32)]' << 1
+ * R4 = [(x128-32 mod P'(x) << 32)]' << 1
+ * R5 = [(x64 mod P'(x) << 32)]' << 1
+ * R6 = [(x32 mod P'(x) << 32)]' << 1
+ *
+ * The bitreflected Barret reduction constant, u', is defined as
+ * the bit reversal of floor(x**64 / P(x)).
+ *
+ * where P(x) is the polynomial in the normal domain and the P'(x) is the
+ * polynomial in the reversed (bitreflected) domain.
+ *
+ * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials:
+ *
+ * P(x) = 0x04C11DB7
+ * P'(x) = 0xEDB88320
+ *
+ * CRC-32C (Castagnoli) polynomials:
+ *
+ * P(x) = 0x1EDC6F41
+ * P'(x) = 0x82F63B78
+ */
+
+.Lconstants_CRC_32_LE:
+ .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask
+ .quad 0x1c6e41596, 0x154442bd4 # R2, R1
+ .quad 0x0ccaa009e, 0x1751997d0 # R4, R3
+ .octa 0x163cd6124 # R5
+ .octa 0x1F7011641 # u'
+ .octa 0x1DB710641 # P'(x) << 1
+
+.Lconstants_CRC_32C_LE:
+ .octa 0x0F0E0D0C0B0A09080706050403020100 # BE->LE mask
+ .quad 0x09e4addf8, 0x740eef02 # R2, R1
+ .quad 0x14cd00bd6, 0xf20c0dfe # R4, R3
+ .octa 0x0dd45aab8 # R5
+ .octa 0x0dea713f1 # u'
+ .octa 0x105ec76f0 # P'(x) << 1
+
+.previous
+
+
+.text
+
+/*
+ * The CRC-32 functions use these calling conventions:
+ *
+ * Parameters:
+ *
+ * %r2: Initial CRC value, typically ~0; and final CRC (return) value.
+ * %r3: Input buffer pointer, performance might be improved if the
+ * buffer is on a doubleword boundary.
+ * %r4: Length of the buffer, must be 64 bytes or greater.
+ *
+ * Register usage:
+ *
+ * %r5: CRC-32 constant pool base pointer.
+ * V0: Initial CRC value and intermediate constants and results.
+ * V1..V4: Data for CRC computation.
+ * V5..V8: Next data chunks that are fetched from the input buffer.
+ * V9: Constant for BE->LE conversion and shift operations
+ *
+ * V10..V14: CRC-32 constants.
+ */
+
+ENTRY(crc32_le_vgfm_16)
+ larl %r5,.Lconstants_CRC_32_LE
+ j crc32_le_vgfm_generic
+
+ENTRY(crc32c_le_vgfm_16)
+ larl %r5,.Lconstants_CRC_32C_LE
+ j crc32_le_vgfm_generic
+
+
+crc32_le_vgfm_generic:
+ /* Load CRC-32 constants */
+ VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5
+
+ /*
+ * Load the initial CRC value.
+ *
+ * The CRC value is loaded into the rightmost word of the
+ * vector register and is later XORed with the LSB portion
+ * of the loaded input data.
+ */
+ VZERO %v0 /* Clear V0 */
+ VLVGF %v0,%r2,3 /* Load CRC into rightmost word */
+
+ /* Load a 64-byte data chunk and XOR with CRC */
+ VLM %v1,%v4,0,%r3 /* 64-bytes into V1..V4 */
+ VPERM %v1,%v1,%v1,CONST_PERM_LE2BE
+ VPERM %v2,%v2,%v2,CONST_PERM_LE2BE
+ VPERM %v3,%v3,%v3,CONST_PERM_LE2BE
+ VPERM %v4,%v4,%v4,CONST_PERM_LE2BE
+
+ VX %v1,%v0,%v1 /* V1 ^= CRC */
+ aghi %r3,64 /* BUF = BUF + 64 */
+ aghi %r4,-64 /* LEN = LEN - 64 */
+
+ cghi %r4,64
+ jl .Lless_than_64bytes
+
+.Lfold_64bytes_loop:
+ /* Load the next 64-byte data chunk into V5 to V8 */
+ VLM %v5,%v8,0,%r3
+ VPERM %v5,%v5,%v5,CONST_PERM_LE2BE
+ VPERM %v6,%v6,%v6,CONST_PERM_LE2BE
+ VPERM %v7,%v7,%v7,CONST_PERM_LE2BE
+ VPERM %v8,%v8,%v8,CONST_PERM_LE2BE
+
+ /*
+ * Perform a GF(2) multiplication of the doublewords in V1 with
+ * the R1 and R2 reduction constants in V0. The intermediate result
+ * is then folded (accumulated) with the next data chunk in V5 and
+ * stored in V1. Repeat this step for the register contents
+ * in V2, V3, and V4 respectively.
+ */
+ VGFMAG %v1,CONST_R2R1,%v1,%v5
+ VGFMAG %v2,CONST_R2R1,%v2,%v6
+ VGFMAG %v3,CONST_R2R1,%v3,%v7
+ VGFMAG %v4,CONST_R2R1,%v4,%v8
+
+ aghi %r3,64 /* BUF = BUF + 64 */
+ aghi %r4,-64 /* LEN = LEN - 64 */
+
+ cghi %r4,64
+ jnl .Lfold_64bytes_loop
+
+.Lless_than_64bytes:
+ /*
+ * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3
+ * and R4 and accumulating the next 128-bit chunk until a single 128-bit
+ * value remains.
+ */
+ VGFMAG %v1,CONST_R4R3,%v1,%v2
+ VGFMAG %v1,CONST_R4R3,%v1,%v3
+ VGFMAG %v1,CONST_R4R3,%v1,%v4
+
+ cghi %r4,16
+ jl .Lfinal_fold
+
+.Lfold_16bytes_loop:
+
+ VL %v2,0,,%r3 /* Load next data chunk */
+ VPERM %v2,%v2,%v2,CONST_PERM_LE2BE
+ VGFMAG %v1,CONST_R4R3,%v1,%v2 /* Fold next data chunk */
+
+ aghi %r3,16
+ aghi %r4,-16
+
+ cghi %r4,16
+ jnl .Lfold_16bytes_loop
+
+.Lfinal_fold:
+ /*
+ * Set up a vector register for byte shifts. The shift value must
+ * be loaded in bits 1-4 in byte element 7 of a vector register.
+ * Shift by 8 bytes: 0x40
+ * Shift by 4 bytes: 0x20
+ */
+ VLEIB %v9,0x40,7
+
+ /*
+ * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes
+ * to move R4 into the rightmost doubleword and set the leftmost
+ * doubleword to 0x1.
+ */
+ VSRLB %v0,CONST_R4R3,%v9
+ VLEIG %v0,1,0
+
+ /*
+ * Compute GF(2) product of V1 and V0. The rightmost doubleword
+ * of V1 is multiplied with R4. The leftmost doubleword of V1 is
+ * multiplied by 0x1 and is then XORed with rightmost product.
+ * Implicitly, the intermediate leftmost product becomes padded
+ */
+ VGFMG %v1,%v0,%v1
+
+ /*
+ * Now do the final 32-bit fold by multiplying the rightmost word
+ * in V1 with R5 and XOR the result with the remaining bits in V1.
+ *
+ * To achieve this by a single VGFMAG, right shift V1 by a word
+ * and store the result in V2 which is then accumulated. Use the
+ * vector unpack instruction to load the rightmost half of the
+ * doubleword into the rightmost doubleword element of V1; the other
+ * half is loaded in the leftmost doubleword.
+ * The vector register with CONST_R5 contains the R5 constant in the
+ * rightmost doubleword and the leftmost doubleword is zero to ignore
+ * the leftmost product of V1.
+ */
+ VLEIB %v9,0x20,7 /* Shift by words */
+ VSRLB %v2,%v1,%v9 /* Store remaining bits in V2 */
+ VUPLLF %v1,%v1 /* Split rightmost doubleword */
+ VGFMAG %v1,CONST_R5,%v1,%v2 /* V1 = (V1 * R5) XOR V2 */
+
+ /*
+ * Apply a Barret reduction to compute the final 32-bit CRC value.
+ *
+ * The input values to the Barret reduction are the degree-63 polynomial
+ * in V1 (R(x)), degree-32 generator polynomial, and the reduction
+ * constant u. The Barret reduction result is the CRC value of R(x) mod
+ * P(x).
+ *
+ * The Barret reduction algorithm is defined as:
+ *
+ * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
+ * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
+ * 3. C(x) = R(x) XOR T2(x) mod x^32
+ *
+ * Note: The leftmost doubleword of vector register containing
+ * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product
+ * is zero and does not contribute to the final result.
+ */
+
+ /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */
+ VUPLLF %v2,%v1
+ VGFMG %v2,CONST_RU_POLY,%v2
+
+ /*
+ * Compute the GF(2) product of the CRC polynomial with T1(x) in
+ * V2 and XOR the intermediate result, T2(x), with the value in V1.
+ * The final result is stored in word element 2 of V2.
+ */
+ VUPLLF %v2,%v2
+ VGFMAG %v2,CONST_CRC_POLY,%v2,%v1
+
+.Ldone:
+ VLGVF %r2,%v2,2
+ br %r14
+
+.previous
diff --git a/arch/s390/defconfig b/arch/s390/defconfig
index 3f571ea89..2d40ef0a6 100644
--- a/arch/s390/defconfig
+++ b/arch/s390/defconfig
@@ -172,7 +172,6 @@ CONFIG_DEBUG_NOTIFIERS=y
CONFIG_RCU_CPU_STALL_TIMEOUT=60
CONFIG_RCU_TRACE=y
CONFIG_LATENCYTOP=y
-CONFIG_DEBUG_STRICT_USER_COPY_CHECKS=y
CONFIG_SCHED_TRACER=y
CONFIG_FTRACE_SYSCALLS=y
CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y
@@ -225,12 +224,16 @@ CONFIG_CRYPTO_DEFLATE=m
CONFIG_CRYPTO_LZ4=m
CONFIG_CRYPTO_LZ4HC=m
CONFIG_CRYPTO_ANSI_CPRNG=m
+CONFIG_CRYPTO_USER_API_HASH=m
+CONFIG_CRYPTO_USER_API_SKCIPHER=m
+CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_ZCRYPT=m
CONFIG_CRYPTO_SHA1_S390=m
CONFIG_CRYPTO_SHA256_S390=m
CONFIG_CRYPTO_SHA512_S390=m
CONFIG_CRYPTO_DES_S390=m
CONFIG_CRYPTO_AES_S390=m
+CONFIG_CRYPTO_CRC32_S390=y
CONFIG_CRC7=m
# CONFIG_XZ_DEC_X86 is not set
# CONFIG_XZ_DEC_POWERPC is not set
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index 045035796..28f03ca60 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -19,29 +19,10 @@
#include <asm/ebcdic.h>
#include "hypfs.h"
-#define LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */
-#define CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */
#define TMP_SIZE 64 /* size of temporary buffers */
#define DBFS_D204_HDR_VERSION 0
-/* diag 204 subcodes */
-enum diag204_sc {
- SUBC_STIB4 = 4,
- SUBC_RSI = 5,
- SUBC_STIB6 = 6,
- SUBC_STIB7 = 7
-};
-
-/* The two available diag 204 data formats */
-enum diag204_format {
- INFO_SIMPLE = 0,
- INFO_EXT = 0x00010000
-};
-
-/* bit is set in flags, when physical cpu info is included in diag 204 data */
-#define LPAR_PHYS_FLG 0x80
-
static char *diag224_cpu_names; /* diag 224 name table */
static enum diag204_sc diag204_store_sc; /* used subcode for store */
static enum diag204_format diag204_info_type; /* used diag 204 data format */
@@ -53,7 +34,7 @@ static int diag204_buf_pages; /* number of pages for diag204 data */
static struct dentry *dbfs_d204_file;
/*
- * DIAG 204 data structures and member access functions.
+ * DIAG 204 member access functions.
*
* Since we have two different diag 204 data formats for old and new s390
* machines, we do not access the structs directly, but use getter functions for
@@ -62,302 +43,173 @@ static struct dentry *dbfs_d204_file;
/* Time information block */
-struct info_blk_hdr {
- __u8 npar;
- __u8 flags;
- __u16 tslice;
- __u16 phys_cpus;
- __u16 this_part;
- __u64 curtod;
-} __attribute__ ((packed));
-
-struct x_info_blk_hdr {
- __u8 npar;
- __u8 flags;
- __u16 tslice;
- __u16 phys_cpus;
- __u16 this_part;
- __u64 curtod1;
- __u64 curtod2;
- char reserved[40];
-} __attribute__ ((packed));
-
static inline int info_blk_hdr__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct info_blk_hdr);
- else /* INFO_EXT */
- return sizeof(struct x_info_blk_hdr);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_info_blk_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_info_blk_hdr);
}
static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct info_blk_hdr *)hdr)->npar;
- else /* INFO_EXT */
- return ((struct x_info_blk_hdr *)hdr)->npar;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->npar;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
}
static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct info_blk_hdr *)hdr)->flags;
- else /* INFO_EXT */
- return ((struct x_info_blk_hdr *)hdr)->flags;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->flags;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
}
static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct info_blk_hdr *)hdr)->phys_cpus;
- else /* INFO_EXT */
- return ((struct x_info_blk_hdr *)hdr)->phys_cpus;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus;
}
/* Partition header */
-struct part_hdr {
- __u8 pn;
- __u8 cpus;
- char reserved[6];
- char part_name[LPAR_NAME_LEN];
-} __attribute__ ((packed));
-
-struct x_part_hdr {
- __u8 pn;
- __u8 cpus;
- __u8 rcpus;
- __u8 pflag;
- __u32 mlu;
- char part_name[LPAR_NAME_LEN];
- char lpc_name[8];
- char os_name[8];
- __u64 online_cs;
- __u64 online_es;
- __u8 upid;
- char reserved1[3];
- __u32 group_mlu;
- char group_name[8];
- char reserved2[32];
-} __attribute__ ((packed));
-
static inline int part_hdr__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct part_hdr);
- else /* INFO_EXT */
- return sizeof(struct x_part_hdr);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_part_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_part_hdr);
}
static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct part_hdr *)hdr)->cpus;
- else /* INFO_EXT */
- return ((struct x_part_hdr *)hdr)->rcpus;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_part_hdr *)hdr)->cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_part_hdr *)hdr)->rcpus;
}
static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
char *name)
{
- if (type == INFO_SIMPLE)
- memcpy(name, ((struct part_hdr *)hdr)->part_name,
- LPAR_NAME_LEN);
- else /* INFO_EXT */
- memcpy(name, ((struct x_part_hdr *)hdr)->part_name,
- LPAR_NAME_LEN);
- EBCASC(name, LPAR_NAME_LEN);
- name[LPAR_NAME_LEN] = 0;
+ if (type == DIAG204_INFO_SIMPLE)
+ memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
+ DIAG204_LPAR_NAME_LEN);
+ else /* DIAG204_INFO_EXT */
+ memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
+ DIAG204_LPAR_NAME_LEN);
+ EBCASC(name, DIAG204_LPAR_NAME_LEN);
+ name[DIAG204_LPAR_NAME_LEN] = 0;
strim(name);
}
-struct cpu_info {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- __u8 cflag;
- __u16 weight;
- __u64 acc_time;
- __u64 lp_time;
-} __attribute__ ((packed));
-
-struct x_cpu_info {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- __u8 cflag;
- __u16 weight;
- __u64 acc_time;
- __u64 lp_time;
- __u16 min_weight;
- __u16 cur_weight;
- __u16 max_weight;
- char reseved2[2];
- __u64 online_time;
- __u64 wait_time;
- __u32 pma_weight;
- __u32 polar_weight;
- char reserved3[40];
-} __attribute__ ((packed));
-
/* CPU info block */
static inline int cpu_info__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct cpu_info);
- else /* INFO_EXT */
- return sizeof(struct x_cpu_info);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_cpu_info);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_cpu_info);
}
static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->ctidx;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->ctidx;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->ctidx;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->ctidx;
}
static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->cpu_addr;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->cpu_addr;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->cpu_addr;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
}
static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->acc_time;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->acc_time;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->acc_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->acc_time;
}
static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->lp_time;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->lp_time;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->lp_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->lp_time;
}
static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
+ if (type == DIAG204_INFO_SIMPLE)
return 0; /* online_time not available in simple info */
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->online_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->online_time;
}
/* Physical header */
-struct phys_hdr {
- char reserved1[1];
- __u8 cpus;
- char reserved2[6];
- char mgm_name[8];
-} __attribute__ ((packed));
-
-struct x_phys_hdr {
- char reserved1[1];
- __u8 cpus;
- char reserved2[6];
- char mgm_name[8];
- char reserved3[80];
-} __attribute__ ((packed));
-
static inline int phys_hdr__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct phys_hdr);
- else /* INFO_EXT */
- return sizeof(struct x_phys_hdr);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_phys_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_phys_hdr);
}
static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_hdr *)hdr)->cpus;
- else /* INFO_EXT */
- return ((struct x_phys_hdr *)hdr)->cpus;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_hdr *)hdr)->cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_hdr *)hdr)->cpus;
}
/* Physical CPU info block */
-struct phys_cpu {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- char reserved2[3];
- __u64 mgm_time;
- char reserved3[8];
-} __attribute__ ((packed));
-
-struct x_phys_cpu {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- char reserved2[3];
- __u64 mgm_time;
- char reserved3[80];
-} __attribute__ ((packed));
-
static inline int phys_cpu__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct phys_cpu);
- else /* INFO_EXT */
- return sizeof(struct x_phys_cpu);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_phys_cpu);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_phys_cpu);
}
static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_cpu *)hdr)->cpu_addr;
- else /* INFO_EXT */
- return ((struct x_phys_cpu *)hdr)->cpu_addr;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
}
static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_cpu *)hdr)->mgm_time;
- else /* INFO_EXT */
- return ((struct x_phys_cpu *)hdr)->mgm_time;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->mgm_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
}
static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_cpu *)hdr)->ctidx;
- else /* INFO_EXT */
- return ((struct x_phys_cpu *)hdr)->ctidx;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->ctidx;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
}
/* Diagnose 204 functions */
-
-static inline int __diag204(unsigned long subcode, unsigned long size, void *addr)
-{
- register unsigned long _subcode asm("0") = subcode;
- register unsigned long _size asm("1") = size;
-
- asm volatile(
- " diag %2,%0,0x204\n"
- "0:\n"
- EX_TABLE(0b,0b)
- : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
- if (_subcode)
- return -1;
- return _size;
-}
-
-static int diag204(unsigned long subcode, unsigned long size, void *addr)
-{
- diag_stat_inc(DIAG_STAT_X204);
- return __diag204(subcode, size, addr);
-}
-
/*
* For the old diag subcode 4 with simple data format we have to use real
* memory. If we use subcode 6 or 7 with extended data format, we can (and
@@ -409,12 +261,12 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
*pages = diag204_buf_pages;
return diag204_buf;
}
- if (fmt == INFO_SIMPLE) {
+ if (fmt == DIAG204_INFO_SIMPLE) {
*pages = 1;
return diag204_alloc_rbuf();
- } else {/* INFO_EXT */
- *pages = diag204((unsigned long)SUBC_RSI |
- (unsigned long)INFO_EXT, 0, NULL);
+ } else {/* DIAG204_INFO_EXT */
+ *pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+ (unsigned long)DIAG204_INFO_EXT, 0, NULL);
if (*pages <= 0)
return ERR_PTR(-ENOSYS);
else
@@ -441,18 +293,18 @@ static int diag204_probe(void)
void *buf;
int pages, rc;
- buf = diag204_get_buffer(INFO_EXT, &pages);
+ buf = diag204_get_buffer(DIAG204_INFO_EXT, &pages);
if (!IS_ERR(buf)) {
- if (diag204((unsigned long)SUBC_STIB7 |
- (unsigned long)INFO_EXT, pages, buf) >= 0) {
- diag204_store_sc = SUBC_STIB7;
- diag204_info_type = INFO_EXT;
+ if (diag204((unsigned long)DIAG204_SUBC_STIB7 |
+ (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+ diag204_store_sc = DIAG204_SUBC_STIB7;
+ diag204_info_type = DIAG204_INFO_EXT;
goto out;
}
- if (diag204((unsigned long)SUBC_STIB6 |
- (unsigned long)INFO_EXT, pages, buf) >= 0) {
- diag204_store_sc = SUBC_STIB6;
- diag204_info_type = INFO_EXT;
+ if (diag204((unsigned long)DIAG204_SUBC_STIB6 |
+ (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+ diag204_store_sc = DIAG204_SUBC_STIB6;
+ diag204_info_type = DIAG204_INFO_EXT;
goto out;
}
diag204_free_buffer();
@@ -460,15 +312,15 @@ static int diag204_probe(void)
/* subcodes 6 and 7 failed, now try subcode 4 */
- buf = diag204_get_buffer(INFO_SIMPLE, &pages);
+ buf = diag204_get_buffer(DIAG204_INFO_SIMPLE, &pages);
if (IS_ERR(buf)) {
rc = PTR_ERR(buf);
goto fail_alloc;
}
- if (diag204((unsigned long)SUBC_STIB4 |
- (unsigned long)INFO_SIMPLE, pages, buf) >= 0) {
- diag204_store_sc = SUBC_STIB4;
- diag204_info_type = INFO_SIMPLE;
+ if (diag204((unsigned long)DIAG204_SUBC_STIB4 |
+ (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) {
+ diag204_store_sc = DIAG204_SUBC_STIB4;
+ diag204_info_type = DIAG204_INFO_SIMPLE;
goto out;
} else {
rc = -ENOSYS;
@@ -508,20 +360,6 @@ out:
/* Diagnose 224 functions */
-static int diag224(void *ptr)
-{
- int rc = -EOPNOTSUPP;
-
- diag_stat_inc(DIAG_STAT_X224);
- asm volatile(
- " diag %1,%2,0x224\n"
- "0: lhi %0,0x0\n"
- "1:\n"
- EX_TABLE(0b,1b)
- : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
- return rc;
-}
-
static int diag224_get_name_table(void)
{
/* memory must be below 2GB */
@@ -543,9 +381,9 @@ static void diag224_delete_name_table(void)
static int diag224_idx2name(int index, char *name)
{
- memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN),
- CPU_NAME_LEN);
- name[CPU_NAME_LEN] = 0;
+ memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
+ DIAG204_CPU_NAME_LEN);
+ name[DIAG204_CPU_NAME_LEN] = 0;
strim(name);
return 0;
}
@@ -601,7 +439,7 @@ __init int hypfs_diag_init(void)
pr_err("The hardware system does not support hypfs\n");
return -ENODATA;
}
- if (diag204_info_type == INFO_EXT) {
+ if (diag204_info_type == DIAG204_INFO_EXT) {
rc = hypfs_dbfs_create_file(&dbfs_file_d204);
if (rc)
return rc;
@@ -649,7 +487,7 @@ static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
cpu_info__lp_time(diag204_info_type, cpu_info));
if (IS_ERR(rc))
return PTR_ERR(rc);
- if (diag204_info_type == INFO_EXT) {
+ if (diag204_info_type == DIAG204_INFO_EXT) {
rc = hypfs_create_u64(cpu_dir, "onlinetime",
cpu_info__online_time(diag204_info_type,
cpu_info));
@@ -665,12 +503,12 @@ static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
{
struct dentry *cpus_dir;
struct dentry *lpar_dir;
- char lpar_name[LPAR_NAME_LEN + 1];
+ char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
void *cpu_info;
int i;
part_hdr__part_name(diag204_info_type, part_hdr, lpar_name);
- lpar_name[LPAR_NAME_LEN] = 0;
+ lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
if (IS_ERR(lpar_dir))
return lpar_dir;
@@ -753,7 +591,8 @@ int hypfs_diag_create_files(struct dentry *root)
goto err_out;
}
}
- if (info_blk_hdr__flags(diag204_info_type, time_hdr) & LPAR_PHYS_FLG) {
+ if (info_blk_hdr__flags(diag204_info_type, time_hdr) &
+ DIAG204_LPAR_PHYS_FLG) {
ptr = hypfs_create_phys_files(root, part_hdr);
if (IS_ERR(ptr)) {
rc = PTR_ERR(ptr);
diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c
index 44feac38c..012919d98 100644
--- a/arch/s390/hypfs/hypfs_vm.c
+++ b/arch/s390/hypfs/hypfs_vm.c
@@ -70,7 +70,7 @@ static int diag2fc(int size, char* query, void *addr)
diag_stat_inc(DIAG_STAT_X2FC);
asm volatile(
" diag %0,%1,0x2fc\n"
- "0:\n"
+ "0: nopr %%r7\n"
EX_TABLE(0b,0b)
: "=d" (residual_cnt), "+d" (rc) : "0" (&parm_list) : "memory");
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 911064aa5..d28cc2f5b 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -93,6 +93,11 @@ static inline int atomic_add_return(int i, atomic_t *v)
return __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_BARRIER) + i;
}
+static inline int atomic_fetch_add(int i, atomic_t *v)
+{
+ return __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_BARRIER);
+}
+
static inline void atomic_add(int i, atomic_t *v)
{
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
@@ -114,22 +119,27 @@ static inline void atomic_add(int i, atomic_t *v)
#define atomic_inc_and_test(_v) (atomic_add_return(1, _v) == 0)
#define atomic_sub(_i, _v) atomic_add(-(int)(_i), _v)
#define atomic_sub_return(_i, _v) atomic_add_return(-(int)(_i), _v)
+#define atomic_fetch_sub(_i, _v) atomic_fetch_add(-(int)(_i), _v)
#define atomic_sub_and_test(_i, _v) (atomic_sub_return(_i, _v) == 0)
#define atomic_dec(_v) atomic_sub(1, _v)
#define atomic_dec_return(_v) atomic_sub_return(1, _v)
#define atomic_dec_and_test(_v) (atomic_sub_return(1, _v) == 0)
-#define ATOMIC_OP(op, OP) \
+#define ATOMIC_OPS(op, OP) \
static inline void atomic_##op(int i, atomic_t *v) \
{ \
__ATOMIC_LOOP(v, i, __ATOMIC_##OP, __ATOMIC_NO_BARRIER); \
+} \
+static inline int atomic_fetch_##op(int i, atomic_t *v) \
+{ \
+ return __ATOMIC_LOOP(v, i, __ATOMIC_##OP, __ATOMIC_BARRIER); \
}
-ATOMIC_OP(and, AND)
-ATOMIC_OP(or, OR)
-ATOMIC_OP(xor, XOR)
+ATOMIC_OPS(and, AND)
+ATOMIC_OPS(or, OR)
+ATOMIC_OPS(xor, XOR)
-#undef ATOMIC_OP
+#undef ATOMIC_OPS
#define atomic_xchg(v, new) (xchg(&((v)->counter), new))
@@ -236,6 +246,11 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v)
return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_BARRIER) + i;
}
+static inline long long atomic64_fetch_add(long long i, atomic64_t *v)
+{
+ return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_BARRIER);
+}
+
static inline void atomic64_add(long long i, atomic64_t *v)
{
#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
@@ -264,17 +279,21 @@ static inline long long atomic64_cmpxchg(atomic64_t *v,
return old;
}
-#define ATOMIC64_OP(op, OP) \
+#define ATOMIC64_OPS(op, OP) \
static inline void atomic64_##op(long i, atomic64_t *v) \
{ \
__ATOMIC64_LOOP(v, i, __ATOMIC64_##OP, __ATOMIC64_NO_BARRIER); \
+} \
+static inline long atomic64_fetch_##op(long i, atomic64_t *v) \
+{ \
+ return __ATOMIC64_LOOP(v, i, __ATOMIC64_##OP, __ATOMIC64_BARRIER); \
}
-ATOMIC64_OP(and, AND)
-ATOMIC64_OP(or, OR)
-ATOMIC64_OP(xor, XOR)
+ATOMIC64_OPS(and, AND)
+ATOMIC64_OPS(or, OR)
+ATOMIC64_OPS(xor, XOR)
-#undef ATOMIC64_OP
+#undef ATOMIC64_OPS
#undef __ATOMIC64_LOOP
static inline int atomic64_add_unless(atomic64_t *v, long long i, long long u)
@@ -315,6 +334,7 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
#define atomic64_inc_return(_v) atomic64_add_return(1, _v)
#define atomic64_inc_and_test(_v) (atomic64_add_return(1, _v) == 0)
#define atomic64_sub_return(_i, _v) atomic64_add_return(-(long long)(_i), _v)
+#define atomic64_fetch_sub(_i, _v) atomic64_fetch_add(-(long long)(_i), _v)
#define atomic64_sub(_i, _v) atomic64_add(-(long long)(_i), _v)
#define atomic64_sub_and_test(_i, _v) (atomic64_sub_return(_i, _v) == 0)
#define atomic64_dec(_v) atomic64_sub(1, _v)
diff --git a/arch/s390/include/asm/cache.h b/arch/s390/include/asm/cache.h
index 22da3b34c..05219a5e0 100644
--- a/arch/s390/include/asm/cache.h
+++ b/arch/s390/include/asm/cache.h
@@ -13,9 +13,6 @@
#define L1_CACHE_SHIFT 8
#define NET_SKB_PAD 32
-#define __read_mostly __attribute__((__section__(".data..read_mostly")))
-
-/* Read-only memory is marked before mark_rodata_ro() is called. */
-#define __ro_after_init __read_mostly
+#define __read_mostly __section(.data..read_mostly)
#endif
diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index d1e7b0a0f..f7ed88cc0 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -320,7 +320,7 @@ struct cio_iplinfo {
extern int cio_get_iplinfo(struct cio_iplinfo *iplinfo);
/* Function from drivers/s390/cio/chsc.c */
-int chsc_sstpc(void *page, unsigned int op, u16 ctrl);
+int chsc_sstpc(void *page, unsigned int op, u16 ctrl, u64 *clock_delta);
int chsc_sstpi(void *page, void *result, size_t size);
#endif
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 1a82cf26e..d28621de8 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -20,6 +20,9 @@
#define CPACF_KMC 0xb92f /* MSA */
#define CPACF_KIMD 0xb93e /* MSA */
#define CPACF_KLMD 0xb93f /* MSA */
+#define CPACF_PCKMO 0xb928 /* MSA3 */
+#define CPACF_KMF 0xb92a /* MSA4 */
+#define CPACF_KMO 0xb92b /* MSA4 */
#define CPACF_PCC 0xb92c /* MSA4 */
#define CPACF_KMCTR 0xb92d /* MSA4 */
#define CPACF_PPNO 0xb93c /* MSA5 */
@@ -136,6 +139,7 @@ static inline void __cpacf_query(unsigned int opcode, unsigned char *status)
register unsigned long r1 asm("1") = (unsigned long) status;
asm volatile(
+ " spm 0\n" /* pckmo doesn't change the cc */
/* Parameter registers are ignored, but may not be 0 */
"0: .insn rrf,%[opc] << 16,2,2,2,0\n"
" brc 1,0b\n" /* handle partial completion */
@@ -157,6 +161,12 @@ static inline int cpacf_query(unsigned int opcode, unsigned int func)
if (!test_facility(17)) /* check for MSA */
return 0;
break;
+ case CPACF_PCKMO:
+ if (!test_facility(76)) /* check for MSA3 */
+ return 0;
+ break;
+ case CPACF_KMF:
+ case CPACF_KMO:
case CPACF_PCC:
case CPACF_KMCTR:
if (!test_facility(77)) /* check for MSA4 */
diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index 9dd04b9e9..035164761 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -169,16 +169,27 @@ static inline int lcctl(u64 ctl)
}
/* Extract CPU counter */
-static inline int ecctr(u64 ctr, u64 *val)
+static inline int __ecctr(u64 ctr, u64 *content)
{
- register u64 content asm("4") = 0;
+ register u64 _content asm("4") = 0;
int cc;
asm volatile (
" .insn rre,0xb2e40000,%0,%2\n"
" ipm %1\n"
" srl %1,28\n"
- : "=d" (content), "=d" (cc) : "d" (ctr) : "cc");
+ : "=d" (_content), "=d" (cc) : "d" (ctr) : "cc");
+ *content = _content;
+ return cc;
+}
+
+/* Extract CPU counter */
+static inline int ecctr(u64 ctr, u64 *val)
+{
+ u64 content;
+ int cc;
+
+ cc = __ecctr(ctr, &content);
if (!cc)
*val = content;
return cc;
diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index 5fac921c1..8acf48216 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -49,7 +49,7 @@ static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn)
diag_stat_inc(DIAG_STAT_X010);
asm volatile(
"0: diag %0,%1,0x10\n"
- "1:\n"
+ "1: nopr %%r7\n"
EX_TABLE(0b, 1b)
EX_TABLE(1b, 1b)
: : "a" (start_addr), "a" (end_addr));
@@ -78,4 +78,153 @@ struct diag210 {
extern int diag210(struct diag210 *addr);
+/* bit is set in flags, when physical cpu info is included in diag 204 data */
+#define DIAG204_LPAR_PHYS_FLG 0x80
+#define DIAG204_LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */
+#define DIAG204_CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */
+
+/* diag 204 subcodes */
+enum diag204_sc {
+ DIAG204_SUBC_STIB4 = 4,
+ DIAG204_SUBC_RSI = 5,
+ DIAG204_SUBC_STIB6 = 6,
+ DIAG204_SUBC_STIB7 = 7
+};
+
+/* The two available diag 204 data formats */
+enum diag204_format {
+ DIAG204_INFO_SIMPLE = 0,
+ DIAG204_INFO_EXT = 0x00010000
+};
+
+enum diag204_cpu_flags {
+ DIAG204_CPU_ONLINE = 0x20,
+ DIAG204_CPU_CAPPED = 0x40,
+};
+
+struct diag204_info_blk_hdr {
+ __u8 npar;
+ __u8 flags;
+ __u16 tslice;
+ __u16 phys_cpus;
+ __u16 this_part;
+ __u64 curtod;
+} __packed;
+
+struct diag204_x_info_blk_hdr {
+ __u8 npar;
+ __u8 flags;
+ __u16 tslice;
+ __u16 phys_cpus;
+ __u16 this_part;
+ __u64 curtod1;
+ __u64 curtod2;
+ char reserved[40];
+} __packed;
+
+struct diag204_part_hdr {
+ __u8 pn;
+ __u8 cpus;
+ char reserved[6];
+ char part_name[DIAG204_LPAR_NAME_LEN];
+} __packed;
+
+struct diag204_x_part_hdr {
+ __u8 pn;
+ __u8 cpus;
+ __u8 rcpus;
+ __u8 pflag;
+ __u32 mlu;
+ char part_name[DIAG204_LPAR_NAME_LEN];
+ char lpc_name[8];
+ char os_name[8];
+ __u64 online_cs;
+ __u64 online_es;
+ __u8 upid;
+ __u8 reserved:3;
+ __u8 mtid:5;
+ char reserved1[2];
+ __u32 group_mlu;
+ char group_name[8];
+ char hardware_group_name[8];
+ char reserved2[24];
+} __packed;
+
+struct diag204_cpu_info {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ __u8 cflag;
+ __u16 weight;
+ __u64 acc_time;
+ __u64 lp_time;
+} __packed;
+
+struct diag204_x_cpu_info {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ __u8 cflag;
+ __u16 weight;
+ __u64 acc_time;
+ __u64 lp_time;
+ __u16 min_weight;
+ __u16 cur_weight;
+ __u16 max_weight;
+ char reseved2[2];
+ __u64 online_time;
+ __u64 wait_time;
+ __u32 pma_weight;
+ __u32 polar_weight;
+ __u32 cpu_type_cap;
+ __u32 group_cpu_type_cap;
+ char reserved3[32];
+} __packed;
+
+struct diag204_phys_hdr {
+ char reserved1[1];
+ __u8 cpus;
+ char reserved2[6];
+ char mgm_name[8];
+} __packed;
+
+struct diag204_x_phys_hdr {
+ char reserved1[1];
+ __u8 cpus;
+ char reserved2[6];
+ char mgm_name[8];
+ char reserved3[80];
+} __packed;
+
+struct diag204_phys_cpu {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ char reserved2[3];
+ __u64 mgm_time;
+ char reserved3[8];
+} __packed;
+
+struct diag204_x_phys_cpu {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ char reserved2[1];
+ __u16 weight;
+ __u64 mgm_time;
+ char reserved3[80];
+} __packed;
+
+struct diag204_x_part_block {
+ struct diag204_x_part_hdr hdr;
+ struct diag204_x_cpu_info cpus[];
+} __packed;
+
+struct diag204_x_phys_block {
+ struct diag204_x_phys_hdr hdr;
+ struct diag204_x_phys_cpu cpus[];
+} __packed;
+
+int diag204(unsigned long subcode, unsigned long size, void *addr);
+int diag224(void *ptr);
#endif /* _ASM_S390_DIAG_H */
diff --git a/arch/s390/include/asm/dma-mapping.h b/arch/s390/include/asm/dma-mapping.h
index 3249b7464..ffaba07f5 100644
--- a/arch/s390/include/asm/dma-mapping.h
+++ b/arch/s390/include/asm/dma-mapping.h
@@ -5,7 +5,6 @@
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/scatterlist.h>
-#include <linux/dma-attrs.h>
#include <linux/dma-debug.h>
#include <linux/io.h>
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index 563ab9f44..1736c7d3c 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -225,6 +225,7 @@ do { \
#define MMAP_ALIGN_MASK (is_compat_task() ? 0 : 0x7fUL)
#define STACK_RND_MASK MMAP_RND_MASK
+/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
#define ARCH_DLINFO \
do { \
if (vdso_enabled) \
diff --git a/arch/s390/include/asm/fcx.h b/arch/s390/include/asm/fcx.h
index 7ecb92b46..04cb4b4bc 100644
--- a/arch/s390/include/asm/fcx.h
+++ b/arch/s390/include/asm/fcx.h
@@ -6,7 +6,7 @@
*/
#ifndef _ASM_S390_FCX_H
-#define _ASM_S390_FCX_H _ASM_S390_FCX_H
+#define _ASM_S390_FCX_H
#include <linux/types.h>
diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h
index 8ae236b0f..6aba6fc40 100644
--- a/arch/s390/include/asm/fpu/api.h
+++ b/arch/s390/include/asm/fpu/api.h
@@ -1,6 +1,41 @@
/*
* In-kernel FPU support functions
*
+ *
+ * Consider these guidelines before using in-kernel FPU functions:
+ *
+ * 1. Use kernel_fpu_begin() and kernel_fpu_end() to enclose all in-kernel
+ * use of floating-point or vector registers and instructions.
+ *
+ * 2. For kernel_fpu_begin(), specify the vector register range you want to
+ * use with the KERNEL_VXR_* constants. Consider these usage guidelines:
+ *
+ * a) If your function typically runs in process-context, use the lower
+ * half of the vector registers, for example, specify KERNEL_VXR_LOW.
+ * b) If your function typically runs in soft-irq or hard-irq context,
+ * prefer using the upper half of the vector registers, for example,
+ * specify KERNEL_VXR_HIGH.
+ *
+ * If you adhere to these guidelines, an interrupted process context
+ * does not require to save and restore vector registers because of
+ * disjoint register ranges.
+ *
+ * Also note that the __kernel_fpu_begin()/__kernel_fpu_end() functions
+ * includes logic to save and restore up to 16 vector registers at once.
+ *
+ * 3. You can nest kernel_fpu_begin()/kernel_fpu_end() by using different
+ * struct kernel_fpu states. Vector registers that are in use by outer
+ * levels are saved and restored. You can minimize the save and restore
+ * effort by choosing disjoint vector register ranges.
+ *
+ * 5. To use vector floating-point instructions, specify the KERNEL_FPC
+ * flag to save and restore floating-point controls in addition to any
+ * vector register range.
+ *
+ * 6. To use floating-point registers and instructions only, specify the
+ * KERNEL_FPR flag. This flag triggers a save and restore of vector
+ * registers V0 to V15 and floating-point controls.
+ *
* Copyright IBM Corp. 2015
* Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
*/
@@ -8,6 +43,8 @@
#ifndef _ASM_S390_FPU_API_H
#define _ASM_S390_FPU_API_H
+#include <linux/preempt.h>
+
void save_fpu_regs(void);
static inline int test_fp_ctl(u32 fpc)
@@ -27,4 +64,42 @@ static inline int test_fp_ctl(u32 fpc)
return rc;
}
+#define KERNEL_VXR_V0V7 1
+#define KERNEL_VXR_V8V15 2
+#define KERNEL_VXR_V16V23 4
+#define KERNEL_VXR_V24V31 8
+#define KERNEL_FPR 16
+#define KERNEL_FPC 256
+
+#define KERNEL_VXR_LOW (KERNEL_VXR_V0V7|KERNEL_VXR_V8V15)
+#define KERNEL_VXR_MID (KERNEL_VXR_V8V15|KERNEL_VXR_V16V23)
+#define KERNEL_VXR_HIGH (KERNEL_VXR_V16V23|KERNEL_VXR_V24V31)
+
+#define KERNEL_FPU_MASK (KERNEL_VXR_LOW|KERNEL_VXR_HIGH|KERNEL_FPR)
+
+struct kernel_fpu;
+
+/*
+ * Note the functions below must be called with preemption disabled.
+ * Do not enable preemption before calling __kernel_fpu_end() to prevent
+ * an corruption of an existing kernel FPU state.
+ *
+ * Prefer using the kernel_fpu_begin()/kernel_fpu_end() pair of functions.
+ */
+void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags);
+void __kernel_fpu_end(struct kernel_fpu *state);
+
+
+static inline void kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
+{
+ preempt_disable();
+ __kernel_fpu_begin(state, flags);
+}
+
+static inline void kernel_fpu_end(struct kernel_fpu *state)
+{
+ __kernel_fpu_end(state);
+ preempt_enable();
+}
+
#endif /* _ASM_S390_FPU_API_H */
diff --git a/arch/s390/include/asm/fpu/types.h b/arch/s390/include/asm/fpu/types.h
index fe937c9b6..bce255ead 100644
--- a/arch/s390/include/asm/fpu/types.h
+++ b/arch/s390/include/asm/fpu/types.h
@@ -24,4 +24,14 @@ struct fpu {
/* VX array structure for address operand constraints in inline assemblies */
struct vx_array { __vector128 _[__NUM_VXRS]; };
+/* In-kernel FPU state structure */
+struct kernel_fpu {
+ u32 mask;
+ u32 fpc;
+ union {
+ freg_t fprs[__NUM_FPRS];
+ __vector128 vxrs[__NUM_VXRS];
+ };
+};
+
#endif /* _ASM_S390_FPU_TYPES_H */
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index d054c1b07..741ddba0b 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -10,14 +10,25 @@
/**
* struct gmap_struct - guest address space
+ * @list: list head for the mm->context gmap list
* @crst_list: list of all crst tables used in the guest address space
* @mm: pointer to the parent mm_struct
* @guest_to_host: radix tree with guest to host address translation
* @host_to_guest: radix tree with pointer to segment table entries
* @guest_table_lock: spinlock to protect all entries in the guest page table
+ * @ref_count: reference counter for the gmap structure
* @table: pointer to the page directory
* @asce: address space control element for gmap page table
* @pfault_enabled: defines if pfaults are applicable for the guest
+ * @host_to_rmap: radix tree with gmap_rmap lists
+ * @children: list of shadow gmap structures
+ * @pt_list: list of all page tables used in the shadow guest address space
+ * @shadow_lock: spinlock to protect the shadow gmap list
+ * @parent: pointer to the parent gmap for shadow guest address spaces
+ * @orig_asce: ASCE for which the shadow page table has been created
+ * @edat_level: edat level to be used for the shadow translation
+ * @removed: flag to indicate if a shadow guest address space has been removed
+ * @initialized: flag to indicate if a shadow guest address space can be used
*/
struct gmap {
struct list_head list;
@@ -26,26 +37,64 @@ struct gmap {
struct radix_tree_root guest_to_host;
struct radix_tree_root host_to_guest;
spinlock_t guest_table_lock;
+ atomic_t ref_count;
unsigned long *table;
unsigned long asce;
unsigned long asce_end;
void *private;
bool pfault_enabled;
+ /* Additional data for shadow guest address spaces */
+ struct radix_tree_root host_to_rmap;
+ struct list_head children;
+ struct list_head pt_list;
+ spinlock_t shadow_lock;
+ struct gmap *parent;
+ unsigned long orig_asce;
+ int edat_level;
+ bool removed;
+ bool initialized;
};
/**
+ * struct gmap_rmap - reverse mapping for shadow page table entries
+ * @next: pointer to next rmap in the list
+ * @raddr: virtual rmap address in the shadow guest address space
+ */
+struct gmap_rmap {
+ struct gmap_rmap *next;
+ unsigned long raddr;
+};
+
+#define gmap_for_each_rmap(pos, head) \
+ for (pos = (head); pos; pos = pos->next)
+
+#define gmap_for_each_rmap_safe(pos, n, head) \
+ for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
+
+/**
* struct gmap_notifier - notify function block for page invalidation
* @notifier_call: address of callback function
*/
struct gmap_notifier {
struct list_head list;
- void (*notifier_call)(struct gmap *gmap, unsigned long gaddr);
+ struct rcu_head rcu;
+ void (*notifier_call)(struct gmap *gmap, unsigned long start,
+ unsigned long end);
};
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
-void gmap_free(struct gmap *gmap);
+static inline int gmap_is_shadow(struct gmap *gmap)
+{
+ return !!gmap->parent;
+}
+
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
+void gmap_remove(struct gmap *gmap);
+struct gmap *gmap_get(struct gmap *gmap);
+void gmap_put(struct gmap *gmap);
+
void gmap_enable(struct gmap *gmap);
void gmap_disable(struct gmap *gmap);
+struct gmap *gmap_get_enabled(void);
int gmap_map_segment(struct gmap *gmap, unsigned long from,
unsigned long to, unsigned long len);
int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
@@ -57,8 +106,29 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
void __gmap_zap(struct gmap *, unsigned long gaddr);
void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
-void gmap_register_ipte_notifier(struct gmap_notifier *);
-void gmap_unregister_ipte_notifier(struct gmap_notifier *);
-int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
+
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level);
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+ int fake);
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+ int fake);
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+ int fake);
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+ int fake);
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection, int *fake);
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
+
+void gmap_register_pte_notifier(struct gmap_notifier *);
+void gmap_unregister_pte_notifier(struct gmap_notifier *);
+void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
+ unsigned long bits);
+
+int gmap_mprotect_notify(struct gmap *, unsigned long start,
+ unsigned long len, int prot);
#endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index d9be7c0c1..4c7fac750 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -41,7 +41,10 @@ static inline int prepare_hugepage_range(struct file *file,
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
pte_t *ptep)
{
- pte_val(*ptep) = _SEGMENT_ENTRY_EMPTY;
+ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ pte_val(*ptep) = _REGION3_ENTRY_EMPTY;
+ else
+ pte_val(*ptep) = _SEGMENT_ENTRY_EMPTY;
}
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h
index 6fc44dca1..4da22b2f0 100644
--- a/arch/s390/include/asm/ipl.h
+++ b/arch/s390/include/asm/ipl.h
@@ -141,11 +141,11 @@ extern void setup_ipl(void);
* DIAG 308 support
*/
enum diag308_subcode {
- DIAG308_REL_HSA = 2,
- DIAG308_IPL = 3,
- DIAG308_DUMP = 4,
- DIAG308_SET = 5,
- DIAG308_STORE = 6,
+ DIAG308_REL_HSA = 2,
+ DIAG308_LOAD_CLEAR = 3,
+ DIAG308_LOAD_NORMAL_DUMP = 4,
+ DIAG308_SET = 5,
+ DIAG308_STORE = 6,
};
enum diag308_ipl_type {
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
index f97b055de..70c9bce76 100644
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -7,11 +7,8 @@
#define NR_IRQS_BASE 3
-#ifdef CONFIG_PCI_NR_MSI
-# define NR_IRQS (NR_IRQS_BASE + CONFIG_PCI_NR_MSI)
-#else
-# define NR_IRQS NR_IRQS_BASE
-#endif
+#define NR_IRQS NR_IRQS_BASE
+#define NR_IRQS_LEGACY NR_IRQS_BASE
/* External interruption codes */
#define EXT_IRQ_INTERRUPT_KEY 0x0040
diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h
index 7f9fd5e3f..9be198f5e 100644
--- a/arch/s390/include/asm/jump_label.h
+++ b/arch/s390/include/asm/jump_label.h
@@ -4,6 +4,7 @@
#ifndef __ASSEMBLY__
#include <linux/types.h>
+#include <linux/stringify.h>
#define JUMP_LABEL_NOP_SIZE 6
#define JUMP_LABEL_NOP_OFFSET 2
diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index b47ad3b64..591e5a527 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -43,9 +43,9 @@ typedef u16 kprobe_opcode_t;
#define MAX_INSN_SIZE 0x0003
#define MAX_STACK_SIZE 64
#define MIN_STACK_SIZE(ADDR) (((MAX_STACK_SIZE) < \
- (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) \
+ (((unsigned long)task_stack_page(current)) + THREAD_SIZE - (ADDR))) \
? (MAX_STACK_SIZE) \
- : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR)))
+ : (((unsigned long)task_stack_page(current)) + THREAD_SIZE - (ADDR)))
#define kretprobe_blacklist_size 0
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index ac82e8eb9..8e5daf7a7 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -43,6 +43,7 @@
/* s390-specific vcpu->requests bit members */
#define KVM_REQ_ENABLE_IBS 8
#define KVM_REQ_DISABLE_IBS 9
+#define KVM_REQ_ICPT_OPEREXC 10
#define SIGP_CTRL_C 0x80
#define SIGP_CTRL_SCN_MASK 0x3f
@@ -145,7 +146,7 @@ struct kvm_s390_sie_block {
__u64 cputm; /* 0x0028 */
__u64 ckc; /* 0x0030 */
__u64 epoch; /* 0x0038 */
- __u8 reserved40[4]; /* 0x0040 */
+ __u32 svcc; /* 0x0040 */
#define LCTL_CR0 0x8000
#define LCTL_CR6 0x0200
#define LCTL_CR9 0x0040
@@ -154,6 +155,7 @@ struct kvm_s390_sie_block {
#define LCTL_CR14 0x0002
__u16 lctl; /* 0x0044 */
__s16 icpua; /* 0x0046 */
+#define ICTL_OPEREXC 0x80000000
#define ICTL_PINT 0x20000000
#define ICTL_LPSW 0x00400000
#define ICTL_STCTL 0x00040000
@@ -166,6 +168,9 @@ struct kvm_s390_sie_block {
#define ICPT_INST 0x04
#define ICPT_PROGI 0x08
#define ICPT_INSTPROGI 0x0C
+#define ICPT_EXTINT 0x14
+#define ICPT_VALIDITY 0x20
+#define ICPT_STOP 0x28
#define ICPT_OPEREXC 0x2C
#define ICPT_PARTEXEC 0x38
#define ICPT_IOINST 0x40
@@ -185,7 +190,9 @@ struct kvm_s390_sie_block {
__u32 scaol; /* 0x0064 */
__u8 reserved68[4]; /* 0x0068 */
__u32 todpr; /* 0x006c */
- __u8 reserved70[32]; /* 0x0070 */
+ __u8 reserved70[16]; /* 0x0070 */
+ __u64 mso; /* 0x0080 */
+ __u64 msl; /* 0x0088 */
psw_t gpsw; /* 0x0090 */
__u64 gg14; /* 0x00a0 */
__u64 gg15; /* 0x00a8 */
@@ -223,7 +230,7 @@ struct kvm_s390_sie_block {
__u8 reserved1e6[2]; /* 0x01e6 */
__u64 itdba; /* 0x01e8 */
__u64 riccbd; /* 0x01f0 */
- __u8 reserved1f8[8]; /* 0x01f8 */
+ __u64 gvrd; /* 0x01f8 */
} __attribute__((packed));
struct kvm_s390_itdb {
@@ -256,6 +263,7 @@ struct kvm_vcpu_stat {
u32 instruction_stctg;
u32 exit_program_interruption;
u32 exit_instr_and_program;
+ u32 exit_operation_exception;
u32 deliver_external_call;
u32 deliver_emergency_signal;
u32 deliver_service_signal;
@@ -278,7 +286,9 @@ struct kvm_vcpu_stat {
u32 instruction_stsi;
u32 instruction_stfl;
u32 instruction_tprot;
+ u32 instruction_sie;
u32 instruction_essa;
+ u32 instruction_sthyi;
u32 instruction_sigp_sense;
u32 instruction_sigp_sense_running;
u32 instruction_sigp_external_call;
@@ -541,12 +551,16 @@ struct kvm_guestdbg_info_arch {
struct kvm_vcpu_arch {
struct kvm_s390_sie_block *sie_block;
+ /* if vsie is active, currently executed shadow sie control block */
+ struct kvm_s390_sie_block *vsie_block;
unsigned int host_acrs[NUM_ACRS];
struct fpu host_fpregs;
struct kvm_s390_local_interrupt local_int;
struct hrtimer ckc_timer;
struct kvm_s390_pgm_info pgm;
struct gmap *gmap;
+ /* backup location for the currently enabled gmap when scheduled out */
+ struct gmap *enabled_gmap;
struct kvm_guestdbg_info_arch guestdbg;
unsigned long pfault_token;
unsigned long pfault_select;
@@ -631,6 +645,14 @@ struct sie_page2 {
u8 reserved900[0x1000 - 0x900]; /* 0x0900 */
} __packed;
+struct kvm_s390_vsie {
+ struct mutex mutex;
+ struct radix_tree_root addr_to_page;
+ int page_count;
+ int next;
+ struct page *pages[KVM_MAX_VCPUS];
+};
+
struct kvm_arch{
void *sca;
int use_esca;
@@ -646,15 +668,20 @@ struct kvm_arch{
int user_cpu_state_ctrl;
int user_sigp;
int user_stsi;
+ int user_instr0;
struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
wait_queue_head_t ipte_wq;
int ipte_lock_count;
struct mutex ipte_mutex;
+ struct ratelimit_state sthyi_limit;
spinlock_t start_stop_lock;
struct sie_page2 *sie_page2;
struct kvm_s390_cpu_model model;
struct kvm_s390_crypto crypto;
+ struct kvm_s390_vsie vsie;
u64 epoch;
+ /* subset of available cpu features enabled by user space */
+ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
};
#define KVM_HVA_ERR_BAD (-1UL)
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index 081b2ad99..6d39329c8 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -6,10 +6,11 @@
typedef struct {
cpumask_t cpu_attach_mask;
- atomic_t attach_count;
+ atomic_t flush_count;
unsigned int flush_mm;
- spinlock_t list_lock;
+ spinlock_t pgtable_lock;
struct list_head pgtable_list;
+ spinlock_t gmap_lock;
struct list_head gmap_list;
unsigned long asce;
unsigned long asce_limit;
@@ -22,9 +23,11 @@ typedef struct {
unsigned int use_skey:1;
} mm_context_t;
-#define INIT_MM_CONTEXT(name) \
- .context.list_lock = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \
- .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+#define INIT_MM_CONTEXT(name) \
+ .context.pgtable_lock = \
+ __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \
+ .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+ .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \
.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
static inline int tprot(unsigned long addr)
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index c837b79b4..c6a088c91 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,11 +15,12 @@
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
- spin_lock_init(&mm->context.list_lock);
+ spin_lock_init(&mm->context.pgtable_lock);
INIT_LIST_HEAD(&mm->context.pgtable_list);
+ spin_lock_init(&mm->context.gmap_lock);
INIT_LIST_HEAD(&mm->context.gmap_list);
cpumask_clear(&mm->context.cpu_attach_mask);
- atomic_set(&mm->context.attach_count, 0);
+ atomic_set(&mm->context.flush_count, 0);
mm->context.flush_mm = 0;
#ifdef CONFIG_PGSTE
mm->context.alloc_pgste = page_table_allocate_pgste;
@@ -90,15 +91,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
S390_lowcore.user_asce = next->context.asce;
if (prev == next)
return;
- if (MACHINE_HAS_TLB_LC)
- cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
+ cpumask_set_cpu(cpu, &next->context.cpu_attach_mask);
+ cpumask_set_cpu(cpu, mm_cpumask(next));
/* Clear old ASCE by loading the kernel ASCE. */
__ctl_load(S390_lowcore.kernel_asce, 1, 1);
__ctl_load(S390_lowcore.kernel_asce, 7, 7);
- atomic_inc(&next->context.attach_count);
- atomic_dec(&prev->context.attach_count);
- if (MACHINE_HAS_TLB_LC)
- cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
+ cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask);
}
#define finish_arch_post_lock_switch finish_arch_post_lock_switch
@@ -110,10 +108,9 @@ static inline void finish_arch_post_lock_switch(void)
load_kernel_asce();
if (mm) {
preempt_disable();
- while (atomic_read(&mm->context.attach_count) >> 16)
+ while (atomic_read(&mm->context.flush_count))
cpu_relax();
- cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
if (mm->context.flush_mm)
__tlb_flush_mm(mm);
preempt_enable();
@@ -128,7 +125,6 @@ static inline void activate_mm(struct mm_struct *prev,
struct mm_struct *next)
{
switch_mm(prev, next, current);
- cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
set_user_asce(next);
}
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 53eacbd4f..69b8a41fc 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -21,6 +21,7 @@
#define HPAGE_SIZE (1UL << HPAGE_SHIFT)
#define HPAGE_MASK (~(HPAGE_SIZE - 1))
#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
+#define HUGE_MAX_HSTATE 2
#define ARCH_HAS_SETCLEAR_HUGE_PTE
#define ARCH_HAS_HUGE_PTE_TYPE
@@ -30,11 +31,12 @@
#include <asm/setup.h>
#ifndef __ASSEMBLY__
+void __storage_key_init_range(unsigned long start, unsigned long end);
+
static inline void storage_key_init_range(unsigned long start, unsigned long end)
{
-#if PAGE_DEFAULT_KEY
- __storage_key_init_range(start, end);
-#endif
+ if (PAGE_DEFAULT_KEY)
+ __storage_key_init_range(start, end);
}
#define clear_page(page) memset((page), 0, PAGE_SIZE)
@@ -109,13 +111,14 @@ static inline unsigned char page_get_storage_key(unsigned long addr)
static inline int page_reset_referenced(unsigned long addr)
{
- unsigned int ipm;
+ int cc;
asm volatile(
" rrbe 0,%1\n"
" ipm %0\n"
- : "=d" (ipm) : "a" (addr) : "cc");
- return !!(ipm & 0x20000000);
+ " srl %0,28\n"
+ : "=d" (cc) : "a" (addr) : "cc");
+ return cc;
}
/* Bits int the storage key */
@@ -146,6 +149,8 @@ static inline int devmem_is_allowed(unsigned long pfn)
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT)
+#define page_to_virt(page) pfn_to_virt(page_to_pfn(page))
#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h
index 1f7ff85c5..c64c0befd 100644
--- a/arch/s390/include/asm/perf_event.h
+++ b/arch/s390/include/asm/perf_event.h
@@ -86,16 +86,4 @@ struct sf_raw_sample {
u8 padding[]; /* Padding to next multiple of 8 */
} __packed;
-/* Perf hardware reserve and release functions */
-#ifdef CONFIG_PERF_EVENTS
-int perf_reserve_sampling(void);
-void perf_release_sampling(void);
-#else /* CONFIG_PERF_EVENTS */
-static inline int perf_reserve_sampling(void)
-{
- return 0;
-}
-static inline void perf_release_sampling(void) {}
-#endif /* CONFIG_PERF_EVENTS */
-
#endif /* _ASM_S390_PERF_EVENT_H */
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index da34cb6b1..f4eb9843e 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *);
void crst_table_free(struct mm_struct *, unsigned long *);
unsigned long *page_table_alloc(struct mm_struct *);
+struct page *page_table_alloc_pgste(struct mm_struct *mm);
void page_table_free(struct mm_struct *, unsigned long *);
void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
+void page_table_free_pgste(struct page *page);
extern int page_table_allocate_pgste;
static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 42b968a85..72c7f60bf 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -28,12 +28,33 @@
#include <linux/mm_types.h>
#include <linux/page-flags.h>
#include <linux/radix-tree.h>
+#include <linux/atomic.h>
#include <asm/bug.h>
#include <asm/page.h>
-extern pgd_t swapper_pg_dir[] __attribute__ ((aligned (4096)));
+extern pgd_t swapper_pg_dir[];
extern void paging_init(void);
extern void vmem_map_init(void);
+pmd_t *vmem_pmd_alloc(void);
+pte_t *vmem_pte_alloc(void);
+
+enum {
+ PG_DIRECT_MAP_4K = 0,
+ PG_DIRECT_MAP_1M,
+ PG_DIRECT_MAP_2G,
+ PG_DIRECT_MAP_MAX
+};
+
+extern atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
+
+static inline void update_page_count(int level, long count)
+{
+ if (IS_ENABLED(CONFIG_PROC_FS))
+ atomic_long_add(count, &direct_pages_count[level]);
+}
+
+struct seq_file;
+void arch_report_meminfo(struct seq_file *m);
/*
* The S390 doesn't have any external MMU info: the kernel page
@@ -221,8 +242,8 @@ static inline int is_module_addr(void *addr)
* swap .11..ttttt.0
* prot-none, clean, old .11.xx0000.1
* prot-none, clean, young .11.xx0001.1
- * prot-none, dirty, old .10.xx0010.1
- * prot-none, dirty, young .10.xx0011.1
+ * prot-none, dirty, old .11.xx0010.1
+ * prot-none, dirty, young .11.xx0011.1
* read-only, clean, old .11.xx0100.1
* read-only, clean, young .01.xx0101.1
* read-only, dirty, old .11.xx0110.1
@@ -256,6 +277,7 @@ static inline int is_module_addr(void *addr)
/* Bits in the region table entry */
#define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */
#define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */
+#define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */
#define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */
#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */
#define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */
@@ -270,8 +292,23 @@ static inline int is_module_addr(void *addr)
#define _REGION3_ENTRY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_LENGTH)
#define _REGION3_ENTRY_EMPTY (_REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID)
-#define _REGION3_ENTRY_LARGE 0x400 /* RTTE-format control, large page */
-#define _REGION3_ENTRY_RO 0x200 /* page protection bit */
+#define _REGION3_ENTRY_ORIGIN_LARGE ~0x7fffffffUL /* large page address */
+#define _REGION3_ENTRY_ORIGIN ~0x7ffUL/* region third table origin */
+
+#define _REGION3_ENTRY_DIRTY 0x2000 /* SW region dirty bit */
+#define _REGION3_ENTRY_YOUNG 0x1000 /* SW region young bit */
+#define _REGION3_ENTRY_LARGE 0x0400 /* RTTE-format control, large page */
+#define _REGION3_ENTRY_READ 0x0002 /* SW region read bit */
+#define _REGION3_ENTRY_WRITE 0x0001 /* SW region write bit */
+
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _REGION3_ENTRY_SOFT_DIRTY 0x4000 /* SW region soft dirty bit */
+#else
+#define _REGION3_ENTRY_SOFT_DIRTY 0x0000 /* SW region soft dirty bit */
+#endif
+
+#define _REGION_ENTRY_BITS 0xfffffffffffff227UL
+#define _REGION_ENTRY_BITS_LARGE 0xffffffff8000fe27UL
/* Bits in the segment table entry */
#define _SEGMENT_ENTRY_BITS 0xfffffffffffffe33UL
@@ -287,8 +324,8 @@ static inline int is_module_addr(void *addr)
#define _SEGMENT_ENTRY_DIRTY 0x2000 /* SW segment dirty bit */
#define _SEGMENT_ENTRY_YOUNG 0x1000 /* SW segment young bit */
#define _SEGMENT_ENTRY_LARGE 0x0400 /* STE-format control, large page */
-#define _SEGMENT_ENTRY_READ 0x0002 /* SW segment read bit */
-#define _SEGMENT_ENTRY_WRITE 0x0001 /* SW segment write bit */
+#define _SEGMENT_ENTRY_WRITE 0x0002 /* SW segment write bit */
+#define _SEGMENT_ENTRY_READ 0x0001 /* SW segment read bit */
#ifdef CONFIG_MEM_SOFT_DIRTY
#define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */
@@ -297,16 +334,17 @@ static inline int is_module_addr(void *addr)
#endif
/*
- * Segment table entry encoding (R = read-only, I = invalid, y = young bit):
- * dy..R...I...rw
+ * Segment table and region3 table entry encoding
+ * (R = read-only, I = invalid, y = young bit):
+ * dy..R...I...wr
* prot-none, clean, old 00..1...1...00
* prot-none, clean, young 01..1...1...00
* prot-none, dirty, old 10..1...1...00
* prot-none, dirty, young 11..1...1...00
- * read-only, clean, old 00..1...1...10
- * read-only, clean, young 01..1...0...10
- * read-only, dirty, old 10..1...1...10
- * read-only, dirty, young 11..1...0...10
+ * read-only, clean, old 00..1...1...01
+ * read-only, clean, young 01..1...0...01
+ * read-only, dirty, old 10..1...1...01
+ * read-only, dirty, young 11..1...0...01
* read-write, clean, old 00..1...1...11
* read-write, clean, young 01..1...0...11
* read-write, dirty, old 10..0...1...11
@@ -327,6 +365,7 @@ static inline int is_module_addr(void *addr)
#define PGSTE_GC_BIT 0x0002000000000000UL
#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */
#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */
+#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */
/* Guest Page State used for virtualization */
#define _PGSTE_GPS_ZERO 0x0000000080000000UL
@@ -345,7 +384,7 @@ static inline int is_module_addr(void *addr)
/*
* Page protection definitions.
*/
-#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_INVALID)
+#define PAGE_NONE __pgprot(_PAGE_PRESENT | _PAGE_INVALID | _PAGE_PROTECT)
#define PAGE_READ __pgprot(_PAGE_PRESENT | _PAGE_READ | \
_PAGE_INVALID | _PAGE_PROTECT)
#define PAGE_WRITE __pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
@@ -391,6 +430,33 @@ static inline int is_module_addr(void *addr)
_SEGMENT_ENTRY_READ)
#define SEGMENT_WRITE __pgprot(_SEGMENT_ENTRY_READ | \
_SEGMENT_ENTRY_WRITE)
+#define SEGMENT_KERNEL __pgprot(_SEGMENT_ENTRY | \
+ _SEGMENT_ENTRY_LARGE | \
+ _SEGMENT_ENTRY_READ | \
+ _SEGMENT_ENTRY_WRITE | \
+ _SEGMENT_ENTRY_YOUNG | \
+ _SEGMENT_ENTRY_DIRTY)
+#define SEGMENT_KERNEL_RO __pgprot(_SEGMENT_ENTRY | \
+ _SEGMENT_ENTRY_LARGE | \
+ _SEGMENT_ENTRY_READ | \
+ _SEGMENT_ENTRY_YOUNG | \
+ _SEGMENT_ENTRY_PROTECT)
+
+/*
+ * Region3 entry (large page) protection definitions.
+ */
+
+#define REGION3_KERNEL __pgprot(_REGION_ENTRY_TYPE_R3 | \
+ _REGION3_ENTRY_LARGE | \
+ _REGION3_ENTRY_READ | \
+ _REGION3_ENTRY_WRITE | \
+ _REGION3_ENTRY_YOUNG | \
+ _REGION3_ENTRY_DIRTY)
+#define REGION3_KERNEL_RO __pgprot(_REGION_ENTRY_TYPE_R3 | \
+ _REGION3_ENTRY_LARGE | \
+ _REGION3_ENTRY_READ | \
+ _REGION3_ENTRY_YOUNG | \
+ _REGION_ENTRY_PROTECT)
static inline int mm_has_pgste(struct mm_struct *mm)
{
@@ -424,6 +490,53 @@ static inline int mm_use_skey(struct mm_struct *mm)
return 0;
}
+static inline void csp(unsigned int *ptr, unsigned int old, unsigned int new)
+{
+ register unsigned long reg2 asm("2") = old;
+ register unsigned long reg3 asm("3") = new;
+ unsigned long address = (unsigned long)ptr | 1;
+
+ asm volatile(
+ " csp %0,%3"
+ : "+d" (reg2), "+m" (*ptr)
+ : "d" (reg3), "d" (address)
+ : "cc");
+}
+
+static inline void cspg(unsigned long *ptr, unsigned long old, unsigned long new)
+{
+ register unsigned long reg2 asm("2") = old;
+ register unsigned long reg3 asm("3") = new;
+ unsigned long address = (unsigned long)ptr | 1;
+
+ asm volatile(
+ " .insn rre,0xb98a0000,%0,%3"
+ : "+d" (reg2), "+m" (*ptr)
+ : "d" (reg3), "d" (address)
+ : "cc");
+}
+
+#define CRDTE_DTT_PAGE 0x00UL
+#define CRDTE_DTT_SEGMENT 0x10UL
+#define CRDTE_DTT_REGION3 0x14UL
+#define CRDTE_DTT_REGION2 0x18UL
+#define CRDTE_DTT_REGION1 0x1cUL
+
+static inline void crdte(unsigned long old, unsigned long new,
+ unsigned long table, unsigned long dtt,
+ unsigned long address, unsigned long asce)
+{
+ register unsigned long reg2 asm("2") = old;
+ register unsigned long reg3 asm("3") = new;
+ register unsigned long reg4 asm("4") = table | dtt;
+ register unsigned long reg5 asm("5") = address;
+
+ asm volatile(".insn rrf,0xb98f0000,%0,%2,%4,0"
+ : "+d" (reg2)
+ : "d" (reg3), "d" (reg4), "d" (reg5), "a" (asce)
+ : "memory", "cc");
+}
+
/*
* pgd/pmd/pte query functions
*/
@@ -465,7 +578,7 @@ static inline int pud_none(pud_t pud)
{
if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3)
return 0;
- return (pud_val(pud) & _REGION_ENTRY_INVALID) != 0UL;
+ return pud_val(pud) == _REGION3_ENTRY_EMPTY;
}
static inline int pud_large(pud_t pud)
@@ -475,17 +588,35 @@ static inline int pud_large(pud_t pud)
return !!(pud_val(pud) & _REGION3_ENTRY_LARGE);
}
+static inline unsigned long pud_pfn(pud_t pud)
+{
+ unsigned long origin_mask;
+
+ origin_mask = _REGION3_ENTRY_ORIGIN;
+ if (pud_large(pud))
+ origin_mask = _REGION3_ENTRY_ORIGIN_LARGE;
+ return (pud_val(pud) & origin_mask) >> PAGE_SHIFT;
+}
+
+static inline int pmd_large(pmd_t pmd)
+{
+ return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
+}
+
+static inline int pmd_bad(pmd_t pmd)
+{
+ if (pmd_large(pmd))
+ return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0;
+ return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
+}
+
static inline int pud_bad(pud_t pud)
{
- /*
- * With dynamic page table levels the pud can be a region table
- * entry or a segment table entry. Check for the bit that are
- * invalid for either table entry.
- */
- unsigned long mask =
- ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID &
- ~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH;
- return (pud_val(pud) & mask) != 0;
+ if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3)
+ return pmd_bad(__pmd(pud_val(pud)));
+ if (pud_large(pud))
+ return (pud_val(pud) & ~_REGION_ENTRY_BITS_LARGE) != 0;
+ return (pud_val(pud) & ~_REGION_ENTRY_BITS) != 0;
}
static inline int pmd_present(pmd_t pmd)
@@ -498,11 +629,6 @@ static inline int pmd_none(pmd_t pmd)
return pmd_val(pmd) == _SEGMENT_ENTRY_INVALID;
}
-static inline int pmd_large(pmd_t pmd)
-{
- return (pmd_val(pmd) & _SEGMENT_ENTRY_LARGE) != 0;
-}
-
static inline unsigned long pmd_pfn(pmd_t pmd)
{
unsigned long origin_mask;
@@ -513,13 +639,6 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
return (pmd_val(pmd) & origin_mask) >> PAGE_SHIFT;
}
-static inline int pmd_bad(pmd_t pmd)
-{
- if (pmd_large(pmd))
- return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0;
- return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0;
-}
-
#define __HAVE_ARCH_PMD_WRITE
static inline int pmd_write(pmd_t pmd)
{
@@ -885,15 +1004,26 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t entry);
void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void ptep_notify(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned long bits);
+int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
+ pte_t *ptep, int prot, unsigned long bit);
void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
pte_t *ptep , int reset);
void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+ pte_t *sptep, pte_t *tptep, pte_t pte);
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
unsigned char key, bool nq);
-unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char key, unsigned char *oldkey,
+ bool nq, bool mr, bool mc);
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr);
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char *key);
/*
* Certain architectures need to do special things when PTEs
@@ -963,6 +1093,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
#define pte_page(x) pfn_to_page(pte_pfn(x))
#define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd))
+#define pud_page(pud) pfn_to_page(pud_pfn(pud))
/* Find an entry in the lowest level page table.. */
#define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr))
@@ -970,20 +1101,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address)
#define pte_unmap(pte) do { } while (0)
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
-static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
-{
- /*
- * pgprot is PAGE_NONE, PAGE_READ, or PAGE_WRITE (see __Pxxx / __Sxxx)
- * Convert to segment table entry format.
- */
- if (pgprot_val(pgprot) == pgprot_val(PAGE_NONE))
- return pgprot_val(SEGMENT_NONE);
- if (pgprot_val(pgprot) == pgprot_val(PAGE_READ))
- return pgprot_val(SEGMENT_READ);
- return pgprot_val(SEGMENT_WRITE);
-}
-
static inline pmd_t pmd_wrprotect(pmd_t pmd)
{
pmd_val(pmd) &= ~_SEGMENT_ENTRY_WRITE;
@@ -1020,6 +1137,56 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
return pmd;
}
+static inline pud_t pud_wrprotect(pud_t pud)
+{
+ pud_val(pud) &= ~_REGION3_ENTRY_WRITE;
+ pud_val(pud) |= _REGION_ENTRY_PROTECT;
+ return pud;
+}
+
+static inline pud_t pud_mkwrite(pud_t pud)
+{
+ pud_val(pud) |= _REGION3_ENTRY_WRITE;
+ if (pud_large(pud) && !(pud_val(pud) & _REGION3_ENTRY_DIRTY))
+ return pud;
+ pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
+ return pud;
+}
+
+static inline pud_t pud_mkclean(pud_t pud)
+{
+ if (pud_large(pud)) {
+ pud_val(pud) &= ~_REGION3_ENTRY_DIRTY;
+ pud_val(pud) |= _REGION_ENTRY_PROTECT;
+ }
+ return pud;
+}
+
+static inline pud_t pud_mkdirty(pud_t pud)
+{
+ if (pud_large(pud)) {
+ pud_val(pud) |= _REGION3_ENTRY_DIRTY |
+ _REGION3_ENTRY_SOFT_DIRTY;
+ if (pud_val(pud) & _REGION3_ENTRY_WRITE)
+ pud_val(pud) &= ~_REGION_ENTRY_PROTECT;
+ }
+ return pud;
+}
+
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE)
+static inline unsigned long massage_pgprot_pmd(pgprot_t pgprot)
+{
+ /*
+ * pgprot is PAGE_NONE, PAGE_READ, or PAGE_WRITE (see __Pxxx / __Sxxx)
+ * Convert to segment table entry format.
+ */
+ if (pgprot_val(pgprot) == pgprot_val(PAGE_NONE))
+ return pgprot_val(SEGMENT_NONE);
+ if (pgprot_val(pgprot) == pgprot_val(PAGE_READ))
+ return pgprot_val(SEGMENT_READ);
+ return pgprot_val(SEGMENT_WRITE);
+}
+
static inline pmd_t pmd_mkyoung(pmd_t pmd)
{
if (pmd_large(pmd)) {
@@ -1068,15 +1235,8 @@ static inline pmd_t mk_pmd_phys(unsigned long physpage, pgprot_t pgprot)
static inline void __pmdp_csp(pmd_t *pmdp)
{
- register unsigned long reg2 asm("2") = pmd_val(*pmdp);
- register unsigned long reg3 asm("3") = pmd_val(*pmdp) |
- _SEGMENT_ENTRY_INVALID;
- register unsigned long reg4 asm("4") = ((unsigned long) pmdp) + 5;
-
- asm volatile(
- " csp %1,%3"
- : "=m" (*pmdp)
- : "d" (reg2), "d" (reg3), "d" (reg4), "m" (*pmdp) : "cc");
+ csp((unsigned int *)pmdp + 1, pmd_val(*pmdp),
+ pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID);
}
static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp)
@@ -1091,6 +1251,19 @@ static inline void __pmdp_idte(unsigned long address, pmd_t *pmdp)
: "cc" );
}
+static inline void __pudp_idte(unsigned long address, pud_t *pudp)
+{
+ unsigned long r3o;
+
+ r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t);
+ r3o |= _ASCE_TYPE_REGION3;
+ asm volatile(
+ " .insn rrf,0xb98e0000,%2,%3,0,0"
+ : "=m" (*pudp)
+ : "m" (*pudp), "a" (r3o), "a" ((address & PUD_MASK))
+ : "cc");
+}
+
static inline void __pmdp_idte_local(unsigned long address, pmd_t *pmdp)
{
unsigned long sto;
@@ -1103,8 +1276,22 @@ static inline void __pmdp_idte_local(unsigned long address, pmd_t *pmdp)
: "cc" );
}
+static inline void __pudp_idte_local(unsigned long address, pud_t *pudp)
+{
+ unsigned long r3o;
+
+ r3o = (unsigned long) pudp - pud_index(address) * sizeof(pud_t);
+ r3o |= _ASCE_TYPE_REGION3;
+ asm volatile(
+ " .insn rrf,0xb98e0000,%2,%3,0,1"
+ : "=m" (*pudp)
+ : "m" (*pudp), "a" (r3o), "a" ((address & PUD_MASK))
+ : "cc");
+}
+
pmd_t pmdp_xchg_direct(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
pmd_t pmdp_xchg_lazy(struct mm_struct *, unsigned long, pmd_t *, pmd_t);
+pud_t pudp_xchg_direct(struct mm_struct *, unsigned long, pud_t *, pud_t);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 9d4d311d7..03323175d 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -77,7 +77,10 @@ static inline void get_cpu_id(struct cpuid *ptr)
asm volatile("stidp %0" : "=Q" (*ptr));
}
-extern void s390_adjust_jiffies(void);
+void s390_adjust_jiffies(void);
+void s390_update_cpu_mhz(void);
+void cpu_detect_mhz_feature(void);
+
extern const struct seq_operations cpuinfo_op;
extern int sysctl_ieee_emulation_warnings;
extern void execve_tail(void);
@@ -109,6 +112,8 @@ struct thread_struct {
unsigned long ksp; /* kernel stack pointer */
mm_segment_t mm_segment;
unsigned long gmap_addr; /* address of last gmap fault. */
+ unsigned int gmap_write_flag; /* gmap fault write indication */
+ unsigned int gmap_int_code; /* int code of last gmap fault */
unsigned int gmap_pfault; /* signal of a pending guest pfault */
struct per_regs per_user; /* User specified PER registers */
struct per_event per_event; /* Cause of the last PER trap */
@@ -233,6 +238,18 @@ void cpu_relax(void);
#define cpu_relax_lowlatency() barrier()
+#define ECAG_CACHE_ATTRIBUTE 0
+#define ECAG_CPU_ATTRIBUTE 1
+
+static inline unsigned long __ecag(unsigned int asi, unsigned char parm)
+{
+ unsigned long val;
+
+ asm volatile(".insn rsy,0xeb000000004c,%0,0,0(%1)" /* ecag */
+ : "=d" (val) : "a" (asi << 8 | parm));
+ return val;
+}
+
static inline void psw_set_key(unsigned int key)
{
asm volatile("spka 0(%0)" : : "d" (key));
diff --git a/arch/s390/include/asm/rwsem.h b/arch/s390/include/asm/rwsem.h
index c75e4471e..597e7e96b 100644
--- a/arch/s390/include/asm/rwsem.h
+++ b/arch/s390/include/asm/rwsem.h
@@ -207,41 +207,4 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
rwsem_downgrade_wake(sem);
}
-/*
- * implement atomic add functionality
- */
-static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem)
-{
- signed long old, new;
-
- asm volatile(
- " lg %0,%2\n"
- "0: lgr %1,%0\n"
- " agr %1,%4\n"
- " csg %0,%1,%2\n"
- " jl 0b"
- : "=&d" (old), "=&d" (new), "=Q" (sem->count)
- : "Q" (sem->count), "d" (delta)
- : "cc", "memory");
-}
-
-/*
- * implement exchange and add functionality
- */
-static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
-{
- signed long old, new;
-
- asm volatile(
- " lg %0,%2\n"
- "0: lgr %1,%0\n"
- " agr %1,%4\n"
- " csg %0,%1,%2\n"
- " jl 0b"
- : "=&d" (old), "=&d" (new), "=Q" (sem->count)
- : "Q" (sem->count), "d" (delta)
- : "cc", "memory");
- return new;
-}
-
#endif /* _S390_RWSEM_H */
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index e4f6f73af..2ad9c204b 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -32,12 +32,19 @@ struct sclp_core_entry {
u8 reserved0;
u8 : 4;
u8 sief2 : 1;
- u8 : 3;
- u8 : 3;
+ u8 skey : 1;
+ u8 : 2;
+ u8 : 2;
+ u8 gpere : 1;
u8 siif : 1;
u8 sigpif : 1;
u8 : 3;
- u8 reserved2[10];
+ u8 reserved2[3];
+ u8 : 2;
+ u8 ib : 1;
+ u8 cei : 1;
+ u8 : 4;
+ u8 reserved3[6];
u8 type;
u8 reserved1;
} __attribute__((packed));
@@ -59,6 +66,15 @@ struct sclp_info {
unsigned char has_hvs : 1;
unsigned char has_esca : 1;
unsigned char has_sief2 : 1;
+ unsigned char has_64bscao : 1;
+ unsigned char has_gpere : 1;
+ unsigned char has_cmma : 1;
+ unsigned char has_gsls : 1;
+ unsigned char has_ib : 1;
+ unsigned char has_cei : 1;
+ unsigned char has_pfmfi : 1;
+ unsigned char has_ibs : 1;
+ unsigned char has_skey : 1;
unsigned int ibc;
unsigned int mtid;
unsigned int mtid_cp;
@@ -101,5 +117,6 @@ int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
void sclp_early_detect(void);
void _sclp_print_early(const char *);
+void sclp_ocf_cpc_name_copy(char *dst);
#endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h
index fbd9116eb..5ce29fe10 100644
--- a/arch/s390/include/asm/sections.h
+++ b/arch/s390/include/asm/sections.h
@@ -4,5 +4,6 @@
#include <asm-generic/sections.h>
extern char _eshared[], _ehead[];
+extern char __start_ro_after_init[], __end_ro_after_init[];
#endif
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index c0f0efbb6..5e8d57e1c 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -86,9 +86,13 @@ extern char vmpoff_cmd[];
#define CONSOLE_IS_SCLP (console_mode == 1)
#define CONSOLE_IS_3215 (console_mode == 2)
#define CONSOLE_IS_3270 (console_mode == 3)
+#define CONSOLE_IS_VT220 (console_mode == 4)
+#define CONSOLE_IS_HVC (console_mode == 5)
#define SET_CONSOLE_SCLP do { console_mode = 1; } while (0)
#define SET_CONSOLE_3215 do { console_mode = 2; } while (0)
#define SET_CONSOLE_3270 do { console_mode = 3; } while (0)
+#define SET_CONSOLE_VT220 do { console_mode = 4; } while (0)
+#define SET_CONSOLE_HVC do { console_mode = 5; } while (0)
#define NSS_NAME_SIZE 8
extern char kernel_nss_name[];
diff --git a/arch/s390/include/asm/sigp.h b/arch/s390/include/asm/sigp.h
index 1c8f33fca..72df5f2de 100644
--- a/arch/s390/include/asm/sigp.h
+++ b/arch/s390/include/asm/sigp.h
@@ -37,8 +37,8 @@
#ifndef __ASSEMBLY__
-static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm,
- u32 *status)
+static inline int ____pcpu_sigp(u16 addr, u8 order, unsigned long parm,
+ u32 *status)
{
register unsigned long reg1 asm ("1") = parm;
int cc;
@@ -48,8 +48,19 @@ static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm,
" ipm %0\n"
" srl %0,28\n"
: "=d" (cc), "+d" (reg1) : "d" (addr), "a" (order) : "cc");
+ *status = reg1;
+ return cc;
+}
+
+static inline int __pcpu_sigp(u16 addr, u8 order, unsigned long parm,
+ u32 *status)
+{
+ u32 _status;
+ int cc;
+
+ cc = ____pcpu_sigp(addr, order, parm, &_status);
if (status && cc == 1)
- *status = reg1;
+ *status = _status;
return cc;
}
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 63ebf37d3..7e9e09f60 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -10,6 +10,8 @@
#define __ASM_SPINLOCK_H
#include <linux/smp.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
#define SPINLOCK_LOCKVAL (S390_lowcore.spinlock_lockval)
@@ -97,6 +99,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
{
while (arch_spin_is_locked(lock))
arch_spin_relax(lock);
+ smp_acquire__after_ctrl_dep();
}
/*
diff --git a/arch/s390/include/asm/stp.h b/arch/s390/include/asm/stp.h
new file mode 100644
index 000000000..768972758
--- /dev/null
+++ b/arch/s390/include/asm/stp.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright IBM Corp. 2006
+ * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com)
+ */
+#ifndef __S390_STP_H
+#define __S390_STP_H
+
+/* notifier for syncs */
+extern struct atomic_notifier_head s390_epoch_delta_notifier;
+
+/* STP interruption parameter */
+struct stp_irq_parm {
+ unsigned int _pad0 : 14;
+ unsigned int tsc : 1; /* Timing status change */
+ unsigned int lac : 1; /* Link availability change */
+ unsigned int tcpc : 1; /* Time control parameter change */
+ unsigned int _pad2 : 15;
+} __attribute__ ((packed));
+
+#define STP_OP_SYNC 1
+#define STP_OP_CTRL 3
+
+struct stp_sstpi {
+ unsigned int rsvd0;
+ unsigned int rsvd1 : 8;
+ unsigned int stratum : 8;
+ unsigned int vbits : 16;
+ unsigned int leaps : 16;
+ unsigned int tmd : 4;
+ unsigned int ctn : 4;
+ unsigned int rsvd2 : 3;
+ unsigned int c : 1;
+ unsigned int tst : 4;
+ unsigned int tzo : 16;
+ unsigned int dsto : 16;
+ unsigned int ctrl : 16;
+ unsigned int rsvd3 : 16;
+ unsigned int tto;
+ unsigned int rsvd4;
+ unsigned int ctnid[3];
+ unsigned int rsvd5;
+ unsigned int todoff[4];
+ unsigned int rsvd6[48];
+} __attribute__ ((packed));
+
+/* Functions needed by the machine check handler */
+int stp_sync_check(void);
+int stp_island_check(void);
+void stp_queue_work(void);
+
+#endif /* __S390_STP_H */
diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h
index dcb6312a0..0bb08f341 100644
--- a/arch/s390/include/asm/timex.h
+++ b/arch/s390/include/asm/timex.h
@@ -52,6 +52,70 @@ static inline void store_clock_comparator(__u64 *time)
void clock_comparator_work(void);
+void __init ptff_init(void);
+
+extern unsigned char ptff_function_mask[16];
+extern unsigned long lpar_offset;
+extern unsigned long initial_leap_seconds;
+
+/* Function codes for the ptff instruction. */
+#define PTFF_QAF 0x00 /* query available functions */
+#define PTFF_QTO 0x01 /* query tod offset */
+#define PTFF_QSI 0x02 /* query steering information */
+#define PTFF_QUI 0x04 /* query UTC information */
+#define PTFF_ATO 0x40 /* adjust tod offset */
+#define PTFF_STO 0x41 /* set tod offset */
+#define PTFF_SFS 0x42 /* set fine steering rate */
+#define PTFF_SGS 0x43 /* set gross steering rate */
+
+/* Query TOD offset result */
+struct ptff_qto {
+ unsigned long long physical_clock;
+ unsigned long long tod_offset;
+ unsigned long long logical_tod_offset;
+ unsigned long long tod_epoch_difference;
+} __packed;
+
+static inline int ptff_query(unsigned int nr)
+{
+ unsigned char *ptr;
+
+ ptr = ptff_function_mask + (nr >> 3);
+ return (*ptr & (0x80 >> (nr & 7))) != 0;
+}
+
+/* Query UTC information result */
+struct ptff_qui {
+ unsigned int tm : 2;
+ unsigned int ts : 2;
+ unsigned int : 28;
+ unsigned int pad_0x04;
+ unsigned long leap_event;
+ short old_leap;
+ short new_leap;
+ unsigned int pad_0x14;
+ unsigned long prt[5];
+ unsigned long cst[3];
+ unsigned int skew;
+ unsigned int pad_0x5c[41];
+} __packed;
+
+static inline int ptff(void *ptff_block, size_t len, unsigned int func)
+{
+ typedef struct { char _[len]; } addrtype;
+ register unsigned int reg0 asm("0") = func;
+ register unsigned long reg1 asm("1") = (unsigned long) ptff_block;
+ int rc;
+
+ asm volatile(
+ " .word 0x0104\n"
+ " ipm %0\n"
+ " srl %0,28\n"
+ : "=d" (rc), "+m" (*(addrtype *) ptff_block)
+ : "d" (reg0), "d" (reg1) : "cc");
+ return rc;
+}
+
static inline unsigned long long local_tick_disable(void)
{
unsigned long long old;
@@ -105,7 +169,7 @@ static inline cycles_t get_cycles(void)
return (cycles_t) get_tod_clock() >> 2;
}
-int get_sync_clock(unsigned long long *clock);
+int get_phys_clock(unsigned long long *clock);
void init_cpu_timer(void);
unsigned long long monotonic_clock(void);
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index 7a92e69c5..15711de10 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -87,10 +87,10 @@ static inline void tlb_finish_mmu(struct mmu_gather *tlb,
* tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page
* has already been freed, so just do free_page_and_swap_cache.
*/
-static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
{
free_page_and_swap_cache(page);
- return 1; /* avoid calling tlb_flush_mmu */
+ return false; /* avoid calling tlb_flush_mmu */
}
static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
@@ -98,6 +98,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page)
free_page_and_swap_cache(page);
}
+static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
+ struct page *page, int page_size)
+{
+ return __tlb_remove_page(tlb, page);
+}
+
+static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb,
+ struct page *page)
+{
+ return __tlb_remove_page(tlb, page);
+}
+
+static inline void tlb_remove_page_size(struct mmu_gather *tlb,
+ struct page *page, int page_size)
+{
+ return tlb_remove_page(tlb, page);
+}
+
/*
* pte_free_tlb frees a pte table and clears the CRSTE for the
* page table from the tlb.
diff --git a/arch/s390/include/asm/tlbflush.h b/arch/s390/include/asm/tlbflush.h
index 0a2031618..1a691ef74 100644
--- a/arch/s390/include/asm/tlbflush.h
+++ b/arch/s390/include/asm/tlbflush.h
@@ -5,6 +5,7 @@
#include <linux/sched.h>
#include <asm/processor.h>
#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
/*
* Flush all TLB entries on the local CPU.
@@ -44,17 +45,9 @@ void smp_ptlb_all(void);
*/
static inline void __tlb_flush_global(void)
{
- register unsigned long reg2 asm("2");
- register unsigned long reg3 asm("3");
- register unsigned long reg4 asm("4");
- long dummy;
-
- dummy = 0;
- reg2 = reg3 = 0;
- reg4 = ((unsigned long) &dummy) + 1;
- asm volatile(
- " csp %0,%2"
- : : "d" (reg2), "d" (reg3), "d" (reg4), "m" (dummy) : "cc" );
+ unsigned int dummy = 0;
+
+ csp(&dummy, 0, 0);
}
/*
@@ -64,7 +57,7 @@ static inline void __tlb_flush_global(void)
static inline void __tlb_flush_full(struct mm_struct *mm)
{
preempt_disable();
- atomic_add(0x10000, &mm->context.attach_count);
+ atomic_inc(&mm->context.flush_count);
if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
/* Local TLB flush */
__tlb_flush_local();
@@ -76,7 +69,7 @@ static inline void __tlb_flush_full(struct mm_struct *mm)
cpumask_copy(mm_cpumask(mm),
&mm->context.cpu_attach_mask);
}
- atomic_sub(0x10000, &mm->context.attach_count);
+ atomic_dec(&mm->context.flush_count);
preempt_enable();
}
@@ -86,12 +79,9 @@ static inline void __tlb_flush_full(struct mm_struct *mm)
*/
static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce)
{
- int active, count;
-
preempt_disable();
- active = (mm == current->active_mm) ? 1 : 0;
- count = atomic_add_return(0x10000, &mm->context.attach_count);
- if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+ atomic_inc(&mm->context.flush_count);
+ if (MACHINE_HAS_TLB_LC &&
cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
__tlb_flush_idte_local(asce);
} else {
@@ -104,7 +94,7 @@ static inline void __tlb_flush_asce(struct mm_struct *mm, unsigned long asce)
cpumask_copy(mm_cpumask(mm),
&mm->context.cpu_attach_mask);
}
- atomic_sub(0x10000, &mm->context.attach_count);
+ atomic_dec(&mm->context.flush_count);
preempt_enable();
}
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index 6b53962e8..f15f5571c 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -14,10 +14,12 @@ struct cpu_topology_s390 {
unsigned short core_id;
unsigned short socket_id;
unsigned short book_id;
+ unsigned short drawer_id;
unsigned short node_id;
cpumask_t thread_mask;
cpumask_t core_mask;
cpumask_t book_mask;
+ cpumask_t drawer_mask;
};
DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology);
@@ -30,6 +32,8 @@ DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology);
#define topology_core_cpumask(cpu) (&per_cpu(cpu_topology, cpu).core_mask)
#define topology_book_id(cpu) (per_cpu(cpu_topology, cpu).book_id)
#define topology_book_cpumask(cpu) (&per_cpu(cpu_topology, cpu).book_mask)
+#define topology_drawer_id(cpu) (per_cpu(cpu_topology, cpu).drawer_id)
+#define topology_drawer_cpumask(cpu) (&per_cpu(cpu_topology, cpu).drawer_mask)
#define mc_capable() 1
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index 666fd8ba1..52d7c8709 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -151,8 +151,65 @@ unsigned long __must_check __copy_to_user(void __user *to, const void *from,
__rc; \
})
-#define __put_user_fn(x, ptr, size) __put_get_user_asm(ptr, x, size, 0x810000UL)
-#define __get_user_fn(x, ptr, size) __put_get_user_asm(x, ptr, size, 0x81UL)
+static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
+{
+ unsigned long spec = 0x810000UL;
+ int rc;
+
+ switch (size) {
+ case 1:
+ rc = __put_get_user_asm((unsigned char __user *)ptr,
+ (unsigned char *)x,
+ size, spec);
+ break;
+ case 2:
+ rc = __put_get_user_asm((unsigned short __user *)ptr,
+ (unsigned short *)x,
+ size, spec);
+ break;
+ case 4:
+ rc = __put_get_user_asm((unsigned int __user *)ptr,
+ (unsigned int *)x,
+ size, spec);
+ break;
+ case 8:
+ rc = __put_get_user_asm((unsigned long __user *)ptr,
+ (unsigned long *)x,
+ size, spec);
+ break;
+ };
+ return rc;
+}
+
+static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size)
+{
+ unsigned long spec = 0x81UL;
+ int rc;
+
+ switch (size) {
+ case 1:
+ rc = __put_get_user_asm((unsigned char *)x,
+ (unsigned char __user *)ptr,
+ size, spec);
+ break;
+ case 2:
+ rc = __put_get_user_asm((unsigned short *)x,
+ (unsigned short __user *)ptr,
+ size, spec);
+ break;
+ case 4:
+ rc = __put_get_user_asm((unsigned int *)x,
+ (unsigned int __user *)ptr,
+ size, spec);
+ break;
+ case 8:
+ rc = __put_get_user_asm((unsigned long *)x,
+ (unsigned long __user *)ptr,
+ size, spec);
+ break;
+ };
+ return rc;
+}
#else /* CONFIG_HAVE_MARCH_Z10_FEATURES */
@@ -191,7 +248,7 @@ static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long s
__put_user_bad(); \
break; \
} \
- __pu_err; \
+ __builtin_expect(__pu_err, 0); \
})
#define put_user(x, ptr) \
@@ -240,7 +297,7 @@ int __put_user_bad(void) __attribute__((noreturn));
__get_user_bad(); \
break; \
} \
- __gu_err; \
+ __builtin_expect(__gu_err, 0); \
})
#define get_user(x, ptr) \
@@ -254,6 +311,14 @@ int __get_user_bad(void) __attribute__((noreturn));
#define __put_user_unaligned __put_user
#define __get_user_unaligned __get_user
+extern void __compiletime_error("usercopy buffer size is too small")
+__bad_copy_user(void);
+
+static inline void copy_user_overflow(int size, unsigned long count)
+{
+ WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count);
+}
+
/**
* copy_to_user: - Copy a block of data into user space.
* @to: Destination address, in user space.
@@ -275,12 +340,6 @@ copy_to_user(void __user *to, const void *from, unsigned long n)
return __copy_to_user(to, from, n);
}
-void copy_from_user_overflow(void)
-#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
-__compiletime_warning("copy_from_user() buffer size is not provably correct")
-#endif
-;
-
/**
* copy_from_user: - Copy a block of data from user space.
* @to: Destination address, in kernel space.
@@ -305,7 +364,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
might_fault();
if (unlikely(sz != -1 && sz < n)) {
- copy_from_user_overflow();
+ if (!__builtin_constant_p(n))
+ copy_user_overflow(sz, n);
+ else
+ __bad_copy_user();
return n;
}
return __copy_from_user(to, from, n);
diff --git a/arch/s390/include/uapi/asm/auxvec.h b/arch/s390/include/uapi/asm/auxvec.h
index a1f153e89..c53e08442 100644
--- a/arch/s390/include/uapi/asm/auxvec.h
+++ b/arch/s390/include/uapi/asm/auxvec.h
@@ -3,4 +3,6 @@
#define AT_SYSINFO_EHDR 33
+#define AT_VECTOR_SIZE_ARCH 1 /* entries in ARCH_DLINFO */
+
#endif
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 3b8e99ef9..a2ffec413 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -93,6 +93,47 @@ struct kvm_s390_vm_cpu_machine {
__u64 fac_list[256];
};
+#define KVM_S390_VM_CPU_PROCESSOR_FEAT 2
+#define KVM_S390_VM_CPU_MACHINE_FEAT 3
+
+#define KVM_S390_VM_CPU_FEAT_NR_BITS 1024
+#define KVM_S390_VM_CPU_FEAT_ESOP 0
+#define KVM_S390_VM_CPU_FEAT_SIEF2 1
+#define KVM_S390_VM_CPU_FEAT_64BSCAO 2
+#define KVM_S390_VM_CPU_FEAT_SIIF 3
+#define KVM_S390_VM_CPU_FEAT_GPERE 4
+#define KVM_S390_VM_CPU_FEAT_GSLS 5
+#define KVM_S390_VM_CPU_FEAT_IB 6
+#define KVM_S390_VM_CPU_FEAT_CEI 7
+#define KVM_S390_VM_CPU_FEAT_IBS 8
+#define KVM_S390_VM_CPU_FEAT_SKEY 9
+#define KVM_S390_VM_CPU_FEAT_CMMA 10
+#define KVM_S390_VM_CPU_FEAT_PFMFI 11
+#define KVM_S390_VM_CPU_FEAT_SIGPIF 12
+struct kvm_s390_vm_cpu_feat {
+ __u64 feat[16];
+};
+
+#define KVM_S390_VM_CPU_PROCESSOR_SUBFUNC 4
+#define KVM_S390_VM_CPU_MACHINE_SUBFUNC 5
+/* for "test bit" instructions MSB 0 bit ordering, for "query" raw blocks */
+struct kvm_s390_vm_cpu_subfunc {
+ __u8 plo[32]; /* always */
+ __u8 ptff[16]; /* with TOD-clock steering */
+ __u8 kmac[16]; /* with MSA */
+ __u8 kmc[16]; /* with MSA */
+ __u8 km[16]; /* with MSA */
+ __u8 kimd[16]; /* with MSA */
+ __u8 klmd[16]; /* with MSA */
+ __u8 pckmo[16]; /* with MSA3 */
+ __u8 kmctr[16]; /* with MSA4 */
+ __u8 kmf[16]; /* with MSA4 */
+ __u8 kmo[16]; /* with MSA4 */
+ __u8 pcc[16]; /* with MSA4 */
+ __u8 ppno[16]; /* with MSA5 */
+ __u8 reserved[1824];
+};
+
/* kvm attributes for crypto */
#define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0
#define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1
diff --git a/arch/s390/include/uapi/asm/ptrace.h b/arch/s390/include/uapi/asm/ptrace.h
index a150f4fab..77630c74f 100644
--- a/arch/s390/include/uapi/asm/ptrace.h
+++ b/arch/s390/include/uapi/asm/ptrace.h
@@ -359,9 +359,9 @@ typedef struct
per_cr_bits bits;
} control_regs;
/*
- * Use these flags instead of setting em_instruction_fetch
- * directly they are used so that single stepping can be
- * switched on & off while not affecting other tracing
+ * The single_step and instruction_fetch bits are obsolete,
+ * the kernel always sets them to zero. To enable single
+ * stepping use ptrace(PTRACE_SINGLESTEP) instead.
*/
unsigned single_step : 1;
unsigned instruction_fetch : 1;
diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h
index 8fb5d4a6d..3ac634368 100644
--- a/arch/s390/include/uapi/asm/sie.h
+++ b/arch/s390/include/uapi/asm/sie.h
@@ -140,6 +140,7 @@
exit_code_ipa0(0xB2, 0x4c, "TAR"), \
exit_code_ipa0(0xB2, 0x50, "CSP"), \
exit_code_ipa0(0xB2, 0x54, "MVPG"), \
+ exit_code_ipa0(0xB2, 0x56, "STHYI"), \
exit_code_ipa0(0xB2, 0x58, "BSG"), \
exit_code_ipa0(0xB2, 0x5a, "BSA"), \
exit_code_ipa0(0xB2, 0x5f, "CHSC"), \
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 2f5586ab8..3234817c7 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -2,6 +2,10 @@
# Makefile for the linux kernel.
#
+KCOV_INSTRUMENT_early.o := n
+KCOV_INSTRUMENT_sclp.o := n
+KCOV_INSTRUMENT_als.o := n
+
ifdef CONFIG_FUNCTION_TRACER
# Don't trace early setup code and tracing code
CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE)
@@ -29,23 +33,27 @@ CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"'
CFLAGS_sysinfo.o += -w
#
-# Use -march=z900 for sclp.c to be able to print an error message if
-# the kernel is started on a machine which is too old
+# Use -march=z900 for sclp.c and als.c to be able to print an error
+# message if the kernel is started on a machine which is too old
#
CFLAGS_REMOVE_sclp.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_als.o = $(CC_FLAGS_FTRACE)
ifneq ($(CC_FLAGS_MARCH),-march=z900)
CFLAGS_REMOVE_sclp.o += $(CC_FLAGS_MARCH)
CFLAGS_sclp.o += -march=z900
+CFLAGS_REMOVE_als.o += $(CC_FLAGS_MARCH)
+CFLAGS_als.o += -march=z900
AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH)
AFLAGS_head.o += -march=z900
endif
GCOV_PROFILE_sclp.o := n
+GCOV_PROFILE_als.o := n
obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o
obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o
-obj-y += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o
+obj-y += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o als.o
obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o
-obj-y += runtime_instr.o cache.o dumpstack.o
+obj-y += runtime_instr.o cache.o fpu.o dumpstack.o
obj-y += entry.o reipl.o relocate_kernel.o
extra-y += head.o head64.o vmlinux.lds
diff --git a/arch/s390/kernel/als.c b/arch/s390/kernel/als.c
new file mode 100644
index 000000000..a16e9d1bf
--- /dev/null
+++ b/arch/s390/kernel/als.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright IBM Corp. 2016
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <asm/processor.h>
+#include <asm/facility.h>
+#include <asm/lowcore.h>
+#include <asm/sclp.h>
+#include "entry.h"
+
+/*
+ * The code within this file will be called very early. It may _not_
+ * access anything within the bss section, since that is not cleared
+ * yet and may contain data (e.g. initrd) that must be saved by other
+ * code.
+ * For temporary objects the stack (16k) should be used.
+ */
+
+static unsigned long als[] __initdata = { FACILITIES_ALS };
+
+static void __init u16_to_hex(char *str, u16 val)
+{
+ int i, num;
+
+ for (i = 1; i <= 4; i++) {
+ num = (val >> (16 - 4 * i)) & 0xf;
+ if (num >= 10)
+ num += 7;
+ *str++ = '0' + num;
+ }
+ *str = '\0';
+}
+
+static void __init print_machine_type(void)
+{
+ static char mach_str[80] __initdata = "Detected machine-type number: ";
+ char type_str[5];
+ struct cpuid id;
+
+ get_cpu_id(&id);
+ u16_to_hex(type_str, id.machine);
+ strcat(mach_str, type_str);
+ _sclp_print_early(mach_str);
+}
+
+static void __init u16_to_decimal(char *str, u16 val)
+{
+ int div = 1;
+
+ while (div * 10 <= val)
+ div *= 10;
+ while (div) {
+ *str++ = '0' + val / div;
+ val %= div;
+ div /= 10;
+ }
+ *str = '\0';
+}
+
+static void __init print_missing_facilities(void)
+{
+ static char als_str[80] __initdata = "Missing facilities: ";
+ unsigned long val;
+ char val_str[6];
+ int i, j, first;
+
+ first = 1;
+ for (i = 0; i < ARRAY_SIZE(als); i++) {
+ val = ~S390_lowcore.stfle_fac_list[i] & als[i];
+ for (j = 0; j < BITS_PER_LONG; j++) {
+ if (!(val & (1UL << (BITS_PER_LONG - 1 - j))))
+ continue;
+ if (!first)
+ strcat(als_str, ",");
+ /*
+ * Make sure we stay within one line. Consider that
+ * each facility bit adds up to five characters and
+ * z/VM adds a four character prefix.
+ */
+ if (strlen(als_str) > 70) {
+ _sclp_print_early(als_str);
+ *als_str = '\0';
+ }
+ u16_to_decimal(val_str, i * BITS_PER_LONG + j);
+ strcat(als_str, val_str);
+ first = 0;
+ }
+ }
+ _sclp_print_early(als_str);
+ _sclp_print_early("See Principles of Operations for facility bits");
+}
+
+static void __init facility_mismatch(void)
+{
+ _sclp_print_early("The Linux kernel requires more recent processor hardware");
+ print_machine_type();
+ print_missing_facilities();
+ disabled_wait(0x8badcccc);
+}
+
+void __init verify_facilities(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(S390_lowcore.stfle_fac_list); i++)
+ S390_lowcore.stfle_fac_list[i] = 0;
+ asm volatile(
+ " stfl 0(0)\n"
+ : "=m" (S390_lowcore.stfl_fac_list));
+ S390_lowcore.stfle_fac_list[0] = (u64)S390_lowcore.stfl_fac_list << 32;
+ if (S390_lowcore.stfl_fac_list & 0x01000000) {
+ register unsigned long reg0 asm("0") = ARRAY_SIZE(als) - 1;
+
+ asm volatile(".insn s,0xb2b00000,0(%1)" /* stfle */
+ : "+d" (reg0)
+ : "a" (&S390_lowcore.stfle_fac_list)
+ : "memory", "cc");
+ }
+ for (i = 0; i < ARRAY_SIZE(als); i++) {
+ if ((S390_lowcore.stfle_fac_list[i] & als[i]) != als[i])
+ facility_mismatch();
+ }
+}
diff --git a/arch/s390/kernel/cache.c b/arch/s390/kernel/cache.c
index 77a84bd78..c8a83276a 100644
--- a/arch/s390/kernel/cache.c
+++ b/arch/s390/kernel/cache.c
@@ -99,12 +99,7 @@ static inline enum cache_type get_cache_type(struct cache_info *ci, int level)
static inline unsigned long ecag(int ai, int li, int ti)
{
- unsigned long cmd, val;
-
- cmd = ai << 4 | li << 1 | ti;
- asm volatile(".insn rsy,0xeb000000004c,%0,0,0(%1)" /* ecag */
- : "=d" (val) : "a" (cmd));
- return val;
+ return __ecag(ECAG_CACHE_ATTRIBUTE, ai << 4 | li << 1 | ti);
}
static void ci_leaf_init(struct cacheinfo *this_leaf, int private,
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index 48b37b835..a97354c8c 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -162,6 +162,30 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
}
EXPORT_SYMBOL(diag14);
+static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
+{
+ register unsigned long _subcode asm("0") = *subcode;
+ register unsigned long _size asm("1") = size;
+
+ asm volatile(
+ " diag %2,%0,0x204\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b,0b)
+ : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
+ *subcode = _subcode;
+ return _size;
+}
+
+int diag204(unsigned long subcode, unsigned long size, void *addr)
+{
+ diag_stat_inc(DIAG_STAT_X204);
+ size = __diag204(&subcode, size, addr);
+ if (subcode)
+ return -1;
+ return size;
+}
+EXPORT_SYMBOL(diag204);
+
/*
* Diagnose 210: Get information about a virtual device
*/
@@ -196,3 +220,18 @@ int diag210(struct diag210 *addr)
return ccode;
}
EXPORT_SYMBOL(diag210);
+
+int diag224(void *ptr)
+{
+ int rc = -EOPNOTSUPP;
+
+ diag_stat_inc(DIAG_STAT_X224);
+ asm volatile(
+ " diag %1,%2,0x224\n"
+ "0: lhi %0,0x0\n"
+ "1:\n"
+ EX_TABLE(0b,1b)
+ : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
+ return rc;
+}
+EXPORT_SYMBOL(diag224);
diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c
index 8cb9bfdd3..43446fa2a 100644
--- a/arch/s390/kernel/dis.c
+++ b/arch/s390/kernel/dis.c
@@ -26,7 +26,6 @@
#include <asm/dis.h>
#include <asm/io.h>
#include <linux/atomic.h>
-#include <asm/mathemu.h>
#include <asm/cpcmd.h>
#include <asm/lowcore.h>
#include <asm/debug.h>
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 69f9908ac..6693383bc 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -78,14 +78,10 @@ void dump_trace(dump_trace_func_t func, void *data, struct task_struct *task,
sp = __dump_trace(func, data, sp,
S390_lowcore.async_stack + frame_size - ASYNC_SIZE,
S390_lowcore.async_stack + frame_size);
- if (task)
- __dump_trace(func, data, sp,
- (unsigned long)task_stack_page(task),
- (unsigned long)task_stack_page(task) + THREAD_SIZE);
- else
- __dump_trace(func, data, sp,
- S390_lowcore.thread_info,
- S390_lowcore.thread_info + THREAD_SIZE);
+ task = task ?: current;
+ __dump_trace(func, data, sp,
+ (unsigned long)task_stack_page(task),
+ (unsigned long)task_stack_page(task) + THREAD_SIZE);
}
EXPORT_SYMBOL_GPL(dump_trace);
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index a0684de5a..717b03aa1 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -231,6 +231,26 @@ static noinline __init void detect_machine_type(void)
S390_lowcore.machine_flags |= MACHINE_FLAG_VM;
}
+static noinline __init void setup_arch_string(void)
+{
+ struct sysinfo_1_1_1 *mach = (struct sysinfo_1_1_1 *)&sysinfo_page;
+
+ if (stsi(mach, 1, 1, 1))
+ return;
+ EBCASC(mach->manufacturer, sizeof(mach->manufacturer));
+ EBCASC(mach->type, sizeof(mach->type));
+ EBCASC(mach->model, sizeof(mach->model));
+ EBCASC(mach->model_capacity, sizeof(mach->model_capacity));
+ dump_stack_set_arch_desc("%-16.16s %-4.4s %-16.16s %-16.16s (%s)",
+ mach->manufacturer,
+ mach->type,
+ mach->model,
+ mach->model_capacity,
+ MACHINE_IS_LPAR ? "LPAR" :
+ MACHINE_IS_VM ? "z/VM" :
+ MACHINE_IS_KVM ? "KVM" : "unknown");
+}
+
static __init void setup_topology(void)
{
int max_mnest;
@@ -447,11 +467,13 @@ void __init startup_init(void)
ipl_save_parameters();
rescue_initrd();
clear_bss_section();
+ ptff_init();
init_kernel_storage_key();
lockdep_off();
setup_lowcore_early();
setup_facility_list();
detect_machine_type();
+ setup_arch_string();
ipl_update_parameters();
setup_boot_command_line();
create_kernel_nss();
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 2d47f9cfc..c51650a1e 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -163,6 +163,16 @@ _PIF_WORK = (_PIF_PER_TRAP)
.endm
.section .kprobes.text, "ax"
+.Ldummy:
+ /*
+ * This nop exists only in order to avoid that __switch_to starts at
+ * the beginning of the kprobes text section. In that case we would
+ * have several symbols at the same address. E.g. objdump would take
+ * an arbitrary symbol name when disassembling this code.
+ * With the added nop in between the __switch_to symbol is unique
+ * again.
+ */
+ nop 0
/*
* Scheduler resume function, called by switch_to
@@ -175,7 +185,6 @@ ENTRY(__switch_to)
stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task
lgr %r1,%r2
aghi %r1,__TASK_thread # thread_struct of prev task
- lg %r4,__TASK_thread_info(%r2) # get thread_info of prev
lg %r5,__TASK_thread_info(%r3) # get thread_info of next
stg %r15,__THREAD_ksp(%r1) # store kernel stack of prev
lgr %r1,%r3
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index bedd2f55d..e79f030dd 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -79,4 +79,6 @@ long sys_s390_pci_mmio_read(unsigned long, void __user *, size_t);
DECLARE_PER_CPU(u64, mt_cycles[8]);
+void verify_facilities(void);
+
#endif /* _ENTRY_H */
diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c
new file mode 100644
index 000000000..81d1d1887
--- /dev/null
+++ b/arch/s390/kernel/fpu.c
@@ -0,0 +1,249 @@
+/*
+ * In-kernel vector facility support functions
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+#include <linux/kernel.h>
+#include <linux/cpu.h>
+#include <linux/sched.h>
+#include <asm/fpu/types.h>
+#include <asm/fpu/api.h>
+
+/*
+ * Per-CPU variable to maintain FPU register ranges that are in use
+ * by the kernel.
+ */
+static DEFINE_PER_CPU(u32, kernel_fpu_state);
+
+#define KERNEL_FPU_STATE_MASK (KERNEL_FPU_MASK|KERNEL_FPC)
+
+
+void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags)
+{
+ if (!__this_cpu_read(kernel_fpu_state)) {
+ /*
+ * Save user space FPU state and register contents. Multiple
+ * calls because of interruptions do not matter and return
+ * immediately. This also sets CIF_FPU to lazy restore FP/VX
+ * register contents when returning to user space.
+ */
+ save_fpu_regs();
+ }
+
+ /* Update flags to use the vector facility for KERNEL_FPR */
+ if (MACHINE_HAS_VX && (state->mask & KERNEL_FPR)) {
+ flags |= KERNEL_VXR_LOW | KERNEL_FPC;
+ flags &= ~KERNEL_FPR;
+ }
+
+ /* Save and update current kernel VX state */
+ state->mask = __this_cpu_read(kernel_fpu_state);
+ __this_cpu_or(kernel_fpu_state, flags & KERNEL_FPU_STATE_MASK);
+
+ /*
+ * If this is the first call to __kernel_fpu_begin(), no additional
+ * work is required.
+ */
+ if (!(state->mask & KERNEL_FPU_STATE_MASK))
+ return;
+
+ /*
+ * If KERNEL_FPR is still set, the vector facility is not available
+ * and, thus, save floating-point control and registers only.
+ */
+ if (state->mask & KERNEL_FPR) {
+ asm volatile("stfpc %0" : "=Q" (state->fpc));
+ asm volatile("std 0,%0" : "=Q" (state->fprs[0]));
+ asm volatile("std 1,%0" : "=Q" (state->fprs[1]));
+ asm volatile("std 2,%0" : "=Q" (state->fprs[2]));
+ asm volatile("std 3,%0" : "=Q" (state->fprs[3]));
+ asm volatile("std 4,%0" : "=Q" (state->fprs[4]));
+ asm volatile("std 5,%0" : "=Q" (state->fprs[5]));
+ asm volatile("std 6,%0" : "=Q" (state->fprs[6]));
+ asm volatile("std 7,%0" : "=Q" (state->fprs[7]));
+ asm volatile("std 8,%0" : "=Q" (state->fprs[8]));
+ asm volatile("std 9,%0" : "=Q" (state->fprs[9]));
+ asm volatile("std 10,%0" : "=Q" (state->fprs[10]));
+ asm volatile("std 11,%0" : "=Q" (state->fprs[11]));
+ asm volatile("std 12,%0" : "=Q" (state->fprs[12]));
+ asm volatile("std 13,%0" : "=Q" (state->fprs[13]));
+ asm volatile("std 14,%0" : "=Q" (state->fprs[14]));
+ asm volatile("std 15,%0" : "=Q" (state->fprs[15]));
+ return;
+ }
+
+ /*
+ * If this is a nested call to __kernel_fpu_begin(), check the saved
+ * state mask to save and later restore the vector registers that
+ * are already in use. Let's start with checking floating-point
+ * controls.
+ */
+ if (state->mask & KERNEL_FPC)
+ asm volatile("stfpc %0" : "=m" (state->fpc));
+
+ /* Test and save vector registers */
+ asm volatile (
+ /*
+ * Test if any vector register must be saved and, if so,
+ * test if all register can be saved.
+ */
+ " tmll %[m],15\n" /* KERNEL_VXR_MASK */
+ " jz 20f\n" /* no work -> done */
+ " la 1,%[vxrs]\n" /* load save area */
+ " jo 18f\n" /* -> save V0..V31 */
+
+ /*
+ * Test if V8..V23 can be saved at once... this speeds up
+ * for KERNEL_fpu_MID only. Otherwise continue to split the
+ * range of vector registers into two halves and test them
+ * separately.
+ */
+ " tmll %[m],6\n" /* KERNEL_VXR_MID */
+ " jo 17f\n" /* -> save V8..V23 */
+
+ /* Test and save the first half of 16 vector registers */
+ "1: tmll %[m],3\n" /* KERNEL_VXR_LOW */
+ " jz 10f\n" /* -> KERNEL_VXR_HIGH */
+ " jo 2f\n" /* 11 -> save V0..V15 */
+ " brc 4,3f\n" /* 01 -> save V0..V7 */
+ " brc 2,4f\n" /* 10 -> save V8..V15 */
+
+ /* Test and save the second half of 16 vector registers */
+ "10: tmll %[m],12\n" /* KERNEL_VXR_HIGH */
+ " jo 19f\n" /* 11 -> save V16..V31 */
+ " brc 4,11f\n" /* 01 -> save V16..V23 */
+ " brc 2,12f\n" /* 10 -> save V24..V31 */
+ " j 20f\n" /* 00 -> done */
+
+ /*
+ * Below are the vstm combinations to save multiple vector
+ * registers at once.
+ */
+ "2: .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
+ " j 10b\n" /* -> VXR_HIGH */
+ "3: .word 0xe707,0x1000,0x003e\n" /* vstm 0,7,0(1) */
+ " j 10b\n" /* -> VXR_HIGH */
+ "4: .word 0xe78f,0x1080,0x003e\n" /* vstm 8,15,128(1) */
+ " j 10b\n" /* -> VXR_HIGH */
+ "\n"
+ "11: .word 0xe707,0x1100,0x0c3e\n" /* vstm 16,23,256(1) */
+ " j 20f\n" /* -> done */
+ "12: .word 0xe78f,0x1180,0x0c3e\n" /* vstm 24,31,384(1) */
+ " j 20f\n" /* -> done */
+ "\n"
+ "17: .word 0xe787,0x1080,0x043e\n" /* vstm 8,23,128(1) */
+ " nill %[m],249\n" /* m &= ~VXR_MID */
+ " j 1b\n" /* -> VXR_LOW */
+ "\n"
+ "18: .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */
+ "19: .word 0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */
+ "20:"
+ : [vxrs] "=Q" (*(struct vx_array *) &state->vxrs)
+ : [m] "d" (state->mask)
+ : "1", "cc");
+}
+EXPORT_SYMBOL(__kernel_fpu_begin);
+
+void __kernel_fpu_end(struct kernel_fpu *state)
+{
+ /* Just update the per-CPU state if there is nothing to restore */
+ if (!(state->mask & KERNEL_FPU_STATE_MASK))
+ goto update_fpu_state;
+
+ /*
+ * If KERNEL_FPR is specified, the vector facility is not available
+ * and, thus, restore floating-point control and registers only.
+ */
+ if (state->mask & KERNEL_FPR) {
+ asm volatile("lfpc %0" : : "Q" (state->fpc));
+ asm volatile("ld 0,%0" : : "Q" (state->fprs[0]));
+ asm volatile("ld 1,%0" : : "Q" (state->fprs[1]));
+ asm volatile("ld 2,%0" : : "Q" (state->fprs[2]));
+ asm volatile("ld 3,%0" : : "Q" (state->fprs[3]));
+ asm volatile("ld 4,%0" : : "Q" (state->fprs[4]));
+ asm volatile("ld 5,%0" : : "Q" (state->fprs[5]));
+ asm volatile("ld 6,%0" : : "Q" (state->fprs[6]));
+ asm volatile("ld 7,%0" : : "Q" (state->fprs[7]));
+ asm volatile("ld 8,%0" : : "Q" (state->fprs[8]));
+ asm volatile("ld 9,%0" : : "Q" (state->fprs[9]));
+ asm volatile("ld 10,%0" : : "Q" (state->fprs[10]));
+ asm volatile("ld 11,%0" : : "Q" (state->fprs[11]));
+ asm volatile("ld 12,%0" : : "Q" (state->fprs[12]));
+ asm volatile("ld 13,%0" : : "Q" (state->fprs[13]));
+ asm volatile("ld 14,%0" : : "Q" (state->fprs[14]));
+ asm volatile("ld 15,%0" : : "Q" (state->fprs[15]));
+ goto update_fpu_state;
+ }
+
+ /* Test and restore floating-point controls */
+ if (state->mask & KERNEL_FPC)
+ asm volatile("lfpc %0" : : "Q" (state->fpc));
+
+ /* Test and restore (load) vector registers */
+ asm volatile (
+ /*
+ * Test if any vector registers must be loaded and, if so,
+ * test if all registers can be loaded at once.
+ */
+ " tmll %[m],15\n" /* KERNEL_VXR_MASK */
+ " jz 20f\n" /* no work -> done */
+ " la 1,%[vxrs]\n" /* load load area */
+ " jo 18f\n" /* -> load V0..V31 */
+
+ /*
+ * Test if V8..V23 can be restored at once... this speeds up
+ * for KERNEL_VXR_MID only. Otherwise continue to split the
+ * range of vector registers into two halves and test them
+ * separately.
+ */
+ " tmll %[m],6\n" /* KERNEL_VXR_MID */
+ " jo 17f\n" /* -> load V8..V23 */
+
+ /* Test and load the first half of 16 vector registers */
+ "1: tmll %[m],3\n" /* KERNEL_VXR_LOW */
+ " jz 10f\n" /* -> KERNEL_VXR_HIGH */
+ " jo 2f\n" /* 11 -> load V0..V15 */
+ " brc 4,3f\n" /* 01 -> load V0..V7 */
+ " brc 2,4f\n" /* 10 -> load V8..V15 */
+
+ /* Test and load the second half of 16 vector registers */
+ "10: tmll %[m],12\n" /* KERNEL_VXR_HIGH */
+ " jo 19f\n" /* 11 -> load V16..V31 */
+ " brc 4,11f\n" /* 01 -> load V16..V23 */
+ " brc 2,12f\n" /* 10 -> load V24..V31 */
+ " j 20f\n" /* 00 -> done */
+
+ /*
+ * Below are the vstm combinations to load multiple vector
+ * registers at once.
+ */
+ "2: .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
+ " j 10b\n" /* -> VXR_HIGH */
+ "3: .word 0xe707,0x1000,0x0036\n" /* vlm 0,7,0(1) */
+ " j 10b\n" /* -> VXR_HIGH */
+ "4: .word 0xe78f,0x1080,0x0036\n" /* vlm 8,15,128(1) */
+ " j 10b\n" /* -> VXR_HIGH */
+ "\n"
+ "11: .word 0xe707,0x1100,0x0c36\n" /* vlm 16,23,256(1) */
+ " j 20f\n" /* -> done */
+ "12: .word 0xe78f,0x1180,0x0c36\n" /* vlm 24,31,384(1) */
+ " j 20f\n" /* -> done */
+ "\n"
+ "17: .word 0xe787,0x1080,0x0436\n" /* vlm 8,23,128(1) */
+ " nill %[m],249\n" /* m &= ~VXR_MID */
+ " j 1b\n" /* -> VXR_LOW */
+ "\n"
+ "18: .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */
+ "19: .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */
+ "20:"
+ :
+ : [vxrs] "Q" (*(struct vx_array *) &state->vxrs),
+ [m] "d" (state->mask)
+ : "1", "cc");
+
+update_fpu_state:
+ /* Update current kernel VX state */
+ __this_cpu_write(kernel_fpu_state, state->mask);
+}
+EXPORT_SYMBOL(__kernel_fpu_end);
diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S
index fcaefb041..4431905f8 100644
--- a/arch/s390/kernel/head.S
+++ b/arch/s390/kernel/head.S
@@ -306,49 +306,16 @@ ENTRY(startup_kdump)
stck __LC_LAST_UPDATE_CLOCK
spt 6f-.LPG0(%r13)
mvc __LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13)
- stfl 0(%r0) # store facilities @ __LC_STFL_FAC_LIST
- mvc __LC_STFLE_FAC_LIST(4),__LC_STFL_FAC_LIST
- tm __LC_STFLE_FAC_LIST,0x01 # stfle available ?
- jz 0f
- lghi %r0,FACILITIES_ALS_DWORDS-1
- .insn s,0xb2b00000,__LC_STFLE_FAC_LIST # store facility list extended
- # verify if all required facilities are supported by the machine
-0: la %r1,__LC_STFLE_FAC_LIST
- la %r2,3f+8-.LPG0(%r13)
- lhi %r3,FACILITIES_ALS_DWORDS
-1: lg %r0,0(%r1)
- ng %r0,0(%r2)
- clg %r0,0(%r2)
- jne 2f
- la %r1,8(%r1)
- la %r2,8(%r2)
- ahi %r3,-1
- jnz 1b
- j 4f
-2: l %r15,.Lstack-.LPG0(%r13)
+ l %r15,.Lstack-.LPG0(%r13)
ahi %r15,-STACK_FRAME_OVERHEAD
- la %r2,.Lals_string-.LPG0(%r13)
- l %r3,.Lsclp_print-.LPG0(%r13)
- basr %r14,%r3
- lpsw 3f-.LPG0(%r13) # machine type not good enough, crash
-.Lals_string:
- .asciz "The Linux kernel requires more recent processor hardware"
-.Lsclp_print:
- .long _sclp_print_early
-.Lstack:
- .long 0x8000 + (1<<(PAGE_SHIFT+THREAD_ORDER))
- .align 16
-3: .long 0x000a0000,0x8badcccc
-
-# List of facilities that are required. If not all facilities are present
-# the kernel will crash.
-
- .quad FACILITIES_ALS
-
-4:
- /* Continue with startup code in head64.S */
+ brasl %r14,verify_facilities
+# For uncompressed images, continue in
+# arch/s390/kernel/head64.S. For compressed images, continue in
+# arch/s390/boot/compressed/head.S.
jg startup_continue
+.Lstack:
+ .long 0x8000 + (1<<(PAGE_SHIFT+THREAD_ORDER))
.align 8
6: .long 0x7fffffff,0xffffffff
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index d14069d4b..295bfb712 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -121,9 +121,9 @@ static char *dump_type_str(enum dump_type type)
* Must be in data section since the bss section
* is not cleared when these are accessed.
*/
-static u8 ipl_ssid __attribute__((__section__(".data"))) = 0;
-static u16 ipl_devno __attribute__((__section__(".data"))) = 0;
-u32 ipl_flags __attribute__((__section__(".data"))) = 0;
+static u8 ipl_ssid __section(.data) = 0;
+static u16 ipl_devno __section(.data) = 0;
+u32 ipl_flags __section(.data) = 0;
enum ipl_method {
REIPL_METHOD_CCW_CIO,
@@ -174,7 +174,7 @@ static inline int __diag308(unsigned long subcode, void *addr)
asm volatile(
" diag %0,%2,0x308\n"
- "0:\n"
+ "0: nopr %%r7\n"
EX_TABLE(0b,0b)
: "+d" (_addr), "+d" (_rc)
: "d" (subcode) : "cc", "memory");
@@ -563,7 +563,7 @@ static struct kset *ipl_kset;
static void __ipl_run(void *unused)
{
- diag308(DIAG308_IPL, NULL);
+ diag308(DIAG308_LOAD_CLEAR, NULL);
if (MACHINE_IS_VM)
__cpcmd("IPL", NULL, 0, NULL);
else if (ipl_info.type == IPL_TYPE_CCW)
@@ -1085,21 +1085,24 @@ static void __reipl_run(void *unused)
break;
case REIPL_METHOD_CCW_DIAG:
diag308(DIAG308_SET, reipl_block_ccw);
- diag308(DIAG308_IPL, NULL);
+ if (MACHINE_IS_LPAR)
+ diag308(DIAG308_LOAD_NORMAL_DUMP, NULL);
+ else
+ diag308(DIAG308_LOAD_CLEAR, NULL);
break;
case REIPL_METHOD_FCP_RW_DIAG:
diag308(DIAG308_SET, reipl_block_fcp);
- diag308(DIAG308_IPL, NULL);
+ diag308(DIAG308_LOAD_CLEAR, NULL);
break;
case REIPL_METHOD_FCP_RO_DIAG:
- diag308(DIAG308_IPL, NULL);
+ diag308(DIAG308_LOAD_CLEAR, NULL);
break;
case REIPL_METHOD_FCP_RO_VM:
__cpcmd("IPL", NULL, 0, NULL);
break;
case REIPL_METHOD_NSS_DIAG:
diag308(DIAG308_SET, reipl_block_nss);
- diag308(DIAG308_IPL, NULL);
+ diag308(DIAG308_LOAD_CLEAR, NULL);
break;
case REIPL_METHOD_NSS:
get_ipl_string(buf, reipl_block_nss, REIPL_METHOD_NSS);
@@ -1108,7 +1111,7 @@ static void __reipl_run(void *unused)
case REIPL_METHOD_DEFAULT:
if (MACHINE_IS_VM)
__cpcmd("IPL", NULL, 0, NULL);
- diag308(DIAG308_IPL, NULL);
+ diag308(DIAG308_LOAD_CLEAR, NULL);
break;
case REIPL_METHOD_FCP_DUMP:
break;
@@ -1423,7 +1426,7 @@ static void diag308_dump(void *dump_block)
{
diag308(DIAG308_SET, dump_block);
while (1) {
- if (diag308(DIAG308_DUMP, NULL) != 0x302)
+ if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302)
break;
udelay_simple(USEC_PER_SEC);
}
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index c373a1d41..285d65610 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -127,9 +127,7 @@ int show_interrupts(struct seq_file *p, void *v)
seq_printf(p, "CPU%d ", cpu);
seq_putc(p, '\n');
}
- if (index < NR_IRQS) {
- if (index >= NR_IRQS_BASE)
- goto out;
+ if (index < NR_IRQS_BASE) {
seq_printf(p, "%s: ", irqclass_main_desc[index].name);
irq = irqclass_main_desc[index].irq;
for_each_online_cpu(cpu)
@@ -137,6 +135,9 @@ int show_interrupts(struct seq_file *p, void *v)
seq_putc(p, '\n');
goto out;
}
+ if (index > NR_IRQS_BASE)
+ goto out;
+
for (index = 0; index < NR_ARCH_IRQS; index++) {
seq_printf(p, "%s: ", irqclass_sub_desc[index].name);
irq = irqclass_sub_desc[index].irq;
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index 250f59725..dd6306c51 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -690,6 +690,15 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs)
stack = (unsigned long) regs->gprs[15];
memcpy(kcb->jprobes_stack, (void *) stack, MIN_STACK_SIZE(stack));
+
+ /*
+ * jprobes use jprobe_return() which skips the normal return
+ * path of the function, and this messes up the accounting of the
+ * function graph tracer to get messed up.
+ *
+ * Pause function graph tracing while performing the jprobe function.
+ */
+ pause_graph_tracing();
return 1;
}
NOKPROBE_SYMBOL(setjmp_pre_handler);
@@ -705,6 +714,9 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
unsigned long stack;
+ /* It's OK to start function graph tracing again */
+ unpause_graph_tracing();
+
stack = (unsigned long) kcb->jprobe_saved_regs.gprs[15];
/* Put the regs back */
diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c
index 0e64f08d3..3074c1d83 100644
--- a/arch/s390/kernel/machine_kexec.c
+++ b/arch/s390/kernel/machine_kexec.c
@@ -24,6 +24,7 @@
#include <asm/diag.h>
#include <asm/elf.h>
#include <asm/asm-offsets.h>
+#include <asm/cacheflush.h>
#include <asm/os_info.h>
#include <asm/switch_to.h>
@@ -60,8 +61,6 @@ static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action,
static int __init machine_kdump_pm_init(void)
{
pm_notifier(machine_kdump_pm_cb, 0);
- /* Create initial mapping for crashkernel memory */
- arch_kexec_unprotect_crashkres();
return 0;
}
arch_initcall(machine_kdump_pm_init);
@@ -150,42 +149,40 @@ static int kdump_csum_valid(struct kimage *image)
#ifdef CONFIG_CRASH_DUMP
-/*
- * Map or unmap crashkernel memory
- */
-static void crash_map_pages(int enable)
+void crash_free_reserved_phys_range(unsigned long begin, unsigned long end)
{
- unsigned long size = resource_size(&crashk_res);
-
- BUG_ON(crashk_res.start % KEXEC_CRASH_MEM_ALIGN ||
- size % KEXEC_CRASH_MEM_ALIGN);
- if (enable)
- vmem_add_mapping(crashk_res.start, size);
- else {
- vmem_remove_mapping(crashk_res.start, size);
- if (size)
- os_info_crashkernel_add(crashk_res.start, size);
- else
- os_info_crashkernel_add(0, 0);
- }
+ unsigned long addr, size;
+
+ for (addr = begin; addr < end; addr += PAGE_SIZE)
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
+ size = begin - crashk_res.start;
+ if (size)
+ os_info_crashkernel_add(crashk_res.start, size);
+ else
+ os_info_crashkernel_add(0, 0);
+}
+
+static void crash_protect_pages(int protect)
+{
+ unsigned long size;
+
+ if (!crashk_res.end)
+ return;
+ size = resource_size(&crashk_res);
+ if (protect)
+ set_memory_ro(crashk_res.start, size >> PAGE_SHIFT);
+ else
+ set_memory_rw(crashk_res.start, size >> PAGE_SHIFT);
}
-/*
- * Unmap crashkernel memory
- */
void arch_kexec_protect_crashkres(void)
{
- if (crashk_res.end)
- crash_map_pages(0);
+ crash_protect_pages(1);
}
-/*
- * Map crashkernel memory
- */
void arch_kexec_unprotect_crashkres(void)
{
- if (crashk_res.end)
- crash_map_pages(1);
+ crash_protect_pages(0);
}
#endif
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 07302ce37..29376f0e7 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -16,7 +16,7 @@
#include <linux/module.h>
#include <asm/lowcore.h>
#include <asm/smp.h>
-#include <asm/etr.h>
+#include <asm/stp.h>
#include <asm/cputime.h>
#include <asm/nmi.h>
#include <asm/crw.h>
@@ -27,7 +27,6 @@ struct mcck_struct {
unsigned int kill_task : 1;
unsigned int channel_report : 1;
unsigned int warning : 1;
- unsigned int etr_queue : 1;
unsigned int stp_queue : 1;
unsigned long mcck_code;
};
@@ -82,8 +81,6 @@ void s390_handle_mcck(void)
if (xchg(&mchchk_wng_posted, 1) == 0)
kill_cad_pid(SIGPWR, 1);
}
- if (mcck.etr_queue)
- etr_queue_work();
if (mcck.stp_queue)
stp_queue_work();
if (mcck.kill_task) {
@@ -241,8 +238,6 @@ static int notrace s390_validate_registers(union mci mci)
#define ED_STP_ISLAND 6 /* External damage STP island check */
#define ED_STP_SYNC 7 /* External damage STP sync check */
-#define ED_ETR_SYNC 12 /* External damage ETR sync check */
-#define ED_ETR_SWITCH 13 /* External damage ETR switch to local */
/*
* machine check handler.
@@ -325,15 +320,11 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
}
if (mci.ed && mci.ec) {
/* External damage */
- if (S390_lowcore.external_damage_code & (1U << ED_ETR_SYNC))
- mcck->etr_queue |= etr_sync_check();
- if (S390_lowcore.external_damage_code & (1U << ED_ETR_SWITCH))
- mcck->etr_queue |= etr_switch_to_local();
if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC))
mcck->stp_queue |= stp_sync_check();
if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND))
mcck->stp_queue |= stp_island_check();
- if (mcck->etr_queue || mcck->stp_queue)
+ if (mcck->stp_queue)
set_cpu_flag(CIF_MCCK_PENDING);
}
if (mci.se)
diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c
index 7ec63b1d9..037c2a253 100644
--- a/arch/s390/kernel/perf_cpum_cf.c
+++ b/arch/s390/kernel/perf_cpum_cf.c
@@ -664,30 +664,22 @@ static struct pmu cpumf_pmu = {
.cancel_txn = cpumf_pmu_cancel_txn,
};
-static int cpumf_pmu_notifier(struct notifier_block *self, unsigned long action,
- void *hcpu)
+static int cpumf_pmf_setup(unsigned int cpu, int flags)
{
- int flags;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- flags = PMC_INIT;
- local_irq_disable();
- setup_pmc_cpu(&flags);
- local_irq_enable();
- break;
- case CPU_DOWN_PREPARE:
- flags = PMC_RELEASE;
- local_irq_disable();
- setup_pmc_cpu(&flags);
- local_irq_enable();
- break;
- default:
- break;
- }
+ local_irq_disable();
+ setup_pmc_cpu(&flags);
+ local_irq_enable();
+ return 0;
+}
+
+static int s390_pmu_online_cpu(unsigned int cpu)
+{
+ return cpumf_pmf_setup(cpu, PMC_INIT);
+}
- return NOTIFY_OK;
+static int s390_pmu_offline_cpu(unsigned int cpu)
+{
+ return cpumf_pmf_setup(cpu, PMC_RELEASE);
}
static int __init cpumf_pmu_init(void)
@@ -707,7 +699,7 @@ static int __init cpumf_pmu_init(void)
if (rc) {
pr_err("Registering for CPU-measurement alerts "
"failed with rc=%i\n", rc);
- goto out;
+ return rc;
}
cpumf_pmu.attr_groups = cpumf_cf_event_group();
@@ -716,10 +708,10 @@ static int __init cpumf_pmu_init(void)
pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc);
unregister_external_irq(EXT_IRQ_MEASURE_ALERT,
cpumf_measurement_alert);
- goto out;
+ return rc;
}
- perf_cpu_notifier(cpumf_pmu_notifier);
-out:
- return rc;
+ return cpuhp_setup_state(CPUHP_AP_PERF_S390_CF_ONLINE,
+ "AP_PERF_S390_CF_ONLINE",
+ s390_pmu_online_cpu, s390_pmu_offline_cpu);
}
early_initcall(cpumf_pmu_init);
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index a8e832166..fcc634c14 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -601,17 +601,12 @@ static void release_pmc_hardware(void)
irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
on_each_cpu(setup_pmc_cpu, &flags, 1);
- perf_release_sampling();
}
static int reserve_pmc_hardware(void)
{
int flags = PMC_INIT;
- int err;
- err = perf_reserve_sampling();
- if (err)
- return err;
on_each_cpu(setup_pmc_cpu, &flags, 1);
if (flags & PMC_FAILURE) {
release_pmc_hardware();
@@ -979,12 +974,15 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
struct pt_regs regs;
struct perf_sf_sde_regs *sde_regs;
struct perf_sample_data data;
- struct perf_raw_record raw;
+ struct perf_raw_record raw = {
+ .frag = {
+ .size = sfr->size,
+ .data = sfr,
+ },
+ };
/* Setup perf sample */
perf_sample_data_init(&data, 0, event->hw.last_period);
- raw.size = sfr->size;
- raw.data = sfr;
data.raw = &raw;
/* Setup pt_regs to look like an CPU-measurement external interrupt
@@ -1506,37 +1504,28 @@ static void cpumf_measurement_alert(struct ext_code ext_code,
sf_disable();
}
}
-
-static int cpumf_pmu_notifier(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int cpusf_pmu_setup(unsigned int cpu, int flags)
{
- int flags;
-
/* Ignore the notification if no events are scheduled on the PMU.
* This might be racy...
*/
if (!atomic_read(&num_events))
- return NOTIFY_OK;
-
- switch (action & ~CPU_TASKS_FROZEN) {
- case CPU_ONLINE:
- case CPU_DOWN_FAILED:
- flags = PMC_INIT;
- local_irq_disable();
- setup_pmc_cpu(&flags);
- local_irq_enable();
- break;
- case CPU_DOWN_PREPARE:
- flags = PMC_RELEASE;
- local_irq_disable();
- setup_pmc_cpu(&flags);
- local_irq_enable();
- break;
- default:
- break;
- }
+ return 0;
- return NOTIFY_OK;
+ local_irq_disable();
+ setup_pmc_cpu(&flags);
+ local_irq_enable();
+ return 0;
+}
+
+static int s390_pmu_sf_online_cpu(unsigned int cpu)
+{
+ return cpusf_pmu_setup(cpu, PMC_INIT);
+}
+
+static int s390_pmu_sf_offline_cpu(unsigned int cpu)
+{
+ return cpusf_pmu_setup(cpu, PMC_RELEASE);
}
static int param_get_sfb_size(char *buffer, const struct kernel_param *kp)
@@ -1636,7 +1625,9 @@ static int __init init_cpum_sampling_pmu(void)
cpumf_measurement_alert);
goto out;
}
- perf_cpu_notifier(cpumf_pmu_notifier);
+
+ cpuhp_setup_state(CPUHP_AP_PERF_S390_SF_ONLINE, "AP_PERF_S390_SF_ONLINE",
+ s390_pmu_sf_online_cpu, s390_pmu_sf_offline_cpu);
out:
return err;
}
diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c
index 87035fa58..17431f63d 100644
--- a/arch/s390/kernel/perf_event.c
+++ b/arch/s390/kernel/perf_event.c
@@ -248,33 +248,3 @@ ssize_t cpumf_events_sysfs_show(struct device *dev,
return sprintf(page, "event=0x%04llx,name=%s\n",
pmu_attr->id, attr->attr.name);
}
-
-/* Reserve/release functions for sharing perf hardware */
-static DEFINE_SPINLOCK(perf_hw_owner_lock);
-static void *perf_sampling_owner;
-
-int perf_reserve_sampling(void)
-{
- int err;
-
- err = 0;
- spin_lock(&perf_hw_owner_lock);
- if (perf_sampling_owner) {
- pr_warn("The sampling facility is already reserved by %p\n",
- perf_sampling_owner);
- err = -EBUSY;
- } else
- perf_sampling_owner = __builtin_return_address(0);
- spin_unlock(&perf_hw_owner_lock);
- return err;
-}
-EXPORT_SYMBOL(perf_reserve_sampling);
-
-void perf_release_sampling(void)
-{
- spin_lock(&perf_hw_owner_lock);
- WARN_ON(!perf_sampling_owner);
- perf_sampling_owner = NULL;
- spin_unlock(&perf_hw_owner_lock);
-}
-EXPORT_SYMBOL(perf_release_sampling);
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index de7451065..81d080808 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -13,12 +13,45 @@
#include <linux/delay.h>
#include <linux/cpu.h>
#include <asm/diag.h>
+#include <asm/facility.h>
#include <asm/elf.h>
#include <asm/lowcore.h>
#include <asm/param.h>
#include <asm/smp.h>
-static DEFINE_PER_CPU(struct cpuid, cpu_id);
+struct cpu_info {
+ unsigned int cpu_mhz_dynamic;
+ unsigned int cpu_mhz_static;
+ struct cpuid cpu_id;
+};
+
+static DEFINE_PER_CPU(struct cpu_info, cpu_info);
+
+static bool machine_has_cpu_mhz;
+
+void __init cpu_detect_mhz_feature(void)
+{
+ if (test_facility(34) && __ecag(ECAG_CPU_ATTRIBUTE, 0) != -1UL)
+ machine_has_cpu_mhz = 1;
+}
+
+static void update_cpu_mhz(void *arg)
+{
+ unsigned long mhz;
+ struct cpu_info *c;
+
+ mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0);
+ c = this_cpu_ptr(&cpu_info);
+ c->cpu_mhz_dynamic = mhz >> 32;
+ c->cpu_mhz_static = mhz & 0xffffffff;
+}
+
+void s390_update_cpu_mhz(void)
+{
+ s390_adjust_jiffies();
+ if (machine_has_cpu_mhz)
+ on_each_cpu(update_cpu_mhz, NULL, 0);
+}
void notrace cpu_relax(void)
{
@@ -35,9 +68,11 @@ EXPORT_SYMBOL(cpu_relax);
*/
void cpu_init(void)
{
- struct cpuid *id = this_cpu_ptr(&cpu_id);
+ struct cpuid *id = this_cpu_ptr(&cpu_info.cpu_id);
get_cpu_id(id);
+ if (machine_has_cpu_mhz)
+ update_cpu_mhz(NULL);
atomic_inc(&init_mm.mm_count);
current->active_mm = &init_mm;
BUG_ON(current->mm);
@@ -53,10 +88,7 @@ int cpu_have_feature(unsigned int num)
}
EXPORT_SYMBOL(cpu_have_feature);
-/*
- * show_cpuinfo - Get information on one CPU for use by procfs.
- */
-static int show_cpuinfo(struct seq_file *m, void *v)
+static void show_cpu_summary(struct seq_file *m, void *v)
{
static const char *hwcap_str[] = {
"esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp",
@@ -65,34 +97,55 @@ static int show_cpuinfo(struct seq_file *m, void *v)
static const char * const int_hwcap_str[] = {
"sie"
};
- unsigned long n = (unsigned long) v - 1;
- int i;
-
- if (!n) {
- s390_adjust_jiffies();
- seq_printf(m, "vendor_id : IBM/S390\n"
- "# processors : %i\n"
- "bogomips per cpu: %lu.%02lu\n",
- num_online_cpus(), loops_per_jiffy/(500000/HZ),
- (loops_per_jiffy/(5000/HZ))%100);
- seq_puts(m, "features\t: ");
- for (i = 0; i < ARRAY_SIZE(hwcap_str); i++)
- if (hwcap_str[i] && (elf_hwcap & (1UL << i)))
- seq_printf(m, "%s ", hwcap_str[i]);
- for (i = 0; i < ARRAY_SIZE(int_hwcap_str); i++)
- if (int_hwcap_str[i] && (int_hwcap & (1UL << i)))
- seq_printf(m, "%s ", int_hwcap_str[i]);
- seq_puts(m, "\n");
- show_cacheinfo(m);
- }
- if (cpu_online(n)) {
- struct cpuid *id = &per_cpu(cpu_id, n);
- seq_printf(m, "processor %li: "
+ int i, cpu;
+
+ seq_printf(m, "vendor_id : IBM/S390\n"
+ "# processors : %i\n"
+ "bogomips per cpu: %lu.%02lu\n",
+ num_online_cpus(), loops_per_jiffy/(500000/HZ),
+ (loops_per_jiffy/(5000/HZ))%100);
+ seq_printf(m, "max thread id : %d\n", smp_cpu_mtid);
+ seq_puts(m, "features\t: ");
+ for (i = 0; i < ARRAY_SIZE(hwcap_str); i++)
+ if (hwcap_str[i] && (elf_hwcap & (1UL << i)))
+ seq_printf(m, "%s ", hwcap_str[i]);
+ for (i = 0; i < ARRAY_SIZE(int_hwcap_str); i++)
+ if (int_hwcap_str[i] && (int_hwcap & (1UL << i)))
+ seq_printf(m, "%s ", int_hwcap_str[i]);
+ seq_puts(m, "\n");
+ show_cacheinfo(m);
+ for_each_online_cpu(cpu) {
+ struct cpuid *id = &per_cpu(cpu_info.cpu_id, cpu);
+
+ seq_printf(m, "processor %d: "
"version = %02X, "
"identification = %06X, "
"machine = %04X\n",
- n, id->version, id->ident, id->machine);
+ cpu, id->version, id->ident, id->machine);
}
+}
+
+static void show_cpu_mhz(struct seq_file *m, unsigned long n)
+{
+ struct cpu_info *c = per_cpu_ptr(&cpu_info, n);
+
+ seq_printf(m, "cpu MHz dynamic : %d\n", c->cpu_mhz_dynamic);
+ seq_printf(m, "cpu MHz static : %d\n", c->cpu_mhz_static);
+}
+
+/*
+ * show_cpuinfo - Get information on one CPU for use by procfs.
+ */
+static int show_cpuinfo(struct seq_file *m, void *v)
+{
+ unsigned long n = (unsigned long) v - 1;
+
+ if (!n)
+ show_cpu_summary(m, v);
+ if (!machine_has_cpu_mhz)
+ return 0;
+ seq_printf(m, "\ncpu number : %ld\n", n);
+ show_cpu_mhz(m, n);
return 0;
}
@@ -126,4 +179,3 @@ const struct seq_operations cpuinfo_op = {
.stop = c_stop,
.show = show_cpuinfo,
};
-
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 49b1c13bf..9336e824e 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -821,14 +821,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
{
- long ret = 0;
-
- /* Do the secure computing check first. */
- if (secure_computing()) {
- /* seccomp failures shouldn't expose any additional code. */
- ret = -1;
- goto out;
- }
+ unsigned long mask = -1UL;
/*
* The sysc_tracesys code in entry.S stored the system
@@ -843,17 +836,26 @@ asmlinkage long do_syscall_trace_enter(struct pt_regs *regs)
* the system call and the system call restart handling.
*/
clear_pt_regs_flag(regs, PIF_SYSCALL);
- ret = -1;
+ return -1;
+ }
+
+ /* Do the secure computing check after ptrace. */
+ if (secure_computing(NULL)) {
+ /* seccomp failures shouldn't expose any additional code. */
+ return -1;
}
if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
trace_sys_enter(regs, regs->gprs[2]);
- audit_syscall_entry(regs->gprs[2], regs->orig_gpr2,
- regs->gprs[3], regs->gprs[4],
- regs->gprs[5]);
-out:
- return ret ?: regs->gprs[2];
+ if (is_compat_task())
+ mask = 0xffffffff;
+
+ audit_syscall_entry(regs->gprs[2], regs->orig_gpr2 & mask,
+ regs->gprs[3] &mask, regs->gprs[4] &mask,
+ regs->gprs[5] &mask);
+
+ return regs->gprs[2];
}
asmlinkage void do_syscall_trace_exit(struct pt_regs *regs)
diff --git a/arch/s390/kernel/sclp.c b/arch/s390/kernel/sclp.c
index d88db40bd..f08af675f 100644
--- a/arch/s390/kernel/sclp.c
+++ b/arch/s390/kernel/sclp.c
@@ -12,8 +12,9 @@
#define EVTYP_VT220MSG_MASK 0x00000040
#define EVTYP_MSG_MASK 0x40000000
-static char _sclp_work_area[4096] __aligned(PAGE_SIZE);
-static bool have_vt220, have_linemode;
+static char _sclp_work_area[4096] __aligned(PAGE_SIZE) __section(data);
+static bool have_vt220 __section(data);
+static bool have_linemode __section(data);
static void _sclp_wait_int(void)
{
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index f31939147..7f7ba5f23 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -130,17 +130,14 @@ __setup("condev=", condev_setup);
static void __init set_preferred_console(void)
{
- if (MACHINE_IS_KVM) {
- if (sclp.has_vt220)
- add_preferred_console("ttyS", 1, NULL);
- else if (sclp.has_linemode)
- add_preferred_console("ttyS", 0, NULL);
- else
- add_preferred_console("hvc", 0, NULL);
- } else if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP)
+ if (CONSOLE_IS_3215 || CONSOLE_IS_SCLP)
add_preferred_console("ttyS", 0, NULL);
else if (CONSOLE_IS_3270)
add_preferred_console("tty3270", 0, NULL);
+ else if (CONSOLE_IS_VT220)
+ add_preferred_console("ttyS", 1, NULL);
+ else if (CONSOLE_IS_HVC)
+ add_preferred_console("hvc", 0, NULL);
}
static int __init conmode_setup(char *str)
@@ -206,6 +203,13 @@ static void __init conmode_default(void)
SET_CONSOLE_SCLP;
#endif
}
+ } else if (MACHINE_IS_KVM) {
+ if (sclp.has_vt220 && IS_ENABLED(CONFIG_SCLP_VT220_CONSOLE))
+ SET_CONSOLE_VT220;
+ else if (sclp.has_linemode && IS_ENABLED(CONFIG_SCLP_CONSOLE))
+ SET_CONSOLE_SCLP;
+ else
+ SET_CONSOLE_HVC;
} else {
#if defined(CONFIG_SCLP_CONSOLE) || defined(CONFIG_SCLP_VT220_CONSOLE)
SET_CONSOLE_SCLP;
@@ -289,7 +293,7 @@ static int __init parse_vmalloc(char *arg)
}
early_param("vmalloc", parse_vmalloc);
-void *restart_stack __attribute__((__section__(".data")));
+void *restart_stack __section(.data);
static void __init setup_lowcore(void)
{
@@ -432,6 +436,20 @@ static void __init setup_resources(void)
}
}
}
+#ifdef CONFIG_CRASH_DUMP
+ /*
+ * Re-add removed crash kernel memory as reserved memory. This makes
+ * sure it will be mapped with the identity mapping and struct pages
+ * will be created, so it can be resized later on.
+ * However add it later since the crash kernel resource should not be
+ * part of the System RAM resource.
+ */
+ if (crashk_res.end) {
+ memblock_add(crashk_res.start, resource_size(&crashk_res));
+ memblock_reserve(crashk_res.start, resource_size(&crashk_res));
+ insert_resource(&iomem_resource, &crashk_res);
+ }
+#endif
}
static void __init setup_memory_end(void)
@@ -602,7 +620,6 @@ static void __init reserve_crashkernel(void)
diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size));
crashk_res.start = crash_base;
crashk_res.end = crash_base + crash_size - 1;
- insert_resource(&iomem_resource, &crashk_res);
memblock_remove(crash_base, crash_size);
pr_info("Reserving %lluMB of memory at %lluMB "
"for crashkernel (System RAM: %luMB)\n",
@@ -901,6 +918,7 @@ void __init setup_arch(char **cmdline_p)
setup_vmcoreinfo();
setup_lowcore();
smp_fill_possible_mask();
+ cpu_detect_mhz_feature();
cpu_init();
numa_setup();
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 7b89a7572..35531fe1c 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -242,10 +242,8 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
{
struct lowcore *lc = pcpu->lowcore;
- if (MACHINE_HAS_TLB_LC)
- cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask);
+ cpumask_set_cpu(cpu, &init_mm.context.cpu_attach_mask);
cpumask_set_cpu(cpu, mm_cpumask(&init_mm));
- atomic_inc(&init_mm.context.attach_count);
lc->cpu_nr = cpu;
lc->spinlock_lockval = arch_spin_lockval(cpu);
lc->percpu_offset = __per_cpu_offset[cpu];
@@ -320,17 +318,11 @@ static void pcpu_delegate(struct pcpu *pcpu, void (*func)(void *),
*/
static int pcpu_set_smt(unsigned int mtid)
{
- register unsigned long reg1 asm ("1") = (unsigned long) mtid;
int cc;
if (smp_cpu_mtid == mtid)
return 0;
- asm volatile(
- " sigp %1,0,%2 # sigp set multi-threading\n"
- " ipm %0\n"
- " srl %0,28\n"
- : "=d" (cc) : "d" (reg1), "K" (SIGP_SET_MULTI_THREADING)
- : "cc");
+ cc = __pcpu_sigp(0, SIGP_SET_MULTI_THREADING, mtid, NULL);
if (cc == 0) {
smp_cpu_mtid = mtid;
smp_cpu_mt_shift = 0;
@@ -876,10 +868,8 @@ void __cpu_die(unsigned int cpu)
while (!pcpu_stopped(pcpu))
cpu_relax();
pcpu_free_lowcore(pcpu);
- atomic_dec(&init_mm.context.attach_count);
cpumask_clear_cpu(cpu, mm_cpumask(&init_mm));
- if (MACHINE_HAS_TLB_LC)
- cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask);
+ cpumask_clear_cpu(cpu, &init_mm.context.cpu_attach_mask);
}
void __noreturn cpu_die(void)
@@ -897,7 +887,7 @@ void __init smp_fill_possible_mask(void)
sclp_max = max(sclp.mtid, sclp.mtid_cp) + 1;
sclp_max = min(smp_max_threads, sclp_max);
- sclp_max = sclp.max_cores * sclp_max ?: nr_cpu_ids;
+ sclp_max = (sclp.max_cores * sclp_max) ?: nr_cpu_ids;
possible = setup_possible_cpus ?: nr_cpu_ids;
possible = min(possible, sclp_max);
for (cpu = 0; cpu < possible && cpu < nr_cpu_ids; cpu++)
diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c
index f7dba3887..050b8d067 100644
--- a/arch/s390/kernel/sysinfo.c
+++ b/arch/s390/kernel/sysinfo.c
@@ -16,21 +16,11 @@
#include <asm/sysinfo.h>
#include <asm/cpcmd.h>
#include <asm/topology.h>
-
-/* Sigh, math-emu. Don't ask. */
-#include <asm/sfp-util.h>
-#include <math-emu/soft-fp.h>
-#include <math-emu/single.h>
+#include <asm/fpu/api.h>
int topology_max_mnest;
-/*
- * stsi - store system information
- *
- * Returns the current configuration level if function code 0 was specified.
- * Otherwise returns 0 on success or a negative value on error.
- */
-int stsi(void *sysinfo, int fc, int sel1, int sel2)
+static inline int __stsi(void *sysinfo, int fc, int sel1, int sel2, int *lvl)
{
register int r0 asm("0") = (fc << 28) | sel1;
register int r1 asm("1") = sel2;
@@ -45,9 +35,24 @@ int stsi(void *sysinfo, int fc, int sel1, int sel2)
: "+d" (r0), "+d" (rc)
: "d" (r1), "a" (sysinfo), "K" (-EOPNOTSUPP)
: "cc", "memory");
+ *lvl = ((unsigned int) r0) >> 28;
+ return rc;
+}
+
+/*
+ * stsi - store system information
+ *
+ * Returns the current configuration level if function code 0 was specified.
+ * Otherwise returns 0 on success or a negative value on error.
+ */
+int stsi(void *sysinfo, int fc, int sel1, int sel2)
+{
+ int lvl, rc;
+
+ rc = __stsi(sysinfo, fc, sel1, sel2, &lvl);
if (rc)
return rc;
- return fc ? 0 : ((unsigned int) r0) >> 28;
+ return fc ? 0 : lvl;
}
EXPORT_SYMBOL(stsi);
@@ -414,10 +419,8 @@ subsys_initcall(create_proc_service_level);
void s390_adjust_jiffies(void)
{
struct sysinfo_1_2_2 *info;
- const unsigned int fmil = 0x4b189680; /* 1e7 as 32-bit float. */
- FP_DECL_S(SA); FP_DECL_S(SB); FP_DECL_S(SR);
- FP_DECL_EX;
- unsigned int capability;
+ unsigned long capability;
+ struct kernel_fpu fpu;
info = (void *) get_zeroed_page(GFP_KERNEL);
if (!info)
@@ -433,15 +436,25 @@ void s390_adjust_jiffies(void)
* higher cpu capacity. Bogomips are the other way round.
* To get to a halfway suitable number we divide 1e7
* by the cpu capability number. Yes, that means a floating
- * point division .. math-emu here we come :-)
+ * point division ..
*/
- FP_UNPACK_SP(SA, &fmil);
- if ((info->capability >> 23) == 0)
- FP_FROM_INT_S(SB, (long) info->capability, 64, long);
- else
- FP_UNPACK_SP(SB, &info->capability);
- FP_DIV_S(SR, SA, SB);
- FP_TO_INT_S(capability, SR, 32, 0);
+ kernel_fpu_begin(&fpu, KERNEL_FPR);
+ asm volatile(
+ " sfpc %3\n"
+ " l %0,%1\n"
+ " tmlh %0,0xff80\n"
+ " jnz 0f\n"
+ " cefbr %%f2,%0\n"
+ " j 1f\n"
+ "0: le %%f2,%1\n"
+ "1: cefbr %%f0,%2\n"
+ " debr %%f0,%%f2\n"
+ " cgebr %0,5,%%f0\n"
+ : "=&d" (capability)
+ : "Q" (info->capability), "d" (10000000), "d" (0)
+ : "cc"
+ );
+ kernel_fpu_end(&fpu);
} else
/*
* Really old machine without stsi block for basic
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 9409d32f2..4e9949800 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -39,13 +39,14 @@
#include <linux/gfp.h>
#include <linux/kprobes.h>
#include <asm/uaccess.h>
+#include <asm/facility.h>
#include <asm/delay.h>
#include <asm/div64.h>
#include <asm/vdso.h>
#include <asm/irq.h>
#include <asm/irq_regs.h>
#include <asm/vtimer.h>
-#include <asm/etr.h>
+#include <asm/stp.h>
#include <asm/cio.h>
#include "entry.h"
@@ -61,6 +62,32 @@ static DEFINE_PER_CPU(struct clock_event_device, comparators);
ATOMIC_NOTIFIER_HEAD(s390_epoch_delta_notifier);
EXPORT_SYMBOL(s390_epoch_delta_notifier);
+unsigned char ptff_function_mask[16];
+unsigned long lpar_offset;
+unsigned long initial_leap_seconds;
+
+/*
+ * Get time offsets with PTFF
+ */
+void __init ptff_init(void)
+{
+ struct ptff_qto qto;
+ struct ptff_qui qui;
+
+ if (!test_facility(28))
+ return;
+ ptff(&ptff_function_mask, sizeof(ptff_function_mask), PTFF_QAF);
+
+ /* get LPAR offset */
+ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0)
+ lpar_offset = qto.tod_epoch_difference;
+
+ /* get initial leap seconds */
+ if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0)
+ initial_leap_seconds = (unsigned long)
+ ((long) qui.old_leap * 4096000000L);
+}
+
/*
* Scheduler clock - returns current time in nanosec units.
*/
@@ -162,30 +189,32 @@ static void clock_comparator_interrupt(struct ext_code ext_code,
set_clock_comparator(S390_lowcore.clock_comparator);
}
-static void etr_timing_alert(struct etr_irq_parm *);
static void stp_timing_alert(struct stp_irq_parm *);
static void timing_alert_interrupt(struct ext_code ext_code,
unsigned int param32, unsigned long param64)
{
inc_irq_stat(IRQEXT_TLA);
- if (param32 & 0x00c40000)
- etr_timing_alert((struct etr_irq_parm *) &param32);
if (param32 & 0x00038000)
stp_timing_alert((struct stp_irq_parm *) &param32);
}
-static void etr_reset(void);
static void stp_reset(void);
void read_persistent_clock64(struct timespec64 *ts)
{
- tod_to_timeval(get_tod_clock() - TOD_UNIX_EPOCH, ts);
+ __u64 clock;
+
+ clock = get_tod_clock() - initial_leap_seconds;
+ tod_to_timeval(clock - TOD_UNIX_EPOCH, ts);
}
void read_boot_clock64(struct timespec64 *ts)
{
- tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, ts);
+ __u64 clock;
+
+ clock = sched_clock_base_cc - initial_leap_seconds;
+ tod_to_timeval(clock - TOD_UNIX_EPOCH, ts);
}
static cycle_t read_tod_clock(struct clocksource *cs)
@@ -269,7 +298,6 @@ void update_vsyscall_tz(void)
void __init time_init(void)
{
/* Reset time synchronization interfaces. */
- etr_reset();
stp_reset();
/* request the clock comparator external interrupt */
@@ -337,20 +365,20 @@ static unsigned long clock_sync_flags;
#define CLOCK_SYNC_STP 3
/*
- * The synchronous get_clock function. It will write the current clock
- * value to the clock pointer and return 0 if the clock is in sync with
- * the external time source. If the clock mode is local it will return
- * -EOPNOTSUPP and -EAGAIN if the clock is not in sync with the external
- * reference.
+ * The get_clock function for the physical clock. It will get the current
+ * TOD clock, subtract the LPAR offset and write the result to *clock.
+ * The function returns 0 if the clock is in sync with the external time
+ * source. If the clock mode is local it will return -EOPNOTSUPP and
+ * -EAGAIN if the clock is not in sync with the external reference.
*/
-int get_sync_clock(unsigned long long *clock)
+int get_phys_clock(unsigned long long *clock)
{
atomic_t *sw_ptr;
unsigned int sw0, sw1;
sw_ptr = &get_cpu_var(clock_sync_word);
sw0 = atomic_read(sw_ptr);
- *clock = get_tod_clock();
+ *clock = get_tod_clock() - lpar_offset;
sw1 = atomic_read(sw_ptr);
put_cpu_var(clock_sync_word);
if (sw0 == sw1 && (sw0 & 0x80000000U))
@@ -364,7 +392,7 @@ int get_sync_clock(unsigned long long *clock)
return -EACCES;
return -EAGAIN;
}
-EXPORT_SYMBOL(get_sync_clock);
+EXPORT_SYMBOL(get_phys_clock);
/*
* Make get_sync_clock return -EAGAIN.
@@ -416,301 +444,6 @@ static void __init time_init_wq(void)
time_sync_wq = create_singlethread_workqueue("timesync");
}
-/*
- * External Time Reference (ETR) code.
- */
-static int etr_port0_online;
-static int etr_port1_online;
-static int etr_steai_available;
-
-static int __init early_parse_etr(char *p)
-{
- if (strncmp(p, "off", 3) == 0)
- etr_port0_online = etr_port1_online = 0;
- else if (strncmp(p, "port0", 5) == 0)
- etr_port0_online = 1;
- else if (strncmp(p, "port1", 5) == 0)
- etr_port1_online = 1;
- else if (strncmp(p, "on", 2) == 0)
- etr_port0_online = etr_port1_online = 1;
- return 0;
-}
-early_param("etr", early_parse_etr);
-
-enum etr_event {
- ETR_EVENT_PORT0_CHANGE,
- ETR_EVENT_PORT1_CHANGE,
- ETR_EVENT_PORT_ALERT,
- ETR_EVENT_SYNC_CHECK,
- ETR_EVENT_SWITCH_LOCAL,
- ETR_EVENT_UPDATE,
-};
-
-/*
- * Valid bit combinations of the eacr register are (x = don't care):
- * e0 e1 dp p0 p1 ea es sl
- * 0 0 x 0 0 0 0 0 initial, disabled state
- * 0 0 x 0 1 1 0 0 port 1 online
- * 0 0 x 1 0 1 0 0 port 0 online
- * 0 0 x 1 1 1 0 0 both ports online
- * 0 1 x 0 1 1 0 0 port 1 online and usable, ETR or PPS mode
- * 0 1 x 0 1 1 0 1 port 1 online, usable and ETR mode
- * 0 1 x 0 1 1 1 0 port 1 online, usable, PPS mode, in-sync
- * 0 1 x 0 1 1 1 1 port 1 online, usable, ETR mode, in-sync
- * 0 1 x 1 1 1 0 0 both ports online, port 1 usable
- * 0 1 x 1 1 1 1 0 both ports online, port 1 usable, PPS mode, in-sync
- * 0 1 x 1 1 1 1 1 both ports online, port 1 usable, ETR mode, in-sync
- * 1 0 x 1 0 1 0 0 port 0 online and usable, ETR or PPS mode
- * 1 0 x 1 0 1 0 1 port 0 online, usable and ETR mode
- * 1 0 x 1 0 1 1 0 port 0 online, usable, PPS mode, in-sync
- * 1 0 x 1 0 1 1 1 port 0 online, usable, ETR mode, in-sync
- * 1 0 x 1 1 1 0 0 both ports online, port 0 usable
- * 1 0 x 1 1 1 1 0 both ports online, port 0 usable, PPS mode, in-sync
- * 1 0 x 1 1 1 1 1 both ports online, port 0 usable, ETR mode, in-sync
- * 1 1 x 1 1 1 1 0 both ports online & usable, ETR, in-sync
- * 1 1 x 1 1 1 1 1 both ports online & usable, ETR, in-sync
- */
-static struct etr_eacr etr_eacr;
-static u64 etr_tolec; /* time of last eacr update */
-static struct etr_aib etr_port0;
-static int etr_port0_uptodate;
-static struct etr_aib etr_port1;
-static int etr_port1_uptodate;
-static unsigned long etr_events;
-static struct timer_list etr_timer;
-
-static void etr_timeout(unsigned long dummy);
-static void etr_work_fn(struct work_struct *work);
-static DEFINE_MUTEX(etr_work_mutex);
-static DECLARE_WORK(etr_work, etr_work_fn);
-
-/*
- * Reset ETR attachment.
- */
-static void etr_reset(void)
-{
- etr_eacr = (struct etr_eacr) {
- .e0 = 0, .e1 = 0, ._pad0 = 4, .dp = 0,
- .p0 = 0, .p1 = 0, ._pad1 = 0, .ea = 0,
- .es = 0, .sl = 0 };
- if (etr_setr(&etr_eacr) == 0) {
- etr_tolec = get_tod_clock();
- set_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags);
- if (etr_port0_online && etr_port1_online)
- set_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
- } else if (etr_port0_online || etr_port1_online) {
- pr_warn("The real or virtual hardware system does not provide an ETR interface\n");
- etr_port0_online = etr_port1_online = 0;
- }
-}
-
-static int __init etr_init(void)
-{
- struct etr_aib aib;
-
- if (!test_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags))
- return 0;
- time_init_wq();
- /* Check if this machine has the steai instruction. */
- if (etr_steai(&aib, ETR_STEAI_STEPPING_PORT) == 0)
- etr_steai_available = 1;
- setup_timer(&etr_timer, etr_timeout, 0UL);
- if (etr_port0_online) {
- set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events);
- queue_work(time_sync_wq, &etr_work);
- }
- if (etr_port1_online) {
- set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events);
- queue_work(time_sync_wq, &etr_work);
- }
- return 0;
-}
-
-arch_initcall(etr_init);
-
-/*
- * Two sorts of ETR machine checks. The architecture reads:
- * "When a machine-check niterruption occurs and if a switch-to-local or
- * ETR-sync-check interrupt request is pending but disabled, this pending
- * disabled interruption request is indicated and is cleared".
- * Which means that we can get etr_switch_to_local events from the machine
- * check handler although the interruption condition is disabled. Lovely..
- */
-
-/*
- * Switch to local machine check. This is called when the last usable
- * ETR port goes inactive. After switch to local the clock is not in sync.
- */
-int etr_switch_to_local(void)
-{
- if (!etr_eacr.sl)
- return 0;
- disable_sync_clock(NULL);
- if (!test_and_set_bit(ETR_EVENT_SWITCH_LOCAL, &etr_events)) {
- etr_eacr.es = etr_eacr.sl = 0;
- etr_setr(&etr_eacr);
- return 1;
- }
- return 0;
-}
-
-/*
- * ETR sync check machine check. This is called when the ETR OTE and the
- * local clock OTE are farther apart than the ETR sync check tolerance.
- * After a ETR sync check the clock is not in sync. The machine check
- * is broadcasted to all cpus at the same time.
- */
-int etr_sync_check(void)
-{
- if (!etr_eacr.es)
- return 0;
- disable_sync_clock(NULL);
- if (!test_and_set_bit(ETR_EVENT_SYNC_CHECK, &etr_events)) {
- etr_eacr.es = 0;
- etr_setr(&etr_eacr);
- return 1;
- }
- return 0;
-}
-
-void etr_queue_work(void)
-{
- queue_work(time_sync_wq, &etr_work);
-}
-
-/*
- * ETR timing alert. There are two causes:
- * 1) port state change, check the usability of the port
- * 2) port alert, one of the ETR-data-validity bits (v1-v2 bits of the
- * sldr-status word) or ETR-data word 1 (edf1) or ETR-data word 3 (edf3)
- * or ETR-data word 4 (edf4) has changed.
- */
-static void etr_timing_alert(struct etr_irq_parm *intparm)
-{
- if (intparm->pc0)
- /* ETR port 0 state change. */
- set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events);
- if (intparm->pc1)
- /* ETR port 1 state change. */
- set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events);
- if (intparm->eai)
- /*
- * ETR port alert on either port 0, 1 or both.
- * Both ports are not up-to-date now.
- */
- set_bit(ETR_EVENT_PORT_ALERT, &etr_events);
- queue_work(time_sync_wq, &etr_work);
-}
-
-static void etr_timeout(unsigned long dummy)
-{
- set_bit(ETR_EVENT_UPDATE, &etr_events);
- queue_work(time_sync_wq, &etr_work);
-}
-
-/*
- * Check if the etr mode is pss.
- */
-static inline int etr_mode_is_pps(struct etr_eacr eacr)
-{
- return eacr.es && !eacr.sl;
-}
-
-/*
- * Check if the etr mode is etr.
- */
-static inline int etr_mode_is_etr(struct etr_eacr eacr)
-{
- return eacr.es && eacr.sl;
-}
-
-/*
- * Check if the port can be used for TOD synchronization.
- * For PPS mode the port has to receive OTEs. For ETR mode
- * the port has to receive OTEs, the ETR stepping bit has to
- * be zero and the validity bits for data frame 1, 2, and 3
- * have to be 1.
- */
-static int etr_port_valid(struct etr_aib *aib, int port)
-{
- unsigned int psc;
-
- /* Check that this port is receiving OTEs. */
- if (aib->tsp == 0)
- return 0;
-
- psc = port ? aib->esw.psc1 : aib->esw.psc0;
- if (psc == etr_lpsc_pps_mode)
- return 1;
- if (psc == etr_lpsc_operational_step)
- return !aib->esw.y && aib->slsw.v1 &&
- aib->slsw.v2 && aib->slsw.v3;
- return 0;
-}
-
-/*
- * Check if two ports are on the same network.
- */
-static int etr_compare_network(struct etr_aib *aib1, struct etr_aib *aib2)
-{
- // FIXME: any other fields we have to compare?
- return aib1->edf1.net_id == aib2->edf1.net_id;
-}
-
-/*
- * Wrapper for etr_stei that converts physical port states
- * to logical port states to be consistent with the output
- * of stetr (see etr_psc vs. etr_lpsc).
- */
-static void etr_steai_cv(struct etr_aib *aib, unsigned int func)
-{
- BUG_ON(etr_steai(aib, func) != 0);
- /* Convert port state to logical port state. */
- if (aib->esw.psc0 == 1)
- aib->esw.psc0 = 2;
- else if (aib->esw.psc0 == 0 && aib->esw.p == 0)
- aib->esw.psc0 = 1;
- if (aib->esw.psc1 == 1)
- aib->esw.psc1 = 2;
- else if (aib->esw.psc1 == 0 && aib->esw.p == 1)
- aib->esw.psc1 = 1;
-}
-
-/*
- * Check if the aib a2 is still connected to the same attachment as
- * aib a1, the etv values differ by one and a2 is valid.
- */
-static int etr_aib_follows(struct etr_aib *a1, struct etr_aib *a2, int p)
-{
- int state_a1, state_a2;
-
- /* Paranoia check: e0/e1 should better be the same. */
- if (a1->esw.eacr.e0 != a2->esw.eacr.e0 ||
- a1->esw.eacr.e1 != a2->esw.eacr.e1)
- return 0;
-
- /* Still connected to the same etr ? */
- state_a1 = p ? a1->esw.psc1 : a1->esw.psc0;
- state_a2 = p ? a2->esw.psc1 : a2->esw.psc0;
- if (state_a1 == etr_lpsc_operational_step) {
- if (state_a2 != etr_lpsc_operational_step ||
- a1->edf1.net_id != a2->edf1.net_id ||
- a1->edf1.etr_id != a2->edf1.etr_id ||
- a1->edf1.etr_pn != a2->edf1.etr_pn)
- return 0;
- } else if (state_a2 != etr_lpsc_pps_mode)
- return 0;
-
- /* The ETV value of a2 needs to be ETV of a1 + 1. */
- if (a1->edf2.etv + 1 != a2->edf2.etv)
- return 0;
-
- if (!etr_port_valid(a2, p))
- return 0;
-
- return 1;
-}
-
struct clock_sync_data {
atomic_t cpus;
int in_sync;
@@ -748,688 +481,6 @@ static void clock_sync_cpu(struct clock_sync_data *sync)
}
/*
- * Sync the TOD clock using the port referred to by aibp. This port
- * has to be enabled and the other port has to be disabled. The
- * last eacr update has to be more than 1.6 seconds in the past.
- */
-static int etr_sync_clock(void *data)
-{
- static int first;
- unsigned long long clock, old_clock, clock_delta, delay, delta;
- struct clock_sync_data *etr_sync;
- struct etr_aib *sync_port, *aib;
- int port;
- int rc;
-
- etr_sync = data;
-
- if (xchg(&first, 1) == 1) {
- /* Slave */
- clock_sync_cpu(etr_sync);
- return 0;
- }
-
- /* Wait until all other cpus entered the sync function. */
- while (atomic_read(&etr_sync->cpus) != 0)
- cpu_relax();
-
- port = etr_sync->etr_port;
- aib = etr_sync->etr_aib;
- sync_port = (port == 0) ? &etr_port0 : &etr_port1;
- enable_sync_clock();
-
- /* Set clock to next OTE. */
- __ctl_set_bit(14, 21);
- __ctl_set_bit(0, 29);
- clock = ((unsigned long long) (aib->edf2.etv + 1)) << 32;
- old_clock = get_tod_clock();
- if (set_tod_clock(clock) == 0) {
- __udelay(1); /* Wait for the clock to start. */
- __ctl_clear_bit(0, 29);
- __ctl_clear_bit(14, 21);
- etr_stetr(aib);
- /* Adjust Linux timing variables. */
- delay = (unsigned long long)
- (aib->edf2.etv - sync_port->edf2.etv) << 32;
- delta = adjust_time(old_clock, clock, delay);
- clock_delta = clock - old_clock;
- atomic_notifier_call_chain(&s390_epoch_delta_notifier, 0,
- &clock_delta);
- etr_sync->fixup_cc = delta;
- fixup_clock_comparator(delta);
- /* Verify that the clock is properly set. */
- if (!etr_aib_follows(sync_port, aib, port)) {
- /* Didn't work. */
- disable_sync_clock(NULL);
- etr_sync->in_sync = -EAGAIN;
- rc = -EAGAIN;
- } else {
- etr_sync->in_sync = 1;
- rc = 0;
- }
- } else {
- /* Could not set the clock ?!? */
- __ctl_clear_bit(0, 29);
- __ctl_clear_bit(14, 21);
- disable_sync_clock(NULL);
- etr_sync->in_sync = -EAGAIN;
- rc = -EAGAIN;
- }
- xchg(&first, 0);
- return rc;
-}
-
-static int etr_sync_clock_stop(struct etr_aib *aib, int port)
-{
- struct clock_sync_data etr_sync;
- struct etr_aib *sync_port;
- int follows;
- int rc;
-
- /* Check if the current aib is adjacent to the sync port aib. */
- sync_port = (port == 0) ? &etr_port0 : &etr_port1;
- follows = etr_aib_follows(sync_port, aib, port);
- memcpy(sync_port, aib, sizeof(*aib));
- if (!follows)
- return -EAGAIN;
- memset(&etr_sync, 0, sizeof(etr_sync));
- etr_sync.etr_aib = aib;
- etr_sync.etr_port = port;
- get_online_cpus();
- atomic_set(&etr_sync.cpus, num_online_cpus() - 1);
- rc = stop_machine(etr_sync_clock, &etr_sync, cpu_online_mask);
- put_online_cpus();
- return rc;
-}
-
-/*
- * Handle the immediate effects of the different events.
- * The port change event is used for online/offline changes.
- */
-static struct etr_eacr etr_handle_events(struct etr_eacr eacr)
-{
- if (test_and_clear_bit(ETR_EVENT_SYNC_CHECK, &etr_events))
- eacr.es = 0;
- if (test_and_clear_bit(ETR_EVENT_SWITCH_LOCAL, &etr_events))
- eacr.es = eacr.sl = 0;
- if (test_and_clear_bit(ETR_EVENT_PORT_ALERT, &etr_events))
- etr_port0_uptodate = etr_port1_uptodate = 0;
-
- if (test_and_clear_bit(ETR_EVENT_PORT0_CHANGE, &etr_events)) {
- if (eacr.e0)
- /*
- * Port change of an enabled port. We have to
- * assume that this can have caused an stepping
- * port switch.
- */
- etr_tolec = get_tod_clock();
- eacr.p0 = etr_port0_online;
- if (!eacr.p0)
- eacr.e0 = 0;
- etr_port0_uptodate = 0;
- }
- if (test_and_clear_bit(ETR_EVENT_PORT1_CHANGE, &etr_events)) {
- if (eacr.e1)
- /*
- * Port change of an enabled port. We have to
- * assume that this can have caused an stepping
- * port switch.
- */
- etr_tolec = get_tod_clock();
- eacr.p1 = etr_port1_online;
- if (!eacr.p1)
- eacr.e1 = 0;
- etr_port1_uptodate = 0;
- }
- clear_bit(ETR_EVENT_UPDATE, &etr_events);
- return eacr;
-}
-
-/*
- * Set up a timer that expires after the etr_tolec + 1.6 seconds if
- * one of the ports needs an update.
- */
-static void etr_set_tolec_timeout(unsigned long long now)
-{
- unsigned long micros;
-
- if ((!etr_eacr.p0 || etr_port0_uptodate) &&
- (!etr_eacr.p1 || etr_port1_uptodate))
- return;
- micros = (now > etr_tolec) ? ((now - etr_tolec) >> 12) : 0;
- micros = (micros > 1600000) ? 0 : 1600000 - micros;
- mod_timer(&etr_timer, jiffies + (micros * HZ) / 1000000 + 1);
-}
-
-/*
- * Set up a time that expires after 1/2 second.
- */
-static void etr_set_sync_timeout(void)
-{
- mod_timer(&etr_timer, jiffies + HZ/2);
-}
-
-/*
- * Update the aib information for one or both ports.
- */
-static struct etr_eacr etr_handle_update(struct etr_aib *aib,
- struct etr_eacr eacr)
-{
- /* With both ports disabled the aib information is useless. */
- if (!eacr.e0 && !eacr.e1)
- return eacr;
-
- /* Update port0 or port1 with aib stored in etr_work_fn. */
- if (aib->esw.q == 0) {
- /* Information for port 0 stored. */
- if (eacr.p0 && !etr_port0_uptodate) {
- etr_port0 = *aib;
- if (etr_port0_online)
- etr_port0_uptodate = 1;
- }
- } else {
- /* Information for port 1 stored. */
- if (eacr.p1 && !etr_port1_uptodate) {
- etr_port1 = *aib;
- if (etr_port0_online)
- etr_port1_uptodate = 1;
- }
- }
-
- /*
- * Do not try to get the alternate port aib if the clock
- * is not in sync yet.
- */
- if (!eacr.es || !check_sync_clock())
- return eacr;
-
- /*
- * If steai is available we can get the information about
- * the other port immediately. If only stetr is available the
- * data-port bit toggle has to be used.
- */
- if (etr_steai_available) {
- if (eacr.p0 && !etr_port0_uptodate) {
- etr_steai_cv(&etr_port0, ETR_STEAI_PORT_0);
- etr_port0_uptodate = 1;
- }
- if (eacr.p1 && !etr_port1_uptodate) {
- etr_steai_cv(&etr_port1, ETR_STEAI_PORT_1);
- etr_port1_uptodate = 1;
- }
- } else {
- /*
- * One port was updated above, if the other
- * port is not uptodate toggle dp bit.
- */
- if ((eacr.p0 && !etr_port0_uptodate) ||
- (eacr.p1 && !etr_port1_uptodate))
- eacr.dp ^= 1;
- else
- eacr.dp = 0;
- }
- return eacr;
-}
-
-/*
- * Write new etr control register if it differs from the current one.
- * Return 1 if etr_tolec has been updated as well.
- */
-static void etr_update_eacr(struct etr_eacr eacr)
-{
- int dp_changed;
-
- if (memcmp(&etr_eacr, &eacr, sizeof(eacr)) == 0)
- /* No change, return. */
- return;
- /*
- * The disable of an active port of the change of the data port
- * bit can/will cause a change in the data port.
- */
- dp_changed = etr_eacr.e0 > eacr.e0 || etr_eacr.e1 > eacr.e1 ||
- (etr_eacr.dp ^ eacr.dp) != 0;
- etr_eacr = eacr;
- etr_setr(&etr_eacr);
- if (dp_changed)
- etr_tolec = get_tod_clock();
-}
-
-/*
- * ETR work. In this function you'll find the main logic. In
- * particular this is the only function that calls etr_update_eacr(),
- * it "controls" the etr control register.
- */
-static void etr_work_fn(struct work_struct *work)
-{
- unsigned long long now;
- struct etr_eacr eacr;
- struct etr_aib aib;
- int sync_port;
-
- /* prevent multiple execution. */
- mutex_lock(&etr_work_mutex);
-
- /* Create working copy of etr_eacr. */
- eacr = etr_eacr;
-
- /* Check for the different events and their immediate effects. */
- eacr = etr_handle_events(eacr);
-
- /* Check if ETR is supposed to be active. */
- eacr.ea = eacr.p0 || eacr.p1;
- if (!eacr.ea) {
- /* Both ports offline. Reset everything. */
- eacr.dp = eacr.es = eacr.sl = 0;
- on_each_cpu(disable_sync_clock, NULL, 1);
- del_timer_sync(&etr_timer);
- etr_update_eacr(eacr);
- goto out_unlock;
- }
-
- /* Store aib to get the current ETR status word. */
- BUG_ON(etr_stetr(&aib) != 0);
- etr_port0.esw = etr_port1.esw = aib.esw; /* Copy status word. */
- now = get_tod_clock();
-
- /*
- * Update the port information if the last stepping port change
- * or data port change is older than 1.6 seconds.
- */
- if (now >= etr_tolec + (1600000 << 12))
- eacr = etr_handle_update(&aib, eacr);
-
- /*
- * Select ports to enable. The preferred synchronization mode is PPS.
- * If a port can be enabled depends on a number of things:
- * 1) The port needs to be online and uptodate. A port is not
- * disabled just because it is not uptodate, but it is only
- * enabled if it is uptodate.
- * 2) The port needs to have the same mode (pps / etr).
- * 3) The port needs to be usable -> etr_port_valid() == 1
- * 4) To enable the second port the clock needs to be in sync.
- * 5) If both ports are useable and are ETR ports, the network id
- * has to be the same.
- * The eacr.sl bit is used to indicate etr mode vs. pps mode.
- */
- if (eacr.p0 && aib.esw.psc0 == etr_lpsc_pps_mode) {
- eacr.sl = 0;
- eacr.e0 = 1;
- if (!etr_mode_is_pps(etr_eacr))
- eacr.es = 0;
- if (!eacr.es || !eacr.p1 || aib.esw.psc1 != etr_lpsc_pps_mode)
- eacr.e1 = 0;
- // FIXME: uptodate checks ?
- else if (etr_port0_uptodate && etr_port1_uptodate)
- eacr.e1 = 1;
- sync_port = (etr_port0_uptodate &&
- etr_port_valid(&etr_port0, 0)) ? 0 : -1;
- } else if (eacr.p1 && aib.esw.psc1 == etr_lpsc_pps_mode) {
- eacr.sl = 0;
- eacr.e0 = 0;
- eacr.e1 = 1;
- if (!etr_mode_is_pps(etr_eacr))
- eacr.es = 0;
- sync_port = (etr_port1_uptodate &&
- etr_port_valid(&etr_port1, 1)) ? 1 : -1;
- } else if (eacr.p0 && aib.esw.psc0 == etr_lpsc_operational_step) {
- eacr.sl = 1;
- eacr.e0 = 1;
- if (!etr_mode_is_etr(etr_eacr))
- eacr.es = 0;
- if (!eacr.es || !eacr.p1 ||
- aib.esw.psc1 != etr_lpsc_operational_alt)
- eacr.e1 = 0;
- else if (etr_port0_uptodate && etr_port1_uptodate &&
- etr_compare_network(&etr_port0, &etr_port1))
- eacr.e1 = 1;
- sync_port = (etr_port0_uptodate &&
- etr_port_valid(&etr_port0, 0)) ? 0 : -1;
- } else if (eacr.p1 && aib.esw.psc1 == etr_lpsc_operational_step) {
- eacr.sl = 1;
- eacr.e0 = 0;
- eacr.e1 = 1;
- if (!etr_mode_is_etr(etr_eacr))
- eacr.es = 0;
- sync_port = (etr_port1_uptodate &&
- etr_port_valid(&etr_port1, 1)) ? 1 : -1;
- } else {
- /* Both ports not usable. */
- eacr.es = eacr.sl = 0;
- sync_port = -1;
- }
-
- /*
- * If the clock is in sync just update the eacr and return.
- * If there is no valid sync port wait for a port update.
- */
- if ((eacr.es && check_sync_clock()) || sync_port < 0) {
- etr_update_eacr(eacr);
- etr_set_tolec_timeout(now);
- goto out_unlock;
- }
-
- /*
- * Prepare control register for clock syncing
- * (reset data port bit, set sync check control.
- */
- eacr.dp = 0;
- eacr.es = 1;
-
- /*
- * Update eacr and try to synchronize the clock. If the update
- * of eacr caused a stepping port switch (or if we have to
- * assume that a stepping port switch has occurred) or the
- * clock syncing failed, reset the sync check control bit
- * and set up a timer to try again after 0.5 seconds
- */
- etr_update_eacr(eacr);
- if (now < etr_tolec + (1600000 << 12) ||
- etr_sync_clock_stop(&aib, sync_port) != 0) {
- /* Sync failed. Try again in 1/2 second. */
- eacr.es = 0;
- etr_update_eacr(eacr);
- etr_set_sync_timeout();
- } else
- etr_set_tolec_timeout(now);
-out_unlock:
- mutex_unlock(&etr_work_mutex);
-}
-
-/*
- * Sysfs interface functions
- */
-static struct bus_type etr_subsys = {
- .name = "etr",
- .dev_name = "etr",
-};
-
-static struct device etr_port0_dev = {
- .id = 0,
- .bus = &etr_subsys,
-};
-
-static struct device etr_port1_dev = {
- .id = 1,
- .bus = &etr_subsys,
-};
-
-/*
- * ETR subsys attributes
- */
-static ssize_t etr_stepping_port_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%i\n", etr_port0.esw.p);
-}
-
-static DEVICE_ATTR(stepping_port, 0400, etr_stepping_port_show, NULL);
-
-static ssize_t etr_stepping_mode_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- char *mode_str;
-
- if (etr_mode_is_pps(etr_eacr))
- mode_str = "pps";
- else if (etr_mode_is_etr(etr_eacr))
- mode_str = "etr";
- else
- mode_str = "local";
- return sprintf(buf, "%s\n", mode_str);
-}
-
-static DEVICE_ATTR(stepping_mode, 0400, etr_stepping_mode_show, NULL);
-
-/*
- * ETR port attributes
- */
-static inline struct etr_aib *etr_aib_from_dev(struct device *dev)
-{
- if (dev == &etr_port0_dev)
- return etr_port0_online ? &etr_port0 : NULL;
- else
- return etr_port1_online ? &etr_port1 : NULL;
-}
-
-static ssize_t etr_online_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- unsigned int online;
-
- online = (dev == &etr_port0_dev) ? etr_port0_online : etr_port1_online;
- return sprintf(buf, "%i\n", online);
-}
-
-static ssize_t etr_online_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned int value;
-
- value = simple_strtoul(buf, NULL, 0);
- if (value != 0 && value != 1)
- return -EINVAL;
- if (!test_bit(CLOCK_SYNC_HAS_ETR, &clock_sync_flags))
- return -EOPNOTSUPP;
- mutex_lock(&clock_sync_mutex);
- if (dev == &etr_port0_dev) {
- if (etr_port0_online == value)
- goto out; /* Nothing to do. */
- etr_port0_online = value;
- if (etr_port0_online && etr_port1_online)
- set_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
- else
- clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
- set_bit(ETR_EVENT_PORT0_CHANGE, &etr_events);
- queue_work(time_sync_wq, &etr_work);
- } else {
- if (etr_port1_online == value)
- goto out; /* Nothing to do. */
- etr_port1_online = value;
- if (etr_port0_online && etr_port1_online)
- set_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
- else
- clear_bit(CLOCK_SYNC_ETR, &clock_sync_flags);
- set_bit(ETR_EVENT_PORT1_CHANGE, &etr_events);
- queue_work(time_sync_wq, &etr_work);
- }
-out:
- mutex_unlock(&clock_sync_mutex);
- return count;
-}
-
-static DEVICE_ATTR(online, 0600, etr_online_show, etr_online_store);
-
-static ssize_t etr_stepping_control_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%i\n", (dev == &etr_port0_dev) ?
- etr_eacr.e0 : etr_eacr.e1);
-}
-
-static DEVICE_ATTR(stepping_control, 0400, etr_stepping_control_show, NULL);
-
-static ssize_t etr_mode_code_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- if (!etr_port0_online && !etr_port1_online)
- /* Status word is not uptodate if both ports are offline. */
- return -ENODATA;
- return sprintf(buf, "%i\n", (dev == &etr_port0_dev) ?
- etr_port0.esw.psc0 : etr_port0.esw.psc1);
-}
-
-static DEVICE_ATTR(state_code, 0400, etr_mode_code_show, NULL);
-
-static ssize_t etr_untuned_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct etr_aib *aib = etr_aib_from_dev(dev);
-
- if (!aib || !aib->slsw.v1)
- return -ENODATA;
- return sprintf(buf, "%i\n", aib->edf1.u);
-}
-
-static DEVICE_ATTR(untuned, 0400, etr_untuned_show, NULL);
-
-static ssize_t etr_network_id_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct etr_aib *aib = etr_aib_from_dev(dev);
-
- if (!aib || !aib->slsw.v1)
- return -ENODATA;
- return sprintf(buf, "%i\n", aib->edf1.net_id);
-}
-
-static DEVICE_ATTR(network, 0400, etr_network_id_show, NULL);
-
-static ssize_t etr_id_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct etr_aib *aib = etr_aib_from_dev(dev);
-
- if (!aib || !aib->slsw.v1)
- return -ENODATA;
- return sprintf(buf, "%i\n", aib->edf1.etr_id);
-}
-
-static DEVICE_ATTR(id, 0400, etr_id_show, NULL);
-
-static ssize_t etr_port_number_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct etr_aib *aib = etr_aib_from_dev(dev);
-
- if (!aib || !aib->slsw.v1)
- return -ENODATA;
- return sprintf(buf, "%i\n", aib->edf1.etr_pn);
-}
-
-static DEVICE_ATTR(port, 0400, etr_port_number_show, NULL);
-
-static ssize_t etr_coupled_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct etr_aib *aib = etr_aib_from_dev(dev);
-
- if (!aib || !aib->slsw.v3)
- return -ENODATA;
- return sprintf(buf, "%i\n", aib->edf3.c);
-}
-
-static DEVICE_ATTR(coupled, 0400, etr_coupled_show, NULL);
-
-static ssize_t etr_local_time_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct etr_aib *aib = etr_aib_from_dev(dev);
-
- if (!aib || !aib->slsw.v3)
- return -ENODATA;
- return sprintf(buf, "%i\n", aib->edf3.blto);
-}
-
-static DEVICE_ATTR(local_time, 0400, etr_local_time_show, NULL);
-
-static ssize_t etr_utc_offset_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct etr_aib *aib = etr_aib_from_dev(dev);
-
- if (!aib || !aib->slsw.v3)
- return -ENODATA;
- return sprintf(buf, "%i\n", aib->edf3.buo);
-}
-
-static DEVICE_ATTR(utc_offset, 0400, etr_utc_offset_show, NULL);
-
-static struct device_attribute *etr_port_attributes[] = {
- &dev_attr_online,
- &dev_attr_stepping_control,
- &dev_attr_state_code,
- &dev_attr_untuned,
- &dev_attr_network,
- &dev_attr_id,
- &dev_attr_port,
- &dev_attr_coupled,
- &dev_attr_local_time,
- &dev_attr_utc_offset,
- NULL
-};
-
-static int __init etr_register_port(struct device *dev)
-{
- struct device_attribute **attr;
- int rc;
-
- rc = device_register(dev);
- if (rc)
- goto out;
- for (attr = etr_port_attributes; *attr; attr++) {
- rc = device_create_file(dev, *attr);
- if (rc)
- goto out_unreg;
- }
- return 0;
-out_unreg:
- for (; attr >= etr_port_attributes; attr--)
- device_remove_file(dev, *attr);
- device_unregister(dev);
-out:
- return rc;
-}
-
-static void __init etr_unregister_port(struct device *dev)
-{
- struct device_attribute **attr;
-
- for (attr = etr_port_attributes; *attr; attr++)
- device_remove_file(dev, *attr);
- device_unregister(dev);
-}
-
-static int __init etr_init_sysfs(void)
-{
- int rc;
-
- rc = subsys_system_register(&etr_subsys, NULL);
- if (rc)
- goto out;
- rc = device_create_file(etr_subsys.dev_root, &dev_attr_stepping_port);
- if (rc)
- goto out_unreg_subsys;
- rc = device_create_file(etr_subsys.dev_root, &dev_attr_stepping_mode);
- if (rc)
- goto out_remove_stepping_port;
- rc = etr_register_port(&etr_port0_dev);
- if (rc)
- goto out_remove_stepping_mode;
- rc = etr_register_port(&etr_port1_dev);
- if (rc)
- goto out_remove_port0;
- return 0;
-
-out_remove_port0:
- etr_unregister_port(&etr_port0_dev);
-out_remove_stepping_mode:
- device_remove_file(etr_subsys.dev_root, &dev_attr_stepping_mode);
-out_remove_stepping_port:
- device_remove_file(etr_subsys.dev_root, &dev_attr_stepping_port);
-out_unreg_subsys:
- bus_unregister(&etr_subsys);
-out:
- return rc;
-}
-
-device_initcall(etr_init_sysfs);
-
-/*
* Server Time Protocol (STP) code.
*/
static bool stp_online;
@@ -1455,7 +506,7 @@ static void __init stp_reset(void)
int rc;
stp_page = (void *) get_zeroed_page(GFP_ATOMIC);
- rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000);
+ rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL);
if (rc == 0)
set_bit(CLOCK_SYNC_HAS_STP, &clock_sync_flags);
else if (stp_online) {
@@ -1533,6 +584,7 @@ static int stp_sync_clock(void *data)
static int first;
unsigned long long old_clock, delta, new_clock, clock_delta;
struct clock_sync_data *stp_sync;
+ struct ptff_qto qto;
int rc;
stp_sync = data;
@@ -1554,11 +606,14 @@ static int stp_sync_clock(void *data)
stp_info.todoff[2] || stp_info.todoff[3] ||
stp_info.tmd != 2) {
old_clock = get_tod_clock();
- rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0);
+ rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, &clock_delta);
if (rc == 0) {
- new_clock = get_tod_clock();
+ new_clock = old_clock + clock_delta;
delta = adjust_time(old_clock, new_clock, 0);
- clock_delta = new_clock - old_clock;
+ if (ptff_query(PTFF_QTO) &&
+ ptff(&qto, sizeof(qto), PTFF_QTO) == 0)
+ /* Update LPAR offset */
+ lpar_offset = qto.tod_epoch_difference;
atomic_notifier_call_chain(&s390_epoch_delta_notifier,
0, &clock_delta);
fixup_clock_comparator(delta);
@@ -1590,12 +645,12 @@ static void stp_work_fn(struct work_struct *work)
mutex_lock(&stp_work_mutex);
if (!stp_online) {
- chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000);
+ chsc_sstpc(stp_page, STP_OP_CTRL, 0x0000, NULL);
del_timer_sync(&stp_timer);
goto out_unlock;
}
- rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0);
+ rc = chsc_sstpc(stp_page, STP_OP_CTRL, 0xb0e0, NULL);
if (rc)
goto out_unlock;
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index 64298a867..e959c02e0 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -46,6 +46,7 @@ static DECLARE_WORK(topology_work, topology_work_fn);
*/
static struct mask_info socket_info;
static struct mask_info book_info;
+static struct mask_info drawer_info;
DEFINE_PER_CPU(struct cpu_topology_s390, cpu_topology);
EXPORT_PER_CPU_SYMBOL_GPL(cpu_topology);
@@ -79,10 +80,10 @@ static cpumask_t cpu_thread_map(unsigned int cpu)
return mask;
}
-static struct mask_info *add_cpus_to_mask(struct topology_core *tl_core,
- struct mask_info *book,
- struct mask_info *socket,
- int one_socket_per_cpu)
+static void add_cpus_to_mask(struct topology_core *tl_core,
+ struct mask_info *drawer,
+ struct mask_info *book,
+ struct mask_info *socket)
{
struct cpu_topology_s390 *topo;
unsigned int core;
@@ -97,21 +98,17 @@ static struct mask_info *add_cpus_to_mask(struct topology_core *tl_core,
continue;
for (i = 0; i <= smp_cpu_mtid; i++) {
topo = &per_cpu(cpu_topology, lcpu + i);
+ topo->drawer_id = drawer->id;
topo->book_id = book->id;
+ topo->socket_id = socket->id;
topo->core_id = rcore;
topo->thread_id = lcpu + i;
+ cpumask_set_cpu(lcpu + i, &drawer->mask);
cpumask_set_cpu(lcpu + i, &book->mask);
cpumask_set_cpu(lcpu + i, &socket->mask);
- if (one_socket_per_cpu)
- topo->socket_id = rcore;
- else
- topo->socket_id = socket->id;
smp_cpu_set_polarization(lcpu + i, tl_core->pp);
}
- if (one_socket_per_cpu)
- socket = socket->next;
}
- return socket;
}
static void clear_masks(void)
@@ -128,6 +125,11 @@ static void clear_masks(void)
cpumask_clear(&info->mask);
info = info->next;
}
+ info = &drawer_info;
+ while (info) {
+ cpumask_clear(&info->mask);
+ info = info->next;
+ }
}
static union topology_entry *next_tle(union topology_entry *tle)
@@ -137,16 +139,22 @@ static union topology_entry *next_tle(union topology_entry *tle)
return (union topology_entry *)((struct topology_container *)tle + 1);
}
-static void __tl_to_masks_generic(struct sysinfo_15_1_x *info)
+static void tl_to_masks(struct sysinfo_15_1_x *info)
{
struct mask_info *socket = &socket_info;
struct mask_info *book = &book_info;
+ struct mask_info *drawer = &drawer_info;
union topology_entry *tle, *end;
+ clear_masks();
tle = info->tle;
end = (union topology_entry *)((unsigned long)info + info->length);
while (tle < end) {
switch (tle->nl) {
+ case 3:
+ drawer = drawer->next;
+ drawer->id = tle->container.id;
+ break;
case 2:
book = book->next;
book->id = tle->container.id;
@@ -156,32 +164,7 @@ static void __tl_to_masks_generic(struct sysinfo_15_1_x *info)
socket->id = tle->container.id;
break;
case 0:
- add_cpus_to_mask(&tle->cpu, book, socket, 0);
- break;
- default:
- clear_masks();
- return;
- }
- tle = next_tle(tle);
- }
-}
-
-static void __tl_to_masks_z10(struct sysinfo_15_1_x *info)
-{
- struct mask_info *socket = &socket_info;
- struct mask_info *book = &book_info;
- union topology_entry *tle, *end;
-
- tle = info->tle;
- end = (union topology_entry *)((unsigned long)info + info->length);
- while (tle < end) {
- switch (tle->nl) {
- case 1:
- book = book->next;
- book->id = tle->container.id;
- break;
- case 0:
- socket = add_cpus_to_mask(&tle->cpu, book, socket, 1);
+ add_cpus_to_mask(&tle->cpu, drawer, book, socket);
break;
default:
clear_masks();
@@ -191,22 +174,6 @@ static void __tl_to_masks_z10(struct sysinfo_15_1_x *info)
}
}
-static void tl_to_masks(struct sysinfo_15_1_x *info)
-{
- struct cpuid cpu_id;
-
- get_cpu_id(&cpu_id);
- clear_masks();
- switch (cpu_id.machine) {
- case 0x2097:
- case 0x2098:
- __tl_to_masks_z10(info);
- break;
- default:
- __tl_to_masks_generic(info);
- }
-}
-
static void topology_update_polarization_simple(void)
{
int cpu;
@@ -257,11 +224,13 @@ static void update_cpu_masks(void)
topo->thread_mask = cpu_thread_map(cpu);
topo->core_mask = cpu_group_map(&socket_info, cpu);
topo->book_mask = cpu_group_map(&book_info, cpu);
+ topo->drawer_mask = cpu_group_map(&drawer_info, cpu);
if (!MACHINE_HAS_TOPOLOGY) {
topo->thread_id = cpu;
topo->core_id = cpu;
topo->socket_id = cpu;
topo->book_id = cpu;
+ topo->drawer_id = cpu;
}
}
numa_update_cpu_topology();
@@ -269,10 +238,7 @@ static void update_cpu_masks(void)
void store_topology(struct sysinfo_15_1_x *info)
{
- if (topology_max_mnest >= 3)
- stsi(info, 15, 1, 3);
- else
- stsi(info, 15, 1, 2);
+ stsi(info, 15, 1, min(topology_max_mnest, 4));
}
int arch_update_cpu_topology(void)
@@ -442,6 +408,11 @@ static const struct cpumask *cpu_book_mask(int cpu)
return &per_cpu(cpu_topology, cpu).book_mask;
}
+static const struct cpumask *cpu_drawer_mask(int cpu)
+{
+ return &per_cpu(cpu_topology, cpu).drawer_mask;
+}
+
static int __init early_parse_topology(char *p)
{
return kstrtobool(p, &topology_enabled);
@@ -452,6 +423,7 @@ static struct sched_domain_topology_level s390_topology[] = {
{ cpu_thread_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
{ cpu_book_mask, SD_INIT_NAME(BOOK) },
+ { cpu_drawer_mask, SD_INIT_NAME(DRAWER) },
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
@@ -487,6 +459,7 @@ static int __init s390_topology_init(void)
printk(KERN_CONT " / %d\n", info->mnest);
alloc_masks(info, &socket_info, 1);
alloc_masks(info, &book_info, 2);
+ alloc_masks(info, &drawer_info, 3);
set_sched_topology(s390_topology);
return 0;
}
diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile
index f9c459586..68145456f 100644
--- a/arch/s390/kernel/vdso32/Makefile
+++ b/arch/s390/kernel/vdso32/Makefile
@@ -1,5 +1,7 @@
# List of files in the vdso, has to be asm only for now
+KCOV_INSTRUMENT := n
+
obj-vdso32 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o
# Build rules
diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile
index 058659c1b..0b0fd22c8 100644
--- a/arch/s390/kernel/vdso64/Makefile
+++ b/arch/s390/kernel/vdso64/Makefile
@@ -1,5 +1,7 @@
# List of files in the vdso, has to be asm only for now
+KCOV_INSTRUMENT := n
+
obj-vdso64 = gettimeofday.o clock_getres.o clock_gettime.o note.o getcpu.o
# Build rules
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index 0f41a8286..429bfd111 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -4,6 +4,16 @@
#include <asm/thread_info.h>
#include <asm/page.h>
+
+/*
+ * Put .bss..swapper_pg_dir as the first thing in .bss. This will
+ * make sure it has 16k alignment.
+ */
+#define BSS_FIRST_SECTIONS *(.bss..swapper_pg_dir)
+
+/* Handle ro_after_init data on our own. */
+#define RO_AFTER_INIT_DATA
+
#include <asm-generic/vmlinux.lds.h>
OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390")
@@ -49,7 +59,14 @@ SECTIONS
_eshared = .; /* End of shareable data */
_sdata = .; /* Start of data section */
- EXCEPTION_TABLE(16) :data
+ . = ALIGN(PAGE_SIZE);
+ __start_ro_after_init = .;
+ .data..ro_after_init : {
+ *(.data..ro_after_init)
+ }
+ EXCEPTION_TABLE(16)
+ . = ALIGN(PAGE_SIZE);
+ __end_ro_after_init = .;
RW_DATA_SECTION(0x100, PAGE_SIZE, THREAD_SIZE)
@@ -81,7 +98,7 @@ SECTIONS
. = ALIGN(PAGE_SIZE);
__init_end = .; /* freed after init ends here */
- BSS_SECTION(0, 2, 0)
+ BSS_SECTION(PAGE_SIZE, 4 * PAGE_SIZE, PAGE_SIZE)
_end = . ;
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index d42fa38c2..09a9e6dfc 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o
+kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 1ea4095b6..ce865bd4f 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -212,6 +212,11 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
(vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
return -EOPNOTSUPP;
+ VCPU_EVENT(vcpu, 4, "diag 0x500 schid 0x%8.8x queue 0x%x cookie 0x%llx",
+ (u32) vcpu->run->s.regs.gprs[2],
+ (u32) vcpu->run->s.regs.gprs[3],
+ vcpu->run->s.regs.gprs[4]);
+
/*
* The layout is as follows:
* - gpr 2 contains the subchannel id (passed as addr)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 66938d283..54200208b 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -8,6 +8,7 @@
#include <linux/vmalloc.h>
#include <linux/err.h>
#include <asm/pgtable.h>
+#include <asm/gmap.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include <asm/switch_to.h>
@@ -476,18 +477,73 @@ enum {
FSI_FETCH = 2 /* Exception was due to fetch operation */
};
-static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
- ar_t ar, enum gacc_mode mode)
+enum prot_type {
+ PROT_TYPE_LA = 0,
+ PROT_TYPE_KEYC = 1,
+ PROT_TYPE_ALC = 2,
+ PROT_TYPE_DAT = 3,
+};
+
+static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
+ ar_t ar, enum gacc_mode mode, enum prot_type prot)
{
- int rc;
- struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
- struct trans_exc_code_bits *tec_bits;
+ struct trans_exc_code_bits *tec;
memset(pgm, 0, sizeof(*pgm));
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
- tec_bits->as = psw.as;
+ pgm->code = code;
+ tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+
+ switch (code) {
+ case PGM_ASCE_TYPE:
+ case PGM_PAGE_TRANSLATION:
+ case PGM_REGION_FIRST_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_THIRD_TRANS:
+ case PGM_SEGMENT_TRANSLATION:
+ /*
+ * op_access_id only applies to MOVE_PAGE -> set bit 61
+ * exc_access_id has to be set to 0 for some instructions. Both
+ * cases have to be handled by the caller. We can always store
+ * exc_access_id, as it is undefined for non-ar cases.
+ */
+ tec->addr = gva >> PAGE_SHIFT;
+ tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+ tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+ /* FALL THROUGH */
+ case PGM_ALEN_TRANSLATION:
+ case PGM_ALE_SEQUENCE:
+ case PGM_ASTE_VALIDITY:
+ case PGM_ASTE_SEQUENCE:
+ case PGM_EXTENDED_AUTHORITY:
+ pgm->exc_access_id = ar;
+ break;
+ case PGM_PROTECTION:
+ switch (prot) {
+ case PROT_TYPE_ALC:
+ tec->b60 = 1;
+ /* FALL THROUGH */
+ case PROT_TYPE_DAT:
+ tec->b61 = 1;
+ tec->addr = gva >> PAGE_SHIFT;
+ tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+ tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+ /* exc_access_id is undefined for most cases */
+ pgm->exc_access_id = ar;
+ break;
+ default: /* LA and KEYC set b61 to 0, other params undefined */
+ break;
+ }
+ break;
+ }
+ return code;
+}
+
+static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
+ unsigned long ga, ar_t ar, enum gacc_mode mode)
+{
+ int rc;
+ struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
if (!psw.t) {
asce->val = 0;
@@ -510,21 +566,8 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
return 0;
case PSW_AS_ACCREG:
rc = ar_translation(vcpu, asce, ar, mode);
- switch (rc) {
- case PGM_ALEN_TRANSLATION:
- case PGM_ALE_SEQUENCE:
- case PGM_ASTE_VALIDITY:
- case PGM_ASTE_SEQUENCE:
- case PGM_EXTENDED_AUTHORITY:
- vcpu->arch.pgm.exc_access_id = ar;
- break;
- case PGM_PROTECTION:
- tec_bits->b60 = 1;
- tec_bits->b61 = 1;
- break;
- }
if (rc > 0)
- pgm->code = rc;
+ return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC);
return rc;
}
return 0;
@@ -729,40 +772,31 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
return 1;
}
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
+static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
unsigned long *pages, unsigned long nr_pages,
const union asce asce, enum gacc_mode mode)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec_bits;
- int lap_enabled, rc;
+ int lap_enabled, rc = 0;
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
lap_enabled = low_address_protection_enabled(vcpu, asce);
while (nr_pages) {
ga = kvm_s390_logical_to_effective(vcpu, ga);
- tec_bits->addr = ga >> PAGE_SHIFT;
- if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) {
- pgm->code = PGM_PROTECTION;
- return pgm->code;
- }
+ if (mode == GACC_STORE && lap_enabled && is_low_address(ga))
+ return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
+ PROT_TYPE_LA);
ga &= PAGE_MASK;
if (psw_bits(*psw).t) {
rc = guest_translate(vcpu, ga, pages, asce, mode);
if (rc < 0)
return rc;
- if (rc == PGM_PROTECTION)
- tec_bits->b61 = 1;
- if (rc)
- pgm->code = rc;
} else {
*pages = kvm_s390_real_to_abs(vcpu, ga);
if (kvm_is_error_gpa(vcpu->kvm, *pages))
- pgm->code = PGM_ADDRESSING;
+ rc = PGM_ADDRESSING;
}
- if (pgm->code)
- return pgm->code;
+ if (rc)
+ return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT);
ga += PAGE_SIZE;
pages++;
nr_pages--;
@@ -783,7 +817,8 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
if (!len)
return 0;
- rc = get_vcpu_asce(vcpu, &asce, ar, mode);
+ ga = kvm_s390_logical_to_effective(vcpu, ga);
+ rc = get_vcpu_asce(vcpu, &asce, ga, ar, mode);
if (rc)
return rc;
nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
@@ -795,7 +830,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
need_ipte_lock = psw_bits(*psw).t && !asce.r;
if (need_ipte_lock)
ipte_lock(vcpu);
- rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode);
+ rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode);
for (idx = 0; idx < nr_pages && !rc; idx++) {
gpa = *(pages + idx) + (ga & ~PAGE_MASK);
_len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
@@ -846,37 +881,28 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
unsigned long *gpa, enum gacc_mode mode)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec;
union asce asce;
int rc;
gva = kvm_s390_logical_to_effective(vcpu, gva);
- tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- rc = get_vcpu_asce(vcpu, &asce, ar, mode);
- tec->addr = gva >> PAGE_SHIFT;
+ rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
if (rc)
return rc;
if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
- if (mode == GACC_STORE) {
- rc = pgm->code = PGM_PROTECTION;
- return rc;
- }
+ if (mode == GACC_STORE)
+ return trans_exc(vcpu, PGM_PROTECTION, gva, 0,
+ mode, PROT_TYPE_LA);
}
if (psw_bits(*psw).t && !asce.r) { /* Use DAT? */
rc = guest_translate(vcpu, gva, gpa, asce, mode);
- if (rc > 0) {
- if (rc == PGM_PROTECTION)
- tec->b61 = 1;
- pgm->code = rc;
- }
+ if (rc > 0)
+ return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT);
} else {
- rc = 0;
*gpa = kvm_s390_real_to_abs(vcpu, gva);
if (kvm_is_error_gpa(vcpu->kvm, *gpa))
- rc = pgm->code = PGM_ADDRESSING;
+ return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0);
}
return rc;
@@ -915,20 +941,247 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
*/
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec_bits;
union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]};
if (!ctlreg0.lap || !is_low_address(gra))
return 0;
+ return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA);
+}
- memset(pgm, 0, sizeof(*pgm));
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- tec_bits->fsi = FSI_STORE;
- tec_bits->as = psw_bits(*psw).as;
- tec_bits->addr = gra >> PAGE_SHIFT;
- pgm->code = PGM_PROTECTION;
+/**
+ * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: pointer to the page table address result
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ */
+static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection,
+ int *fake)
+{
+ struct gmap *parent;
+ union asce asce;
+ union vaddress vaddr;
+ unsigned long ptr;
+ int rc;
+
+ *fake = 0;
+ *dat_protection = 0;
+ parent = sg->parent;
+ vaddr.addr = saddr;
+ asce.val = sg->orig_asce;
+ ptr = asce.origin * 4096;
+ if (asce.r) {
+ *fake = 1;
+ asce.dt = ASCE_TYPE_REGION1;
+ }
+ switch (asce.dt) {
+ case ASCE_TYPE_REGION1:
+ if (vaddr.rfx01 > asce.tl && !asce.r)
+ return PGM_REGION_FIRST_TRANS;
+ break;
+ case ASCE_TYPE_REGION2:
+ if (vaddr.rfx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.rsx01 > asce.tl)
+ return PGM_REGION_SECOND_TRANS;
+ break;
+ case ASCE_TYPE_REGION3:
+ if (vaddr.rfx || vaddr.rsx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.rtx01 > asce.tl)
+ return PGM_REGION_THIRD_TRANS;
+ break;
+ case ASCE_TYPE_SEGMENT:
+ if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.sx01 > asce.tl)
+ return PGM_SEGMENT_TRANSLATION;
+ break;
+ }
+
+ switch (asce.dt) {
+ case ASCE_TYPE_REGION1: {
+ union region1_table_entry rfte;
- return pgm->code;
+ if (*fake) {
+ /* offset in 16EB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.rsx << 53UL);
+ rfte.val = ptr;
+ goto shadow_r2t;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+ if (rc)
+ return rc;
+ if (rfte.i)
+ return PGM_REGION_FIRST_TRANS;
+ if (rfte.tt != TABLE_TYPE_REGION1)
+ return PGM_TRANSLATION_SPEC;
+ if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+ return PGM_REGION_SECOND_TRANS;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rfte.p;
+ ptr = rfte.rto << 12UL;
+shadow_r2t:
+ rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_REGION2: {
+ union region2_table_entry rste;
+
+ if (*fake) {
+ /* offset in 8PB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.rtx << 42UL);
+ rste.val = ptr;
+ goto shadow_r3t;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+ if (rc)
+ return rc;
+ if (rste.i)
+ return PGM_REGION_SECOND_TRANS;
+ if (rste.tt != TABLE_TYPE_REGION2)
+ return PGM_TRANSLATION_SPEC;
+ if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+ return PGM_REGION_THIRD_TRANS;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rste.p;
+ ptr = rste.rto << 12UL;
+shadow_r3t:
+ rste.p |= *dat_protection;
+ rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_REGION3: {
+ union region3_table_entry rtte;
+
+ if (*fake) {
+ /* offset in 4TB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.sx << 31UL);
+ rtte.val = ptr;
+ goto shadow_sgt;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+ if (rc)
+ return rc;
+ if (rtte.i)
+ return PGM_REGION_THIRD_TRANS;
+ if (rtte.tt != TABLE_TYPE_REGION3)
+ return PGM_TRANSLATION_SPEC;
+ if (rtte.cr && asce.p && sg->edat_level >= 2)
+ return PGM_TRANSLATION_SPEC;
+ if (rtte.fc && sg->edat_level >= 2) {
+ *dat_protection |= rtte.fc0.p;
+ *fake = 1;
+ ptr = rtte.fc1.rfaa << 31UL;
+ rtte.val = ptr;
+ goto shadow_sgt;
+ }
+ if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+ return PGM_SEGMENT_TRANSLATION;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rtte.fc0.p;
+ ptr = rtte.fc0.sto << 12UL;
+shadow_sgt:
+ rtte.fc0.p |= *dat_protection;
+ rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_SEGMENT: {
+ union segment_table_entry ste;
+
+ if (*fake) {
+ /* offset in 2G guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.sx << 20UL);
+ ste.val = ptr;
+ goto shadow_pgt;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+ if (rc)
+ return rc;
+ if (ste.i)
+ return PGM_SEGMENT_TRANSLATION;
+ if (ste.tt != TABLE_TYPE_SEGMENT)
+ return PGM_TRANSLATION_SPEC;
+ if (ste.cs && asce.p)
+ return PGM_TRANSLATION_SPEC;
+ *dat_protection |= ste.fc0.p;
+ if (ste.fc && sg->edat_level >= 1) {
+ *fake = 1;
+ ptr = ste.fc1.sfaa << 20UL;
+ ste.val = ptr;
+ goto shadow_pgt;
+ }
+ ptr = ste.fc0.pto << 11UL;
+shadow_pgt:
+ ste.fc0.p |= *dat_protection;
+ rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+ if (rc)
+ return rc;
+ }
+ }
+ /* Return the parent address of the page table */
+ *pgt = ptr;
+ return 0;
+}
+
+/**
+ * kvm_s390_shadow_fault - handle fault on a shadow page table
+ * @vcpu: virtual cpu
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ *
+ * Returns: - 0 if the shadow fault was successfully resolved
+ * - > 0 (pgm exception code) on exceptions while faulting
+ * - -EAGAIN if the caller can retry immediately
+ * - -EFAULT when accessing invalid guest addresses
+ * - -ENOMEM if out of memory
+ */
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
+ unsigned long saddr)
+{
+ union vaddress vaddr;
+ union page_table_entry pte;
+ unsigned long pgt;
+ int dat_protection, fake;
+ int rc;
+
+ down_read(&sg->mm->mmap_sem);
+ /*
+ * We don't want any guest-2 tables to change - so the parent
+ * tables/pointers we read stay valid - unshadowing is however
+ * always possible - only guest_table_lock protects us.
+ */
+ ipte_lock(vcpu);
+
+ rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+ if (rc)
+ rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
+ &fake);
+
+ vaddr.addr = saddr;
+ if (fake) {
+ /* offset in 1MB guest memory block */
+ pte.val = pgt + ((unsigned long) vaddr.px << 12UL);
+ goto shadow_page;
+ }
+ if (!rc)
+ rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+ if (!rc && pte.i)
+ rc = PGM_PAGE_TRANSLATION;
+ if (!rc && (pte.z || (pte.co && sg->edat_level < 1)))
+ rc = PGM_TRANSLATION_SPEC;
+shadow_page:
+ pte.p |= dat_protection;
+ if (!rc)
+ rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+ ipte_unlock(vcpu);
+ up_read(&sg->mm->mmap_sem);
+ return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index df0a79dd8..8756569ad 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
int ipte_lock_held(struct kvm_vcpu *vcpu);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
+ unsigned long saddr);
+
#endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index e8c6843b9..31a05330d 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -439,6 +439,23 @@ exit_required:
#define guest_per_enabled(vcpu) \
(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
+{
+ const u8 ilen = kvm_s390_get_ilen(vcpu);
+ struct kvm_s390_pgm_info pgm_info = {
+ .code = PGM_PER,
+ .per_code = PER_EVENT_IFETCH >> 24,
+ .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
+ };
+
+ /*
+ * The PSW points to the next instruction, therefore the intercepted
+ * instruction generated a PER i-fetch event. PER address therefore
+ * points at the previous PSW address (could be an EXECUTE function).
+ */
+ return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+}
+
static void filter_guest_per_event(struct kvm_vcpu *vcpu)
{
u32 perc = vcpu->arch.sie_block->perc << 24;
@@ -465,7 +482,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
guest_perc &= ~PER_EVENT_IFETCH;
/* All other PER events will be given to the guest */
- /* TODO: Check alterated address/address space */
+ /* TODO: Check altered address/address space */
vcpu->arch.sie_block->perc = guest_perc >> 24;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 252157181..dfd0ca263 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -351,8 +351,26 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;
}
+static int handle_operexc(struct kvm_vcpu *vcpu)
+{
+ vcpu->stat.exit_operation_exception++;
+ trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
+ vcpu->arch.sie_block->ipb);
+
+ if (vcpu->arch.sie_block->ipa == 0xb256 &&
+ test_kvm_facility(vcpu->kvm, 74))
+ return handle_sthyi(vcpu);
+
+ if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
+ return -EOPNOTSUPP;
+
+ return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
{
+ int rc, per_rc = 0;
+
if (kvm_is_ucontrol(vcpu->kvm))
return -EOPNOTSUPP;
@@ -361,7 +379,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
case 0x18:
return handle_noop(vcpu);
case 0x04:
- return handle_instruction(vcpu);
+ rc = handle_instruction(vcpu);
+ break;
case 0x08:
return handle_prog(vcpu);
case 0x14:
@@ -372,9 +391,19 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
return handle_validity(vcpu);
case 0x28:
return handle_stop(vcpu);
+ case 0x2c:
+ rc = handle_operexc(vcpu);
+ break;
case 0x38:
- return handle_partial_execution(vcpu);
+ rc = handle_partial_execution(vcpu);
+ break;
default:
return -EOPNOTSUPP;
}
+
+ /* process PER, also if the instrution is processed in user space */
+ if (vcpu->arch.sie_block->icptstatus & 0x02 &&
+ (!rc || rc == -EOPNOTSUPP))
+ per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
+ return per_rc ? per_rc : rc;
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 5a80af740..24524c0f3 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -28,9 +28,6 @@
#include "gaccess.h"
#include "trace-s390.h"
-#define IOINT_SCHID_MASK 0x0000ffff
-#define IOINT_SSID_MASK 0x00030000
-#define IOINT_CSSID_MASK 0x03fc0000
#define PFAULT_INIT 0x0600
#define PFAULT_DONE 0x0680
#define VIRTIO_PARAM 0x0d00
@@ -821,7 +818,14 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
struct kvm_s390_interrupt_info,
list);
if (inti) {
- VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type);
+ if (inti->type & KVM_S390_INT_IO_AI_MASK)
+ VCPU_EVENT(vcpu, 4, "%s", "deliver: I/O (AI)");
+ else
+ VCPU_EVENT(vcpu, 4, "deliver: I/O %x ss %x schid %04x",
+ inti->io.subchannel_id >> 8,
+ inti->io.subchannel_id >> 1 & 0x3,
+ inti->io.subchannel_nr);
+
vcpu->stat.deliver_io_int++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
inti->type,
@@ -991,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
swake_up(&vcpu->wq);
vcpu->stat.halt_wakeup++;
}
+ /*
+ * The VCPU might not be sleeping but is executing the VSIE. Let's
+ * kick it, so it leaves the SIE to process the request.
+ */
+ kvm_s390_vsie_kick(vcpu);
}
enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
@@ -1415,6 +1424,13 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
}
fi->counters[FIRQ_CNTR_IO] += 1;
+ if (inti->type & KVM_S390_INT_IO_AI_MASK)
+ VM_EVENT(kvm, 4, "%s", "inject: I/O (AI)");
+ else
+ VM_EVENT(kvm, 4, "inject: I/O %x ss %x schid %04x",
+ inti->io.subchannel_id >> 8,
+ inti->io.subchannel_id >> 1 & 0x3,
+ inti->io.subchannel_nr);
isc = int_word_to_isc(inti->io.io_int_word);
list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
list_add_tail(&inti->list, list);
@@ -1531,13 +1547,6 @@ int kvm_s390_inject_vm(struct kvm *kvm,
inti->mchk.mcic = s390int->parm64;
break;
case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
- if (inti->type & KVM_S390_INT_IO_AI_MASK)
- VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
- else
- VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
- s390int->type & IOINT_CSSID_MASK,
- s390int->type & IOINT_SSID_MASK,
- s390int->type & IOINT_SCHID_MASK);
inti->io.subchannel_id = s390int->parm >> 16;
inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
inti->io.io_int_parm = s390int->parm64 >> 32;
@@ -2237,7 +2246,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
return ret;
}
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int ret;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 13c62e036..607ec9196 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -21,20 +21,24 @@
#include <linux/init.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <linux/mman.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/vmalloc.h>
+#include <linux/bitmap.h>
#include <asm/asm-offsets.h>
#include <asm/lowcore.h>
-#include <asm/etr.h>
+#include <asm/stp.h>
#include <asm/pgtable.h>
#include <asm/gmap.h>
#include <asm/nmi.h>
#include <asm/switch_to.h>
#include <asm/isc.h>
#include <asm/sclp.h>
+#include <asm/cpacf.h>
+#include <asm/timex.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -64,6 +68,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "exit_pei", VCPU_STAT(exit_pei) },
{ "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+ { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
@@ -94,6 +99,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "instruction_stsi", VCPU_STAT(instruction_stsi) },
{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
+ { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+ { "instruction_sie", VCPU_STAT(instruction_sie) },
{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
{ "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@ -119,6 +126,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
+/* allow nested virtualization in KVM (if enabled by user space) */
+static int nested;
+module_param(nested, int, S_IRUGO);
+MODULE_PARM_DESC(nested, "Nested virtualization support");
+
/* upper facilities limit for kvm */
unsigned long kvm_s390_fac_list_mask[16] = {
0xffe6000000000000UL,
@@ -131,7 +143,13 @@ unsigned long kvm_s390_fac_list_mask_size(void)
return ARRAY_SIZE(kvm_s390_fac_list_mask);
}
+/* available cpu features supported by kvm */
+static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+/* available subfunctions indicated via query / "test bit" */
+static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
+
static struct gmap_notifier gmap_notifier;
+static struct gmap_notifier vsie_gmap_notifier;
debug_info_t *kvm_s390_dbf;
/* Section: not file related */
@@ -141,7 +159,8 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end);
/*
* This callback is executed during stop_machine(). All CPUs are therefore
@@ -163,6 +182,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
vcpu->arch.sie_block->epoch -= *delta;
if (vcpu->arch.cputm_enabled)
vcpu->arch.cputm_start += *delta;
+ if (vcpu->arch.vsie_block)
+ vcpu->arch.vsie_block->epoch -= *delta;
}
}
return NOTIFY_OK;
@@ -175,7 +196,9 @@ static struct notifier_block kvm_clock_notifier = {
int kvm_arch_hardware_setup(void)
{
gmap_notifier.notifier_call = kvm_gmap_notifier;
- gmap_register_ipte_notifier(&gmap_notifier);
+ gmap_register_pte_notifier(&gmap_notifier);
+ vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+ gmap_register_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_register(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
return 0;
@@ -183,11 +206,109 @@ int kvm_arch_hardware_setup(void)
void kvm_arch_hardware_unsetup(void)
{
- gmap_unregister_ipte_notifier(&gmap_notifier);
+ gmap_unregister_pte_notifier(&gmap_notifier);
+ gmap_unregister_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
}
+static void allow_cpu_feat(unsigned long nr)
+{
+ set_bit_inv(nr, kvm_s390_available_cpu_feat);
+}
+
+static inline int plo_test_bit(unsigned char nr)
+{
+ register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+ int cc = 3; /* subfunction not available */
+
+ asm volatile(
+ /* Parameter registers are ignored for "test bit" */
+ " plo 0,0,0,0(0)\n"
+ " ipm %0\n"
+ " srl %0,28\n"
+ : "=d" (cc)
+ : "d" (r0)
+ : "cc");
+ return cc == 0;
+}
+
+static void kvm_s390_cpu_feat_init(void)
+{
+ int i;
+
+ for (i = 0; i < 256; ++i) {
+ if (plo_test_bit(i))
+ kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7);
+ }
+
+ if (test_facility(28)) /* TOD-clock steering */
+ ptff(kvm_s390_available_subfunc.ptff,
+ sizeof(kvm_s390_available_subfunc.ptff),
+ PTFF_QAF);
+
+ if (test_facility(17)) { /* MSA */
+ __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
+ __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
+ __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
+ __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
+ __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
+ }
+ if (test_facility(76)) /* MSA3 */
+ __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
+ if (test_facility(77)) { /* MSA4 */
+ __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
+ __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
+ __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
+ __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
+ }
+ if (test_facility(57)) /* MSA5 */
+ __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
+
+ if (MACHINE_HAS_ESOP)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+ /*
+ * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+ * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+ */
+ if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+ !test_facility(3) || !nested)
+ return;
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+ if (sclp.has_64bscao)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+ if (sclp.has_siif)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+ if (sclp.has_gpere)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+ if (sclp.has_gsls)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+ if (sclp.has_ib)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+ if (sclp.has_cei)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+ if (sclp.has_ibs)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+ /*
+ * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+ * all skey handling functions read/set the skey from the PGSTE
+ * instead of the real storage key.
+ *
+ * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+ * pages being detected as preserved although they are resident.
+ *
+ * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+ * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+ *
+ * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+ * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+ * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+ *
+ * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+ * cannot easily shadow the SCA because of the ipte lock.
+ */
+}
+
int kvm_arch_init(void *opaque)
{
kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
@@ -199,6 +320,8 @@ int kvm_arch_init(void *opaque)
return -ENOMEM;
}
+ kvm_s390_cpu_feat_init();
+
/* Register floating interrupt controller interface. */
return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
}
@@ -244,6 +367,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_USER_STSI:
case KVM_CAP_S390_SKEYS:
case KVM_CAP_S390_IRQ_STATE:
+ case KVM_CAP_S390_USER_INSTR0:
r = 1;
break;
case KVM_CAP_S390_MEM_OP:
@@ -251,8 +375,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
- r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS
- : KVM_S390_BSCA_CPU_SLOTS;
+ r = KVM_S390_BSCA_CPU_SLOTS;
+ if (sclp.has_esca && sclp.has_64bscao)
+ r = KVM_S390_ESCA_CPU_SLOTS;
break;
case KVM_CAP_NR_MEMSLOTS:
r = KVM_USER_MEM_SLOTS;
@@ -335,6 +460,16 @@ out:
return r;
}
+static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
+{
+ unsigned int i;
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu);
+ }
+}
+
static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
{
int r;
@@ -355,7 +490,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
break;
case KVM_CAP_S390_VECTOR_REGISTERS:
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
r = -EBUSY;
} else if (MACHINE_HAS_VX) {
set_kvm_facility(kvm->arch.model.fac_mask, 129);
@@ -370,7 +505,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
case KVM_CAP_S390_RI:
r = -EINVAL;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
r = -EBUSY;
} else if (test_facility(64)) {
set_kvm_facility(kvm->arch.model.fac_mask, 64);
@@ -386,6 +521,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
kvm->arch.user_stsi = 1;
r = 0;
break;
+ case KVM_CAP_S390_USER_INSTR0:
+ VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0");
+ kvm->arch.user_instr0 = 1;
+ icpt_operexc_on_all_vcpus(kvm);
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -418,21 +559,23 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
unsigned int idx;
switch (attr->attr) {
case KVM_S390_VM_MEM_ENABLE_CMMA:
- /* enable CMMA only for z10 and later (EDAT_1) */
- ret = -EINVAL;
- if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1)
+ ret = -ENXIO;
+ if (!sclp.has_cmma)
break;
ret = -EBUSY;
VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) == 0) {
+ if (!kvm->created_vcpus) {
kvm->arch.use_cmma = 1;
ret = 0;
}
mutex_unlock(&kvm->lock);
break;
case KVM_S390_VM_MEM_CLR_CMMA:
+ ret = -ENXIO;
+ if (!sclp.has_cmma)
+ break;
ret = -EINVAL;
if (!kvm->arch.use_cmma)
break;
@@ -461,20 +604,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
if (!new_limit)
return -EINVAL;
- /* gmap_alloc takes last usable address */
+ /* gmap_create takes last usable address */
if (new_limit != KVM_S390_NO_MEM_LIMIT)
new_limit -= 1;
ret = -EBUSY;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) == 0) {
- /* gmap_alloc will round the limit up */
- struct gmap *new = gmap_alloc(current->mm, new_limit);
+ if (!kvm->created_vcpus) {
+ /* gmap_create will round the limit up */
+ struct gmap *new = gmap_create(current->mm, new_limit);
if (!new) {
ret = -ENOMEM;
} else {
- gmap_free(kvm->arch.gmap);
+ gmap_remove(kvm->arch.gmap);
new->private = kvm;
kvm->arch.gmap = new;
ret = 0;
@@ -644,7 +787,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
int ret = 0;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
ret = -EBUSY;
goto out;
}
@@ -676,6 +819,39 @@ out:
return ret;
}
+static int kvm_s390_set_processor_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+ int ret = -EBUSY;
+
+ if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
+ return -EFAULT;
+ if (!bitmap_subset((unsigned long *) data.feat,
+ kvm_s390_available_cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ if (!atomic_read(&kvm->online_vcpus)) {
+ bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ ret = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ /*
+ * Once supported by kernel + hw, we have to store the subfunctions
+ * in kvm->arch and remember that user space configured them.
+ */
+ return -ENXIO;
+}
+
static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret = -ENXIO;
@@ -684,6 +860,12 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_PROCESSOR:
ret = kvm_s390_set_processor(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ ret = kvm_s390_set_processor_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+ ret = kvm_s390_set_processor_subfunc(kvm, attr);
+ break;
}
return ret;
}
@@ -732,6 +914,50 @@ out:
return ret;
}
+static int kvm_s390_get_processor_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+
+ bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+ return -EFAULT;
+ return 0;
+}
+
+static int kvm_s390_get_machine_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+
+ bitmap_copy((unsigned long *) data.feat,
+ kvm_s390_available_cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+ return -EFAULT;
+ return 0;
+}
+
+static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ /*
+ * Once we can actually configure subfunctions (kernel + hw support),
+ * we have to check if they were already set by user space, if so copy
+ * them from kvm->arch.
+ */
+ return -ENXIO;
+}
+
+static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc,
+ sizeof(struct kvm_s390_vm_cpu_subfunc)))
+ return -EFAULT;
+ return 0;
+}
static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret = -ENXIO;
@@ -743,6 +969,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_MACHINE:
ret = kvm_s390_get_machine(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ ret = kvm_s390_get_processor_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MACHINE_FEAT:
+ ret = kvm_s390_get_machine_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+ ret = kvm_s390_get_processor_subfunc(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
+ ret = kvm_s390_get_machine_subfunc(kvm, attr);
+ break;
}
return ret;
}
@@ -803,6 +1041,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_S390_VM_MEM_ENABLE_CMMA:
case KVM_S390_VM_MEM_CLR_CMMA:
+ ret = sclp.has_cmma ? 0 : -ENXIO;
+ break;
case KVM_S390_VM_MEM_LIMIT_SIZE:
ret = 0;
break;
@@ -826,8 +1066,13 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_S390_VM_CPU_PROCESSOR:
case KVM_S390_VM_CPU_MACHINE:
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ case KVM_S390_VM_CPU_MACHINE_FEAT:
+ case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
ret = 0;
break;
+ /* configuring subfunctions is not supported yet */
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
default:
ret = -ENXIO;
break;
@@ -858,7 +1103,6 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
uint8_t *keys;
uint64_t hva;
- unsigned long curkey;
int i, r = 0;
if (args->flags != 0)
@@ -879,26 +1123,27 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (!keys)
return -ENOMEM;
+ down_read(&current->mm->mmap_sem);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
if (kvm_is_error_hva(hva)) {
r = -EFAULT;
- goto out;
+ break;
}
- curkey = get_guest_storage_key(current->mm, hva);
- if (IS_ERR_VALUE(curkey)) {
- r = curkey;
- goto out;
- }
- keys[i] = curkey;
+ r = get_guest_storage_key(current->mm, hva, &keys[i]);
+ if (r)
+ break;
+ }
+ up_read(&current->mm->mmap_sem);
+
+ if (!r) {
+ r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
+ sizeof(uint8_t) * args->count);
+ if (r)
+ r = -EFAULT;
}
- r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
- sizeof(uint8_t) * args->count);
- if (r)
- r = -EFAULT;
-out:
kvfree(keys);
return r;
}
@@ -935,24 +1180,25 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (r)
goto out;
+ down_read(&current->mm->mmap_sem);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
if (kvm_is_error_hva(hva)) {
r = -EFAULT;
- goto out;
+ break;
}
/* Lowest order bit is reserved */
if (keys[i] & 0x01) {
r = -EINVAL;
- goto out;
+ break;
}
- r = set_guest_storage_key(current->mm, hva,
- (unsigned long)keys[i], 0);
+ r = set_guest_storage_key(current->mm, hva, keys[i], 0);
if (r)
- goto out;
+ break;
}
+ up_read(&current->mm->mmap_sem);
out:
kvfree(keys);
return r;
@@ -1129,6 +1375,7 @@ static void sca_dispose(struct kvm *kvm)
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
+ gfp_t alloc_flags = GFP_KERNEL;
int i, rc;
char debug_name[16];
static unsigned long sca_offset;
@@ -1150,9 +1397,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
rc = -ENOMEM;
+ ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500);
+
kvm->arch.use_esca = 0; /* start with basic SCA */
+ if (!sclp.has_64bscao)
+ alloc_flags |= GFP_DMA;
rwlock_init(&kvm->arch.sca_lock);
- kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL);
+ kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
if (!kvm->arch.sca)
goto out_err;
spin_lock(&kvm_lock);
@@ -1189,6 +1440,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
S390_ARCH_FAC_LIST_SIZE_BYTE);
+ set_kvm_facility(kvm->arch.model.fac_mask, 74);
+ set_kvm_facility(kvm->arch.model.fac_list, 74);
+
kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
kvm->arch.model.ibc = sclp.ibc & 0x0fff;
@@ -1212,7 +1466,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
else
kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
sclp.hamax + 1);
- kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+ kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
if (!kvm->arch.gmap)
goto out_err;
kvm->arch.gmap->private = kvm;
@@ -1224,6 +1478,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.epoch = 0;
spin_lock_init(&kvm->arch.start_stop_lock);
+ kvm_s390_vsie_init(kvm);
KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
return 0;
@@ -1245,7 +1500,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
sca_del_vcpu(vcpu);
if (kvm_is_ucontrol(vcpu->kvm))
- gmap_free(vcpu->arch.gmap);
+ gmap_remove(vcpu->arch.gmap);
if (vcpu->kvm->arch.use_cmma)
kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -1278,16 +1533,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
debug_unregister(kvm->arch.dbf);
free_page((unsigned long)kvm->arch.sie_page2);
if (!kvm_is_ucontrol(kvm))
- gmap_free(kvm->arch.gmap);
+ gmap_remove(kvm->arch.gmap);
kvm_s390_destroy_adapters(kvm);
kvm_s390_clear_float_irqs(kvm);
+ kvm_s390_vsie_destroy(kvm);
KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
}
/* Section: vcpu related */
static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
{
- vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+ vcpu->arch.gmap = gmap_create(current->mm, -1UL);
if (!vcpu->arch.gmap)
return -ENOMEM;
vcpu->arch.gmap->private = vcpu->kvm;
@@ -1396,7 +1652,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
if (id < KVM_S390_BSCA_CPU_SLOTS)
return true;
- if (!sclp.has_esca)
+ if (!sclp.has_esca || !sclp.has_64bscao)
return false;
mutex_lock(&kvm->lock);
@@ -1416,6 +1672,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
KVM_SYNC_CRS |
KVM_SYNC_ARCH0 |
KVM_SYNC_PFAULT;
+ kvm_s390_set_prefix(vcpu, 0);
if (test_kvm_facility(vcpu->kvm, 64))
vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
/* fprs can be synchronized via vrs, even if the guest has no vx. With
@@ -1537,7 +1794,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
save_access_regs(vcpu->arch.host_acrs);
restore_access_regs(vcpu->run->s.regs.acrs);
- gmap_enable(vcpu->arch.gmap);
+ gmap_enable(vcpu->arch.enabled_gmap);
atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__start_cpu_timer_accounting(vcpu);
@@ -1550,7 +1807,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__stop_cpu_timer_accounting(vcpu);
atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
- gmap_disable(vcpu->arch.gmap);
+ vcpu->arch.enabled_gmap = gmap_get_enabled();
+ gmap_disable(vcpu->arch.enabled_gmap);
/* Save guest register state */
save_fpu_regs();
@@ -1599,7 +1857,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
vcpu->arch.gmap = vcpu->kvm->arch.gmap;
sca_add_vcpu(vcpu);
}
-
+ if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
+ vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+ /* make vcpu_load load the right gmap on the first trigger */
+ vcpu->arch.enabled_gmap = vcpu->arch.gmap;
}
static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
@@ -1658,15 +1919,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_s390_vcpu_setup_model(vcpu);
- vcpu->arch.sie_block->ecb = 0x02;
+ /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
+ if (MACHINE_HAS_ESOP)
+ vcpu->arch.sie_block->ecb |= 0x02;
if (test_kvm_facility(vcpu->kvm, 9))
vcpu->arch.sie_block->ecb |= 0x04;
- if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
+ if (test_kvm_facility(vcpu->kvm, 73))
vcpu->arch.sie_block->ecb |= 0x10;
- if (test_kvm_facility(vcpu->kvm, 8))
+ if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
vcpu->arch.sie_block->ecb2 |= 0x08;
- vcpu->arch.sie_block->eca = 0xC1002000U;
+ vcpu->arch.sie_block->eca = 0x1002000U;
+ if (sclp.has_cei)
+ vcpu->arch.sie_block->eca |= 0x80000000U;
+ if (sclp.has_ib)
+ vcpu->arch.sie_block->eca |= 0x40000000U;
if (sclp.has_siif)
vcpu->arch.sie_block->eca |= 1;
if (sclp.has_sigpif)
@@ -1716,6 +1983,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
vcpu->arch.sie_block = &sie_page->sie_block;
vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
+ /* the real guest size will always be smaller than msl */
+ vcpu->arch.sie_block->mso = 0;
+ vcpu->arch.sie_block->msl = sclp.hamax;
+
vcpu->arch.sie_block->icpua = id;
spin_lock_init(&vcpu->arch.local_int.lock);
vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@ -1784,16 +2055,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
kvm_s390_vcpu_request(vcpu);
}
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
{
- int i;
struct kvm *kvm = gmap->private;
struct kvm_vcpu *vcpu;
+ unsigned long prefix;
+ int i;
+ if (gmap_is_shadow(gmap))
+ return;
+ if (start >= 1UL << 31)
+ /* We are only interested in prefix pages */
+ return;
kvm_for_each_vcpu(i, vcpu, kvm) {
/* match against both prefix pages */
- if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
- VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+ prefix = kvm_s390_get_prefix(vcpu);
+ if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+ VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+ start, end);
kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
}
}
@@ -2004,6 +2284,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
if (dbg->control & ~VALID_GUESTDBG_FLAGS)
return -EINVAL;
+ if (!sclp.has_gpere)
+ return -EINVAL;
if (dbg->control & KVM_GUESTDBG_ENABLE) {
vcpu->guest_debug = dbg->control;
@@ -2072,18 +2354,20 @@ retry:
return 0;
/*
* We use MMU_RELOAD just to re-arm the ipte notifier for the
- * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+ * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
* This ensures that the ipte instruction for this request has
* already finished. We might race against a second unmapper that
* wants to set the blocking bit. Lets just retry the request loop.
*/
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
int rc;
- rc = gmap_ipte_notify(vcpu->arch.gmap,
- kvm_s390_get_prefix(vcpu),
- PAGE_SIZE * 2);
- if (rc)
+ rc = gmap_mprotect_notify(vcpu->arch.gmap,
+ kvm_s390_get_prefix(vcpu),
+ PAGE_SIZE * 2, PROT_WRITE);
+ if (rc) {
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
return rc;
+ }
goto retry;
}
@@ -2110,6 +2394,11 @@ retry:
goto retry;
}
+ if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) {
+ vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+ goto retry;
+ }
+
/* nothing to do, just clear the request */
clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
@@ -2364,14 +2653,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
* guest_enter and guest_exit should be no uaccess.
*/
local_irq_disable();
- __kvm_guest_enter();
+ guest_enter_irqoff();
__disable_cpu_timer_accounting(vcpu);
local_irq_enable();
exit_reason = sie64a(vcpu->arch.sie_block,
vcpu->run->s.regs.gprs);
local_irq_disable();
__enable_cpu_timer_accounting(vcpu);
- __kvm_guest_exit();
+ guest_exit_irqoff();
local_irq_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -2600,6 +2889,8 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
{
+ if (!sclp.has_ibs)
+ return;
kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 8621ab00e..b8432862a 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
{
- return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT;
+ return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
}
static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -175,6 +175,12 @@ static inline int set_kvm_facility(u64 *fac_list, unsigned long nr)
return 0;
}
+static inline int test_kvm_cpu_feat(struct kvm *kvm, unsigned long nr)
+{
+ WARN_ON_ONCE(nr >= KVM_S390_VM_CPU_FEAT_NR_BITS);
+ return test_bit_inv(nr, kvm->arch.cpu_feat);
+}
+
/* are cpu states controlled by user space */
static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
{
@@ -232,6 +238,8 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen)
}
static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
{
+ /* don't inject PER events if we re-execute the instruction */
+ vcpu->arch.sie_block->icptstatus &= ~0x02;
kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu));
}
@@ -246,10 +254,21 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
+/* implemented in vsie.c */
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end);
+void kvm_s390_vsie_init(struct kvm *kvm);
+void kvm_s390_vsie_destroy(struct kvm *kvm);
+
/* implemented in sigp.c */
int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
+/* implemented in sthyi.c */
+int handle_sthyi(struct kvm_vcpu *vcpu);
+
/* implemented in kvm-s390.c */
void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
@@ -360,6 +379,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
/* support for Basic/Extended SCA handling */
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 95916fa7c..46160388e 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -27,6 +27,7 @@
#include <asm/io.h>
#include <asm/ptrace.h>
#include <asm/compat.h>
+#include <asm/sclp.h>
#include "gaccess.h"
#include "kvm-s390.h"
#include "trace.h"
@@ -152,30 +153,166 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
static int __skey_check_enable(struct kvm_vcpu *vcpu)
{
int rc = 0;
+
+ trace_kvm_s390_skey_related_inst(vcpu);
if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
return rc;
rc = s390_enable_skey();
- VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest");
- trace_kvm_s390_skey_related_inst(vcpu);
- vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+ VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
+ if (!rc)
+ vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
return rc;
}
-
-static int handle_skey(struct kvm_vcpu *vcpu)
+static int try_handle_skey(struct kvm_vcpu *vcpu)
{
- int rc = __skey_check_enable(vcpu);
+ int rc;
+ vcpu->stat.instruction_storage_key++;
+ rc = __skey_check_enable(vcpu);
if (rc)
return rc;
- vcpu->stat.instruction_storage_key++;
-
+ if (sclp.has_skey) {
+ /* with storage-key facility, SIE interprets it for us */
+ kvm_s390_retry_instr(vcpu);
+ VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+ return -EAGAIN;
+ }
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+ return 0;
+}
- kvm_s390_retry_instr(vcpu);
- VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+static int handle_iske(struct kvm_vcpu *vcpu)
+{
+ unsigned long addr;
+ unsigned char key;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ addr = kvm_s390_logical_to_effective(vcpu, addr);
+ addr = kvm_s390_real_to_abs(vcpu, addr);
+ addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = get_guest_storage_key(current->mm, addr, &key);
+ up_read(&current->mm->mmap_sem);
+ if (rc)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ vcpu->run->s.regs.gprs[reg1] &= ~0xff;
+ vcpu->run->s.regs.gprs[reg1] |= key;
+ return 0;
+}
+
+static int handle_rrbe(struct kvm_vcpu *vcpu)
+{
+ unsigned long addr;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ addr = kvm_s390_logical_to_effective(vcpu, addr);
+ addr = kvm_s390_real_to_abs(vcpu, addr);
+ addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = reset_guest_reference_bit(current->mm, addr);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ kvm_s390_set_psw_cc(vcpu, rc);
+ return 0;
+}
+
+#define SSKE_NQ 0x8
+#define SSKE_MR 0x4
+#define SSKE_MC 0x2
+#define SSKE_MB 0x1
+static int handle_sske(struct kvm_vcpu *vcpu)
+{
+ unsigned char m3 = vcpu->arch.sie_block->ipb >> 28;
+ unsigned long start, end;
+ unsigned char key, oldkey;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ if (!test_kvm_facility(vcpu->kvm, 8))
+ m3 &= ~SSKE_MB;
+ if (!test_kvm_facility(vcpu->kvm, 10))
+ m3 &= ~(SSKE_MC | SSKE_MR);
+ if (!test_kvm_facility(vcpu->kvm, 14))
+ m3 &= ~SSKE_NQ;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ key = vcpu->run->s.regs.gprs[reg1] & 0xfe;
+ start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ start = kvm_s390_logical_to_effective(vcpu, start);
+ if (m3 & SSKE_MB) {
+ /* start already designates an absolute address */
+ end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
+ } else {
+ start = kvm_s390_real_to_abs(vcpu, start);
+ end = start + PAGE_SIZE;
+ }
+
+ while (start != end) {
+ unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
+
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey,
+ m3 & SSKE_NQ, m3 & SSKE_MR,
+ m3 & SSKE_MC);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ start += PAGE_SIZE;
+ };
+
+ if (m3 & (SSKE_MC | SSKE_MR)) {
+ if (m3 & SSKE_MB) {
+ /* skey in reg1 is unpredictable */
+ kvm_s390_set_psw_cc(vcpu, 3);
+ } else {
+ kvm_s390_set_psw_cc(vcpu, rc);
+ vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL;
+ vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8;
+ }
+ }
+ if (m3 & SSKE_MB) {
+ if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT)
+ vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK;
+ else
+ vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL;
+ end = kvm_s390_logical_to_effective(vcpu, end);
+ vcpu->run->s.regs.gprs[reg2] |= end;
+ }
return 0;
}
@@ -582,10 +719,11 @@ static const intercept_handler_t b2_handlers[256] = {
[0x10] = handle_set_prefix,
[0x11] = handle_store_prefix,
[0x12] = handle_store_cpu_address,
+ [0x14] = kvm_s390_handle_vsie,
[0x21] = handle_ipte_interlock,
- [0x29] = handle_skey,
- [0x2a] = handle_skey,
- [0x2b] = handle_skey,
+ [0x29] = handle_iske,
+ [0x2a] = handle_rrbe,
+ [0x2b] = handle_sske,
[0x2c] = handle_test_block,
[0x30] = handle_io_inst,
[0x31] = handle_io_inst,
@@ -654,8 +792,10 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
static int handle_pfmf(struct kvm_vcpu *vcpu)
{
+ bool mr = false, mc = false, nq;
int reg1, reg2;
unsigned long start, end;
+ unsigned char key;
vcpu->stat.instruction_pfmf++;
@@ -675,15 +815,27 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
!test_kvm_facility(vcpu->kvm, 14))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- /* No support for conditional-SSKE */
- if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC))
- return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ /* Only provide conditional-SSKE support if enabled for the guest */
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK &&
+ test_kvm_facility(vcpu->kvm, 10)) {
+ mr = vcpu->run->s.regs.gprs[reg1] & PFMF_MR;
+ mc = vcpu->run->s.regs.gprs[reg1] & PFMF_MC;
+ }
+ nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ;
+ key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
start = kvm_s390_logical_to_effective(vcpu, start);
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
+ if (kvm_s390_check_low_addr_prot_real(vcpu, start))
+ return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+ }
+
switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
case 0x00000000:
+ /* only 4k frames specify a real address */
+ start = kvm_s390_real_to_abs(vcpu, start);
end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
break;
case 0x00001000:
@@ -701,20 +853,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
}
- if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
- if (kvm_s390_check_low_addr_prot_real(vcpu, start))
- return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
- }
-
- while (start < end) {
- unsigned long useraddr, abs_addr;
+ while (start != end) {
+ unsigned long useraddr;
/* Translate guest address to host address */
- if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0)
- abs_addr = kvm_s390_real_to_abs(vcpu, start);
- else
- abs_addr = start;
- useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr));
+ useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
if (kvm_is_error_hva(useraddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
@@ -728,16 +871,25 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
if (rc)
return rc;
- if (set_guest_storage_key(current->mm, useraddr,
- vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
- vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
+ down_read(&current->mm->mmap_sem);
+ rc = cond_set_guest_storage_key(current->mm, useraddr,
+ key, NULL, nq, mr, mc);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
}
start += PAGE_SIZE;
}
- if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC)
- vcpu->run->s.regs.gprs[reg2] = end;
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
+ if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) {
+ vcpu->run->s.regs.gprs[reg2] = end;
+ } else {
+ vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL;
+ end = kvm_s390_logical_to_effective(vcpu, end);
+ vcpu->run->s.regs.gprs[reg2] |= end;
+ }
+ }
return 0;
}
@@ -1033,7 +1185,15 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
return 0;
}
+static int handle_ptff(struct kvm_vcpu *vcpu)
+{
+ /* we don't emulate any control instructions yet */
+ kvm_s390_set_psw_cc(vcpu, 3);
+ return 0;
+}
+
static const intercept_handler_t x01_handlers[256] = {
+ [0x04] = handle_ptff,
[0x07] = handle_sckpf,
};
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 28ea0cab1..1a252f537 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT;
u16 p_asn, s_asn;
psw_t *psw;
- u32 flags;
+ bool idle;
- flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
+ idle = is_vcpu_idle(vcpu);
psw = &dst_vcpu->arch.sie_block->gpsw;
p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff; /* Primary ASN */
s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff; /* Secondary ASN */
/* Inject the emergency signal? */
- if (!(flags & CPUSTAT_STOPPED)
+ if (!is_vcpu_stopped(vcpu)
|| (psw->mask & psw_int_mask) != psw_int_mask
- || ((flags & CPUSTAT_WAIT) && psw->addr != 0)
- || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) {
+ || (idle && psw->addr != 0)
+ || (!idle && (asn == p_asn || asn == s_asn))) {
return __inject_sigp_emergency(vcpu, dst_vcpu);
} else {
*reg &= 0xffffffff00000000UL;
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
new file mode 100644
index 000000000..bd98b7d25
--- /dev/null
+++ b/arch/s390/kvm/sthyi.c
@@ -0,0 +1,471 @@
+/*
+ * store hypervisor information instruction emulation functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+
+#include <asm/kvm_host.h>
+#include <asm/asm-offsets.h>
+#include <asm/sclp.h>
+#include <asm/diag.h>
+#include <asm/sysinfo.h>
+#include <asm/ebcdic.h>
+
+#include "kvm-s390.h"
+#include "gaccess.h"
+#include "trace.h"
+
+#define DED_WEIGHT 0xffff
+/*
+ * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string
+ * as they are justified with spaces.
+ */
+#define CP 0xc3d7404040404040UL
+#define IFL 0xc9c6d34040404040UL
+
+enum hdr_flags {
+ HDR_NOT_LPAR = 0x10,
+ HDR_STACK_INCM = 0x20,
+ HDR_STSI_UNAV = 0x40,
+ HDR_PERF_UNAV = 0x80,
+};
+
+enum mac_validity {
+ MAC_NAME_VLD = 0x20,
+ MAC_ID_VLD = 0x40,
+ MAC_CNT_VLD = 0x80,
+};
+
+enum par_flag {
+ PAR_MT_EN = 0x80,
+};
+
+enum par_validity {
+ PAR_GRP_VLD = 0x08,
+ PAR_ID_VLD = 0x10,
+ PAR_ABS_VLD = 0x20,
+ PAR_WGHT_VLD = 0x40,
+ PAR_PCNT_VLD = 0x80,
+};
+
+struct hdr_sctn {
+ u8 infhflg1;
+ u8 infhflg2; /* reserved */
+ u8 infhval1; /* reserved */
+ u8 infhval2; /* reserved */
+ u8 reserved[3];
+ u8 infhygct;
+ u16 infhtotl;
+ u16 infhdln;
+ u16 infmoff;
+ u16 infmlen;
+ u16 infpoff;
+ u16 infplen;
+ u16 infhoff1;
+ u16 infhlen1;
+ u16 infgoff1;
+ u16 infglen1;
+ u16 infhoff2;
+ u16 infhlen2;
+ u16 infgoff2;
+ u16 infglen2;
+ u16 infhoff3;
+ u16 infhlen3;
+ u16 infgoff3;
+ u16 infglen3;
+ u8 reserved2[4];
+} __packed;
+
+struct mac_sctn {
+ u8 infmflg1; /* reserved */
+ u8 infmflg2; /* reserved */
+ u8 infmval1;
+ u8 infmval2; /* reserved */
+ u16 infmscps;
+ u16 infmdcps;
+ u16 infmsifl;
+ u16 infmdifl;
+ char infmname[8];
+ char infmtype[4];
+ char infmmanu[16];
+ char infmseq[16];
+ char infmpman[4];
+ u8 reserved[4];
+} __packed;
+
+struct par_sctn {
+ u8 infpflg1;
+ u8 infpflg2; /* reserved */
+ u8 infpval1;
+ u8 infpval2; /* reserved */
+ u16 infppnum;
+ u16 infpscps;
+ u16 infpdcps;
+ u16 infpsifl;
+ u16 infpdifl;
+ u16 reserved;
+ char infppnam[8];
+ u32 infpwbcp;
+ u32 infpabcp;
+ u32 infpwbif;
+ u32 infpabif;
+ char infplgnm[8];
+ u32 infplgcp;
+ u32 infplgif;
+} __packed;
+
+struct sthyi_sctns {
+ struct hdr_sctn hdr;
+ struct mac_sctn mac;
+ struct par_sctn par;
+} __packed;
+
+struct cpu_inf {
+ u64 lpar_cap;
+ u64 lpar_grp_cap;
+ u64 lpar_weight;
+ u64 all_weight;
+ int cpu_num_ded;
+ int cpu_num_shd;
+};
+
+struct lpar_cpu_inf {
+ struct cpu_inf cp;
+ struct cpu_inf ifl;
+};
+
+static inline u64 cpu_id(u8 ctidx, void *diag224_buf)
+{
+ return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN));
+}
+
+/*
+ * Scales the cpu capping from the lpar range to the one expected in
+ * sthyi data.
+ *
+ * diag204 reports a cap in hundredths of processor units.
+ * z/VM's range for one core is 0 - 0x10000.
+ */
+static u32 scale_cap(u32 in)
+{
+ return (0x10000 * in) / 100;
+}
+
+static void fill_hdr(struct sthyi_sctns *sctns)
+{
+ sctns->hdr.infhdln = sizeof(sctns->hdr);
+ sctns->hdr.infmoff = sizeof(sctns->hdr);
+ sctns->hdr.infmlen = sizeof(sctns->mac);
+ sctns->hdr.infplen = sizeof(sctns->par);
+ sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen;
+ sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen;
+}
+
+static void fill_stsi_mac(struct sthyi_sctns *sctns,
+ struct sysinfo_1_1_1 *sysinfo)
+{
+ if (stsi(sysinfo, 1, 1, 1))
+ return;
+
+ sclp_ocf_cpc_name_copy(sctns->mac.infmname);
+
+ memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype));
+ memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu));
+ memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman));
+ memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq));
+
+ sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD;
+}
+
+static void fill_stsi_par(struct sthyi_sctns *sctns,
+ struct sysinfo_2_2_2 *sysinfo)
+{
+ if (stsi(sysinfo, 2, 2, 2))
+ return;
+
+ sctns->par.infppnum = sysinfo->lpar_number;
+ memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam));
+
+ sctns->par.infpval1 |= PAR_ID_VLD;
+}
+
+static void fill_stsi(struct sthyi_sctns *sctns)
+{
+ void *sysinfo;
+
+ /* Errors are handled through the validity bits in the response. */
+ sysinfo = (void *)__get_free_page(GFP_KERNEL);
+ if (!sysinfo)
+ return;
+
+ fill_stsi_mac(sctns, sysinfo);
+ fill_stsi_par(sctns, sysinfo);
+
+ free_pages((unsigned long)sysinfo, 0);
+}
+
+static void fill_diag_mac(struct sthyi_sctns *sctns,
+ struct diag204_x_phys_block *block,
+ void *diag224_buf)
+{
+ int i;
+
+ for (i = 0; i < block->hdr.cpus; i++) {
+ switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+ case CP:
+ if (block->cpus[i].weight == DED_WEIGHT)
+ sctns->mac.infmdcps++;
+ else
+ sctns->mac.infmscps++;
+ break;
+ case IFL:
+ if (block->cpus[i].weight == DED_WEIGHT)
+ sctns->mac.infmdifl++;
+ else
+ sctns->mac.infmsifl++;
+ break;
+ }
+ }
+ sctns->mac.infmval1 |= MAC_CNT_VLD;
+}
+
+/* Returns a pointer to the the next partition block. */
+static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf,
+ bool this_lpar,
+ void *diag224_buf,
+ struct diag204_x_part_block *block)
+{
+ int i, capped = 0, weight_cp = 0, weight_ifl = 0;
+ struct cpu_inf *cpu_inf;
+
+ for (i = 0; i < block->hdr.rcpus; i++) {
+ if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE))
+ continue;
+
+ switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+ case CP:
+ cpu_inf = &part_inf->cp;
+ if (block->cpus[i].cur_weight < DED_WEIGHT)
+ weight_cp |= block->cpus[i].cur_weight;
+ break;
+ case IFL:
+ cpu_inf = &part_inf->ifl;
+ if (block->cpus[i].cur_weight < DED_WEIGHT)
+ weight_ifl |= block->cpus[i].cur_weight;
+ break;
+ default:
+ continue;
+ }
+
+ if (!this_lpar)
+ continue;
+
+ capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED;
+ cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap;
+ cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap;
+
+ if (block->cpus[i].weight == DED_WEIGHT)
+ cpu_inf->cpu_num_ded += 1;
+ else
+ cpu_inf->cpu_num_shd += 1;
+ }
+
+ if (this_lpar && capped) {
+ part_inf->cp.lpar_weight = weight_cp;
+ part_inf->ifl.lpar_weight = weight_ifl;
+ }
+ part_inf->cp.all_weight += weight_cp;
+ part_inf->ifl.all_weight += weight_ifl;
+ return (struct diag204_x_part_block *)&block->cpus[i];
+}
+
+static void fill_diag(struct sthyi_sctns *sctns)
+{
+ int i, r, pages;
+ bool this_lpar;
+ void *diag204_buf;
+ void *diag224_buf = NULL;
+ struct diag204_x_info_blk_hdr *ti_hdr;
+ struct diag204_x_part_block *part_block;
+ struct diag204_x_phys_block *phys_block;
+ struct lpar_cpu_inf lpar_inf = {};
+
+ /* Errors are handled through the validity bits in the response. */
+ pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+ (unsigned long)DIAG204_INFO_EXT, 0, NULL);
+ if (pages <= 0)
+ return;
+
+ diag204_buf = vmalloc(PAGE_SIZE * pages);
+ if (!diag204_buf)
+ return;
+
+ r = diag204((unsigned long)DIAG204_SUBC_STIB7 |
+ (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf);
+ if (r < 0)
+ goto out;
+
+ diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA);
+ if (!diag224_buf || diag224(diag224_buf))
+ goto out;
+
+ ti_hdr = diag204_buf;
+ part_block = diag204_buf + sizeof(*ti_hdr);
+
+ for (i = 0; i < ti_hdr->npar; i++) {
+ /*
+ * For the calling lpar we also need to get the cpu
+ * caps and weights. The time information block header
+ * specifies the offset to the partition block of the
+ * caller lpar, so we know when we process its data.
+ */
+ this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part;
+ part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf,
+ part_block);
+ }
+
+ phys_block = (struct diag204_x_phys_block *)part_block;
+ part_block = diag204_buf + ti_hdr->this_part;
+ if (part_block->hdr.mtid)
+ sctns->par.infpflg1 = PAR_MT_EN;
+
+ sctns->par.infpval1 |= PAR_GRP_VLD;
+ sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap);
+ sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap);
+ memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name,
+ sizeof(sctns->par.infplgnm));
+
+ sctns->par.infpscps = lpar_inf.cp.cpu_num_shd;
+ sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded;
+ sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd;
+ sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded;
+ sctns->par.infpval1 |= PAR_PCNT_VLD;
+
+ sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap);
+ sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap);
+ sctns->par.infpval1 |= PAR_ABS_VLD;
+
+ /*
+ * Everything below needs global performance data to be
+ * meaningful.
+ */
+ if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) {
+ sctns->hdr.infhflg1 |= HDR_PERF_UNAV;
+ goto out;
+ }
+
+ fill_diag_mac(sctns, phys_block, diag224_buf);
+
+ if (lpar_inf.cp.lpar_weight) {
+ sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 *
+ lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight;
+ }
+
+ if (lpar_inf.ifl.lpar_weight) {
+ sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 *
+ lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight;
+ }
+ sctns->par.infpval1 |= PAR_WGHT_VLD;
+
+out:
+ kfree(diag224_buf);
+ vfree(diag204_buf);
+}
+
+static int sthyi(u64 vaddr)
+{
+ register u64 code asm("0") = 0;
+ register u64 addr asm("2") = vaddr;
+ int cc;
+
+ asm volatile(
+ ".insn rre,0xB2560000,%[code],%[addr]\n"
+ "ipm %[cc]\n"
+ "srl %[cc],28\n"
+ : [cc] "=d" (cc)
+ : [code] "d" (code), [addr] "a" (addr)
+ : "memory", "cc");
+ return cc;
+}
+
+int handle_sthyi(struct kvm_vcpu *vcpu)
+{
+ int reg1, reg2, r = 0;
+ u64 code, addr, cc = 0;
+ struct sthyi_sctns *sctns = NULL;
+
+ /*
+ * STHYI requires extensive locking in the higher hypervisors
+ * and is very computational/memory expensive. Therefore we
+ * ratelimit the executions per VM.
+ */
+ if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) {
+ kvm_s390_retry_instr(vcpu);
+ return 0;
+ }
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+ code = vcpu->run->s.regs.gprs[reg1];
+ addr = vcpu->run->s.regs.gprs[reg2];
+
+ vcpu->stat.instruction_sthyi++;
+ VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr);
+ trace_kvm_s390_handle_sthyi(vcpu, code, addr);
+
+ if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK)
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ if (code & 0xffff) {
+ cc = 3;
+ goto out;
+ }
+
+ /*
+ * If the page has not yet been faulted in, we want to do that
+ * now and not after all the expensive calculations.
+ */
+ r = write_guest(vcpu, addr, reg2, &cc, 1);
+ if (r)
+ return kvm_s390_inject_prog_cond(vcpu, r);
+
+ sctns = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!sctns)
+ return -ENOMEM;
+
+ /*
+ * If we are a guest, we don't want to emulate an emulated
+ * instruction. We ask the hypervisor to provide the data.
+ */
+ if (test_facility(74)) {
+ cc = sthyi((u64)sctns);
+ goto out;
+ }
+
+ fill_hdr(sctns);
+ fill_stsi(sctns);
+ fill_diag(sctns);
+
+out:
+ if (!cc) {
+ r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
+ if (r) {
+ free_page((unsigned long)sctns);
+ return kvm_s390_inject_prog_cond(vcpu, r);
+ }
+ }
+
+ free_page((unsigned long)sctns);
+ vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0;
+ kvm_s390_set_psw_cc(vcpu, cc);
+ return r;
+}
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
index 916834d7a..4fc9d4e5b 100644
--- a/arch/s390/kvm/trace.h
+++ b/arch/s390/kvm/trace.h
@@ -41,7 +41,7 @@ TRACE_EVENT(kvm_s390_skey_related_inst,
TP_fast_assign(
VCPU_ASSIGN_COMMON
),
- VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu")
+ VCPU_TP_PRINTK("%s", "storage key related instruction")
);
TRACE_EVENT(kvm_s390_major_guest_pfault,
@@ -185,8 +185,10 @@ TRACE_EVENT(kvm_s390_intercept_prog,
__entry->code = code;
),
- VCPU_TP_PRINTK("intercepted program interruption %04x",
- __entry->code)
+ VCPU_TP_PRINTK("intercepted program interruption %04x (%s)",
+ __entry->code,
+ __print_symbolic(__entry->code,
+ icpt_prog_codes))
);
/*
@@ -412,6 +414,47 @@ TRACE_EVENT(kvm_s390_handle_stsi,
__entry->addr)
);
+TRACE_EVENT(kvm_s390_handle_operexc,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+ TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u64, instruction)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->instruction = ((__u64)ipa << 48) |
+ ((__u64)ipb << 16);
+ ),
+
+ VCPU_TP_PRINTK("operation exception on instruction %016llx (%s)",
+ __entry->instruction,
+ __print_symbolic(icpt_insn_decoder(__entry->instruction),
+ icpt_insn_codes))
+ );
+
+TRACE_EVENT(kvm_s390_handle_sthyi,
+ TP_PROTO(VCPU_PROTO_COMMON, u64 code, u64 addr),
+ TP_ARGS(VCPU_ARGS_COMMON, code, addr),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(u64, code)
+ __field(u64, addr)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->code = code;
+ __entry->addr = addr;
+ ),
+
+ VCPU_TP_PRINTK("STHYI fc: %llu addr: %016llx",
+ __entry->code, __entry->addr)
+ );
+
#endif /* _TRACE_KVM_H */
/* This part must be outside protection */
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
new file mode 100644
index 000000000..d8673e243
--- /dev/null
+++ b/arch/s390/kvm/vsie.c
@@ -0,0 +1,1091 @@
+/*
+ * kvm nested virtualization support for s390x
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
+ */
+#include <linux/vmalloc.h>
+#include <linux/kvm_host.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <asm/gmap.h>
+#include <asm/mmu_context.h>
+#include <asm/sclp.h>
+#include <asm/nmi.h>
+#include <asm/dis.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+struct vsie_page {
+ struct kvm_s390_sie_block scb_s; /* 0x0000 */
+ /* the pinned originial scb */
+ struct kvm_s390_sie_block *scb_o; /* 0x0200 */
+ /* the shadow gmap in use by the vsie_page */
+ struct gmap *gmap; /* 0x0208 */
+ /* address of the last reported fault to guest2 */
+ unsigned long fault_addr; /* 0x0210 */
+ __u8 reserved[0x0700 - 0x0218]; /* 0x0218 */
+ struct kvm_s390_crypto_cb crycb; /* 0x0700 */
+ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
+} __packed;
+
+/* trigger a validity icpt for the given scb */
+static int set_validity_icpt(struct kvm_s390_sie_block *scb,
+ __u16 reason_code)
+{
+ scb->ipa = 0x1000;
+ scb->ipb = ((__u32) reason_code) << 16;
+ scb->icptcode = ICPT_VALIDITY;
+ return 1;
+}
+
+/* mark the prefix as unmapped, this will block the VSIE */
+static void prefix_unmapped(struct vsie_page *vsie_page)
+{
+ atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* mark the prefix as unmapped and wait until the VSIE has been left */
+static void prefix_unmapped_sync(struct vsie_page *vsie_page)
+{
+ prefix_unmapped(vsie_page);
+ if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+ atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
+ while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+ cpu_relax();
+}
+
+/* mark the prefix as mapped, this will allow the VSIE to run */
+static void prefix_mapped(struct vsie_page *vsie_page)
+{
+ atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* test if the prefix is mapped into the gmap shadow */
+static int prefix_is_mapped(struct vsie_page *vsie_page)
+{
+ return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
+}
+
+/* copy the updated intervention request bits into the shadow scb */
+static void update_intervention_requests(struct vsie_page *vsie_page)
+{
+ const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
+ int cpuflags;
+
+ cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
+ atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
+ atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
+}
+
+/* shadow (filter and validate) the cpuflags */
+static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
+
+ /* we don't allow ESA/390 guests */
+ if (!(cpuflags & CPUSTAT_ZARCH))
+ return set_validity_icpt(scb_s, 0x0001U);
+
+ if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
+ return set_validity_icpt(scb_s, 0x0001U);
+ else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
+ return set_validity_icpt(scb_s, 0x0007U);
+
+ /* intervention requests will be set later */
+ newflags = CPUSTAT_ZARCH;
+ if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
+ newflags |= CPUSTAT_GED;
+ if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
+ if (cpuflags & CPUSTAT_GED)
+ return set_validity_icpt(scb_s, 0x0001U);
+ newflags |= CPUSTAT_GED2;
+ }
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
+ newflags |= cpuflags & CPUSTAT_P;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
+ newflags |= cpuflags & CPUSTAT_SM;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
+ newflags |= cpuflags & CPUSTAT_IBS;
+
+ atomic_set(&scb_s->cpuflags, newflags);
+ return 0;
+}
+
+/*
+ * Create a shadow copy of the crycb block and setup key wrapping, if
+ * requested for guest 3 and enabled for guest 2.
+ *
+ * We only accept format-1 (no AP in g2), but convert it into format-2
+ * There is nothing to do for format-0.
+ *
+ * Returns: - 0 if shadowed or nothing to do
+ * - > 0 if control has to be given to guest 2
+ */
+static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+ unsigned long *b1, *b2;
+ u8 ecb3_flags;
+
+ scb_s->crycbd = 0;
+ if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+ return 0;
+ /* format-1 is supported with message-security-assist extension 3 */
+ if (!test_kvm_facility(vcpu->kvm, 76))
+ return 0;
+ /* we may only allow it if enabled for guest 2 */
+ ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
+ (ECB3_AES | ECB3_DEA);
+ if (!ecb3_flags)
+ return 0;
+
+ if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
+ return set_validity_icpt(scb_s, 0x003CU);
+ else if (!crycb_addr)
+ return set_validity_icpt(scb_s, 0x0039U);
+
+ /* copy only the wrapping keys */
+ if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56))
+ return set_validity_icpt(scb_s, 0x0035U);
+
+ scb_s->ecb3 |= ecb3_flags;
+ scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
+ CRYCB_FORMAT2;
+
+ /* xor both blocks in one run */
+ b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
+ b2 = (unsigned long *)
+ vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
+ /* as 56%8 == 0, bitmap_xor won't overwrite any data */
+ bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
+ return 0;
+}
+
+/* shadow (round up/down) the ibc to avoid validity icpt */
+static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
+
+ scb_s->ibc = 0;
+ /* ibc installed in g2 and requested for g3 */
+ if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
+ scb_s->ibc = scb_o->ibc & 0x0fffU;
+ /* takte care of the minimum ibc level of the machine */
+ if (scb_s->ibc < min_ibc)
+ scb_s->ibc = min_ibc;
+ /* take care of the maximum ibc level set for the guest */
+ if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
+ scb_s->ibc = vcpu->kvm->arch.model.ibc;
+ }
+}
+
+/* unshadow the scb, copying parameters back to the real scb */
+static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+
+ /* interception */
+ scb_o->icptcode = scb_s->icptcode;
+ scb_o->icptstatus = scb_s->icptstatus;
+ scb_o->ipa = scb_s->ipa;
+ scb_o->ipb = scb_s->ipb;
+ scb_o->gbea = scb_s->gbea;
+
+ /* timer */
+ scb_o->cputm = scb_s->cputm;
+ scb_o->ckc = scb_s->ckc;
+ scb_o->todpr = scb_s->todpr;
+
+ /* guest state */
+ scb_o->gpsw = scb_s->gpsw;
+ scb_o->gg14 = scb_s->gg14;
+ scb_o->gg15 = scb_s->gg15;
+ memcpy(scb_o->gcr, scb_s->gcr, 128);
+ scb_o->pp = scb_s->pp;
+
+ /* interrupt intercept */
+ switch (scb_s->icptcode) {
+ case ICPT_PROGI:
+ case ICPT_INSTPROGI:
+ case ICPT_EXTINT:
+ memcpy((void *)((u64)scb_o + 0xc0),
+ (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
+ break;
+ case ICPT_PARTEXEC:
+ /* MVPG only */
+ memcpy((void *)((u64)scb_o + 0xc0),
+ (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
+ break;
+ }
+
+ if (scb_s->ihcpu != 0xffffU)
+ scb_o->ihcpu = scb_s->ihcpu;
+}
+
+/*
+ * Setup the shadow scb by copying and checking the relevant parts of the g2
+ * provided scb.
+ *
+ * Returns: - 0 if the scb has been shadowed
+ * - > 0 if control has to be given to guest 2
+ */
+static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ bool had_tx = scb_s->ecb & 0x10U;
+ unsigned long new_mso = 0;
+ int rc;
+
+ /* make sure we don't have any leftovers when reusing the scb */
+ scb_s->icptcode = 0;
+ scb_s->eca = 0;
+ scb_s->ecb = 0;
+ scb_s->ecb2 = 0;
+ scb_s->ecb3 = 0;
+ scb_s->ecd = 0;
+ scb_s->fac = 0;
+
+ rc = prepare_cpuflags(vcpu, vsie_page);
+ if (rc)
+ goto out;
+
+ /* timer */
+ scb_s->cputm = scb_o->cputm;
+ scb_s->ckc = scb_o->ckc;
+ scb_s->todpr = scb_o->todpr;
+ scb_s->epoch = scb_o->epoch;
+
+ /* guest state */
+ scb_s->gpsw = scb_o->gpsw;
+ scb_s->gg14 = scb_o->gg14;
+ scb_s->gg15 = scb_o->gg15;
+ memcpy(scb_s->gcr, scb_o->gcr, 128);
+ scb_s->pp = scb_o->pp;
+
+ /* interception / execution handling */
+ scb_s->gbea = scb_o->gbea;
+ scb_s->lctl = scb_o->lctl;
+ scb_s->svcc = scb_o->svcc;
+ scb_s->ictl = scb_o->ictl;
+ /*
+ * SKEY handling functions can't deal with false setting of PTE invalid
+ * bits. Therefore we cannot provide interpretation and would later
+ * have to provide own emulation handlers.
+ */
+ scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+ scb_s->icpua = scb_o->icpua;
+
+ if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
+ new_mso = scb_o->mso & 0xfffffffffff00000UL;
+ /* if the hva of the prefix changes, we have to remap the prefix */
+ if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+ prefix_unmapped(vsie_page);
+ /* SIE will do mso/msl validity and exception checks for us */
+ scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
+ scb_s->mso = new_mso;
+ scb_s->prefix = scb_o->prefix;
+
+ /* We have to definetly flush the tlb if this scb never ran */
+ if (scb_s->ihcpu != 0xffffU)
+ scb_s->ihcpu = scb_o->ihcpu;
+
+ /* MVPG and Protection Exception Interpretation are always available */
+ scb_s->eca |= scb_o->eca & 0x01002000U;
+ /* Host-protection-interruption introduced with ESOP */
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
+ scb_s->ecb |= scb_o->ecb & 0x02U;
+ /* transactional execution */
+ if (test_kvm_facility(vcpu->kvm, 73)) {
+ /* remap the prefix is tx is toggled on */
+ if ((scb_o->ecb & 0x10U) && !had_tx)
+ prefix_unmapped(vsie_page);
+ scb_s->ecb |= scb_o->ecb & 0x10U;
+ }
+ /* SIMD */
+ if (test_kvm_facility(vcpu->kvm, 129)) {
+ scb_s->eca |= scb_o->eca & 0x00020000U;
+ scb_s->ecd |= scb_o->ecd & 0x20000000U;
+ }
+ /* Run-time-Instrumentation */
+ if (test_kvm_facility(vcpu->kvm, 64))
+ scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
+ scb_s->eca |= scb_o->eca & 0x00000001U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
+ scb_s->eca |= scb_o->eca & 0x40000000U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
+ scb_s->eca |= scb_o->eca & 0x80000000U;
+
+ prepare_ibc(vcpu, vsie_page);
+ rc = shadow_crycb(vcpu, vsie_page);
+out:
+ if (rc)
+ unshadow_scb(vcpu, vsie_page);
+ return rc;
+}
+
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
+{
+ struct kvm *kvm = gmap->private;
+ struct vsie_page *cur;
+ unsigned long prefix;
+ struct page *page;
+ int i;
+
+ if (!gmap_is_shadow(gmap))
+ return;
+ if (start >= 1UL << 31)
+ /* We are only interested in prefix pages */
+ return;
+
+ /*
+ * Only new shadow blocks are added to the list during runtime,
+ * therefore we can safely reference them all the time.
+ */
+ for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+ page = READ_ONCE(kvm->arch.vsie.pages[i]);
+ if (!page)
+ continue;
+ cur = page_to_virt(page);
+ if (READ_ONCE(cur->gmap) != gmap)
+ continue;
+ prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
+ /* with mso/msl, the prefix lies at an offset */
+ prefix += cur->scb_s.mso;
+ if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
+ prefix_unmapped_sync(cur);
+ }
+}
+
+/*
+ * Map the first prefix page and if tx is enabled also the second prefix page.
+ *
+ * The prefix will be protected, a gmap notifier will inform about unmaps.
+ * The shadow scb must not be executed until the prefix is remapped, this is
+ * guaranteed by properly handling PROG_REQUEST.
+ *
+ * Returns: - 0 on if successfully mapped or already mapped
+ * - > 0 if control has to be given to guest 2
+ * - -EAGAIN if the caller can retry immediately
+ * - -ENOMEM if out of memory
+ */
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+ int rc;
+
+ if (prefix_is_mapped(vsie_page))
+ return 0;
+
+ /* mark it as mapped so we can catch any concurrent unmappers */
+ prefix_mapped(vsie_page);
+
+ /* with mso/msl, the prefix lies at offset *mso* */
+ prefix += scb_s->mso;
+
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+ if (!rc && (scb_s->ecb & 0x10U))
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ prefix + PAGE_SIZE);
+ /*
+ * We don't have to mprotect, we will be called for all unshadows.
+ * SIE will detect if protection applies and trigger a validity.
+ */
+ if (rc)
+ prefix_unmapped(vsie_page);
+ if (rc > 0 || rc == -EFAULT)
+ rc = set_validity_icpt(scb_s, 0x0037U);
+ return rc;
+}
+
+/*
+ * Pin the guest page given by gpa and set hpa to the pinned host address.
+ * Will always be pinned writable.
+ *
+ * Returns: - 0 on success
+ * - -EINVAL if the gpa is not valid guest storage
+ * - -ENOMEM if out of memory
+ */
+static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
+{
+ struct page *page;
+ hva_t hva;
+ int rc;
+
+ hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+ if (kvm_is_error_hva(hva))
+ return -EINVAL;
+ rc = get_user_pages_fast(hva, 1, 1, &page);
+ if (rc < 0)
+ return rc;
+ else if (rc != 1)
+ return -ENOMEM;
+ *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+ return 0;
+}
+
+/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
+static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+ struct page *page;
+
+ page = virt_to_page(hpa);
+ set_page_dirty_lock(page);
+ put_page(page);
+ /* mark the page always as dirty for migration */
+ mark_page_dirty(kvm, gpa_to_gfn(gpa));
+}
+
+/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
+static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ hpa_t hpa;
+ gpa_t gpa;
+
+ hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
+ if (hpa) {
+ gpa = scb_o->scaol & ~0xfUL;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+ gpa |= (u64) scb_o->scaoh << 32;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->scaol = 0;
+ scb_s->scaoh = 0;
+ }
+
+ hpa = scb_s->itdba;
+ if (hpa) {
+ gpa = scb_o->itdba & ~0xffUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->itdba = 0;
+ }
+
+ hpa = scb_s->gvrd;
+ if (hpa) {
+ gpa = scb_o->gvrd & ~0x1ffUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->gvrd = 0;
+ }
+
+ hpa = scb_s->riccbd;
+ if (hpa) {
+ gpa = scb_o->riccbd & ~0x3fUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->riccbd = 0;
+ }
+}
+
+/*
+ * Instead of shadowing some blocks, we can simply forward them because the
+ * addresses in the scb are 64 bit long.
+ *
+ * This works as long as the data lies in one page. If blocks ever exceed one
+ * page, we have to fall back to shadowing.
+ *
+ * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
+ * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
+ *
+ * Returns: - 0 if all blocks were pinned.
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ hpa_t hpa;
+ gpa_t gpa;
+ int rc = 0;
+
+ gpa = scb_o->scaol & ~0xfUL;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+ gpa |= (u64) scb_o->scaoh << 32;
+ if (gpa) {
+ if (!(gpa & ~0x1fffUL))
+ rc = set_validity_icpt(scb_s, 0x0038U);
+ else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
+ rc = set_validity_icpt(scb_s, 0x0011U);
+ else if ((gpa & PAGE_MASK) !=
+ ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+ rc = set_validity_icpt(scb_s, 0x003bU);
+ if (!rc) {
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0034U);
+ }
+ if (rc)
+ goto unpin;
+ scb_s->scaoh = (u32)((u64)hpa >> 32);
+ scb_s->scaol = (u32)(u64)hpa;
+ }
+
+ gpa = scb_o->itdba & ~0xffUL;
+ if (gpa && (scb_s->ecb & 0x10U)) {
+ if (!(gpa & ~0x1fffU)) {
+ rc = set_validity_icpt(scb_s, 0x0080U);
+ goto unpin;
+ }
+ /* 256 bytes cannot cross page boundaries */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0080U);
+ if (rc)
+ goto unpin;
+ scb_s->itdba = hpa;
+ }
+
+ gpa = scb_o->gvrd & ~0x1ffUL;
+ if (gpa && (scb_s->eca & 0x00020000U) &&
+ !(scb_s->ecd & 0x20000000U)) {
+ if (!(gpa & ~0x1fffUL)) {
+ rc = set_validity_icpt(scb_s, 0x1310U);
+ goto unpin;
+ }
+ /*
+ * 512 bytes vector registers cannot cross page boundaries
+ * if this block gets bigger, we have to shadow it.
+ */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x1310U);
+ if (rc)
+ goto unpin;
+ scb_s->gvrd = hpa;
+ }
+
+ gpa = scb_o->riccbd & ~0x3fUL;
+ if (gpa && (scb_s->ecb3 & 0x01U)) {
+ if (!(gpa & ~0x1fffUL)) {
+ rc = set_validity_icpt(scb_s, 0x0043U);
+ goto unpin;
+ }
+ /* 64 bytes cannot cross page boundaries */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0043U);
+ /* Validity 0x0044 will be checked by SIE */
+ if (rc)
+ goto unpin;
+ scb_s->riccbd = hpa;
+ }
+ return 0;
+unpin:
+ unpin_blocks(vcpu, vsie_page);
+ return rc;
+}
+
+/* unpin the scb provided by guest 2, marking it as dirty */
+static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+ gpa_t gpa)
+{
+ hpa_t hpa = (hpa_t) vsie_page->scb_o;
+
+ if (hpa)
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ vsie_page->scb_o = NULL;
+}
+
+/*
+ * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
+ *
+ * Returns: - 0 if the scb was pinned.
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+ gpa_t gpa)
+{
+ hpa_t hpa;
+ int rc;
+
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL) {
+ rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ if (!rc)
+ rc = 1;
+ }
+ if (!rc)
+ vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+ return rc;
+}
+
+/*
+ * Inject a fault into guest 2.
+ *
+ * Returns: - > 0 if control has to be given to guest 2
+ * < 0 if an error occurred during injection.
+ */
+static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
+ bool write_flag)
+{
+ struct kvm_s390_pgm_info pgm = {
+ .code = code,
+ .trans_exc_code =
+ /* 0-51: virtual address */
+ (vaddr & 0xfffffffffffff000UL) |
+ /* 52-53: store / fetch */
+ (((unsigned int) !write_flag) + 1) << 10,
+ /* 62-63: asce id (alway primary == 0) */
+ .exc_access_id = 0, /* always primary */
+ .op_access_id = 0, /* not MVPG */
+ };
+ int rc;
+
+ if (code == PGM_PROTECTION)
+ pgm.trans_exc_code |= 0x4UL;
+
+ rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
+ return rc ? rc : 1;
+}
+
+/*
+ * Handle a fault during vsie execution on a gmap shadow.
+ *
+ * Returns: - 0 if the fault was resolved
+ * - > 0 if control has to be given to guest 2
+ * - < 0 if an error occurred
+ */
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ int rc;
+
+ if (current->thread.gmap_int_code == PGM_PROTECTION)
+ /* we can directly forward all protection exceptions */
+ return inject_fault(vcpu, PGM_PROTECTION,
+ current->thread.gmap_addr, 1);
+
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ current->thread.gmap_addr);
+ if (rc > 0) {
+ rc = inject_fault(vcpu, rc,
+ current->thread.gmap_addr,
+ current->thread.gmap_write_flag);
+ if (rc >= 0)
+ vsie_page->fault_addr = current->thread.gmap_addr;
+ }
+ return rc;
+}
+
+/*
+ * Retry the previous fault that required guest 2 intervention. This avoids
+ * one superfluous SIE re-entry and direct exit.
+ *
+ * Will ignore any errors. The next SIE fault will do proper fault handling.
+ */
+static void handle_last_fault(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ if (vsie_page->fault_addr)
+ kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ vsie_page->fault_addr);
+ vsie_page->fault_addr = 0;
+}
+
+static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
+{
+ vsie_page->scb_s.icptcode = 0;
+}
+
+/* rewind the psw and clear the vsie icpt, so we can retry execution */
+static void retry_vsie_icpt(struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ int ilen = insn_length(scb_s->ipa >> 8);
+
+ /* take care of EXECUTE instructions */
+ if (scb_s->icptstatus & 1) {
+ ilen = (scb_s->icptstatus >> 4) & 0x6;
+ if (!ilen)
+ ilen = 4;
+ }
+ scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
+ clear_vsie_icpt(vsie_page);
+}
+
+/*
+ * Try to shadow + enable the guest 2 provided facility list.
+ * Retry instruction execution if enabled for and provided by guest 2.
+ *
+ * Returns: - 0 if handled (retry or guest 2 icpt)
+ * - > 0 if control has to be given to guest 2
+ */
+static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+
+ if (fac && test_kvm_facility(vcpu->kvm, 7)) {
+ retry_vsie_icpt(vsie_page);
+ if (read_guest_real(vcpu, fac, &vsie_page->fac,
+ sizeof(vsie_page->fac)))
+ return set_validity_icpt(scb_s, 0x1090U);
+ scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+ }
+ return 0;
+}
+
+/*
+ * Run the vsie on a shadow scb and a shadow gmap, without any further
+ * sanity checks, handling SIE faults.
+ *
+ * Returns: - 0 everything went fine
+ * - > 0 if control has to be given to guest 2
+ * - < 0 if an error occurred
+ */
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ int rc;
+
+ handle_last_fault(vcpu, vsie_page);
+
+ if (need_resched())
+ schedule();
+ if (test_cpu_flag(CIF_MCCK_PENDING))
+ s390_handle_mcck();
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ local_irq_disable();
+ guest_enter_irqoff();
+ local_irq_enable();
+
+ rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+
+ local_irq_disable();
+ guest_exit_irqoff();
+ local_irq_enable();
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ if (rc > 0)
+ rc = 0; /* we could still have an icpt */
+ else if (rc == -EFAULT)
+ return handle_fault(vcpu, vsie_page);
+
+ switch (scb_s->icptcode) {
+ case ICPT_INST:
+ if (scb_s->ipa == 0xb2b0)
+ rc = handle_stfle(vcpu, vsie_page);
+ break;
+ case ICPT_STOP:
+ /* stop not requested by g2 - must have been a kick */
+ if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
+ clear_vsie_icpt(vsie_page);
+ break;
+ case ICPT_VALIDITY:
+ if ((scb_s->ipa & 0xf000) != 0xf000)
+ scb_s->ipa += 0x1000;
+ break;
+ }
+ return rc;
+}
+
+static void release_gmap_shadow(struct vsie_page *vsie_page)
+{
+ if (vsie_page->gmap)
+ gmap_put(vsie_page->gmap);
+ WRITE_ONCE(vsie_page->gmap, NULL);
+ prefix_unmapped(vsie_page);
+}
+
+static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ unsigned long asce;
+ union ctlreg0 cr0;
+ struct gmap *gmap;
+ int edat;
+
+ asce = vcpu->arch.sie_block->gcr[1];
+ cr0.val = vcpu->arch.sie_block->gcr[0];
+ edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+ edat += edat && test_kvm_facility(vcpu->kvm, 78);
+
+ /*
+ * ASCE or EDAT could have changed since last icpt, or the gmap
+ * we're holding has been unshadowed. If the gmap is still valid,
+ * we can safely reuse it.
+ */
+ if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+ return 0;
+
+ /* release the old shadow - if any, and mark the prefix as unmapped */
+ release_gmap_shadow(vsie_page);
+ gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+ if (IS_ERR(gmap))
+ return PTR_ERR(gmap);
+ gmap->private = vcpu->kvm;
+ WRITE_ONCE(vsie_page->gmap, gmap);
+ return 0;
+}
+
+/*
+ * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
+ */
+static void register_shadow_scb(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+
+ WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
+ /*
+ * External calls have to lead to a kick of the vcpu and
+ * therefore the vsie -> Simulate Wait state.
+ */
+ atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+ /*
+ * We have to adjust the g3 epoch by the g2 epoch. The epoch will
+ * automatically be adjusted on tod clock changes via kvm_sync_clock.
+ */
+ preempt_disable();
+ scb_s->epoch += vcpu->kvm->arch.epoch;
+ preempt_enable();
+}
+
+/*
+ * Unregister a shadow scb from a VCPU.
+ */
+static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
+{
+ atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+ WRITE_ONCE(vcpu->arch.vsie_block, NULL);
+}
+
+/*
+ * Run the vsie on a shadowed scb, managing the gmap shadow, handling
+ * prefix pages and faults.
+ *
+ * Returns: - 0 if no errors occurred
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ int rc = 0;
+
+ while (1) {
+ rc = acquire_gmap_shadow(vcpu, vsie_page);
+ if (!rc)
+ rc = map_prefix(vcpu, vsie_page);
+ if (!rc) {
+ gmap_enable(vsie_page->gmap);
+ update_intervention_requests(vsie_page);
+ rc = do_vsie_run(vcpu, vsie_page);
+ gmap_enable(vcpu->arch.gmap);
+ }
+ atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
+
+ if (rc == -EAGAIN)
+ rc = 0;
+ if (rc || scb_s->icptcode || signal_pending(current) ||
+ kvm_s390_vcpu_has_irq(vcpu, 0))
+ break;
+ };
+
+ if (rc == -EFAULT) {
+ /*
+ * Addressing exceptions are always presentes as intercepts.
+ * As addressing exceptions are suppressing and our guest 3 PSW
+ * points at the responsible instruction, we have to
+ * forward the PSW and set the ilc. If we can't read guest 3
+ * instruction, we can use an arbitrary ilc. Let's always use
+ * ilen = 4 for now, so we can avoid reading in guest 3 virtual
+ * memory. (we could also fake the shadow so the hardware
+ * handles it).
+ */
+ scb_s->icptcode = ICPT_PROGI;
+ scb_s->iprcc = PGM_ADDRESSING;
+ scb_s->pgmilc = 4;
+ scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+ }
+ return rc;
+}
+
+/*
+ * Get or create a vsie page for a scb address.
+ *
+ * Returns: - address of a vsie page (cached or new one)
+ * - NULL if the same scb address is already used by another VCPU
+ * - ERR_PTR(-ENOMEM) if out of memory
+ */
+static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
+{
+ struct vsie_page *vsie_page;
+ struct page *page;
+ int nr_vcpus;
+
+ rcu_read_lock();
+ page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+ rcu_read_unlock();
+ if (page) {
+ if (page_ref_inc_return(page) == 2)
+ return page_to_virt(page);
+ page_ref_dec(page);
+ }
+
+ /*
+ * We want at least #online_vcpus shadows, so every VCPU can execute
+ * the VSIE in parallel.
+ */
+ nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+ mutex_lock(&kvm->arch.vsie.mutex);
+ if (kvm->arch.vsie.page_count < nr_vcpus) {
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+ if (!page) {
+ mutex_unlock(&kvm->arch.vsie.mutex);
+ return ERR_PTR(-ENOMEM);
+ }
+ page_ref_inc(page);
+ kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+ kvm->arch.vsie.page_count++;
+ } else {
+ /* reuse an existing entry that belongs to nobody */
+ while (true) {
+ page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+ if (page_ref_inc_return(page) == 2)
+ break;
+ page_ref_dec(page);
+ kvm->arch.vsie.next++;
+ kvm->arch.vsie.next %= nr_vcpus;
+ }
+ radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+ }
+ page->index = addr;
+ /* double use of the same address */
+ if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
+ page_ref_dec(page);
+ mutex_unlock(&kvm->arch.vsie.mutex);
+ return NULL;
+ }
+ mutex_unlock(&kvm->arch.vsie.mutex);
+
+ vsie_page = page_to_virt(page);
+ memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
+ release_gmap_shadow(vsie_page);
+ vsie_page->fault_addr = 0;
+ vsie_page->scb_s.ihcpu = 0xffffU;
+ return vsie_page;
+}
+
+/* put a vsie page acquired via get_vsie_page */
+static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+ struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
+
+ page_ref_dec(page);
+}
+
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
+{
+ struct vsie_page *vsie_page;
+ unsigned long scb_addr;
+ int rc;
+
+ vcpu->stat.instruction_sie++;
+ if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
+ return -EOPNOTSUPP;
+ if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+ return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+ BUILD_BUG_ON(sizeof(struct vsie_page) != 4096);
+ scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
+
+ /* 512 byte alignment */
+ if (unlikely(scb_addr & 0x1ffUL))
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
+ return 0;
+
+ vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
+ if (IS_ERR(vsie_page))
+ return PTR_ERR(vsie_page);
+ else if (!vsie_page)
+ /* double use of sie control block - simply do nothing */
+ return 0;
+
+ rc = pin_scb(vcpu, vsie_page, scb_addr);
+ if (rc)
+ goto out_put;
+ rc = shadow_scb(vcpu, vsie_page);
+ if (rc)
+ goto out_unpin_scb;
+ rc = pin_blocks(vcpu, vsie_page);
+ if (rc)
+ goto out_unshadow;
+ register_shadow_scb(vcpu, vsie_page);
+ rc = vsie_run(vcpu, vsie_page);
+ unregister_shadow_scb(vcpu);
+ unpin_blocks(vcpu, vsie_page);
+out_unshadow:
+ unshadow_scb(vcpu, vsie_page);
+out_unpin_scb:
+ unpin_scb(vcpu, vsie_page, scb_addr);
+out_put:
+ put_vsie_page(vcpu->kvm, vsie_page);
+
+ return rc < 0 ? rc : 0;
+}
+
+/* Init the vsie data structures. To be called when a vm is initialized. */
+void kvm_s390_vsie_init(struct kvm *kvm)
+{
+ mutex_init(&kvm->arch.vsie.mutex);
+ INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+}
+
+/* Destroy the vsie data structures. To be called when a vm is destroyed. */
+void kvm_s390_vsie_destroy(struct kvm *kvm)
+{
+ struct vsie_page *vsie_page;
+ struct page *page;
+ int i;
+
+ mutex_lock(&kvm->arch.vsie.mutex);
+ for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+ page = kvm->arch.vsie.pages[i];
+ kvm->arch.vsie.pages[i] = NULL;
+ vsie_page = page_to_virt(page);
+ release_gmap_shadow(vsie_page);
+ /* free the radix tree entry */
+ radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+ __free_page(page);
+ }
+ kvm->arch.vsie.page_count = 0;
+ mutex_unlock(&kvm->arch.vsie.mutex);
+}
+
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
+
+ /*
+ * Even if the VCPU lets go of the shadow sie block reference, it is
+ * still valid in the cache. So we can safely kick it.
+ */
+ if (scb) {
+ atomic_or(PROG_BLOCK_SIE, &scb->prog20);
+ if (scb->prog0c & PROG_IN_SIE)
+ atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
+ }
+}
diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c
index b647d5ff0..48352bffb 100644
--- a/arch/s390/lib/string.c
+++ b/arch/s390/lib/string.c
@@ -236,6 +236,24 @@ char * strrchr(const char * s, int c)
}
EXPORT_SYMBOL(strrchr);
+static inline int clcle(const char *s1, unsigned long l1,
+ const char *s2, unsigned long l2)
+{
+ register unsigned long r2 asm("2") = (unsigned long) s1;
+ register unsigned long r3 asm("3") = (unsigned long) l1;
+ register unsigned long r4 asm("4") = (unsigned long) s2;
+ register unsigned long r5 asm("5") = (unsigned long) l2;
+ int cc;
+
+ asm volatile ("0: clcle %1,%3,0\n"
+ " jo 0b\n"
+ " ipm %0\n"
+ " srl %0,28"
+ : "=&d" (cc), "+a" (r2), "+a" (r3),
+ "+a" (r4), "+a" (r5) : : "cc");
+ return cc;
+}
+
/**
* strstr - Find the first substring in a %NUL terminated string
* @s1: The string to be searched
@@ -250,18 +268,9 @@ char * strstr(const char * s1,const char * s2)
return (char *) s1;
l1 = __strend(s1) - s1;
while (l1-- >= l2) {
- register unsigned long r2 asm("2") = (unsigned long) s1;
- register unsigned long r3 asm("3") = (unsigned long) l2;
- register unsigned long r4 asm("4") = (unsigned long) s2;
- register unsigned long r5 asm("5") = (unsigned long) l2;
int cc;
- asm volatile ("0: clcle %1,%3,0\n"
- " jo 0b\n"
- " ipm %0\n"
- " srl %0,28"
- : "=&d" (cc), "+a" (r2), "+a" (r3),
- "+a" (r4), "+a" (r5) : : "cc" );
+ cc = clcle(s1, l2, s2, l2);
if (!cc)
return (char *) s1;
s1++;
@@ -302,20 +311,11 @@ EXPORT_SYMBOL(memchr);
*/
int memcmp(const void *cs, const void *ct, size_t n)
{
- register unsigned long r2 asm("2") = (unsigned long) cs;
- register unsigned long r3 asm("3") = (unsigned long) n;
- register unsigned long r4 asm("4") = (unsigned long) ct;
- register unsigned long r5 asm("5") = (unsigned long) n;
int ret;
- asm volatile ("0: clcle %1,%3,0\n"
- " jo 0b\n"
- " ipm %0\n"
- " srl %0,28"
- : "=&d" (ret), "+a" (r2), "+a" (r3), "+a" (r4), "+a" (r5)
- : : "cc" );
+ ret = clcle(cs, n, ct, n);
if (ret)
- ret = *(char *) r2 - *(char *) r4;
+ ret = ret == 1 ? -1 : 1;
return ret;
}
EXPORT_SYMBOL(memcmp);
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index ae4de559e..f481fcde0 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -49,7 +49,7 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr
" jnm 5b\n"
" ex %4,0(%3)\n"
" j 8f\n"
- "7:slgr %0,%0\n"
+ "7: slgr %0,%0\n"
"8:\n"
EX_TABLE(0b,2b) EX_TABLE(3b,4b) EX_TABLE(9b,2b) EX_TABLE(10b,4b)
: "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
@@ -93,7 +93,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr,
" jnm 6b\n"
" ex %4,0(%3)\n"
" j 9f\n"
- "8:slgr %0,%0\n"
+ "8: slgr %0,%0\n"
"9: sacf 768\n"
EX_TABLE(0b,3b) EX_TABLE(2b,3b) EX_TABLE(4b,5b)
EX_TABLE(10b,3b) EX_TABLE(11b,3b) EX_TABLE(12b,5b)
@@ -104,6 +104,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr,
unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n)
{
+ check_object_size(to, n, false);
if (static_branch_likely(&have_mvcos))
return copy_from_user_mvcos(to, from, n);
return copy_from_user_mvcp(to, from, n);
@@ -177,6 +178,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x,
unsigned long __copy_to_user(void __user *to, const void *from, unsigned long n)
{
+ check_object_size(from, n, true);
if (static_branch_likely(&have_mvcos))
return copy_to_user_mvcos(to, from, n);
return copy_to_user_mvcs(to, from, n);
@@ -266,7 +268,7 @@ static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size
"3: .insn ss,0xc80000000000,0(%3,%1),0(%4),0\n"
" slgr %0,%3\n"
" j 5f\n"
- "4:slgr %0,%0\n"
+ "4: slgr %0,%0\n"
"5:\n"
EX_TABLE(0b,2b) EX_TABLE(3b,5b)
: "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2)
diff --git a/arch/s390/mm/dump_pagetables.c b/arch/s390/mm/dump_pagetables.c
index 8556d6be9..861880df1 100644
--- a/arch/s390/mm/dump_pagetables.c
+++ b/arch/s390/mm/dump_pagetables.c
@@ -157,7 +157,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st,
pud = pud_offset(pgd, addr);
if (!pud_none(*pud))
if (pud_large(*pud)) {
- prot = pud_val(*pud) & _REGION3_ENTRY_RO;
+ prot = pud_val(*pud) & _REGION_ENTRY_PROTECT;
note_page(m, st, prot, 2);
} else
walk_pmd_level(m, st, pud, addr);
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 19288c1b3..a58bca62a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -418,6 +418,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
(struct gmap *) S390_lowcore.gmap : NULL;
if (gmap) {
current->thread.gmap_addr = address;
+ current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+ current->thread.gmap_int_code = regs->int_code & 0xffff;
address = __gmap_translate(gmap, address);
if (address == -EFAULT) {
fault = VM_FAULT_BADMAP;
@@ -456,7 +458,7 @@ retry:
* make sure we exit gracefully rather than endlessly redo
* the fault.
*/
- fault = handle_mm_fault(mm, vma, address, flags);
+ fault = handle_mm_fault(vma, address, flags);
/* No reason to continue if interrupted by SIGKILL. */
if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
fault = VM_FAULT_SIGNAL;
@@ -624,7 +626,7 @@ void pfault_fini(void)
diag_stat_inc(DIAG_STAT_X258);
asm volatile(
" diag %0,0,0x258\n"
- "0:\n"
+ "0: nopr %%r7\n"
EX_TABLE(0b,0b)
: : "a" (&refbk), "m" (refbk) : "cc");
}
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 313c3b8cf..2ce6bb3ba 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -20,14 +20,16 @@
#include <asm/gmap.h>
#include <asm/tlb.h>
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
/**
- * gmap_alloc - allocate a guest address space
+ * gmap_alloc - allocate and initialize a guest address space
* @mm: pointer to the parent mm_struct
* @limit: maximum address of the gmap address space
*
* Returns a guest address space structure.
*/
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+static struct gmap *gmap_alloc(unsigned long limit)
{
struct gmap *gmap;
struct page *page;
@@ -55,10 +57,14 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
if (!gmap)
goto out;
INIT_LIST_HEAD(&gmap->crst_list);
+ INIT_LIST_HEAD(&gmap->children);
+ INIT_LIST_HEAD(&gmap->pt_list);
INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+ INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
spin_lock_init(&gmap->guest_table_lock);
- gmap->mm = mm;
+ spin_lock_init(&gmap->shadow_lock);
+ atomic_set(&gmap->ref_count, 1);
page = alloc_pages(GFP_KERNEL, 2);
if (!page)
goto out_free;
@@ -70,9 +76,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
gmap->asce = atype | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | __pa(table);
gmap->asce_end = limit;
- down_write(&mm->mmap_sem);
- list_add(&gmap->list, &mm->context.gmap_list);
- up_write(&mm->mmap_sem);
return gmap;
out_free:
@@ -80,7 +83,28 @@ out_free:
out:
return NULL;
}
-EXPORT_SYMBOL_GPL(gmap_alloc);
+
+/**
+ * gmap_create - create a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+{
+ struct gmap *gmap;
+
+ gmap = gmap_alloc(limit);
+ if (!gmap)
+ return NULL;
+ gmap->mm = mm;
+ spin_lock(&mm->context.gmap_lock);
+ list_add_rcu(&gmap->list, &mm->context.gmap_list);
+ spin_unlock(&mm->context.gmap_lock);
+ return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_create);
static void gmap_flush_tlb(struct gmap *gmap)
{
@@ -114,31 +138,117 @@ static void gmap_radix_tree_free(struct radix_tree_root *root)
} while (nr > 0);
}
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+ struct gmap_rmap *rmap, *rnext, *head;
+ struct radix_tree_iter iter;
+ unsigned long indices[16];
+ unsigned long index;
+ void **slot;
+ int i, nr;
+
+ /* A radix tree is freed by deleting all of its entries */
+ index = 0;
+ do {
+ nr = 0;
+ radix_tree_for_each_slot(slot, root, &iter, index) {
+ indices[nr] = iter.index;
+ if (++nr == 16)
+ break;
+ }
+ for (i = 0; i < nr; i++) {
+ index = indices[i];
+ head = radix_tree_delete(root, index);
+ gmap_for_each_rmap_safe(rmap, rnext, head)
+ kfree(rmap);
+ }
+ } while (nr > 0);
+}
+
/**
* gmap_free - free a guest address space
* @gmap: pointer to the guest address space structure
+ *
+ * No locks required. There are no references to this gmap anymore.
*/
-void gmap_free(struct gmap *gmap)
+static void gmap_free(struct gmap *gmap)
{
struct page *page, *next;
- /* Flush tlb. */
- if (MACHINE_HAS_IDTE)
- __tlb_flush_idte(gmap->asce);
- else
- __tlb_flush_global();
-
+ /* Flush tlb of all gmaps (if not already done for shadows) */
+ if (!(gmap_is_shadow(gmap) && gmap->removed))
+ gmap_flush_tlb(gmap);
/* Free all segment & region tables. */
list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
__free_pages(page, 2);
gmap_radix_tree_free(&gmap->guest_to_host);
gmap_radix_tree_free(&gmap->host_to_guest);
- down_write(&gmap->mm->mmap_sem);
- list_del(&gmap->list);
- up_write(&gmap->mm->mmap_sem);
+
+ /* Free additional data for a shadow gmap */
+ if (gmap_is_shadow(gmap)) {
+ /* Free all page tables. */
+ list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+ page_table_free_pgste(page);
+ gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+ /* Release reference to the parent */
+ gmap_put(gmap->parent);
+ }
+
kfree(gmap);
}
-EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_get - increase reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * Returns the gmap pointer
+ */
+struct gmap *gmap_get(struct gmap *gmap)
+{
+ atomic_inc(&gmap->ref_count);
+ return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get);
+
+/**
+ * gmap_put - decrease reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * If the reference counter reaches zero the guest address space is freed.
+ */
+void gmap_put(struct gmap *gmap)
+{
+ if (atomic_dec_return(&gmap->ref_count) == 0)
+ gmap_free(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_put);
+
+/**
+ * gmap_remove - remove a guest address space but do not free it yet
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_remove(struct gmap *gmap)
+{
+ struct gmap *sg, *next;
+
+ /* Remove all shadow gmaps linked to this gmap */
+ if (!list_empty(&gmap->children)) {
+ spin_lock(&gmap->shadow_lock);
+ list_for_each_entry_safe(sg, next, &gmap->children, list) {
+ list_del(&sg->list);
+ gmap_put(sg);
+ }
+ spin_unlock(&gmap->shadow_lock);
+ }
+ /* Remove gmap from the pre-mm list */
+ spin_lock(&gmap->mm->context.gmap_lock);
+ list_del_rcu(&gmap->list);
+ spin_unlock(&gmap->mm->context.gmap_lock);
+ synchronize_rcu();
+ /* Put reference */
+ gmap_put(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_remove);
/**
* gmap_enable - switch primary space to the guest address space
@@ -160,6 +270,17 @@ void gmap_disable(struct gmap *gmap)
}
EXPORT_SYMBOL_GPL(gmap_disable);
+/**
+ * gmap_get_enabled - get a pointer to the currently enabled gmap
+ *
+ * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+ */
+struct gmap *gmap_get_enabled(void)
+{
+ return (struct gmap *) S390_lowcore.gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get_enabled);
+
/*
* gmap_alloc_table is assumed to be called with mmap_sem held
*/
@@ -175,7 +296,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
return -ENOMEM;
new = (unsigned long *) page_to_phys(page);
crst_table_init(new, init);
- spin_lock(&gmap->mm->page_table_lock);
+ spin_lock(&gmap->guest_table_lock);
if (*table & _REGION_ENTRY_INVALID) {
list_add(&page->lru, &gmap->crst_list);
*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
@@ -183,7 +304,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
page->index = gaddr;
page = NULL;
}
- spin_unlock(&gmap->mm->page_table_lock);
+ spin_unlock(&gmap->guest_table_lock);
if (page)
__free_pages(page, 2);
return 0;
@@ -219,6 +340,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
unsigned long *entry;
int flush = 0;
+ BUG_ON(gmap_is_shadow(gmap));
spin_lock(&gmap->guest_table_lock);
entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
if (entry) {
@@ -258,6 +380,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
unsigned long off;
int flush;
+ BUG_ON(gmap_is_shadow(gmap));
if ((to | len) & (PMD_SIZE - 1))
return -EINVAL;
if (len == 0 || to + len < to)
@@ -289,6 +412,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
unsigned long off;
int flush;
+ BUG_ON(gmap_is_shadow(gmap));
if ((from | to | len) & (PMD_SIZE - 1))
return -EINVAL;
if (len == 0 || from + len < from || to + len < to ||
@@ -326,6 +450,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
* This function does not establish potentially missing page table entries.
* The mmap_sem of the mm that belongs to the address space must be held
* when this function gets called.
+ *
+ * Note: Can also be called for shadow gmaps.
*/
unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
{
@@ -333,6 +459,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
vmaddr = (unsigned long)
radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+ /* Note: guest_to_host is empty for a shadow gmap */
return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
}
EXPORT_SYMBOL_GPL(__gmap_translate);
@@ -369,11 +496,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
struct gmap *gmap;
int flush;
- list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
if (flush)
gmap_flush_tlb(gmap);
}
+ rcu_read_unlock();
}
/**
@@ -397,6 +526,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
pmd_t *pmd;
int rc;
+ BUG_ON(gmap_is_shadow(gmap));
/* Create higher level tables in the gmap page table */
table = gmap->table;
if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
@@ -430,6 +560,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
VM_BUG_ON(pgd_none(*pgd));
pud = pud_offset(pgd, vmaddr);
VM_BUG_ON(pud_none(*pud));
+ /* large puds cannot yet be handled */
+ if (pud_large(*pud))
+ return -EFAULT;
pmd = pmd_offset(pud, vmaddr);
VM_BUG_ON(pmd_none(*pmd));
/* large pmds cannot yet be handled */
@@ -549,116 +682,1412 @@ static LIST_HEAD(gmap_notifier_list);
static DEFINE_SPINLOCK(gmap_notifier_lock);
/**
- * gmap_register_ipte_notifier - register a pte invalidation callback
+ * gmap_register_pte_notifier - register a pte invalidation callback
* @nb: pointer to the gmap notifier block
*/
-void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+void gmap_register_pte_notifier(struct gmap_notifier *nb)
{
spin_lock(&gmap_notifier_lock);
- list_add(&nb->list, &gmap_notifier_list);
+ list_add_rcu(&nb->list, &gmap_notifier_list);
spin_unlock(&gmap_notifier_lock);
}
-EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
/**
- * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+ * gmap_unregister_pte_notifier - remove a pte invalidation callback
* @nb: pointer to the gmap notifier block
*/
-void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
{
spin_lock(&gmap_notifier_lock);
- list_del_init(&nb->list);
+ list_del_rcu(&nb->list);
spin_unlock(&gmap_notifier_lock);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+
+/**
+ * gmap_call_notifier - call all registered invalidation callbacks
+ * @gmap: pointer to guest mapping meta data structure
+ * @start: start virtual address in the guest address space
+ * @end: end virtual address in the guest address space
+ */
+static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
+{
+ struct gmap_notifier *nb;
+
+ list_for_each_entry(nb, &gmap_notifier_list, list)
+ nb->notifier_call(gmap, start, end);
+}
+
+/**
+ * gmap_table_walk - walk the gmap page tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @level: page table level to stop at
+ *
+ * Returns a table entry pointer for the given guest address and @level
+ * @level=0 : returns a pointer to a page table table entry (or NULL)
+ * @level=1 : returns a pointer to a segment table entry (or NULL)
+ * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+ * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+ * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+ *
+ * Returns NULL if the gmap page tables could not be walked to the
+ * requested level.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+ unsigned long gaddr, int level)
+{
+ unsigned long *table;
+
+ if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+ return NULL;
+ if (gmap_is_shadow(gmap) && gmap->removed)
+ return NULL;
+ if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+ return NULL;
+ table = gmap->table;
+ switch (gmap->asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_REGION1:
+ table += (gaddr >> 53) & 0x7ff;
+ if (level == 4)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_REGION2:
+ table += (gaddr >> 42) & 0x7ff;
+ if (level == 3)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_REGION3:
+ table += (gaddr >> 31) & 0x7ff;
+ if (level == 2)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_SEGMENT:
+ table += (gaddr >> 20) & 0x7ff;
+ if (level == 1)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table += (gaddr >> 12) & 0xff;
+ }
+ return table;
+}
+
+/**
+ * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+ * and return the pte pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @ptl: pointer to the spinlock pointer
+ *
+ * Returns a pointer to the locked pte for a guest address, or NULL
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+ spinlock_t **ptl)
+{
+ unsigned long *table;
+
+ if (gmap_is_shadow(gmap))
+ spin_lock(&gmap->guest_table_lock);
+ /* Walk the gmap page table, lock and get pte pointer */
+ table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+ if (!table || *table & _SEGMENT_ENTRY_INVALID) {
+ if (gmap_is_shadow(gmap))
+ spin_unlock(&gmap->guest_table_lock);
+ return NULL;
+ }
+ if (gmap_is_shadow(gmap)) {
+ *ptl = &gmap->guest_table_lock;
+ return pte_offset_map((pmd_t *) table, gaddr);
+ }
+ return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+}
+
+/**
+ * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: address in the host process address space
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if the caller can retry __gmap_translate (might fail again),
+ * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+ * up or connecting the gmap page table.
+ */
+static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+ unsigned long vmaddr, int prot)
+{
+ struct mm_struct *mm = gmap->mm;
+ unsigned int fault_flags;
+ bool unlocked = false;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+ if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+ return -EFAULT;
+ if (unlocked)
+ /* lost mmap_sem, caller has to retry __gmap_translate */
+ return 0;
+ /* Connect the page tables */
+ return __gmap_link(gmap, gaddr, vmaddr);
}
-EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
/**
- * gmap_ipte_notify - mark a range of ptes for invalidation notification
+ * gmap_pte_op_end - release the page table lock
+ * @ptl: pointer to the spinlock pointer
+ */
+static void gmap_pte_op_end(spinlock_t *ptl)
+{
+ spin_unlock(ptl);
+}
+
+/*
+ * gmap_protect_range - remove access rights to memory and set pgste bits
* @gmap: pointer to guest mapping meta data structure
* @gaddr: virtual address in the guest address space
* @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
+ *
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
*
- * Returns 0 if for each page in the given range a gmap mapping exists and
- * the invalidation notification could be set. If the gmap mapping is missing
- * for one or more pages -EFAULT is returned. If no memory could be allocated
- * -ENOMEM is returned. This function establishes missing page table entries.
+ * Called with sg->mm->mmap_sem in read.
+ *
+ * Note: Can also be called for shadow gmaps.
*/
-int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+ unsigned long len, int prot, unsigned long bits)
{
- unsigned long addr;
+ unsigned long vmaddr;
spinlock_t *ptl;
pte_t *ptep;
- bool unlocked;
- int rc = 0;
+ int rc;
+
+ while (len) {
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+ if (ptep) {
+ rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+ gmap_pte_op_end(ptl);
+ }
+ if (rc) {
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr))
+ return vmaddr;
+ rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+ if (rc)
+ return rc;
+ continue;
+ }
+ gaddr += PAGE_SIZE;
+ len -= PAGE_SIZE;
+ }
+ return 0;
+}
- if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+/**
+ * gmap_mprotect_notify - change access rights for a range of ptes and
+ * call the notifier if any pte changes again
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if for each page in the given range a gmap mapping exists,
+ * the new access rights could be set and the notifier could be armed.
+ * If the gmap mapping is missing for one or more pages -EFAULT is
+ * returned. If no memory could be allocated -ENOMEM is returned.
+ * This function establishes missing page table entries.
+ */
+int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+ unsigned long len, int prot)
+{
+ int rc;
+
+ if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
+ return -EINVAL;
+ if (!MACHINE_HAS_ESOP && prot == PROT_READ)
return -EINVAL;
down_read(&gmap->mm->mmap_sem);
- while (len) {
- unlocked = false;
- /* Convert gmap address and connect the page tables */
- addr = __gmap_translate(gmap, gaddr);
- if (IS_ERR_VALUE(addr)) {
- rc = addr;
+ rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+ up_read(&gmap->mm->mmap_sem);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+
+/**
+ * gmap_read_table - get an unsigned long value from a guest page table using
+ * absolute addressing, without marking the page referenced.
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @val: pointer to the unsigned long value to return
+ *
+ * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+ * if reading using the virtual address failed.
+ *
+ * Called with gmap->mm->mmap_sem in read.
+ */
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+{
+ unsigned long address, vmaddr;
+ spinlock_t *ptl;
+ pte_t *ptep, pte;
+ int rc;
+
+ while (1) {
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+ if (ptep) {
+ pte = *ptep;
+ if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+ address = pte_val(pte) & PAGE_MASK;
+ address += gaddr & ~PAGE_MASK;
+ *val = *(unsigned long *) address;
+ pte_val(*ptep) |= _PAGE_YOUNG;
+ /* Do *NOT* clear the _PAGE_INVALID bit! */
+ rc = 0;
+ }
+ gmap_pte_op_end(ptl);
+ }
+ if (!rc)
+ break;
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr)) {
+ rc = vmaddr;
break;
}
- /* Get the page mapped */
- if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
- &unlocked)) {
- rc = -EFAULT;
+ rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
+ if (rc)
break;
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_read_table);
+
+/**
+ * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+ * @sg: pointer to the shadow guest address space structure
+ * @vmaddr: vm address associated with the rmap
+ * @rmap: pointer to the rmap structure
+ *
+ * Called with the sg->guest_table_lock
+ */
+static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+ struct gmap_rmap *rmap)
+{
+ void **slot;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+ if (slot) {
+ rmap->next = radix_tree_deref_slot_protected(slot,
+ &sg->guest_table_lock);
+ radix_tree_replace_slot(slot, rmap);
+ } else {
+ rmap->next = NULL;
+ radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+ rmap);
+ }
+}
+
+/**
+ * gmap_protect_rmap - modify access rights to memory and create an rmap
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow gmap
+ * @paddr: address in the parent guest address space
+ * @len: length of the memory area to protect
+ * @prot: indicates access rights: none, read-only or read-write
+ *
+ * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+ * if out of memory and -EFAULT if paddr is invalid.
+ */
+static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+ unsigned long paddr, unsigned long len, int prot)
+{
+ struct gmap *parent;
+ struct gmap_rmap *rmap;
+ unsigned long vmaddr;
+ spinlock_t *ptl;
+ pte_t *ptep;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ parent = sg->parent;
+ while (len) {
+ vmaddr = __gmap_translate(parent, paddr);
+ if (IS_ERR_VALUE(vmaddr))
+ return vmaddr;
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ if (!rmap)
+ return -ENOMEM;
+ rmap->raddr = raddr;
+ rc = radix_tree_preload(GFP_KERNEL);
+ if (rc) {
+ kfree(rmap);
+ return rc;
}
- /* While trying to map mmap_sem got unlocked. Let us retry */
- if (unlocked)
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+ if (ptep) {
+ spin_lock(&sg->guest_table_lock);
+ rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+ PGSTE_VSIE_BIT);
+ if (!rc)
+ gmap_insert_rmap(sg, vmaddr, rmap);
+ spin_unlock(&sg->guest_table_lock);
+ gmap_pte_op_end(ptl);
+ }
+ radix_tree_preload_end();
+ if (rc) {
+ kfree(rmap);
+ rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+ if (rc)
+ return rc;
+ continue;
+ }
+ paddr += PAGE_SIZE;
+ len -= PAGE_SIZE;
+ }
+ return 0;
+}
+
+#define _SHADOW_RMAP_MASK 0x7
+#define _SHADOW_RMAP_REGION1 0x5
+#define _SHADOW_RMAP_REGION2 0x4
+#define _SHADOW_RMAP_REGION3 0x3
+#define _SHADOW_RMAP_SEGMENT 0x2
+#define _SHADOW_RMAP_PGTABLE 0x1
+
+/**
+ * gmap_idte_one - invalidate a single region or segment table entry
+ * @asce: region or segment table *origin* + table-type bits
+ * @vaddr: virtual address to identify the table entry to flush
+ *
+ * The invalid bit of a single region or segment table entry is set
+ * and the associated TLB entries depending on the entry are flushed.
+ * The table-type of the @asce identifies the portion of the @vaddr
+ * that is used as the invalidation index.
+ */
+static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+{
+ asm volatile(
+ " .insn rrf,0xb98e0000,%0,%1,0,0"
+ : : "a" (asce), "a" (vaddr) : "cc", "memory");
+}
+
+/**
+ * gmap_unshadow_page - remove a page from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+ if (!table || *table & _PAGE_INVALID)
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
+ ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+}
+
+/**
+ * __gmap_unshadow_pgt - remove all entries from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @pgt: pointer to the start of a shadow page table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+ unsigned long *pgt)
+{
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ for (i = 0; i < 256; i++, raddr += 1UL << 12)
+ pgt[i] = _PAGE_INVALID;
+}
+
+/**
+ * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long sto, *ste, *pgt;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
+ sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
+ gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+ pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+ *ste = _SEGMENT_ENTRY_EMPTY;
+ __gmap_unshadow_pgt(sg, raddr, pgt);
+ /* Free page table */
+ page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ page_table_free_pgste(page);
+}
+
+/**
+ * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @sgt: pointer to the start of a shadow segment table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+ unsigned long *sgt)
+{
+ unsigned long asce, *pgt;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
+ if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+ continue;
+ pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+ sgt[i] = _SEGMENT_ENTRY_EMPTY;
+ __gmap_unshadow_pgt(sg, raddr, pgt);
+ /* Free page table */
+ page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ page_table_free_pgste(page);
+ }
+}
+
+/**
+ * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r3o, *r3e, *sgt;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+ if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
+ r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
+ gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+ sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+ *r3e = _REGION3_ENTRY_EMPTY;
+ __gmap_unshadow_sgt(sg, raddr, sgt);
+ /* Free segment table */
+ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ * @r3t: pointer to the start of a shadow region-3 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r3t)
+{
+ unsigned long asce, *sgt;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
+ if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+ r3t[i] = _REGION3_ENTRY_EMPTY;
+ __gmap_unshadow_sgt(sg, raddr, sgt);
+ /* Free segment table */
+ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r2o, *r2e, *r3t;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+ if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
+ r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
+ gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+ r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+ *r2e = _REGION2_ENTRY_EMPTY;
+ __gmap_unshadow_r3t(sg, raddr, r3t);
+ /* Free region 3 table */
+ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r2t: pointer to the start of a shadow region-2 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r2t)
+{
+ unsigned long asce, *r3t;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
+ if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
continue;
- rc = __gmap_link(gmap, gaddr, addr);
+ r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+ r2t[i] = _REGION2_ENTRY_EMPTY;
+ __gmap_unshadow_r3t(sg, raddr, r3t);
+ /* Free region 3 table */
+ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r1o, *r1e, *r2t;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+ if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
+ r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
+ gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+ r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+ *r1e = _REGION1_ENTRY_EMPTY;
+ __gmap_unshadow_r2t(sg, raddr, r2t);
+ /* Free region 2 table */
+ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r1t: pointer to the start of a shadow region-1 table
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r1t)
+{
+ unsigned long asce, *r2t;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
+ if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+ __gmap_unshadow_r2t(sg, raddr, r2t);
+ /* Clear entry and flush translation r1t -> r2t */
+ gmap_idte_one(asce, raddr);
+ r1t[i] = _REGION1_ENTRY_EMPTY;
+ /* Free region 2 table */
+ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow - remove a shadow page table completely
+ * @sg: pointer to the shadow guest address space structure
+ *
+ * Called with sg->guest_table_lock
+ */
+static void gmap_unshadow(struct gmap *sg)
+{
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ if (sg->removed)
+ return;
+ sg->removed = 1;
+ gmap_call_notifier(sg, 0, -1UL);
+ gmap_flush_tlb(sg);
+ table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+ switch (sg->asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_REGION1:
+ __gmap_unshadow_r1t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_REGION2:
+ __gmap_unshadow_r2t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_REGION3:
+ __gmap_unshadow_r3t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_SEGMENT:
+ __gmap_unshadow_sgt(sg, 0, table);
+ break;
+ }
+}
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level)
+{
+ struct gmap *sg;
+
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+ sg->removed)
+ continue;
+ if (!sg->initialized)
+ return ERR_PTR(-EAGAIN);
+ atomic_inc(&sg->ref_count);
+ return sg;
+ }
+ return NULL;
+}
+
+/**
+ * gmap_shadow_valid - check if a shadow guest address space matches the
+ * given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ *
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+ if (sg->removed)
+ return 0;
+ return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level)
+{
+ struct gmap *sg, *new;
+ unsigned long limit;
+ int rc;
+
+ BUG_ON(gmap_is_shadow(parent));
+ spin_lock(&parent->shadow_lock);
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ spin_unlock(&parent->shadow_lock);
+ if (sg)
+ return sg;
+ /* Create a new shadow gmap */
+ limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+ if (asce & _ASCE_REAL_SPACE)
+ limit = -1UL;
+ new = gmap_alloc(limit);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+ new->mm = parent->mm;
+ new->parent = gmap_get(parent);
+ new->orig_asce = asce;
+ new->edat_level = edat_level;
+ new->initialized = false;
+ spin_lock(&parent->shadow_lock);
+ /* Recheck if another CPU created the same shadow */
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ if (sg) {
+ spin_unlock(&parent->shadow_lock);
+ gmap_free(new);
+ return sg;
+ }
+ if (asce & _ASCE_REAL_SPACE) {
+ /* only allow one real-space gmap shadow */
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->orig_asce & _ASCE_REAL_SPACE) {
+ spin_lock(&sg->guest_table_lock);
+ gmap_unshadow(sg);
+ spin_unlock(&sg->guest_table_lock);
+ list_del(&sg->list);
+ gmap_put(sg);
+ break;
+ }
+ }
+ }
+ atomic_set(&new->ref_count, 2);
+ list_add(&new->list, &parent->children);
+ if (asce & _ASCE_REAL_SPACE) {
+ /* nothing to protect, return right away */
+ new->initialized = true;
+ spin_unlock(&parent->shadow_lock);
+ return new;
+ }
+ spin_unlock(&parent->shadow_lock);
+ /* protect after insertion, so it will get properly invalidated */
+ down_read(&parent->mm->mmap_sem);
+ rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+ ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
+ PROT_READ, PGSTE_VSIE_BIT);
+ up_read(&parent->mm->mmap_sem);
+ spin_lock(&parent->shadow_lock);
+ new->initialized = true;
+ if (rc) {
+ list_del(&new->list);
+ gmap_free(new);
+ new = ERR_PTR(rc);
+ }
+ spin_unlock(&parent->shadow_lock);
+ return new;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow);
+
+/**
+ * gmap_shadow_r2t - create an empty shadow region 2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r2t: parent gmap address of the region 2 table to get shadowed
+ * @fake: r2t references contiguous guest memory block, not a r2t
+ *
+ * The r2t parameter specifies the address of the source table. The
+ * four pages of the source table are made read-only in the parent gmap
+ * address space. A write to the source table area @r2t will automatically
+ * remove the shadow r2 table and all of its decendents.
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_r2t, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ /* Allocate a shadow region second table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = r2t & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_r2t = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= (r2t & _REGION_ENTRY_PROTECT);
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make r2t read-only in parent gmap page table */
+ raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
+ origin = r2t & _REGION_ENTRY_ORIGIN;
+ offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 4);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_r2t)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_r2t(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+
+/**
+ * gmap_shadow_r3t - create a shadow region 3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r3t: parent gmap address of the region 3 table to get shadowed
+ * @fake: r3t references contiguous guest memory block, not a r3t
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_r3t, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ /* Allocate a shadow region second table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = r3t & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_r3t = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ }
+ crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= (r3t & _REGION_ENTRY_PROTECT);
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make r3t read-only in parent gmap page table */
+ raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
+ origin = r3t & _REGION_ENTRY_ORIGIN;
+ offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 3);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_r3t)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_r3t(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+
+/**
+ * gmap_shadow_sgt - create a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @sgt: parent gmap address of the segment table to get shadowed
+ * @fake: sgt references contiguous guest memory block, not a sgt
+ *
+ * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_sgt, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+ /* Allocate a shadow segment table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = sgt & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_sgt = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= sgt & _REGION_ENTRY_PROTECT;
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make sgt read-only in parent gmap page table */
+ raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
+ origin = sgt & _REGION_ENTRY_ORIGIN;
+ offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 2);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_sgt)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_sgt(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+
+/**
+ * gmap_shadow_lookup_pgtable - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection,
+ int *fake)
+{
+ unsigned long *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+ if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+ /* Shadow page tables are full pages (pte+pgste) */
+ page = pfn_to_page(*table >> PAGE_SHIFT);
+ *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
+ *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+ *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
+ rc = 0;
+ } else {
+ rc = -EAGAIN;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+
+/**
+ * gmap_shadow_pgt - instantiate a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory,
+ * -EFAULT if an address in the parent gmap could not be resolved and
+ *
+ * Called with gmap->mm->mmap_sem in read
+ */
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+ int fake)
+{
+ unsigned long raddr, origin;
+ unsigned long *s_pgt, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+ /* Allocate a shadow page table */
+ page = page_table_alloc_pgste(sg->mm);
+ if (!page)
+ return -ENOMEM;
+ page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_pgt = (unsigned long *) page_to_phys(page);
+ /* Install shadow page table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+ (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+ list_add(&page->lru, &sg->pt_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_SEGMENT_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make pgt read-only in parent gmap page table (not the pgste) */
+ raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
+ origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+ rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 1);
+ if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+ (unsigned long) s_pgt)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_SEGMENT_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_pgt(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ page_table_free_pgste(page);
+ return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+
+/**
+ * gmap_shadow_page - create a shadow page mapping
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pte: pte in parent gmap address space to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+{
+ struct gmap *parent;
+ struct gmap_rmap *rmap;
+ unsigned long vmaddr, paddr;
+ spinlock_t *ptl;
+ pte_t *sptep, *tptep;
+ int prot;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ parent = sg->parent;
+ prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ if (!rmap)
+ return -ENOMEM;
+ rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+
+ while (1) {
+ paddr = pte_val(pte) & PAGE_MASK;
+ vmaddr = __gmap_translate(parent, paddr);
+ if (IS_ERR_VALUE(vmaddr)) {
+ rc = vmaddr;
+ break;
+ }
+ rc = radix_tree_preload(GFP_KERNEL);
if (rc)
break;
- /* Walk the process page table, lock and get pte pointer */
- ptep = get_locked_pte(gmap->mm, addr, &ptl);
- VM_BUG_ON(!ptep);
- /* Set notification bit in the pgste of the pte */
- if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
- ptep_set_notify(gmap->mm, addr, ptep);
- gaddr += PAGE_SIZE;
- len -= PAGE_SIZE;
+ rc = -EAGAIN;
+ sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+ if (sptep) {
+ spin_lock(&sg->guest_table_lock);
+ /* Get page table pointer */
+ tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+ if (!tptep) {
+ spin_unlock(&sg->guest_table_lock);
+ gmap_pte_op_end(ptl);
+ radix_tree_preload_end();
+ break;
+ }
+ rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+ if (rc > 0) {
+ /* Success and a new mapping */
+ gmap_insert_rmap(sg, vmaddr, rmap);
+ rmap = NULL;
+ rc = 0;
+ }
+ gmap_pte_op_end(ptl);
+ spin_unlock(&sg->guest_table_lock);
}
- pte_unmap_unlock(ptep, ptl);
+ radix_tree_preload_end();
+ if (!rc)
+ break;
+ rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+ if (rc)
+ break;
}
- up_read(&gmap->mm->mmap_sem);
+ kfree(rmap);
return rc;
}
-EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+EXPORT_SYMBOL_GPL(gmap_shadow_page);
+
+/**
+ * gmap_shadow_notify - handle notifications for shadow gmap
+ *
+ * Called with sg->parent->shadow_lock.
+ */
+static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+ unsigned long offset, pte_t *pte)
+{
+ struct gmap_rmap *rmap, *rnext, *head;
+ unsigned long gaddr, start, end, bits, raddr;
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ spin_lock(&sg->parent->guest_table_lock);
+ table = radix_tree_lookup(&sg->parent->host_to_guest,
+ vmaddr >> PMD_SHIFT);
+ gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
+ spin_unlock(&sg->parent->guest_table_lock);
+ if (!table)
+ return;
+
+ spin_lock(&sg->guest_table_lock);
+ if (sg->removed) {
+ spin_unlock(&sg->guest_table_lock);
+ return;
+ }
+ /* Check for top level table */
+ start = sg->orig_asce & _ASCE_ORIGIN;
+ end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
+ if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+ gaddr < end) {
+ /* The complete shadow table has to go */
+ gmap_unshadow(sg);
+ spin_unlock(&sg->guest_table_lock);
+ list_del(&sg->list);
+ gmap_put(sg);
+ return;
+ }
+ /* Remove the page table tree from on specific entry */
+ head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
+ gmap_for_each_rmap_safe(rmap, rnext, head) {
+ bits = rmap->raddr & _SHADOW_RMAP_MASK;
+ raddr = rmap->raddr ^ bits;
+ switch (bits) {
+ case _SHADOW_RMAP_REGION1:
+ gmap_unshadow_r2t(sg, raddr);
+ break;
+ case _SHADOW_RMAP_REGION2:
+ gmap_unshadow_r3t(sg, raddr);
+ break;
+ case _SHADOW_RMAP_REGION3:
+ gmap_unshadow_sgt(sg, raddr);
+ break;
+ case _SHADOW_RMAP_SEGMENT:
+ gmap_unshadow_pgt(sg, raddr);
+ break;
+ case _SHADOW_RMAP_PGTABLE:
+ gmap_unshadow_page(sg, raddr);
+ break;
+ }
+ kfree(rmap);
+ }
+ spin_unlock(&sg->guest_table_lock);
+}
/**
* ptep_notify - call all invalidation callbacks for a specific pte.
* @mm: pointer to the process mm_struct
* @addr: virtual address in the process address space
* @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
*
* This function is assumed to be called with the page table lock held
* for the pte to notify.
*/
-void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+ pte_t *pte, unsigned long bits)
{
unsigned long offset, gaddr;
unsigned long *table;
- struct gmap_notifier *nb;
- struct gmap *gmap;
+ struct gmap *gmap, *sg, *next;
offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
offset = offset * (4096 / sizeof(pte_t));
- spin_lock(&gmap_notifier_lock);
- list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+ spin_lock(&gmap->shadow_lock);
+ list_for_each_entry_safe(sg, next,
+ &gmap->children, list)
+ gmap_shadow_notify(sg, vmaddr, offset, pte);
+ spin_unlock(&gmap->shadow_lock);
+ }
+ if (!(bits & PGSTE_IN_BIT))
+ continue;
+ spin_lock(&gmap->guest_table_lock);
table = radix_tree_lookup(&gmap->host_to_guest,
vmaddr >> PMD_SHIFT);
- if (!table)
- continue;
- gaddr = __gmap_segment_gaddr(table) + offset;
- list_for_each_entry(nb, &gmap_notifier_list, list)
- nb->notifier_call(gmap, gaddr);
+ if (table)
+ gaddr = __gmap_segment_gaddr(table) + offset;
+ spin_unlock(&gmap->guest_table_lock);
+ if (table)
+ gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
}
- spin_unlock(&gmap_notifier_lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(ptep_notify);
diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c
index a8a6765f1..adb0c34bf 100644
--- a/arch/s390/mm/gup.c
+++ b/arch/s390/mm/gup.c
@@ -128,6 +128,44 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr,
return 1;
}
+static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr,
+ unsigned long end, int write, struct page **pages, int *nr)
+{
+ struct page *head, *page;
+ unsigned long mask;
+ int refs;
+
+ mask = (write ? _REGION_ENTRY_PROTECT : 0) | _REGION_ENTRY_INVALID;
+ if ((pud_val(pud) & mask) != 0)
+ return 0;
+ VM_BUG_ON(!pfn_valid(pud_pfn(pud)));
+
+ refs = 0;
+ head = pud_page(pud);
+ page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON_PAGE(compound_head(page) != head, page);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ if (!page_cache_add_speculative(head, refs)) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pud_val(pud) != pud_val(*pudp))) {
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ return 1;
+}
+
static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
unsigned long end, int write, struct page **pages, int *nr)
{
@@ -144,7 +182,12 @@ static inline int gup_pud_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr,
next = pud_addr_end(addr, end);
if (pud_none(pud))
return 0;
- if (!gup_pmd_range(pudp, pud, addr, next, write, pages, nr))
+ if (unlikely(pud_large(pud))) {
+ if (!gup_huge_pud(pudp, pud, addr, next, write, pages,
+ nr))
+ return 0;
+ } else if (!gup_pmd_range(pudp, pud, addr, next, write, pages,
+ nr))
return 0;
} while (pudp++, addr = next, addr != end);
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 1b5e8983f..cd404aa39 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -1,19 +1,28 @@
/*
* IBM System z Huge TLB Page Support for Kernel.
*
- * Copyright IBM Corp. 2007
+ * Copyright IBM Corp. 2007,2016
* Author(s): Gerald Schaefer <gerald.schaefer@de.ibm.com>
*/
+#define KMSG_COMPONENT "hugetlb"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
#include <linux/mm.h>
#include <linux/hugetlb.h>
-static inline pmd_t __pte_to_pmd(pte_t pte)
+/*
+ * If the bit selected by single-bit bitmask "a" is set within "x", move
+ * it to the position indicated by single-bit bitmask "b".
+ */
+#define move_set_bit(x, a, b) (((x) & (a)) >> ilog2(a) << ilog2(b))
+
+static inline unsigned long __pte_to_rste(pte_t pte)
{
- pmd_t pmd;
+ unsigned long rste;
/*
- * Convert encoding pte bits pmd bits
+ * Convert encoding pte bits pmd / pud bits
* lIR.uswrdy.p dy..R...I...wr
* empty 010.000000.0 -> 00..0...1...00
* prot-none, clean, old 111.000000.1 -> 00..1...1...00
@@ -33,25 +42,40 @@ static inline pmd_t __pte_to_pmd(pte_t pte)
* u unused, l large
*/
if (pte_present(pte)) {
- pmd_val(pmd) = pte_val(pte) & PAGE_MASK;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_READ) >> 4;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_WRITE) >> 4;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_INVALID) >> 5;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_PROTECT);
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_DIRTY) << 10;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_YOUNG) << 10;
- pmd_val(pmd) |= (pte_val(pte) & _PAGE_SOFT_DIRTY) << 13;
+ rste = pte_val(pte) & PAGE_MASK;
+ rste |= move_set_bit(pte_val(pte), _PAGE_READ,
+ _SEGMENT_ENTRY_READ);
+ rste |= move_set_bit(pte_val(pte), _PAGE_WRITE,
+ _SEGMENT_ENTRY_WRITE);
+ rste |= move_set_bit(pte_val(pte), _PAGE_INVALID,
+ _SEGMENT_ENTRY_INVALID);
+ rste |= move_set_bit(pte_val(pte), _PAGE_PROTECT,
+ _SEGMENT_ENTRY_PROTECT);
+ rste |= move_set_bit(pte_val(pte), _PAGE_DIRTY,
+ _SEGMENT_ENTRY_DIRTY);
+ rste |= move_set_bit(pte_val(pte), _PAGE_YOUNG,
+ _SEGMENT_ENTRY_YOUNG);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ rste |= move_set_bit(pte_val(pte), _PAGE_SOFT_DIRTY,
+ _SEGMENT_ENTRY_SOFT_DIRTY);
+#endif
} else
- pmd_val(pmd) = _SEGMENT_ENTRY_INVALID;
- return pmd;
+ rste = _SEGMENT_ENTRY_INVALID;
+ return rste;
}
-static inline pte_t __pmd_to_pte(pmd_t pmd)
+static inline pte_t __rste_to_pte(unsigned long rste)
{
+ int present;
pte_t pte;
+ if ((rste & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ present = pud_present(__pud(rste));
+ else
+ present = pmd_present(__pmd(rste));
+
/*
- * Convert encoding pmd bits pte bits
+ * Convert encoding pmd / pud bits pte bits
* dy..R...I...wr lIR.uswrdy.p
* empty 00..0...1...00 -> 010.000000.0
* prot-none, clean, old 00..1...1...00 -> 111.000000.1
@@ -70,16 +94,25 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
* SW-bits: p present, y young, d dirty, r read, w write, s special,
* u unused, l large
*/
- if (pmd_present(pmd)) {
- pte_val(pte) = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN_LARGE;
+ if (present) {
+ pte_val(pte) = rste & _SEGMENT_ENTRY_ORIGIN_LARGE;
pte_val(pte) |= _PAGE_LARGE | _PAGE_PRESENT;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_READ) << 4;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE) << 4;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_INVALID) << 5;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_PROTECT);
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) >> 10;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) >> 10;
- pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_SOFT_DIRTY) >> 13;
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_READ,
+ _PAGE_READ);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_WRITE,
+ _PAGE_WRITE);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_INVALID,
+ _PAGE_INVALID);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_PROTECT,
+ _PAGE_PROTECT);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_DIRTY,
+ _PAGE_DIRTY);
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_YOUNG,
+ _PAGE_YOUNG);
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ pte_val(pte) |= move_set_bit(rste, _SEGMENT_ENTRY_SOFT_DIRTY,
+ _PAGE_DIRTY);
+#endif
} else
pte_val(pte) = _PAGE_INVALID;
return pte;
@@ -88,27 +121,33 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
- pmd_t pmd = __pte_to_pmd(pte);
-
- pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
- *(pmd_t *) ptep = pmd;
+ unsigned long rste = __pte_to_rste(pte);
+
+ /* Set correct table type for 2G hugepages */
+ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ rste |= _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE;
+ else
+ rste |= _SEGMENT_ENTRY_LARGE;
+ pte_val(*ptep) = rste;
}
pte_t huge_ptep_get(pte_t *ptep)
{
- pmd_t pmd = *(pmd_t *) ptep;
-
- return __pmd_to_pte(pmd);
+ return __rste_to_pte(pte_val(*ptep));
}
pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
+ pte_t pte = huge_ptep_get(ptep);
pmd_t *pmdp = (pmd_t *) ptep;
- pmd_t old;
+ pud_t *pudp = (pud_t *) ptep;
- old = pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
- return __pmd_to_pte(old);
+ if ((pte_val(*ptep) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3)
+ pudp_xchg_direct(mm, addr, pudp, __pud(_REGION3_ENTRY_EMPTY));
+ else
+ pmdp_xchg_direct(mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY));
+ return pte;
}
pte_t *huge_pte_alloc(struct mm_struct *mm,
@@ -120,8 +159,12 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
pgdp = pgd_offset(mm, addr);
pudp = pud_alloc(mm, pgdp, addr);
- if (pudp)
- pmdp = pmd_alloc(mm, pudp, addr);
+ if (pudp) {
+ if (sz == PUD_SIZE)
+ return (pte_t *) pudp;
+ else if (sz == PMD_SIZE)
+ pmdp = pmd_alloc(mm, pudp, addr);
+ }
return (pte_t *) pmdp;
}
@@ -134,8 +177,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
pgdp = pgd_offset(mm, addr);
if (pgd_present(*pgdp)) {
pudp = pud_offset(pgdp, addr);
- if (pud_present(*pudp))
+ if (pud_present(*pudp)) {
+ if (pud_large(*pudp))
+ return (pte_t *) pudp;
pmdp = pmd_offset(pudp, addr);
+ }
}
return (pte_t *) pmdp;
}
@@ -147,5 +193,34 @@ int pmd_huge(pmd_t pmd)
int pud_huge(pud_t pud)
{
- return 0;
+ return pud_large(pud);
+}
+
+struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+ pud_t *pud, int flags)
+{
+ if (flags & FOLL_GET)
+ return NULL;
+
+ return pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+}
+
+static __init int setup_hugepagesz(char *opt)
+{
+ unsigned long size;
+ char *string = opt;
+
+ size = memparse(opt, &opt);
+ if (MACHINE_HAS_EDAT1 && size == PMD_SIZE) {
+ hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
+ } else if (MACHINE_HAS_EDAT2 && size == PUD_SIZE) {
+ hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
+ } else {
+ pr_err("hugepagesz= specifies an unsupported page size %s\n",
+ string);
+ return 0;
+ }
+ return 1;
}
+__setup("hugepagesz=", setup_hugepagesz);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index 2489b2e91..f56a39bd8 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -40,7 +40,7 @@
#include <asm/ctl_reg.h>
#include <asm/sclp.h>
-pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((__aligned__(PAGE_SIZE)));
+pgd_t swapper_pg_dir[PTRS_PER_PGD] __section(.bss..swapper_pg_dir);
unsigned long empty_zero_page, zero_page_mask;
EXPORT_SYMBOL(empty_zero_page);
@@ -111,17 +111,16 @@ void __init paging_init(void)
void mark_rodata_ro(void)
{
- /* Text and rodata are already protected. Nothing to do here. */
- pr_info("Write protecting the kernel read-only data: %luk\n",
- ((unsigned long)&_eshared - (unsigned long)&_stext) >> 10);
+ unsigned long size = __end_ro_after_init - __start_ro_after_init;
+
+ set_memory_ro((unsigned long)__start_ro_after_init, size >> PAGE_SHIFT);
+ pr_info("Write protected read-only-after-init data: %luk\n", size >> 10);
}
void __init mem_init(void)
{
- if (MACHINE_HAS_TLB_LC)
- cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
+ cpumask_set_cpu(0, &init_mm.context.cpu_attach_mask);
cpumask_set_cpu(0, mm_cpumask(&init_mm));
- atomic_set(&init_mm.context.attach_count, 1);
set_max_mapnr(max_low_pfn);
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
diff --git a/arch/s390/mm/page-states.c b/arch/s390/mm/page-states.c
index a90d45e9d..3330ea124 100644
--- a/arch/s390/mm/page-states.c
+++ b/arch/s390/mm/page-states.c
@@ -34,20 +34,25 @@ static int __init cmma(char *str)
}
__setup("cmma=", cmma);
-void __init cmma_init(void)
+static inline int cmma_test_essa(void)
{
register unsigned long tmp asm("0") = 0;
register int rc asm("1") = -EOPNOTSUPP;
- if (!cmma_flag)
- return;
asm volatile(
" .insn rrf,0xb9ab0000,%1,%1,0,0\n"
"0: la %0,0\n"
"1:\n"
EX_TABLE(0b,1b)
: "+&d" (rc), "+&d" (tmp));
- if (rc)
+ return rc;
+}
+
+void __init cmma_init(void)
+{
+ if (!cmma_flag)
+ return;
+ if (cmma_test_essa())
cmma_flag = 0;
}
diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c
index f2a5c29a9..af7cf28cf 100644
--- a/arch/s390/mm/pageattr.c
+++ b/arch/s390/mm/pageattr.c
@@ -10,7 +10,6 @@
#include <asm/pgtable.h>
#include <asm/page.h>
-#if PAGE_DEFAULT_KEY
static inline unsigned long sske_frame(unsigned long addr, unsigned char skey)
{
asm volatile(".insn rrf,0xb22b0000,%[skey],%[addr],9,0"
@@ -22,6 +21,8 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
{
unsigned long boundary, size;
+ if (!PAGE_DEFAULT_KEY)
+ return;
while (start < end) {
if (MACHINE_HAS_EDAT1) {
/* set storage keys for a 1MB frame */
@@ -38,56 +39,256 @@ void __storage_key_init_range(unsigned long start, unsigned long end)
start += PAGE_SIZE;
}
}
-#endif
-static pte_t *walk_page_table(unsigned long addr)
+#ifdef CONFIG_PROC_FS
+atomic_long_t direct_pages_count[PG_DIRECT_MAP_MAX];
+
+void arch_report_meminfo(struct seq_file *m)
{
- pgd_t *pgdp;
- pud_t *pudp;
+ seq_printf(m, "DirectMap4k: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_4K]) << 2);
+ seq_printf(m, "DirectMap1M: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_1M]) << 10);
+ seq_printf(m, "DirectMap2G: %8lu kB\n",
+ atomic_long_read(&direct_pages_count[PG_DIRECT_MAP_2G]) << 21);
+}
+#endif /* CONFIG_PROC_FS */
+
+static void pgt_set(unsigned long *old, unsigned long new, unsigned long addr,
+ unsigned long dtt)
+{
+ unsigned long table, mask;
+
+ mask = 0;
+ if (MACHINE_HAS_EDAT2) {
+ switch (dtt) {
+ case CRDTE_DTT_REGION3:
+ mask = ~(PTRS_PER_PUD * sizeof(pud_t) - 1);
+ break;
+ case CRDTE_DTT_SEGMENT:
+ mask = ~(PTRS_PER_PMD * sizeof(pmd_t) - 1);
+ break;
+ case CRDTE_DTT_PAGE:
+ mask = ~(PTRS_PER_PTE * sizeof(pte_t) - 1);
+ break;
+ }
+ table = (unsigned long)old & mask;
+ crdte(*old, new, table, dtt, addr, S390_lowcore.kernel_asce);
+ } else if (MACHINE_HAS_IDTE) {
+ cspg(old, *old, new);
+ } else {
+ csp((unsigned int *)old + 1, *old, new);
+ }
+}
+
+struct cpa {
+ unsigned int set_ro : 1;
+ unsigned int clear_ro : 1;
+};
+
+static int walk_pte_level(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ struct cpa cpa)
+{
+ pte_t *ptep, new;
+
+ ptep = pte_offset(pmdp, addr);
+ do {
+ if (pte_none(*ptep))
+ return -EINVAL;
+ if (cpa.set_ro)
+ new = pte_wrprotect(*ptep);
+ else if (cpa.clear_ro)
+ new = pte_mkwrite(pte_mkdirty(*ptep));
+ pgt_set((unsigned long *)ptep, pte_val(new), addr, CRDTE_DTT_PAGE);
+ ptep++;
+ addr += PAGE_SIZE;
+ cond_resched();
+ } while (addr < end);
+ return 0;
+}
+
+static int split_pmd_page(pmd_t *pmdp, unsigned long addr)
+{
+ unsigned long pte_addr, prot;
+ pte_t *pt_dir, *ptep;
+ pmd_t new;
+ int i, ro;
+
+ pt_dir = vmem_pte_alloc();
+ if (!pt_dir)
+ return -ENOMEM;
+ pte_addr = pmd_pfn(*pmdp) << PAGE_SHIFT;
+ ro = !!(pmd_val(*pmdp) & _SEGMENT_ENTRY_PROTECT);
+ prot = pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL);
+ ptep = pt_dir;
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte_val(*ptep) = pte_addr | prot;
+ pte_addr += PAGE_SIZE;
+ ptep++;
+ }
+ pmd_val(new) = __pa(pt_dir) | _SEGMENT_ENTRY;
+ pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
+ update_page_count(PG_DIRECT_MAP_4K, PTRS_PER_PTE);
+ update_page_count(PG_DIRECT_MAP_1M, -1);
+ return 0;
+}
+
+static void modify_pmd_page(pmd_t *pmdp, unsigned long addr, struct cpa cpa)
+{
+ pmd_t new;
+
+ if (cpa.set_ro)
+ new = pmd_wrprotect(*pmdp);
+ else if (cpa.clear_ro)
+ new = pmd_mkwrite(pmd_mkdirty(*pmdp));
+ pgt_set((unsigned long *)pmdp, pmd_val(new), addr, CRDTE_DTT_SEGMENT);
+}
+
+static int walk_pmd_level(pud_t *pudp, unsigned long addr, unsigned long end,
+ struct cpa cpa)
+{
+ unsigned long next;
pmd_t *pmdp;
- pte_t *ptep;
+ int rc = 0;
- pgdp = pgd_offset_k(addr);
- if (pgd_none(*pgdp))
- return NULL;
- pudp = pud_offset(pgdp, addr);
- if (pud_none(*pudp) || pud_large(*pudp))
- return NULL;
pmdp = pmd_offset(pudp, addr);
- if (pmd_none(*pmdp) || pmd_large(*pmdp))
- return NULL;
- ptep = pte_offset_kernel(pmdp, addr);
- if (pte_none(*ptep))
- return NULL;
- return ptep;
+ do {
+ if (pmd_none(*pmdp))
+ return -EINVAL;
+ next = pmd_addr_end(addr, end);
+ if (pmd_large(*pmdp)) {
+ if (addr & ~PMD_MASK || addr + PMD_SIZE > next) {
+ rc = split_pmd_page(pmdp, addr);
+ if (rc)
+ return rc;
+ continue;
+ }
+ modify_pmd_page(pmdp, addr, cpa);
+ } else {
+ rc = walk_pte_level(pmdp, addr, next, cpa);
+ if (rc)
+ return rc;
+ }
+ pmdp++;
+ addr = next;
+ cond_resched();
+ } while (addr < end);
+ return rc;
}
-static void change_page_attr(unsigned long addr, int numpages,
- pte_t (*set) (pte_t))
+static int split_pud_page(pud_t *pudp, unsigned long addr)
{
- pte_t *ptep;
- int i;
+ unsigned long pmd_addr, prot;
+ pmd_t *pm_dir, *pmdp;
+ pud_t new;
+ int i, ro;
- for (i = 0; i < numpages; i++) {
- ptep = walk_page_table(addr);
- if (WARN_ON_ONCE(!ptep))
- break;
- *ptep = set(*ptep);
- addr += PAGE_SIZE;
+ pm_dir = vmem_pmd_alloc();
+ if (!pm_dir)
+ return -ENOMEM;
+ pmd_addr = pud_pfn(*pudp) << PAGE_SHIFT;
+ ro = !!(pud_val(*pudp) & _REGION_ENTRY_PROTECT);
+ prot = pgprot_val(ro ? SEGMENT_KERNEL_RO : SEGMENT_KERNEL);
+ pmdp = pm_dir;
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_val(*pmdp) = pmd_addr | prot;
+ pmd_addr += PMD_SIZE;
+ pmdp++;
}
- __tlb_flush_kernel();
+ pud_val(new) = __pa(pm_dir) | _REGION3_ENTRY;
+ pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
+ update_page_count(PG_DIRECT_MAP_1M, PTRS_PER_PMD);
+ update_page_count(PG_DIRECT_MAP_2G, -1);
+ return 0;
+}
+
+static void modify_pud_page(pud_t *pudp, unsigned long addr, struct cpa cpa)
+{
+ pud_t new;
+
+ if (cpa.set_ro)
+ new = pud_wrprotect(*pudp);
+ else if (cpa.clear_ro)
+ new = pud_mkwrite(pud_mkdirty(*pudp));
+ pgt_set((unsigned long *)pudp, pud_val(new), addr, CRDTE_DTT_REGION3);
+}
+
+static int walk_pud_level(pgd_t *pgd, unsigned long addr, unsigned long end,
+ struct cpa cpa)
+{
+ unsigned long next;
+ pud_t *pudp;
+ int rc = 0;
+
+ pudp = pud_offset(pgd, addr);
+ do {
+ if (pud_none(*pudp))
+ return -EINVAL;
+ next = pud_addr_end(addr, end);
+ if (pud_large(*pudp)) {
+ if (addr & ~PUD_MASK || addr + PUD_SIZE > next) {
+ rc = split_pud_page(pudp, addr);
+ if (rc)
+ break;
+ continue;
+ }
+ modify_pud_page(pudp, addr, cpa);
+ } else {
+ rc = walk_pmd_level(pudp, addr, next, cpa);
+ }
+ pudp++;
+ addr = next;
+ cond_resched();
+ } while (addr < end && !rc);
+ return rc;
+}
+
+static DEFINE_MUTEX(cpa_mutex);
+
+static int change_page_attr(unsigned long addr, unsigned long end,
+ struct cpa cpa)
+{
+ unsigned long next;
+ int rc = -EINVAL;
+ pgd_t *pgdp;
+
+ if (addr == end)
+ return 0;
+ if (end >= MODULES_END)
+ return -EINVAL;
+ mutex_lock(&cpa_mutex);
+ pgdp = pgd_offset_k(addr);
+ do {
+ if (pgd_none(*pgdp))
+ break;
+ next = pgd_addr_end(addr, end);
+ rc = walk_pud_level(pgdp, addr, next, cpa);
+ if (rc)
+ break;
+ cond_resched();
+ } while (pgdp++, addr = next, addr < end && !rc);
+ mutex_unlock(&cpa_mutex);
+ return rc;
}
int set_memory_ro(unsigned long addr, int numpages)
{
- change_page_attr(addr, numpages, pte_wrprotect);
- return 0;
+ struct cpa cpa = {
+ .set_ro = 1,
+ };
+
+ addr &= PAGE_MASK;
+ return change_page_attr(addr, addr + numpages * PAGE_SIZE, cpa);
}
int set_memory_rw(unsigned long addr, int numpages)
{
- change_page_attr(addr, numpages, pte_mkwrite);
- return 0;
+ struct cpa cpa = {
+ .clear_ro = 1,
+ };
+
+ addr &= PAGE_MASK;
+ return change_page_attr(addr, addr + numpages * PAGE_SIZE, cpa);
}
/* not possible */
@@ -138,7 +339,7 @@ void __kernel_map_pages(struct page *page, int numpages, int enable)
nr = min(numpages - i, nr);
if (enable) {
for (j = 0; j < nr; j++) {
- pte_val(*pte) = __pa(address);
+ pte_val(*pte) = address | pgprot_val(PAGE_KERNEL);
address += PAGE_SIZE;
pte++;
}
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index e2565d2d0..995f78532 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
return new;
}
+#ifdef CONFIG_PGSTE
+
+struct page *page_table_alloc_pgste(struct mm_struct *mm)
+{
+ struct page *page;
+ unsigned long *table;
+
+ page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+ if (page) {
+ table = (unsigned long *) page_to_phys(page);
+ clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+ clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+ }
+ return page;
+}
+
+void page_table_free_pgste(struct page *page)
+{
+ __free_page(page);
+}
+
+#endif /* CONFIG_PGSTE */
+
/*
* page table entry allocation/free routines.
*/
@@ -149,7 +172,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
/* Try to get a fragment of a 4K page as a 2K page table */
if (!mm_alloc_pgste(mm)) {
table = NULL;
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
if (!list_empty(&mm->context.pgtable_list)) {
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
@@ -164,7 +187,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
list_del(&page->lru);
}
}
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
if (table)
return table;
}
@@ -187,9 +210,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
/* Return the first 2K fragment of the page */
atomic_set(&page->_mapcount, 1);
clear_table(table, _PAGE_INVALID, PAGE_SIZE);
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
list_add(&page->lru, &mm->context.pgtable_list);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
}
return table;
}
@@ -203,13 +226,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
if (!mm_alloc_pgste(mm)) {
/* Free 2K page table fragment of a 4K page */
bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
if (mask & 3)
list_add(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
if (mask != 0)
return;
}
@@ -235,13 +258,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
return;
}
bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
if (mask & 3)
list_add_tail(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
table = (unsigned long *) (__pa(table) | (1U << bit));
tlb_remove_table(tlb, table);
}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ebb4f8711..5f092015a 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -27,40 +27,37 @@
static inline pte_t ptep_flush_direct(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- int active, count;
pte_t old;
old = *ptep;
if (unlikely(pte_val(old) & _PAGE_INVALID))
return old;
- active = (mm == current->active_mm) ? 1 : 0;
- count = atomic_add_return(0x10000, &mm->context.attach_count);
- if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+ atomic_inc(&mm->context.flush_count);
+ if (MACHINE_HAS_TLB_LC &&
cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
__ptep_ipte_local(addr, ptep);
else
__ptep_ipte(addr, ptep);
- atomic_sub(0x10000, &mm->context.attach_count);
+ atomic_dec(&mm->context.flush_count);
return old;
}
static inline pte_t ptep_flush_lazy(struct mm_struct *mm,
unsigned long addr, pte_t *ptep)
{
- int active, count;
pte_t old;
old = *ptep;
if (unlikely(pte_val(old) & _PAGE_INVALID))
return old;
- active = (mm == current->active_mm) ? 1 : 0;
- count = atomic_add_return(0x10000, &mm->context.attach_count);
- if ((count & 0xffff) <= active) {
+ atomic_inc(&mm->context.flush_count);
+ if (cpumask_equal(&mm->context.cpu_attach_mask,
+ cpumask_of(smp_processor_id()))) {
pte_val(*ptep) |= _PAGE_INVALID;
mm->context.flush_mm = 1;
} else
__ptep_ipte(addr, ptep);
- atomic_sub(0x10000, &mm->context.attach_count);
+ atomic_dec(&mm->context.flush_count);
return old;
}
@@ -70,7 +67,6 @@ static inline pgste_t pgste_get_lock(pte_t *ptep)
#ifdef CONFIG_PGSTE
unsigned long old;
- preempt_disable();
asm(
" lg %0,%2\n"
"0: lgr %1,%0\n"
@@ -93,7 +89,6 @@ static inline void pgste_set_unlock(pte_t *ptep, pgste_t pgste)
: "=Q" (ptep[PTRS_PER_PTE])
: "d" (pgste_val(pgste)), "Q" (ptep[PTRS_PER_PTE])
: "cc", "memory");
- preempt_enable();
#endif
}
@@ -179,14 +174,17 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
return pgste;
}
-static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep, pgste_t pgste)
+static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, pgste_t pgste)
{
#ifdef CONFIG_PGSTE
- if (pgste_val(pgste) & PGSTE_IN_BIT) {
- pgste_val(pgste) &= ~PGSTE_IN_BIT;
- ptep_notify(mm, addr, ptep);
+ unsigned long bits;
+
+ bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+ if (bits) {
+ pgste_val(pgste) ^= bits;
+ ptep_notify(mm, addr, ptep, bits);
}
#endif
return pgste;
@@ -199,7 +197,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
- pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+ pgste = pgste_pte_notify(mm, addr, ptep, pgste);
}
return pgste;
}
@@ -230,9 +228,11 @@ pte_t ptep_xchg_direct(struct mm_struct *mm, unsigned long addr,
pgste_t pgste;
pte_t old;
+ preempt_disable();
pgste = ptep_xchg_start(mm, addr, ptep);
old = ptep_flush_direct(mm, addr, ptep);
ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+ preempt_enable();
return old;
}
EXPORT_SYMBOL(ptep_xchg_direct);
@@ -243,9 +243,11 @@ pte_t ptep_xchg_lazy(struct mm_struct *mm, unsigned long addr,
pgste_t pgste;
pte_t old;
+ preempt_disable();
pgste = ptep_xchg_start(mm, addr, ptep);
old = ptep_flush_lazy(mm, addr, ptep);
ptep_xchg_commit(mm, addr, ptep, pgste, old, new);
+ preempt_enable();
return old;
}
EXPORT_SYMBOL(ptep_xchg_lazy);
@@ -256,6 +258,7 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
pgste_t pgste;
pte_t old;
+ preempt_disable();
pgste = ptep_xchg_start(mm, addr, ptep);
old = ptep_flush_lazy(mm, addr, ptep);
if (mm_has_pgste(mm)) {
@@ -279,13 +282,13 @@ void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
} else {
*ptep = pte;
}
+ preempt_enable();
}
EXPORT_SYMBOL(ptep_modify_prot_commit);
static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
- int active, count;
pmd_t old;
old = *pmdp;
@@ -295,36 +298,34 @@ static inline pmd_t pmdp_flush_direct(struct mm_struct *mm,
__pmdp_csp(pmdp);
return old;
}
- active = (mm == current->active_mm) ? 1 : 0;
- count = atomic_add_return(0x10000, &mm->context.attach_count);
- if (MACHINE_HAS_TLB_LC && (count & 0xffff) <= active &&
+ atomic_inc(&mm->context.flush_count);
+ if (MACHINE_HAS_TLB_LC &&
cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
__pmdp_idte_local(addr, pmdp);
else
__pmdp_idte(addr, pmdp);
- atomic_sub(0x10000, &mm->context.attach_count);
+ atomic_dec(&mm->context.flush_count);
return old;
}
static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm,
unsigned long addr, pmd_t *pmdp)
{
- int active, count;
pmd_t old;
old = *pmdp;
if (pmd_val(old) & _SEGMENT_ENTRY_INVALID)
return old;
- active = (mm == current->active_mm) ? 1 : 0;
- count = atomic_add_return(0x10000, &mm->context.attach_count);
- if ((count & 0xffff) <= active) {
+ atomic_inc(&mm->context.flush_count);
+ if (cpumask_equal(&mm->context.cpu_attach_mask,
+ cpumask_of(smp_processor_id()))) {
pmd_val(*pmdp) |= _SEGMENT_ENTRY_INVALID;
mm->context.flush_mm = 1;
} else if (MACHINE_HAS_IDTE)
__pmdp_idte(addr, pmdp);
else
__pmdp_csp(pmdp);
- atomic_sub(0x10000, &mm->context.attach_count);
+ atomic_dec(&mm->context.flush_count);
return old;
}
@@ -333,8 +334,10 @@ pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr,
{
pmd_t old;
+ preempt_disable();
old = pmdp_flush_direct(mm, addr, pmdp);
*pmdp = new;
+ preempt_enable();
return old;
}
EXPORT_SYMBOL(pmdp_xchg_direct);
@@ -344,12 +347,53 @@ pmd_t pmdp_xchg_lazy(struct mm_struct *mm, unsigned long addr,
{
pmd_t old;
+ preempt_disable();
old = pmdp_flush_lazy(mm, addr, pmdp);
*pmdp = new;
+ preempt_enable();
return old;
}
EXPORT_SYMBOL(pmdp_xchg_lazy);
+static inline pud_t pudp_flush_direct(struct mm_struct *mm,
+ unsigned long addr, pud_t *pudp)
+{
+ pud_t old;
+
+ old = *pudp;
+ if (pud_val(old) & _REGION_ENTRY_INVALID)
+ return old;
+ if (!MACHINE_HAS_IDTE) {
+ /*
+ * Invalid bit position is the same for pmd and pud, so we can
+ * re-use _pmd_csp() here
+ */
+ __pmdp_csp((pmd_t *) pudp);
+ return old;
+ }
+ atomic_inc(&mm->context.flush_count);
+ if (MACHINE_HAS_TLB_LC &&
+ cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id())))
+ __pudp_idte_local(addr, pudp);
+ else
+ __pudp_idte(addr, pudp);
+ atomic_dec(&mm->context.flush_count);
+ return old;
+}
+
+pud_t pudp_xchg_direct(struct mm_struct *mm, unsigned long addr,
+ pud_t *pudp, pud_t new)
+{
+ pud_t old;
+
+ preempt_disable();
+ old = pudp_flush_direct(mm, addr, pudp);
+ *pudp = new;
+ preempt_enable();
+ return old;
+}
+EXPORT_SYMBOL(pudp_xchg_direct);
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
@@ -398,20 +442,108 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
pgste_t pgste;
/* the mm_has_pgste() check is done in set_pte_at() */
+ preempt_disable();
pgste = pgste_get_lock(ptep);
pgste_val(pgste) &= ~_PGSTE_GPS_ZERO;
pgste_set_key(ptep, pgste, entry, mm);
pgste = pgste_set_pte(ptep, pgste, entry);
pgste_set_unlock(ptep, pgste);
+ preempt_enable();
}
void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{
pgste_t pgste;
+ preempt_disable();
pgste = pgste_get_lock(ptep);
pgste_val(pgste) |= PGSTE_IN_BIT;
pgste_set_unlock(ptep, pgste);
+ preempt_enable();
+}
+
+/**
+ * ptep_force_prot - change access rights of a locked pte
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @ptep: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bit: pgste bit to set (e.g. for notification)
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
+ */
+int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, int prot, unsigned long bit)
+{
+ pte_t entry;
+ pgste_t pgste;
+ int pte_i, pte_p;
+
+ pgste = pgste_get_lock(ptep);
+ entry = *ptep;
+ /* Check pte entry after all locks have been acquired */
+ pte_i = pte_val(entry) & _PAGE_INVALID;
+ pte_p = pte_val(entry) & _PAGE_PROTECT;
+ if ((pte_i && (prot != PROT_NONE)) ||
+ (pte_p && (prot & PROT_WRITE))) {
+ pgste_set_unlock(ptep, pgste);
+ return -EAGAIN;
+ }
+ /* Change access rights and set pgste bit */
+ if (prot == PROT_NONE && !pte_i) {
+ ptep_flush_direct(mm, addr, ptep);
+ pgste = pgste_update_all(entry, pgste, mm);
+ pte_val(entry) |= _PAGE_INVALID;
+ }
+ if (prot == PROT_READ && !pte_p) {
+ ptep_flush_direct(mm, addr, ptep);
+ pte_val(entry) &= ~_PAGE_INVALID;
+ pte_val(entry) |= _PAGE_PROTECT;
+ }
+ pgste_val(pgste) |= bit;
+ pgste = pgste_set_pte(ptep, pgste, entry);
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+ pte_t *sptep, pte_t *tptep, pte_t pte)
+{
+ pgste_t spgste, tpgste;
+ pte_t spte, tpte;
+ int rc = -EAGAIN;
+
+ if (!(pte_val(*tptep) & _PAGE_INVALID))
+ return 0; /* already shadowed */
+ spgste = pgste_get_lock(sptep);
+ spte = *sptep;
+ if (!(pte_val(spte) & _PAGE_INVALID) &&
+ !((pte_val(spte) & _PAGE_PROTECT) &&
+ !(pte_val(pte) & _PAGE_PROTECT))) {
+ pgste_val(spgste) |= PGSTE_VSIE_BIT;
+ tpgste = pgste_get_lock(tptep);
+ pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+ (pte_val(pte) & _PAGE_PROTECT);
+ /* don't touch the storage key - it belongs to parent pgste */
+ tpgste = pgste_set_pte(tptep, tpgste, tpte);
+ pgste_set_unlock(tptep, tpgste);
+ rc = 1;
+ }
+ pgste_set_unlock(sptep, spgste);
+ return rc;
+}
+
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+{
+ pgste_t pgste;
+
+ pgste = pgste_get_lock(ptep);
+ /* notifier is called by the caller */
+ ptep_flush_direct(mm, saddr, ptep);
+ /* don't touch the storage key - it belongs to parent pgste */
+ pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+ pgste_set_unlock(ptep, pgste);
}
static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
@@ -434,6 +566,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
pte_t pte;
/* Zap unused and logically-zero pages */
+ preempt_disable();
pgste = pgste_get_lock(ptep);
pgstev = pgste_val(pgste);
pte = *ptep;
@@ -446,6 +579,7 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
if (reset)
pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
pgste_set_unlock(ptep, pgste);
+ preempt_enable();
}
void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -454,6 +588,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
pgste_t pgste;
/* Clear storage key */
+ preempt_disable();
pgste = pgste_get_lock(ptep);
pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
PGSTE_GR_BIT | PGSTE_GC_BIT);
@@ -461,6 +596,7 @@ void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE))
page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1);
pgste_set_unlock(ptep, pgste);
+ preempt_enable();
}
/*
@@ -483,7 +619,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
pgste_val(pgste) &= ~PGSTE_UC_BIT;
pte = *ptep;
if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
- pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+ pgste = pgste_pte_notify(mm, addr, ptep, pgste);
__ptep_ipte(addr, ptep);
if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
pte_val(pte) |= _PAGE_PROTECT;
@@ -506,12 +642,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pgste_t old, new;
pte_t *ptep;
- down_read(&mm->mmap_sem);
ptep = get_locked_pte(mm, addr, &ptl);
- if (unlikely(!ptep)) {
- up_read(&mm->mmap_sem);
+ if (unlikely(!ptep))
return -EFAULT;
- }
new = old = pgste_get_lock(ptep);
pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
@@ -538,45 +671,100 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pgste_set_unlock(ptep, new);
pte_unmap_unlock(ptep, ptl);
- up_read(&mm->mmap_sem);
return 0;
}
EXPORT_SYMBOL(set_guest_storage_key);
-unsigned long get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
+/**
+ * Conditionally set a guest storage key (handling csske).
+ * oldkey will be updated when either mr or mc is set and a pointer is given.
+ *
+ * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
+ * storage key was updated and -EFAULT on access errors.
+ */
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char key, unsigned char *oldkey,
+ bool nq, bool mr, bool mc)
+{
+ unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
+ int rc;
+
+ /* we can drop the pgste lock between getting and setting the key */
+ if (mr | mc) {
+ rc = get_guest_storage_key(current->mm, addr, &tmp);
+ if (rc)
+ return rc;
+ if (oldkey)
+ *oldkey = tmp;
+ if (!mr)
+ mask |= _PAGE_REFERENCED;
+ if (!mc)
+ mask |= _PAGE_CHANGED;
+ if (!((tmp ^ key) & mask))
+ return 0;
+ }
+ rc = set_guest_storage_key(current->mm, addr, key, nq);
+ return rc < 0 ? rc : 1;
+}
+EXPORT_SYMBOL(cond_set_guest_storage_key);
+
+/**
+ * Reset a guest reference bit (rrbe), returning the reference and changed bit.
+ *
+ * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
+ */
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
{
- unsigned char key;
spinlock_t *ptl;
- pgste_t pgste;
+ pgste_t old, new;
pte_t *ptep;
+ int cc = 0;
- down_read(&mm->mmap_sem);
ptep = get_locked_pte(mm, addr, &ptl);
- if (unlikely(!ptep)) {
- up_read(&mm->mmap_sem);
+ if (unlikely(!ptep))
return -EFAULT;
- }
- pgste = pgste_get_lock(ptep);
- if (pte_val(*ptep) & _PAGE_INVALID) {
- key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
- key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
- key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
- key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
- } else {
- key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+ new = old = pgste_get_lock(ptep);
+ /* Reset guest reference bit only */
+ pgste_val(new) &= ~PGSTE_GR_BIT;
- /* Reflect guest's logical view, not physical */
- if (pgste_val(pgste) & PGSTE_GR_BIT)
- key |= _PAGE_REFERENCED;
- if (pgste_val(pgste) & PGSTE_GC_BIT)
- key |= _PAGE_CHANGED;
+ if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+ cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
+ /* Merge real referenced bit into host-set */
+ pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
}
+ /* Reflect guest's logical view, not physical */
+ cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
+ /* Changing the guest storage key is considered a change of the page */
+ if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
+ pgste_val(new) |= PGSTE_UC_BIT;
+
+ pgste_set_unlock(ptep, new);
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+EXPORT_SYMBOL(reset_guest_reference_bit);
+
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char *key)
+{
+ spinlock_t *ptl;
+ pgste_t pgste;
+ pte_t *ptep;
+ ptep = get_locked_pte(mm, addr, &ptl);
+ if (unlikely(!ptep))
+ return -EFAULT;
+
+ pgste = pgste_get_lock(ptep);
+ *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+ if (!(pte_val(*ptep) & _PAGE_INVALID))
+ *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+ /* Reflect guest's logical view, not physical */
+ *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
pgste_set_unlock(ptep, pgste);
pte_unmap_unlock(ptep, ptl);
- up_read(&mm->mmap_sem);
- return key;
+ return 0;
}
EXPORT_SYMBOL(get_guest_storage_key);
#endif
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index d48cf25cf..184829276 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -11,6 +11,7 @@
#include <linux/hugetlb.h>
#include <linux/slab.h>
#include <linux/memblock.h>
+#include <asm/cacheflush.h>
#include <asm/pgalloc.h>
#include <asm/pgtable.h>
#include <asm/setup.h>
@@ -29,9 +30,11 @@ static LIST_HEAD(mem_segs);
static void __ref *vmem_alloc_pages(unsigned int order)
{
+ unsigned long size = PAGE_SIZE << order;
+
if (slab_is_available())
return (void *)__get_free_pages(GFP_KERNEL, order);
- return alloc_bootmem_pages((1 << order) * PAGE_SIZE);
+ return alloc_bootmem_align(size, size);
}
static inline pud_t *vmem_pud_alloc(void)
@@ -45,7 +48,7 @@ static inline pud_t *vmem_pud_alloc(void)
return pud;
}
-static inline pmd_t *vmem_pmd_alloc(void)
+pmd_t *vmem_pmd_alloc(void)
{
pmd_t *pmd = NULL;
@@ -56,7 +59,7 @@ static inline pmd_t *vmem_pmd_alloc(void)
return pmd;
}
-static pte_t __ref *vmem_pte_alloc(void)
+pte_t __ref *vmem_pte_alloc(void)
{
pte_t *pte;
@@ -75,8 +78,9 @@ static pte_t __ref *vmem_pte_alloc(void)
/*
* Add a physical memory range to the 1:1 mapping.
*/
-static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
+static int vmem_add_mem(unsigned long start, unsigned long size)
{
+ unsigned long pages4k, pages1m, pages2g;
unsigned long end = start + size;
unsigned long address = start;
pgd_t *pg_dir;
@@ -85,6 +89,7 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
pte_t *pt_dir;
int ret = -ENOMEM;
+ pages4k = pages1m = pages2g = 0;
while (address < end) {
pg_dir = pgd_offset_k(address);
if (pgd_none(*pg_dir)) {
@@ -97,10 +102,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
if (MACHINE_HAS_EDAT2 && pud_none(*pu_dir) && address &&
!(address & ~PUD_MASK) && (address + PUD_SIZE <= end) &&
!debug_pagealloc_enabled()) {
- pud_val(*pu_dir) = __pa(address) |
- _REGION_ENTRY_TYPE_R3 | _REGION3_ENTRY_LARGE |
- (ro ? _REGION_ENTRY_PROTECT : 0);
+ pud_val(*pu_dir) = address | pgprot_val(REGION3_KERNEL);
address += PUD_SIZE;
+ pages2g++;
continue;
}
if (pud_none(*pu_dir)) {
@@ -113,11 +117,9 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
if (MACHINE_HAS_EDAT1 && pmd_none(*pm_dir) && address &&
!(address & ~PMD_MASK) && (address + PMD_SIZE <= end) &&
!debug_pagealloc_enabled()) {
- pmd_val(*pm_dir) = __pa(address) |
- _SEGMENT_ENTRY | _SEGMENT_ENTRY_LARGE |
- _SEGMENT_ENTRY_YOUNG |
- (ro ? _SEGMENT_ENTRY_PROTECT : 0);
+ pmd_val(*pm_dir) = address | pgprot_val(SEGMENT_KERNEL);
address += PMD_SIZE;
+ pages1m++;
continue;
}
if (pmd_none(*pm_dir)) {
@@ -128,12 +130,15 @@ static int vmem_add_mem(unsigned long start, unsigned long size, int ro)
}
pt_dir = pte_offset_kernel(pm_dir, address);
- pte_val(*pt_dir) = __pa(address) |
- pgprot_val(ro ? PAGE_KERNEL_RO : PAGE_KERNEL);
+ pte_val(*pt_dir) = address | pgprot_val(PAGE_KERNEL);
address += PAGE_SIZE;
+ pages4k++;
}
ret = 0;
out:
+ update_page_count(PG_DIRECT_MAP_4K, pages4k);
+ update_page_count(PG_DIRECT_MAP_1M, pages1m);
+ update_page_count(PG_DIRECT_MAP_2G, pages2g);
return ret;
}
@@ -143,15 +148,15 @@ out:
*/
static void vmem_remove_range(unsigned long start, unsigned long size)
{
+ unsigned long pages4k, pages1m, pages2g;
unsigned long end = start + size;
unsigned long address = start;
pgd_t *pg_dir;
pud_t *pu_dir;
pmd_t *pm_dir;
pte_t *pt_dir;
- pte_t pte;
- pte_val(pte) = _PAGE_INVALID;
+ pages4k = pages1m = pages2g = 0;
while (address < end) {
pg_dir = pgd_offset_k(address);
if (pgd_none(*pg_dir)) {
@@ -166,6 +171,7 @@ static void vmem_remove_range(unsigned long start, unsigned long size)
if (pud_large(*pu_dir)) {
pud_clear(pu_dir);
address += PUD_SIZE;
+ pages2g++;
continue;
}
pm_dir = pmd_offset(pu_dir, address);
@@ -176,13 +182,18 @@ static void vmem_remove_range(unsigned long start, unsigned long size)
if (pmd_large(*pm_dir)) {
pmd_clear(pm_dir);
address += PMD_SIZE;
+ pages1m++;
continue;
}
pt_dir = pte_offset_kernel(pm_dir, address);
- *pt_dir = pte;
+ pte_clear(&init_mm, address, pt_dir);
address += PAGE_SIZE;
+ pages4k++;
}
flush_tlb_kernel_range(start, end);
+ update_page_count(PG_DIRECT_MAP_4K, -pages4k);
+ update_page_count(PG_DIRECT_MAP_1M, -pages1m);
+ update_page_count(PG_DIRECT_MAP_2G, -pages2g);
}
/*
@@ -341,7 +352,7 @@ int vmem_add_mapping(unsigned long start, unsigned long size)
if (ret)
goto out_free;
- ret = vmem_add_mem(start, size, 0);
+ ret = vmem_add_mem(start, size);
if (ret)
goto out_remove;
goto out;
@@ -362,31 +373,13 @@ out:
*/
void __init vmem_map_init(void)
{
- unsigned long ro_start, ro_end;
+ unsigned long size = _eshared - _stext;
struct memblock_region *reg;
- phys_addr_t start, end;
- ro_start = PFN_ALIGN((unsigned long)&_stext);
- ro_end = (unsigned long)&_eshared & PAGE_MASK;
- for_each_memblock(memory, reg) {
- start = reg->base;
- end = reg->base + reg->size;
- if (start >= ro_end || end <= ro_start)
- vmem_add_mem(start, end - start, 0);
- else if (start >= ro_start && end <= ro_end)
- vmem_add_mem(start, end - start, 1);
- else if (start >= ro_start) {
- vmem_add_mem(start, ro_end - start, 1);
- vmem_add_mem(ro_end, end - ro_end, 0);
- } else if (end < ro_end) {
- vmem_add_mem(start, ro_start - start, 0);
- vmem_add_mem(ro_start, end - ro_start, 1);
- } else {
- vmem_add_mem(start, ro_start - start, 0);
- vmem_add_mem(ro_start, ro_end - ro_start, 1);
- vmem_add_mem(ro_end, end - ro_end, 0);
- }
- }
+ for_each_memblock(memory, reg)
+ vmem_add_mem(reg->base, reg->size);
+ set_memory_ro((unsigned long)_stext, size >> PAGE_SHIFT);
+ pr_info("Write protected kernel read-only data: %luk\n", size >> 10);
}
/*
diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c
index 828d0695d..37e0bb835 100644
--- a/arch/s390/numa/mode_emu.c
+++ b/arch/s390/numa/mode_emu.c
@@ -34,7 +34,8 @@
#define DIST_CORE 1
#define DIST_MC 2
#define DIST_BOOK 3
-#define DIST_MAX 4
+#define DIST_DRAWER 4
+#define DIST_MAX 5
/* Node distance reported to common code */
#define EMU_NODE_DIST 10
@@ -43,7 +44,7 @@
#define NODE_ID_FREE -1
/* Different levels of toptree */
-enum toptree_level {CORE, MC, BOOK, NODE, TOPOLOGY};
+enum toptree_level {CORE, MC, BOOK, DRAWER, NODE, TOPOLOGY};
/* The two toptree IDs */
enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA};
@@ -114,6 +115,14 @@ static int cores_free(struct toptree *tree)
*/
static struct toptree *core_node(struct toptree *core)
{
+ return core->parent->parent->parent->parent;
+}
+
+/*
+ * Return drawer of core
+ */
+static struct toptree *core_drawer(struct toptree *core)
+{
return core->parent->parent->parent;
}
@@ -138,6 +147,8 @@ static struct toptree *core_mc(struct toptree *core)
*/
static int dist_core_to_core(struct toptree *core1, struct toptree *core2)
{
+ if (core_drawer(core1)->id != core_drawer(core2)->id)
+ return DIST_DRAWER;
if (core_book(core1)->id != core_book(core2)->id)
return DIST_BOOK;
if (core_mc(core1)->id != core_mc(core2)->id)
@@ -262,6 +273,8 @@ static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys)
struct toptree *core;
/* Always try to move perfectly fitting structures first */
+ move_level_to_numa(numa, phys, DRAWER, true);
+ move_level_to_numa(numa, phys, DRAWER, false);
move_level_to_numa(numa, phys, BOOK, true);
move_level_to_numa(numa, phys, BOOK, false);
move_level_to_numa(numa, phys, MC, true);
@@ -335,7 +348,7 @@ static struct toptree *toptree_to_numa(struct toptree *phys)
*/
static struct toptree *toptree_from_topology(void)
{
- struct toptree *phys, *node, *book, *mc, *core;
+ struct toptree *phys, *node, *drawer, *book, *mc, *core;
struct cpu_topology_s390 *top;
int cpu;
@@ -344,10 +357,11 @@ static struct toptree *toptree_from_topology(void)
for_each_online_cpu(cpu) {
top = &per_cpu(cpu_topology, cpu);
node = toptree_get_child(phys, 0);
- book = toptree_get_child(node, top->book_id);
+ drawer = toptree_get_child(node, top->drawer_id);
+ book = toptree_get_child(drawer, top->book_id);
mc = toptree_get_child(book, top->socket_id);
core = toptree_get_child(mc, top->core_id);
- if (!book || !mc || !core)
+ if (!drawer || !book || !mc || !core)
panic("NUMA emulation could not allocate memory");
cpumask_set_cpu(cpu, &core->mask);
toptree_update_mask(mc);
@@ -368,6 +382,7 @@ static void topology_add_core(struct toptree *core)
cpumask_copy(&top->thread_mask, &core->mask);
cpumask_copy(&top->core_mask, &core_mc(core)->mask);
cpumask_copy(&top->book_mask, &core_book(core)->mask);
+ cpumask_copy(&top->drawer_mask, &core_drawer(core)->mask);
cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]);
top->node_id = core_node(core)->id;
}
@@ -467,8 +482,12 @@ static int emu_setup_nodes_adjust(int nodes)
*/
static void emu_setup(void)
{
+ int nid;
+
emu_size = emu_setup_size_adjust(emu_size);
emu_nodes = emu_setup_nodes_adjust(emu_nodes);
+ for (nid = 0; nid < emu_nodes; nid++)
+ node_set(nid, node_possible_map);
pr_info("Creating %d nodes with memory stripe size %ld MB\n",
emu_nodes, emu_size >> 20);
}
diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c
index 279484506..f576f1073 100644
--- a/arch/s390/numa/numa.c
+++ b/arch/s390/numa/numa.c
@@ -26,8 +26,14 @@ EXPORT_SYMBOL(node_data);
cpumask_t node_to_cpumask_map[MAX_NUMNODES];
EXPORT_SYMBOL(node_to_cpumask_map);
+static void plain_setup(void)
+{
+ node_set(0, node_possible_map);
+}
+
const struct numa_mode numa_mode_plain = {
.name = "plain",
+ .setup = plain_setup,
};
static const struct numa_mode *mode = &numa_mode_plain;
@@ -126,13 +132,13 @@ static void __init numa_setup_memory(void)
void __init numa_setup(void)
{
pr_info("NUMA mode: %s\n", mode->name);
+ nodes_clear(node_possible_map);
if (mode->setup)
mode->setup();
numa_setup_memory();
memblock_dump_all();
}
-
/*
* numa_init_early() - Initialization initcall
*
diff --git a/arch/s390/oprofile/Makefile b/arch/s390/oprofile/Makefile
index 496e4a7ee..e9dd41b0b 100644
--- a/arch/s390/oprofile/Makefile
+++ b/arch/s390/oprofile/Makefile
@@ -7,4 +7,3 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
timer_int.o )
oprofile-y := $(DRIVER_OBJS) init.o
-oprofile-y += hwsampler.o
diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c
index 791935a65..16f4c3960 100644
--- a/arch/s390/oprofile/init.c
+++ b/arch/s390/oprofile/init.c
@@ -10,488 +10,8 @@
*/
#include <linux/oprofile.h>
-#include <linux/perf_event.h>
#include <linux/init.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/module.h>
#include <asm/processor.h>
-#include <asm/perf_event.h>
-
-#include "../../../drivers/oprofile/oprof.h"
-
-#include "hwsampler.h"
-#include "op_counter.h"
-
-#define DEFAULT_INTERVAL 4127518
-
-#define DEFAULT_SDBT_BLOCKS 1
-#define DEFAULT_SDB_BLOCKS 511
-
-static unsigned long oprofile_hw_interval = DEFAULT_INTERVAL;
-static unsigned long oprofile_min_interval;
-static unsigned long oprofile_max_interval;
-
-static unsigned long oprofile_sdbt_blocks = DEFAULT_SDBT_BLOCKS;
-static unsigned long oprofile_sdb_blocks = DEFAULT_SDB_BLOCKS;
-
-static int hwsampler_enabled;
-static int hwsampler_running; /* start_mutex must be held to change */
-static int hwsampler_available;
-
-static struct oprofile_operations timer_ops;
-
-struct op_counter_config counter_config;
-
-enum __force_cpu_type {
- reserved = 0, /* do not force */
- timer,
-};
-static int force_cpu_type;
-
-static int set_cpu_type(const char *str, struct kernel_param *kp)
-{
- if (!strcmp(str, "timer")) {
- force_cpu_type = timer;
- printk(KERN_INFO "oprofile: forcing timer to be returned "
- "as cpu type\n");
- } else {
- force_cpu_type = 0;
- }
-
- return 0;
-}
-module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0);
-MODULE_PARM_DESC(cpu_type, "Force legacy basic mode sampling"
- "(report cpu_type \"timer\"");
-
-static int __oprofile_hwsampler_start(void)
-{
- int retval;
-
- retval = hwsampler_allocate(oprofile_sdbt_blocks, oprofile_sdb_blocks);
- if (retval)
- return retval;
-
- retval = hwsampler_start_all(oprofile_hw_interval);
- if (retval)
- hwsampler_deallocate();
-
- return retval;
-}
-
-static int oprofile_hwsampler_start(void)
-{
- int retval;
-
- hwsampler_running = hwsampler_enabled;
-
- if (!hwsampler_running)
- return timer_ops.start();
-
- retval = perf_reserve_sampling();
- if (retval)
- return retval;
-
- retval = __oprofile_hwsampler_start();
- if (retval)
- perf_release_sampling();
-
- return retval;
-}
-
-static void oprofile_hwsampler_stop(void)
-{
- if (!hwsampler_running) {
- timer_ops.stop();
- return;
- }
-
- hwsampler_stop_all();
- hwsampler_deallocate();
- perf_release_sampling();
- return;
-}
-
-/*
- * File ops used for:
- * /dev/oprofile/0/enabled
- * /dev/oprofile/hwsampling/hwsampler (cpu_type = timer)
- */
-
-static ssize_t hwsampler_read(struct file *file, char __user *buf,
- size_t count, loff_t *offset)
-{
- return oprofilefs_ulong_to_user(hwsampler_enabled, buf, count, offset);
-}
-
-static ssize_t hwsampler_write(struct file *file, char const __user *buf,
- size_t count, loff_t *offset)
-{
- unsigned long val;
- int retval;
-
- if (*offset)
- return -EINVAL;
-
- retval = oprofilefs_ulong_from_user(&val, buf, count);
- if (retval <= 0)
- return retval;
-
- if (val != 0 && val != 1)
- return -EINVAL;
-
- if (oprofile_started)
- /*
- * save to do without locking as we set
- * hwsampler_running in start() when start_mutex is
- * held
- */
- return -EBUSY;
-
- hwsampler_enabled = val;
-
- return count;
-}
-
-static const struct file_operations hwsampler_fops = {
- .read = hwsampler_read,
- .write = hwsampler_write,
-};
-
-/*
- * File ops used for:
- * /dev/oprofile/0/count
- * /dev/oprofile/hwsampling/hw_interval (cpu_type = timer)
- *
- * Make sure that the value is within the hardware range.
- */
-
-static ssize_t hw_interval_read(struct file *file, char __user *buf,
- size_t count, loff_t *offset)
-{
- return oprofilefs_ulong_to_user(oprofile_hw_interval, buf,
- count, offset);
-}
-
-static ssize_t hw_interval_write(struct file *file, char const __user *buf,
- size_t count, loff_t *offset)
-{
- unsigned long val;
- int retval;
-
- if (*offset)
- return -EINVAL;
- retval = oprofilefs_ulong_from_user(&val, buf, count);
- if (retval <= 0)
- return retval;
- if (val < oprofile_min_interval)
- oprofile_hw_interval = oprofile_min_interval;
- else if (val > oprofile_max_interval)
- oprofile_hw_interval = oprofile_max_interval;
- else
- oprofile_hw_interval = val;
-
- return count;
-}
-
-static const struct file_operations hw_interval_fops = {
- .read = hw_interval_read,
- .write = hw_interval_write,
-};
-
-/*
- * File ops used for:
- * /dev/oprofile/0/event
- * Only a single event with number 0 is supported with this counter.
- *
- * /dev/oprofile/0/unit_mask
- * This is a dummy file needed by the user space tools.
- * No value other than 0 is accepted or returned.
- */
-
-static ssize_t hwsampler_zero_read(struct file *file, char __user *buf,
- size_t count, loff_t *offset)
-{
- return oprofilefs_ulong_to_user(0, buf, count, offset);
-}
-
-static ssize_t hwsampler_zero_write(struct file *file, char const __user *buf,
- size_t count, loff_t *offset)
-{
- unsigned long val;
- int retval;
-
- if (*offset)
- return -EINVAL;
-
- retval = oprofilefs_ulong_from_user(&val, buf, count);
- if (retval <= 0)
- return retval;
- if (val != 0)
- return -EINVAL;
- return count;
-}
-
-static const struct file_operations zero_fops = {
- .read = hwsampler_zero_read,
- .write = hwsampler_zero_write,
-};
-
-/* /dev/oprofile/0/kernel file ops. */
-
-static ssize_t hwsampler_kernel_read(struct file *file, char __user *buf,
- size_t count, loff_t *offset)
-{
- return oprofilefs_ulong_to_user(counter_config.kernel,
- buf, count, offset);
-}
-
-static ssize_t hwsampler_kernel_write(struct file *file, char const __user *buf,
- size_t count, loff_t *offset)
-{
- unsigned long val;
- int retval;
-
- if (*offset)
- return -EINVAL;
-
- retval = oprofilefs_ulong_from_user(&val, buf, count);
- if (retval <= 0)
- return retval;
-
- if (val != 0 && val != 1)
- return -EINVAL;
-
- counter_config.kernel = val;
-
- return count;
-}
-
-static const struct file_operations kernel_fops = {
- .read = hwsampler_kernel_read,
- .write = hwsampler_kernel_write,
-};
-
-/* /dev/oprofile/0/user file ops. */
-
-static ssize_t hwsampler_user_read(struct file *file, char __user *buf,
- size_t count, loff_t *offset)
-{
- return oprofilefs_ulong_to_user(counter_config.user,
- buf, count, offset);
-}
-
-static ssize_t hwsampler_user_write(struct file *file, char const __user *buf,
- size_t count, loff_t *offset)
-{
- unsigned long val;
- int retval;
-
- if (*offset)
- return -EINVAL;
-
- retval = oprofilefs_ulong_from_user(&val, buf, count);
- if (retval <= 0)
- return retval;
-
- if (val != 0 && val != 1)
- return -EINVAL;
-
- counter_config.user = val;
-
- return count;
-}
-
-static const struct file_operations user_fops = {
- .read = hwsampler_user_read,
- .write = hwsampler_user_write,
-};
-
-
-/*
- * File ops used for: /dev/oprofile/timer/enabled
- * The value always has to be the inverted value of hwsampler_enabled. So
- * no separate variable is created. That way we do not need locking.
- */
-
-static ssize_t timer_enabled_read(struct file *file, char __user *buf,
- size_t count, loff_t *offset)
-{
- return oprofilefs_ulong_to_user(!hwsampler_enabled, buf, count, offset);
-}
-
-static ssize_t timer_enabled_write(struct file *file, char const __user *buf,
- size_t count, loff_t *offset)
-{
- unsigned long val;
- int retval;
-
- if (*offset)
- return -EINVAL;
-
- retval = oprofilefs_ulong_from_user(&val, buf, count);
- if (retval <= 0)
- return retval;
-
- if (val != 0 && val != 1)
- return -EINVAL;
-
- /* Timer cannot be disabled without having hardware sampling. */
- if (val == 0 && !hwsampler_available)
- return -EINVAL;
-
- if (oprofile_started)
- /*
- * save to do without locking as we set
- * hwsampler_running in start() when start_mutex is
- * held
- */
- return -EBUSY;
-
- hwsampler_enabled = !val;
-
- return count;
-}
-
-static const struct file_operations timer_enabled_fops = {
- .read = timer_enabled_read,
- .write = timer_enabled_write,
-};
-
-
-static int oprofile_create_hwsampling_files(struct dentry *root)
-{
- struct dentry *dir;
-
- dir = oprofilefs_mkdir(root, "timer");
- if (!dir)
- return -EINVAL;
-
- oprofilefs_create_file(dir, "enabled", &timer_enabled_fops);
-
- if (!hwsampler_available)
- return 0;
-
- /* reinitialize default values */
- hwsampler_enabled = 1;
- counter_config.kernel = 1;
- counter_config.user = 1;
-
- if (!force_cpu_type) {
- /*
- * Create the counter file system. A single virtual
- * counter is created which can be used to
- * enable/disable hardware sampling dynamically from
- * user space. The user space will configure a single
- * counter with a single event. The value of 'event'
- * and 'unit_mask' are not evaluated by the kernel code
- * and can only be set to 0.
- */
-
- dir = oprofilefs_mkdir(root, "0");
- if (!dir)
- return -EINVAL;
-
- oprofilefs_create_file(dir, "enabled", &hwsampler_fops);
- oprofilefs_create_file(dir, "event", &zero_fops);
- oprofilefs_create_file(dir, "count", &hw_interval_fops);
- oprofilefs_create_file(dir, "unit_mask", &zero_fops);
- oprofilefs_create_file(dir, "kernel", &kernel_fops);
- oprofilefs_create_file(dir, "user", &user_fops);
- oprofilefs_create_ulong(dir, "hw_sdbt_blocks",
- &oprofile_sdbt_blocks);
-
- } else {
- /*
- * Hardware sampling can be used but the cpu_type is
- * forced to timer in order to deal with legacy user
- * space tools. The /dev/oprofile/hwsampling fs is
- * provided in that case.
- */
- dir = oprofilefs_mkdir(root, "hwsampling");
- if (!dir)
- return -EINVAL;
-
- oprofilefs_create_file(dir, "hwsampler",
- &hwsampler_fops);
- oprofilefs_create_file(dir, "hw_interval",
- &hw_interval_fops);
- oprofilefs_create_ro_ulong(dir, "hw_min_interval",
- &oprofile_min_interval);
- oprofilefs_create_ro_ulong(dir, "hw_max_interval",
- &oprofile_max_interval);
- oprofilefs_create_ulong(dir, "hw_sdbt_blocks",
- &oprofile_sdbt_blocks);
- }
- return 0;
-}
-
-static int oprofile_hwsampler_init(struct oprofile_operations *ops)
-{
- /*
- * Initialize the timer mode infrastructure as well in order
- * to be able to switch back dynamically. oprofile_timer_init
- * is not supposed to fail.
- */
- if (oprofile_timer_init(ops))
- BUG();
-
- memcpy(&timer_ops, ops, sizeof(timer_ops));
- ops->create_files = oprofile_create_hwsampling_files;
-
- /*
- * If the user space tools do not support newer cpu types,
- * the force_cpu_type module parameter
- * can be used to always return \"timer\" as cpu type.
- */
- if (force_cpu_type != timer) {
- struct cpuid id;
-
- get_cpu_id (&id);
-
- switch (id.machine) {
- case 0x2097: case 0x2098: ops->cpu_type = "s390/z10"; break;
- case 0x2817: case 0x2818: ops->cpu_type = "s390/z196"; break;
- case 0x2827: case 0x2828: ops->cpu_type = "s390/zEC12"; break;
- case 0x2964: case 0x2965: ops->cpu_type = "s390/z13"; break;
- default: return -ENODEV;
- }
- }
-
- if (hwsampler_setup())
- return -ENODEV;
-
- /*
- * Query the range for the sampling interval from the
- * hardware.
- */
- oprofile_min_interval = hwsampler_query_min_interval();
- if (oprofile_min_interval == 0)
- return -ENODEV;
- oprofile_max_interval = hwsampler_query_max_interval();
- if (oprofile_max_interval == 0)
- return -ENODEV;
-
- /* The initial value should be sane */
- if (oprofile_hw_interval < oprofile_min_interval)
- oprofile_hw_interval = oprofile_min_interval;
- if (oprofile_hw_interval > oprofile_max_interval)
- oprofile_hw_interval = oprofile_max_interval;
-
- printk(KERN_INFO "oprofile: System z hardware sampling "
- "facility found.\n");
-
- ops->start = oprofile_hwsampler_start;
- ops->stop = oprofile_hwsampler_stop;
-
- return 0;
-}
-
-static void oprofile_hwsampler_exit(void)
-{
- hwsampler_shutdown();
-}
static int __s390_backtrace(void *data, unsigned long address)
{
@@ -514,18 +34,9 @@ static void s390_backtrace(struct pt_regs *regs, unsigned int depth)
int __init oprofile_arch_init(struct oprofile_operations *ops)
{
ops->backtrace = s390_backtrace;
-
- /*
- * -ENODEV is not reported to the caller. The module itself
- * will use the timer mode sampling as fallback and this is
- * always available.
- */
- hwsampler_available = oprofile_hwsampler_init(ops) == 0;
-
return 0;
}
void oprofile_arch_exit(void)
{
- oprofile_hwsampler_exit();
}
diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index 1ea8c07ea..7297fce9b 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -226,7 +226,8 @@ static unsigned long __dma_alloc_iommu(struct device *dev,
boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
PAGE_SIZE) >> PAGE_SHIFT;
return iommu_area_alloc(zdev->iommu_bitmap, zdev->iommu_pages,
- start, size, 0, boundary_size, 0);
+ start, size, zdev->start_dma >> PAGE_SHIFT,
+ boundary_size, 0);
}
static unsigned long dma_alloc_iommu(struct device *dev, int size)
@@ -285,7 +286,7 @@ static inline void zpci_err_dma(unsigned long rc, unsigned long addr)
static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page,
unsigned long offset, size_t size,
enum dma_data_direction direction,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
unsigned long nr_pages, iommu_page_index;
@@ -331,7 +332,7 @@ out_err:
static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr,
size_t size, enum dma_data_direction direction,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
unsigned long iommu_page_index;
@@ -354,7 +355,7 @@ static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr,
static void *s390_dma_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t flag,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
struct page *page;
@@ -369,7 +370,7 @@ static void *s390_dma_alloc(struct device *dev, size_t size,
pa = page_to_phys(page);
memset((void *) pa, 0, size);
- map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, NULL);
+ map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0);
if (dma_mapping_error(dev, map)) {
free_pages(pa, get_order(size));
return NULL;
@@ -383,19 +384,19 @@ static void *s390_dma_alloc(struct device *dev, size_t size,
static void s390_dma_free(struct device *dev, size_t size,
void *pa, dma_addr_t dma_handle,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
struct zpci_dev *zdev = to_zpci(to_pci_dev(dev));
size = PAGE_ALIGN(size);
atomic64_sub(size / PAGE_SIZE, &zdev->allocated_pages);
- s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
+ s390_dma_unmap_pages(dev, dma_handle, size, DMA_BIDIRECTIONAL, 0);
free_pages((unsigned long) pa, get_order(size));
}
static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
int nr_elements, enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
int mapped_elements = 0;
struct scatterlist *s;
@@ -404,7 +405,7 @@ static int s390_dma_map_sg(struct device *dev, struct scatterlist *sg,
for_each_sg(sg, s, nr_elements, i) {
struct page *page = sg_page(s);
s->dma_address = s390_dma_map_pages(dev, page, s->offset,
- s->length, dir, NULL);
+ s->length, dir, 0);
if (!dma_mapping_error(dev, s->dma_address)) {
s->dma_length = s->length;
mapped_elements++;
@@ -418,7 +419,7 @@ unmap:
for_each_sg(sg, s, mapped_elements, i) {
if (s->dma_address)
s390_dma_unmap_pages(dev, s->dma_address, s->dma_length,
- dir, NULL);
+ dir, 0);
s->dma_address = 0;
s->dma_length = 0;
}
@@ -428,13 +429,14 @@ unmap:
static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
int nr_elements, enum dma_data_direction dir,
- struct dma_attrs *attrs)
+ unsigned long attrs)
{
struct scatterlist *s;
int i;
for_each_sg(sg, s, nr_elements, i) {
- s390_dma_unmap_pages(dev, s->dma_address, s->dma_length, dir, NULL);
+ s390_dma_unmap_pages(dev, s->dma_address, s->dma_length, dir,
+ 0);
s->dma_address = 0;
s->dma_length = 0;
}
@@ -469,6 +471,7 @@ int zpci_dma_init_device(struct zpci_dev *zdev)
* Also set zdev->end_dma to the actual end address of the usable
* range, instead of the theoretical maximum as reported by hardware.
*/
+ zdev->start_dma = PAGE_ALIGN(zdev->start_dma);
zdev->iommu_size = min3((u64) high_memory,
ZPCI_TABLE_SIZE_RT - zdev->start_dma,
zdev->end_dma - zdev->start_dma + 1);
diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c
index fb2a9a560..c2b27ad8e 100644
--- a/arch/s390/pci/pci_event.c
+++ b/arch/s390/pci/pci_event.c
@@ -145,8 +145,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf)
default:
break;
}
- if (pdev)
- pci_dev_put(pdev);
+ pci_dev_put(pdev);
}
void zpci_event_availability(void *data)
diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c
index 10ca15dca..fa8d7d4b9 100644
--- a/arch/s390/pci/pci_insn.c
+++ b/arch/s390/pci/pci_insn.c
@@ -99,7 +99,7 @@ void zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc)
}
/* PCI Load */
-static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status)
+static inline int ____pcilg(u64 *data, u64 req, u64 offset, u8 *status)
{
register u64 __req asm("2") = req;
register u64 __offset asm("3") = offset;
@@ -116,6 +116,16 @@ static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status)
: "d" (__offset)
: "cc");
*status = __req >> 24 & 0xff;
+ *data = __data;
+ return cc;
+}
+
+static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status)
+{
+ u64 __data;
+ int cc;
+
+ cc = ____pcilg(&__data, req, offset, status);
if (!cc)
*data = __data;
diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c
index e2660d278..fe4e6c910 100644
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -39,7 +39,6 @@ static void print_facility_list(struct facility_def *def)
printf("#define %s ", def->name);
for (i = 0; i <= high; i++)
printf("_AC(0x%016llx,UL)%c", array[i], i < high ? ',' : '\n');
- printf("#define %s_DWORDS %d\n", def->name, high + 1);
free(array);
}