diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-08-05 17:04:01 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2015-08-05 17:04:01 -0300 |
commit | 57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch) | |
tree | 5e910f0e82173f4ef4f51111366a3f1299037a7b /kernel/bpf |
Initial import
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/Makefile | 2 | ||||
-rw-r--r-- | kernel/bpf/arraymap.c | 156 | ||||
-rw-r--r-- | kernel/bpf/core.c | 674 | ||||
-rw-r--r-- | kernel/bpf/hashtab.c | 367 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 113 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 621 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 2146 |
7 files changed, 4079 insertions, 0 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile new file mode 100644 index 000000000..e6983be12 --- /dev/null +++ b/kernel/bpf/Makefile @@ -0,0 +1,2 @@ +obj-y := core.o +obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c new file mode 100644 index 000000000..8a6616583 --- /dev/null +++ b/kernel/bpf/arraymap.c @@ -0,0 +1,156 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/err.h> +#include <linux/vmalloc.h> +#include <linux/slab.h> +#include <linux/mm.h> + +struct bpf_array { + struct bpf_map map; + u32 elem_size; + char value[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + struct bpf_array *array; + u32 elem_size, array_size; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size == 0) + return ERR_PTR(-EINVAL); + + elem_size = round_up(attr->value_size, 8); + + /* check round_up into zero and u32 overflow */ + if (elem_size == 0 || + attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) + return ERR_PTR(-ENOMEM); + + array_size = sizeof(*array) + attr->max_entries * elem_size; + + /* allocate all map elements and zero-initialize them */ + array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); + if (!array) { + array = vzalloc(array_size); + if (!array) + return ERR_PTR(-ENOMEM); + } + + /* copy mandatory map attributes */ + array->map.key_size = attr->key_size; + array->map.value_size = attr->value_size; + array->map.max_entries = attr->max_entries; + + array->elem_size = elem_size; + + return &array->map; +} + +/* Called from syscall or from eBPF program */ +static void *array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (index >= array->map.max_entries) + return NULL; + + return array->value + array->elem_size * index; +} + +/* Called from syscall */ +static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + u32 *next = (u32 *)next_key; + + if (index >= array->map.max_entries) { + *next = 0; + return 0; + } + + if (index == array->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + if (index >= array->map.max_entries) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (map_flags == BPF_NOEXIST) + /* all elements already exist */ + return -EEXIST; + + memcpy(array->value + array->elem_size * index, value, array->elem_size); + return 0; +} + +/* Called from syscall or from eBPF program */ +static int array_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void array_map_free(struct bpf_map *map) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding programs to complete + * and free the array + */ + synchronize_rcu(); + + kvfree(array); +} + +static const struct bpf_map_ops array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list array_type __read_mostly = { + .ops = &array_ops, + .type = BPF_MAP_TYPE_ARRAY, +}; + +static int __init register_array_map(void) +{ + bpf_register_map_type(&array_type); + return 0; +} +late_initcall(register_array_map); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c new file mode 100644 index 000000000..54f0e7fcd --- /dev/null +++ b/kernel/bpf/core.c @@ -0,0 +1,674 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Based on the design of the Berkeley Packet Filter. The new + * internal format has been designed by PLUMgrid: + * + * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com + * + * Authors: + * + * Jay Schulist <jschlst@samba.org> + * Alexei Starovoitov <ast@plumgrid.com> + * Daniel Borkmann <dborkman@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in bpf_check_classic() + */ + +#include <linux/filter.h> +#include <linux/skbuff.h> +#include <linux/vmalloc.h> +#include <linux/random.h> +#include <linux/moduleloader.h> +#include <asm/unaligned.h> +#include <linux/bpf.h> + +/* Registers */ +#define BPF_R0 regs[BPF_REG_0] +#define BPF_R1 regs[BPF_REG_1] +#define BPF_R2 regs[BPF_REG_2] +#define BPF_R3 regs[BPF_REG_3] +#define BPF_R4 regs[BPF_REG_4] +#define BPF_R5 regs[BPF_REG_5] +#define BPF_R6 regs[BPF_REG_6] +#define BPF_R7 regs[BPF_REG_7] +#define BPF_R8 regs[BPF_REG_8] +#define BPF_R9 regs[BPF_REG_9] +#define BPF_R10 regs[BPF_REG_10] + +/* Named registers */ +#define DST regs[insn->dst_reg] +#define SRC regs[insn->src_reg] +#define FP regs[BPF_REG_FP] +#define ARG1 regs[BPF_REG_ARG1] +#define CTX regs[BPF_REG_CTX] +#define IMM insn->imm + +/* No hurry in this branch + * + * Exported for the bpf jit load helper. + */ +void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size) +{ + u8 *ptr = NULL; + + if (k >= SKF_NET_OFF) + ptr = skb_network_header(skb) + k - SKF_NET_OFF; + else if (k >= SKF_LL_OFF) + ptr = skb_mac_header(skb) + k - SKF_LL_OFF; + if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb)) + return ptr; + + return NULL; +} + +struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog_aux *aux; + struct bpf_prog *fp; + + size = round_up(size, PAGE_SIZE); + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp == NULL) + return NULL; + + aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + if (aux == NULL) { + vfree(fp); + return NULL; + } + + fp->pages = size / PAGE_SIZE; + fp->aux = aux; + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_alloc); + +struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, + gfp_t gfp_extra_flags) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | + gfp_extra_flags; + struct bpf_prog *fp; + + BUG_ON(fp_old == NULL); + + size = round_up(size, PAGE_SIZE); + if (size <= fp_old->pages * PAGE_SIZE) + return fp_old; + + fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + if (fp != NULL) { + memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); + fp->pages = size / PAGE_SIZE; + + /* We keep fp->aux from fp_old around in the new + * reallocated structure. + */ + fp_old->aux = NULL; + __bpf_prog_free(fp_old); + } + + return fp; +} +EXPORT_SYMBOL_GPL(bpf_prog_realloc); + +void __bpf_prog_free(struct bpf_prog *fp) +{ + kfree(fp->aux); + vfree(fp); +} +EXPORT_SYMBOL_GPL(__bpf_prog_free); + +#ifdef CONFIG_BPF_JIT +struct bpf_binary_header * +bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, + unsigned int alignment, + bpf_jit_fill_hole_t bpf_fill_ill_insns) +{ + struct bpf_binary_header *hdr; + unsigned int size, hole, start; + + /* Most of BPF filters are really small, but if some of them + * fill a page, allow at least 128 extra bytes to insert a + * random section of illegal instructions. + */ + size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); + hdr = module_alloc(size); + if (hdr == NULL) + return NULL; + + /* Fill space with illegal/arch-dep instructions. */ + bpf_fill_ill_insns(hdr, size); + + hdr->pages = size / PAGE_SIZE; + hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), + PAGE_SIZE - sizeof(*hdr)); + start = (prandom_u32() % hole) & ~(alignment - 1); + + /* Leave a random number of instructions before BPF code. */ + *image_ptr = &hdr->image[start]; + + return hdr; +} + +void bpf_jit_binary_free(struct bpf_binary_header *hdr) +{ + module_memfree(hdr); +} +#endif /* CONFIG_BPF_JIT */ + +/* Base function for offset calculation. Needs to go into .text section, + * therefore keeping it non-static as well; will also be used by JITs + * anyway later on, so do not let the compiler omit it. + */ +noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return 0; +} + +/** + * __bpf_prog_run - run eBPF program on a given context + * @ctx: is the data we are operating on + * @insn: is the array of eBPF instructions + * + * Decode and execute eBPF instructions. + */ +static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +{ + u64 stack[MAX_BPF_STACK / sizeof(u64)]; + u64 regs[MAX_BPF_REG], tmp; + static const void *jumptable[256] = { + [0 ... 255] = &&default_label, + /* Now overwrite non-defaults ... */ + /* 32 bit ALU operations */ + [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, + [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, + [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, + [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, + [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, + [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, + [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, + [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, + [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, + [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, + [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, + [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, + [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, + [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, + [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, + [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, + [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, + [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, + [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, + [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, + [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, + [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, + [BPF_ALU | BPF_NEG] = &&ALU_NEG, + [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, + [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, + /* 64 bit ALU operations */ + [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, + [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, + [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, + [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, + [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, + [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, + [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, + [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, + [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, + [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, + [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, + [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, + [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, + [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, + [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, + [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, + [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, + [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, + [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, + [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, + [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, + [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, + [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, + [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, + [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, + /* Call instruction */ + [BPF_JMP | BPF_CALL] = &&JMP_CALL, + /* Jumps */ + [BPF_JMP | BPF_JA] = &&JMP_JA, + [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, + [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, + [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, + [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, + [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, + [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, + [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, + [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, + [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, + [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, + [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, + [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, + [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, + [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, + /* Program return */ + [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, + /* Store instructions */ + [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, + [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, + [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, + [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, + [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, + [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, + [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, + [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, + [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, + [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, + /* Load instructions */ + [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, + [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, + [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, + [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, + [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, + [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, + [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, + [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, + [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, + [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, + [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, + }; + void *ptr; + int off; + +#define CONT ({ insn++; goto select_insn; }) +#define CONT_JMP ({ insn++; goto select_insn; }) + + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; + ARG1 = (u64) (unsigned long) ctx; + + /* Registers used in classic BPF programs need to be reset first. */ + regs[BPF_REG_A] = 0; + regs[BPF_REG_X] = 0; + +select_insn: + goto *jumptable[insn->code]; + + /* ALU */ +#define ALU(OPCODE, OP) \ + ALU64_##OPCODE##_X: \ + DST = DST OP SRC; \ + CONT; \ + ALU_##OPCODE##_X: \ + DST = (u32) DST OP (u32) SRC; \ + CONT; \ + ALU64_##OPCODE##_K: \ + DST = DST OP IMM; \ + CONT; \ + ALU_##OPCODE##_K: \ + DST = (u32) DST OP (u32) IMM; \ + CONT; + + ALU(ADD, +) + ALU(SUB, -) + ALU(AND, &) + ALU(OR, |) + ALU(LSH, <<) + ALU(RSH, >>) + ALU(XOR, ^) + ALU(MUL, *) +#undef ALU + ALU_NEG: + DST = (u32) -DST; + CONT; + ALU64_NEG: + DST = -DST; + CONT; + ALU_MOV_X: + DST = (u32) SRC; + CONT; + ALU_MOV_K: + DST = (u32) IMM; + CONT; + ALU64_MOV_X: + DST = SRC; + CONT; + ALU64_MOV_K: + DST = IMM; + CONT; + LD_IMM_DW: + DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; + insn++; + CONT; + ALU64_ARSH_X: + (*(s64 *) &DST) >>= SRC; + CONT; + ALU64_ARSH_K: + (*(s64 *) &DST) >>= IMM; + CONT; + ALU64_MOD_X: + if (unlikely(SRC == 0)) + return 0; + div64_u64_rem(DST, SRC, &tmp); + DST = tmp; + CONT; + ALU_MOD_X: + if (unlikely(SRC == 0)) + return 0; + tmp = (u32) DST; + DST = do_div(tmp, (u32) SRC); + CONT; + ALU64_MOD_K: + div64_u64_rem(DST, IMM, &tmp); + DST = tmp; + CONT; + ALU_MOD_K: + tmp = (u32) DST; + DST = do_div(tmp, (u32) IMM); + CONT; + ALU64_DIV_X: + if (unlikely(SRC == 0)) + return 0; + DST = div64_u64(DST, SRC); + CONT; + ALU_DIV_X: + if (unlikely(SRC == 0)) + return 0; + tmp = (u32) DST; + do_div(tmp, (u32) SRC); + DST = (u32) tmp; + CONT; + ALU64_DIV_K: + DST = div64_u64(DST, IMM); + CONT; + ALU_DIV_K: + tmp = (u32) DST; + do_div(tmp, (u32) IMM); + DST = (u32) tmp; + CONT; + ALU_END_TO_BE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_be16(DST); + break; + case 32: + DST = (__force u32) cpu_to_be32(DST); + break; + case 64: + DST = (__force u64) cpu_to_be64(DST); + break; + } + CONT; + ALU_END_TO_LE: + switch (IMM) { + case 16: + DST = (__force u16) cpu_to_le16(DST); + break; + case 32: + DST = (__force u32) cpu_to_le32(DST); + break; + case 64: + DST = (__force u64) cpu_to_le64(DST); + break; + } + CONT; + + /* CALL */ + JMP_CALL: + /* Function call scratches BPF_R1-BPF_R5 registers, + * preserves BPF_R6-BPF_R9, and stores return value + * into BPF_R0. + */ + BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3, + BPF_R4, BPF_R5); + CONT; + + /* JMP */ + JMP_JA: + insn += insn->off; + CONT; + JMP_JEQ_X: + if (DST == SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JEQ_K: + if (DST == IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_X: + if (DST != SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JNE_K: + if (DST != IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_X: + if (DST > SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGT_K: + if (DST > IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_X: + if (DST >= SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JGE_K: + if (DST >= IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_X: + if (((s64) DST) > ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGT_K: + if (((s64) DST) > ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_X: + if (((s64) DST) >= ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSGE_K: + if (((s64) DST) >= ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_X: + if (DST & SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSET_K: + if (DST & IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_EXIT: + return BPF_R0; + + /* STX and ST and LDX*/ +#define LDST(SIZEOP, SIZE) \ + STX_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \ + CONT; \ + ST_MEM_##SIZEOP: \ + *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \ + CONT; \ + LDX_MEM_##SIZEOP: \ + DST = *(SIZE *)(unsigned long) (SRC + insn->off); \ + CONT; + + LDST(B, u8) + LDST(H, u16) + LDST(W, u32) + LDST(DW, u64) +#undef LDST + STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ + atomic_add((u32) SRC, (atomic_t *)(unsigned long) + (DST + insn->off)); + CONT; + STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ + atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) + (DST + insn->off)); + CONT; + LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ + off = IMM; +load_word: + /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are + * only appearing in the programs where ctx == + * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] + * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, + * internal BPF verifier will check that BPF_R6 == + * ctx. + * + * BPF_ABS and BPF_IND are wrappers of function calls, + * so they scratch BPF_R1-BPF_R5 registers, preserve + * BPF_R6-BPF_R9, and store return value into BPF_R0. + * + * Implicit input: + * ctx == skb == BPF_R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness + */ + + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be32(ptr); + CONT; + } + + return 0; + LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ + off = IMM; +load_half: + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = get_unaligned_be16(ptr); + CONT; + } + + return 0; + LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ + off = IMM; +load_byte: + ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); + if (likely(ptr != NULL)) { + BPF_R0 = *(u8 *)ptr; + CONT; + } + + return 0; + LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_word; + LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ + off = IMM + SRC; + goto load_half; + LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ + off = IMM + SRC; + goto load_byte; + + default_label: + /* If we ever reach this, we have a bug somewhere. */ + WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + return 0; +} + +void __weak bpf_int_jit_compile(struct bpf_prog *prog) +{ +} + +/** + * bpf_prog_select_runtime - select execution runtime for BPF program + * @fp: bpf_prog populated with internal BPF program + * + * try to JIT internal BPF program, if JIT is not available select interpreter + * BPF program will be executed via BPF_PROG_RUN() macro + */ +void bpf_prog_select_runtime(struct bpf_prog *fp) +{ + fp->bpf_func = (void *) __bpf_prog_run; + + /* Probe if internal BPF can be JITed */ + bpf_int_jit_compile(fp); + /* Lock whole bpf_prog as read-only */ + bpf_prog_lock_ro(fp); +} +EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); + +static void bpf_prog_free_deferred(struct work_struct *work) +{ + struct bpf_prog_aux *aux; + + aux = container_of(work, struct bpf_prog_aux, work); + bpf_jit_free(aux->prog); +} + +/* Free internal BPF program */ +void bpf_prog_free(struct bpf_prog *fp) +{ + struct bpf_prog_aux *aux = fp->aux; + + INIT_WORK(&aux->work, bpf_prog_free_deferred); + aux->prog = fp; + schedule_work(&aux->work); +} +EXPORT_SYMBOL_GPL(bpf_prog_free); + +/* Weak definitions of helper functions in case we don't have bpf syscall. */ +const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; +const struct bpf_func_proto bpf_map_update_elem_proto __weak; +const struct bpf_func_proto bpf_map_delete_elem_proto __weak; + +const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; +const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; + +/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call + * skb_copy_bits(), so provide a weak definition of it for NET-less config. + */ +int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, + int len) +{ + return -EFAULT; +} diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c new file mode 100644 index 000000000..83c209d9b --- /dev/null +++ b/kernel/bpf/hashtab.c @@ -0,0 +1,367 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/jhash.h> +#include <linux/filter.h> +#include <linux/vmalloc.h> + +struct bpf_htab { + struct bpf_map map; + struct hlist_head *buckets; + spinlock_t lock; + u32 count; /* number of elements in this hashtable */ + u32 n_buckets; /* number of hash buckets */ + u32 elem_size; /* size of each element in bytes */ +}; + +/* each htab element is struct htab_elem + key + value */ +struct htab_elem { + struct hlist_node hash_node; + struct rcu_head rcu; + u32 hash; + char key[0] __aligned(8); +}; + +/* Called from syscall */ +static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_htab *htab; + int err, i; + + htab = kzalloc(sizeof(*htab), GFP_USER); + if (!htab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + htab->map.key_size = attr->key_size; + htab->map.value_size = attr->value_size; + htab->map.max_entries = attr->max_entries; + + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + err = -EINVAL; + if (htab->map.max_entries == 0 || htab->map.key_size == 0 || + htab->map.value_size == 0) + goto free_htab; + + /* hash table size must be power of 2 */ + htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); + + err = -E2BIG; + if (htab->map.key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + goto free_htab; + + err = -ENOMEM; + /* prevent zero size kmalloc and check for u32 overflow */ + if (htab->n_buckets == 0 || + htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + goto free_htab; + + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + GFP_USER | __GFP_NOWARN); + + if (!htab->buckets) { + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + if (!htab->buckets) + goto free_htab; + } + + for (i = 0; i < htab->n_buckets; i++) + INIT_HLIST_HEAD(&htab->buckets[i]); + + spin_lock_init(&htab->lock); + htab->count = 0; + + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8) + + htab->map.value_size; + return &htab->map; + +free_htab: + kfree(htab); + return ERR_PTR(err); +} + +static inline u32 htab_map_hash(const void *key, u32 key_len) +{ + return jhash(key, key_len, 0); +} + +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &htab->buckets[hash & (htab->n_buckets - 1)]; +} + +static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, + void *key, u32 key_size) +{ + struct htab_elem *l; + + hlist_for_each_entry_rcu(l, head, hash_node) + if (l->hash == hash && !memcmp(&l->key, key, key_size)) + return l; + + return NULL; +} + +/* Called from syscall or from eBPF program */ +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + u32 hash, key_size; + + /* Must be called with rcu_read_lock. */ + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) + return l->key + round_up(map->key_size, 8); + + return NULL; +} + +/* Called from syscall */ +static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l, *next_l; + u32 hash, key_size; + int i; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + head = select_bucket(htab, hash); + + /* lookup the key */ + l = lookup_elem_raw(head, hash, key, key_size); + + if (!l) { + i = 0; + goto find_first_elem; + } + + /* key was found, get next key in the same bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), + struct htab_elem, hash_node); + + if (next_l) { + /* if next elem in this hash list is non-zero, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + + /* no more elements in this hash list, go to the next bucket */ + i = hash & (htab->n_buckets - 1); + i++; + +find_first_elem: + /* iterate over buckets */ + for (; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + /* pick first element in the bucket */ + next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct htab_elem, hash_node); + if (next_l) { + /* if it's not empty, just return it */ + memcpy(next_key, next_l->key, key_size); + return 0; + } + } + + /* itereated over all buckets and all elements */ + return -ENOENT; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new, *l_old; + struct hlist_head *head; + unsigned long flags; + u32 key_size; + int ret; + + if (map_flags > BPF_EXIST) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* allocate new element outside of lock */ + l_new = kmalloc(htab->elem_size, GFP_ATOMIC); + if (!l_new) + return -ENOMEM; + + key_size = map->key_size; + + memcpy(l_new->key, key, key_size); + memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + + l_new->hash = htab_map_hash(l_new->key, key_size); + + /* bpf_map_update_elem() can be called in_irq() */ + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, l_new->hash); + + l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + + if (!l_old && unlikely(htab->count >= map->max_entries)) { + /* if elem with this 'key' doesn't exist and we've reached + * max_entries limit, fail insertion of new elem + */ + ret = -E2BIG; + goto err; + } + + if (l_old && map_flags == BPF_NOEXIST) { + /* elem already exists */ + ret = -EEXIST; + goto err; + } + + if (!l_old && map_flags == BPF_EXIST) { + /* elem doesn't exist, cannot update it */ + ret = -ENOENT; + goto err; + } + + /* add new element to the head of the list, so that concurrent + * search will find it before old elem + */ + hlist_add_head_rcu(&l_new->hash_node, head); + if (l_old) { + hlist_del_rcu(&l_old->hash_node); + kfree_rcu(l_old, rcu); + } else { + htab->count++; + } + spin_unlock_irqrestore(&htab->lock, flags); + + return 0; +err: + spin_unlock_irqrestore(&htab->lock, flags); + kfree(l_new); + return ret; +} + +/* Called from syscall or from eBPF program */ +static int htab_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_head *head; + struct htab_elem *l; + unsigned long flags; + u32 hash, key_size; + int ret = -ENOENT; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + spin_lock_irqsave(&htab->lock, flags); + + head = select_bucket(htab, hash); + + l = lookup_elem_raw(head, hash, key, key_size); + + if (l) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree_rcu(l, rcu); + ret = 0; + } + + spin_unlock_irqrestore(&htab->lock, flags); + return ret; +} + +static void delete_all_elements(struct bpf_htab *htab) +{ + int i; + + for (i = 0; i < htab->n_buckets; i++) { + struct hlist_head *head = select_bucket(htab, i); + struct hlist_node *n; + struct htab_elem *l; + + hlist_for_each_entry_safe(l, n, head, hash_node) { + hlist_del_rcu(&l->hash_node); + htab->count--; + kfree(l); + } + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void htab_map_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + /* some of kfree_rcu() callbacks for elements of this map may not have + * executed. It's ok. Proceed to free residual elements and map itself + */ + delete_all_elements(htab); + kvfree(htab->buckets); + kfree(htab); +} + +static const struct bpf_map_ops htab_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_map_lookup_elem, + .map_update_elem = htab_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list htab_type __read_mostly = { + .ops = &htab_ops, + .type = BPF_MAP_TYPE_HASH, +}; + +static int __init register_htab_map(void) +{ + bpf_register_map_type(&htab_type); + return 0; +} +late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c new file mode 100644 index 000000000..bd7f5988e --- /dev/null +++ b/kernel/bpf/helpers.c @@ -0,0 +1,113 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/rcupdate.h> +#include <linux/random.h> +#include <linux/smp.h> + +/* If kernel subsystem is allowing eBPF programs to call this function, + * inside its own verifier_ops->get_func_proto() callback it should return + * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments + * + * Different map implementations will rely on rcu in map methods + * lookup/update/delete, therefore eBPF programs must run under rcu lock + * if program is allowed to access maps, so check rcu_read_lock_held in + * all three functions. + */ +static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* verifier checked that R1 contains a valid pointer to bpf_map + * and R2 points to a program stack and map->key_size bytes were + * initialized + */ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + value = map->ops->map_lookup_elem(map, key); + + /* lookup() returns either pointer to element value or NULL + * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type + */ + return (unsigned long) value; +} + +const struct bpf_func_proto bpf_map_lookup_elem_proto = { + .func = bpf_map_lookup_elem, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; + +static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + void *value = (void *) (unsigned long) r3; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_update_elem(map, key, value, r4); +} + +const struct bpf_func_proto bpf_map_update_elem_proto = { + .func = bpf_map_update_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, + .arg3_type = ARG_PTR_TO_MAP_VALUE, + .arg4_type = ARG_ANYTHING, +}; + +static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + void *key = (void *) (unsigned long) r2; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + return map->ops->map_delete_elem(map, key); +} + +const struct bpf_func_proto bpf_map_delete_elem_proto = { + .func = bpf_map_delete_elem, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_KEY, +}; + +static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return prandom_u32(); +} + +const struct bpf_func_proto bpf_get_prandom_u32_proto = { + .func = bpf_get_prandom_u32, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return raw_smp_processor_id(); +} + +const struct bpf_func_proto bpf_get_smp_processor_id_proto = { + .func = bpf_get_smp_processor_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c new file mode 100644 index 000000000..3bae6c591 --- /dev/null +++ b/kernel/bpf/syscall.c @@ -0,0 +1,621 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bpf.h> +#include <linux/syscalls.h> +#include <linux/slab.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/license.h> +#include <linux/filter.h> +#include <linux/version.h> + +static LIST_HEAD(bpf_map_types); + +static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) +{ + struct bpf_map_type_list *tl; + struct bpf_map *map; + + list_for_each_entry(tl, &bpf_map_types, list_node) { + if (tl->type == attr->map_type) { + map = tl->ops->map_alloc(attr); + if (IS_ERR(map)) + return map; + map->ops = tl->ops; + map->map_type = attr->map_type; + return map; + } + } + return ERR_PTR(-EINVAL); +} + +/* boot time registration of different map implementations */ +void bpf_register_map_type(struct bpf_map_type_list *tl) +{ + list_add(&tl->list_node, &bpf_map_types); +} + +/* called from workqueue */ +static void bpf_map_free_deferred(struct work_struct *work) +{ + struct bpf_map *map = container_of(work, struct bpf_map, work); + + /* implementation dependent freeing */ + map->ops->map_free(map); +} + +/* decrement map refcnt and schedule it for freeing via workqueue + * (unrelying map implementation ops->map_free() might sleep) + */ +void bpf_map_put(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->refcnt)) { + INIT_WORK(&map->work, bpf_map_free_deferred); + schedule_work(&map->work); + } +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + struct bpf_map *map = filp->private_data; + + bpf_map_put(map); + return 0; +} + +static const struct file_operations bpf_map_fops = { + .release = bpf_map_release, +}; + +/* helper macro to check that unused fields 'union bpf_attr' are zero */ +#define CHECK_ATTR(CMD) \ + memchr_inv((void *) &attr->CMD##_LAST_FIELD + \ + sizeof(attr->CMD##_LAST_FIELD), 0, \ + sizeof(*attr) - \ + offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ + sizeof(attr->CMD##_LAST_FIELD)) != NULL + +#define BPF_MAP_CREATE_LAST_FIELD max_entries +/* called via syscall */ +static int map_create(union bpf_attr *attr) +{ + struct bpf_map *map; + int err; + + err = CHECK_ATTR(BPF_MAP_CREATE); + if (err) + return -EINVAL; + + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ + map = find_and_alloc_map(attr); + if (IS_ERR(map)) + return PTR_ERR(map); + + atomic_set(&map->refcnt, 1); + + err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC); + + if (err < 0) + /* failed to allocate fd */ + goto free_map; + + return err; + +free_map: + map->ops->map_free(map); + return err; +} + +/* if error is returned, fd is released. + * On success caller should complete fd access with matching fdput() + */ +struct bpf_map *bpf_map_get(struct fd f) +{ + struct bpf_map *map; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_map_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + map = f.file->private_data; + + return map; +} + +/* helper to convert user pointers passed inside __aligned_u64 fields */ +static void __user *u64_to_ptr(__u64 val) +{ + return (void __user *) (unsigned long) val; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value + +static int map_lookup_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value, *ptr; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); + if (!value) + goto free_key; + + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, map->value_size); + rcu_read_unlock(); + + err = -ENOENT; + if (!ptr) + goto free_value; + + err = -EFAULT; + if (copy_to_user(uvalue, value, map->value_size) != 0) + goto free_value; + + err = 0; + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags + +static int map_update_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *uvalue = u64_to_ptr(attr->value); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *value; + int err; + + if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); + if (!value) + goto free_key; + + err = -EFAULT; + if (copy_from_user(value, uvalue, map->value_size) != 0) + goto free_value; + + /* eBPF program that use maps are running under rcu_read_lock(), + * therefore all map accessors rely on this fact, so do the same here + */ + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +#define BPF_MAP_DELETE_ELEM_LAST_FIELD key + +static int map_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key; + int err; + + if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_delete_elem(map, key); + rcu_read_unlock(); + +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key + +static int map_get_next_key(union bpf_attr *attr) +{ + void __user *ukey = u64_to_ptr(attr->key); + void __user *unext_key = u64_to_ptr(attr->next_key); + int ufd = attr->map_fd; + struct fd f = fdget(ufd); + struct bpf_map *map; + void *key, *next_key; + int err; + + if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) + return -EINVAL; + + map = bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; + + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + + err = -ENOMEM; + next_key = kmalloc(map->key_size, GFP_USER); + if (!next_key) + goto free_key; + + rcu_read_lock(); + err = map->ops->map_get_next_key(map, key, next_key); + rcu_read_unlock(); + if (err) + goto free_next_key; + + err = -EFAULT; + if (copy_to_user(unext_key, next_key, map->key_size) != 0) + goto free_next_key; + + err = 0; + +free_next_key: + kfree(next_key); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + +static LIST_HEAD(bpf_prog_types); + +static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) +{ + struct bpf_prog_type_list *tl; + + list_for_each_entry(tl, &bpf_prog_types, list_node) { + if (tl->type == type) { + prog->aux->ops = tl->ops; + prog->type = type; + return 0; + } + } + + return -EINVAL; +} + +void bpf_register_prog_type(struct bpf_prog_type_list *tl) +{ + list_add(&tl->list_node, &bpf_prog_types); +} + +/* fixup insn->imm field of bpf_call instructions: + * if (insn->imm == BPF_FUNC_map_lookup_elem) + * insn->imm = bpf_map_lookup_elem - __bpf_call_base; + * else if (insn->imm == BPF_FUNC_map_update_elem) + * insn->imm = bpf_map_update_elem - __bpf_call_base; + * else ... + * + * this function is called after eBPF program passed verification + */ +static void fixup_bpf_calls(struct bpf_prog *prog) +{ + const struct bpf_func_proto *fn; + int i; + + for (i = 0; i < prog->len; i++) { + struct bpf_insn *insn = &prog->insnsi[i]; + + if (insn->code == (BPF_JMP | BPF_CALL)) { + /* we reach here when program has bpf_call instructions + * and it passed bpf_check(), means that + * ops->get_func_proto must have been supplied, check it + */ + BUG_ON(!prog->aux->ops->get_func_proto); + + fn = prog->aux->ops->get_func_proto(insn->imm); + /* all functions that have prototype and verifier allowed + * programs to call them, must be real in-kernel functions + */ + BUG_ON(!fn->func); + insn->imm = fn->func - __bpf_call_base; + } + } +} + +/* drop refcnt on maps used by eBPF program and free auxilary data */ +static void free_used_maps(struct bpf_prog_aux *aux) +{ + int i; + + for (i = 0; i < aux->used_map_cnt; i++) + bpf_map_put(aux->used_maps[i]); + + kfree(aux->used_maps); +} + +void bpf_prog_put(struct bpf_prog *prog) +{ + if (atomic_dec_and_test(&prog->aux->refcnt)) { + free_used_maps(prog->aux); + bpf_prog_free(prog); + } +} +EXPORT_SYMBOL_GPL(bpf_prog_put); + +static int bpf_prog_release(struct inode *inode, struct file *filp) +{ + struct bpf_prog *prog = filp->private_data; + + bpf_prog_put(prog); + return 0; +} + +static const struct file_operations bpf_prog_fops = { + .release = bpf_prog_release, +}; + +static struct bpf_prog *get_prog(struct fd f) +{ + struct bpf_prog *prog; + + if (!f.file) + return ERR_PTR(-EBADF); + + if (f.file->f_op != &bpf_prog_fops) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + prog = f.file->private_data; + + return prog; +} + +/* called by sockets/tracing/seccomp before attaching program to an event + * pairs with bpf_prog_put() + */ +struct bpf_prog *bpf_prog_get(u32 ufd) +{ + struct fd f = fdget(ufd); + struct bpf_prog *prog; + + prog = get_prog(f); + + if (IS_ERR(prog)) + return prog; + + atomic_inc(&prog->aux->refcnt); + fdput(f); + return prog; +} +EXPORT_SYMBOL_GPL(bpf_prog_get); + +/* last field in 'union bpf_attr' used by this command */ +#define BPF_PROG_LOAD_LAST_FIELD kern_version + +static int bpf_prog_load(union bpf_attr *attr) +{ + enum bpf_prog_type type = attr->prog_type; + struct bpf_prog *prog; + int err; + char license[128]; + bool is_gpl; + + if (CHECK_ATTR(BPF_PROG_LOAD)) + return -EINVAL; + + /* copy eBPF program license from user space */ + if (strncpy_from_user(license, u64_to_ptr(attr->license), + sizeof(license) - 1) < 0) + return -EFAULT; + license[sizeof(license) - 1] = 0; + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + is_gpl = license_is_gpl_compatible(license); + + if (attr->insn_cnt >= BPF_MAXINSNS) + return -EINVAL; + + if (type == BPF_PROG_TYPE_KPROBE && + attr->kern_version != LINUX_VERSION_CODE) + return -EINVAL; + + /* plain bpf_prog allocation */ + prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); + if (!prog) + return -ENOMEM; + + prog->len = attr->insn_cnt; + + err = -EFAULT; + if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), + prog->len * sizeof(struct bpf_insn)) != 0) + goto free_prog; + + prog->orig_prog = NULL; + prog->jited = false; + + atomic_set(&prog->aux->refcnt, 1); + prog->gpl_compatible = is_gpl; + + /* find program type: socket_filter vs tracing_filter */ + err = find_prog_type(type, prog); + if (err < 0) + goto free_prog; + + /* run eBPF verifier */ + err = bpf_check(&prog, attr); + if (err < 0) + goto free_used_maps; + + /* fixup BPF_CALL->imm field */ + fixup_bpf_calls(prog); + + /* eBPF program is ready to be JITed */ + bpf_prog_select_runtime(prog); + + err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); + if (err < 0) + /* failed to allocate fd */ + goto free_used_maps; + + return err; + +free_used_maps: + free_used_maps(prog->aux); +free_prog: + bpf_prog_free(prog); + return err; +} + +SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +{ + union bpf_attr attr = {}; + int err; + + /* the syscall is limited to root temporarily. This restriction will be + * lifted when security audit is clean. Note that eBPF+tracing must have + * this restriction, since it may pass kernel data to user space + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!access_ok(VERIFY_READ, uattr, 1)) + return -EFAULT; + + if (size > PAGE_SIZE) /* silly large */ + return -E2BIG; + + /* If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + err = get_user(val, addr); + if (err) + return err; + if (val) + return -E2BIG; + } + size = sizeof(attr); + } + + /* copy attributes from user space, may be less than sizeof(bpf_attr) */ + if (copy_from_user(&attr, uattr, size) != 0) + return -EFAULT; + + switch (cmd) { + case BPF_MAP_CREATE: + err = map_create(&attr); + break; + case BPF_MAP_LOOKUP_ELEM: + err = map_lookup_elem(&attr); + break; + case BPF_MAP_UPDATE_ELEM: + err = map_update_elem(&attr); + break; + case BPF_MAP_DELETE_ELEM: + err = map_delete_elem(&attr); + break; + case BPF_MAP_GET_NEXT_KEY: + err = map_get_next_key(&attr); + break; + case BPF_PROG_LOAD: + err = bpf_prog_load(&attr); + break; + default: + err = -EINVAL; + break; + } + + return err; +} diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c new file mode 100644 index 000000000..47dcd3aa6 --- /dev/null +++ b/kernel/bpf/verifier.c @@ -0,0 +1,2146 @@ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/bpf.h> +#include <linux/filter.h> +#include <net/netlink.h> +#include <linux/file.h> +#include <linux/vmalloc.h> + +/* bpf_check() is a static code analyzer that walks eBPF program + * instruction by instruction and updates register/stack state. + * All paths of conditional branches are analyzed until 'bpf_exit' insn. + * + * The first pass is depth-first-search to check that the program is a DAG. + * It rejects the following programs: + * - larger than BPF_MAXINSNS insns + * - if loop is present (detected via back-edge) + * - unreachable insns exist (shouldn't be a forest. program = one function) + * - out of bounds or malformed jumps + * The second pass is all possible path descent from the 1st insn. + * Since it's analyzing all pathes through the program, the length of the + * analysis is limited to 32k insn, which may be hit even if total number of + * insn is less then 4K, but there are too many branches that change stack/regs. + * Number of 'branches to be analyzed' is limited to 1k + * + * On entry to each instruction, each register has a type, and the instruction + * changes the types of the registers depending on instruction semantics. + * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is + * copied to R1. + * + * All registers are 64-bit. + * R0 - return register + * R1-R5 argument passing registers + * R6-R9 callee saved registers + * R10 - frame pointer read-only + * + * At the start of BPF program the register R1 contains a pointer to bpf_context + * and has type PTR_TO_CTX. + * + * Verifier tracks arithmetic operations on pointers in case: + * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20), + * 1st insn copies R10 (which has FRAME_PTR) type into R1 + * and 2nd arithmetic instruction is pattern matched to recognize + * that it wants to construct a pointer to some element within stack. + * So after 2nd insn, the register R1 has type PTR_TO_STACK + * (and -20 constant is saved for further stack bounds checking). + * Meaning that this reg is a pointer to stack plus known immediate constant. + * + * Most of the time the registers have UNKNOWN_VALUE type, which + * means the register has some value, but it's not a valid pointer. + * (like pointer plus pointer becomes UNKNOWN_VALUE type) + * + * When verifier sees load or store instructions the type of base register + * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer + * types recognized by check_mem_access() function. + * + * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' + * and the range of [ptr, ptr + map's value_size) is accessible. + * + * registers used to pass values to function calls are checked against + * function argument constraints. + * + * ARG_PTR_TO_MAP_KEY is one of such argument constraints. + * It means that the register type passed to this function must be + * PTR_TO_STACK and it will be used inside the function as + * 'pointer to map element key' + * + * For example the argument constraints for bpf_map_lookup_elem(): + * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + * .arg1_type = ARG_CONST_MAP_PTR, + * .arg2_type = ARG_PTR_TO_MAP_KEY, + * + * ret_type says that this function returns 'pointer to map elem value or null' + * function expects 1st argument to be a const pointer to 'struct bpf_map' and + * 2nd argument should be a pointer to stack, which will be used inside + * the helper function as a pointer to map element key. + * + * On the kernel side the helper function looks like: + * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) + * { + * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; + * void *key = (void *) (unsigned long) r2; + * void *value; + * + * here kernel can access 'key' and 'map' pointers safely, knowing that + * [key, key + map->key_size) bytes are valid and were initialized on + * the stack of eBPF program. + * } + * + * Corresponding eBPF program may look like: + * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR + * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK + * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP + * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), + * here verifier looks at prototype of map_lookup_elem() and sees: + * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok, + * Now verifier knows that this map has key of R1->map_ptr->key_size bytes + * + * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far, + * Now verifier checks that [R2, R2 + map's key_size) are within stack limits + * and were initialized prior to this call. + * If it's ok, then verifier allows this BPF_CALL insn and looks at + * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets + * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function + * returns ether pointer to map value or NULL. + * + * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off' + * insn, the register holding that pointer in the true branch changes state to + * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false + * branch. See check_cond_jmp_op(). + * + * After the call R0 is set to return type of the function and registers R1-R5 + * are set to NOT_INIT to indicate that they are no longer readable. + */ + +/* types of values stored in eBPF registers */ +enum bpf_reg_type { + NOT_INIT = 0, /* nothing was written into register */ + UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ + PTR_TO_CTX, /* reg points to bpf_context */ + CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ + PTR_TO_MAP_VALUE, /* reg points to map element value */ + PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ + FRAME_PTR, /* reg == frame_pointer */ + PTR_TO_STACK, /* reg == frame_pointer + imm */ + CONST_IMM, /* constant integer value */ +}; + +struct reg_state { + enum bpf_reg_type type; + union { + /* valid when type == CONST_IMM | PTR_TO_STACK */ + int imm; + + /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | + * PTR_TO_MAP_VALUE_OR_NULL + */ + struct bpf_map *map_ptr; + }; +}; + +enum bpf_stack_slot_type { + STACK_INVALID, /* nothing was stored in this stack slot */ + STACK_SPILL, /* register spilled into stack */ + STACK_MISC /* BPF program wrote some data into this slot */ +}; + +#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ + +/* state of the program: + * type of all registers and stack info + */ +struct verifier_state { + struct reg_state regs[MAX_BPF_REG]; + u8 stack_slot_type[MAX_BPF_STACK]; + struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; +}; + +/* linked list of verifier states used to prune search */ +struct verifier_state_list { + struct verifier_state state; + struct verifier_state_list *next; +}; + +/* verifier_state + insn_idx are pushed to stack when branch is encountered */ +struct verifier_stack_elem { + /* verifer state is 'st' + * before processing instruction 'insn_idx' + * and after processing instruction 'prev_insn_idx' + */ + struct verifier_state st; + int insn_idx; + int prev_insn_idx; + struct verifier_stack_elem *next; +}; + +#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ + +/* single container for all structs + * one verifier_env per bpf_check() call + */ +struct verifier_env { + struct bpf_prog *prog; /* eBPF program being verified */ + struct verifier_stack_elem *head; /* stack of verifier states to be processed */ + int stack_size; /* number of states to be processed */ + struct verifier_state cur_state; /* current verifier state */ + struct verifier_state_list **explored_states; /* search pruning optimization */ + struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + u32 used_map_cnt; /* number of used maps */ +}; + +/* verbose verifier prints what it's seeing + * bpf_check() is called under lock, so no race to access these global vars + */ +static u32 log_level, log_size, log_len; +static char *log_buf; + +static DEFINE_MUTEX(bpf_verifier_lock); + +/* log_level controls verbosity level of eBPF verifier. + * verbose() is used to dump the verification trace to the log, so the user + * can figure out what's wrong with the program + */ +static void verbose(const char *fmt, ...) +{ + va_list args; + + if (log_level == 0 || log_len >= log_size - 1) + return; + + va_start(args, fmt); + log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); + va_end(args); +} + +/* string representation of 'enum bpf_reg_type' */ +static const char * const reg_type_str[] = { + [NOT_INIT] = "?", + [UNKNOWN_VALUE] = "inv", + [PTR_TO_CTX] = "ctx", + [CONST_PTR_TO_MAP] = "map_ptr", + [PTR_TO_MAP_VALUE] = "map_value", + [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", + [FRAME_PTR] = "fp", + [PTR_TO_STACK] = "fp", + [CONST_IMM] = "imm", +}; + +static void print_verifier_state(struct verifier_env *env) +{ + enum bpf_reg_type t; + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + t = env->cur_state.regs[i].type; + if (t == NOT_INIT) + continue; + verbose(" R%d=%s", i, reg_type_str[t]); + if (t == CONST_IMM || t == PTR_TO_STACK) + verbose("%d", env->cur_state.regs[i].imm); + else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || + t == PTR_TO_MAP_VALUE_OR_NULL) + verbose("(ks=%d,vs=%d)", + env->cur_state.regs[i].map_ptr->key_size, + env->cur_state.regs[i].map_ptr->value_size); + } + for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { + if (env->cur_state.stack_slot_type[i] == STACK_SPILL) + verbose(" fp%d=%s", -MAX_BPF_STACK + i, + reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); + } + verbose("\n"); +} + +static const char *const bpf_class_string[] = { + [BPF_LD] = "ld", + [BPF_LDX] = "ldx", + [BPF_ST] = "st", + [BPF_STX] = "stx", + [BPF_ALU] = "alu", + [BPF_JMP] = "jmp", + [BPF_RET] = "BUG", + [BPF_ALU64] = "alu64", +}; + +static const char *const bpf_alu_string[] = { + [BPF_ADD >> 4] = "+=", + [BPF_SUB >> 4] = "-=", + [BPF_MUL >> 4] = "*=", + [BPF_DIV >> 4] = "/=", + [BPF_OR >> 4] = "|=", + [BPF_AND >> 4] = "&=", + [BPF_LSH >> 4] = "<<=", + [BPF_RSH >> 4] = ">>=", + [BPF_NEG >> 4] = "neg", + [BPF_MOD >> 4] = "%=", + [BPF_XOR >> 4] = "^=", + [BPF_MOV >> 4] = "=", + [BPF_ARSH >> 4] = "s>>=", + [BPF_END >> 4] = "endian", +}; + +static const char *const bpf_ldst_string[] = { + [BPF_W >> 3] = "u32", + [BPF_H >> 3] = "u16", + [BPF_B >> 3] = "u8", + [BPF_DW >> 3] = "u64", +}; + +static const char *const bpf_jmp_string[] = { + [BPF_JA >> 4] = "jmp", + [BPF_JEQ >> 4] = "==", + [BPF_JGT >> 4] = ">", + [BPF_JGE >> 4] = ">=", + [BPF_JSET >> 4] = "&", + [BPF_JNE >> 4] = "!=", + [BPF_JSGT >> 4] = "s>", + [BPF_JSGE >> 4] = "s>=", + [BPF_CALL >> 4] = "call", + [BPF_EXIT >> 4] = "exit", +}; + +static void print_bpf_insn(struct bpf_insn *insn) +{ + u8 class = BPF_CLASS(insn->code); + + if (class == BPF_ALU || class == BPF_ALU64) { + if (BPF_SRC(insn->code) == BPF_X) + verbose("(%02x) %sr%d %s %sr%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->src_reg); + else + verbose("(%02x) %sr%d %s %s%d\n", + insn->code, class == BPF_ALU ? "(u32) " : "", + insn->dst_reg, + bpf_alu_string[BPF_OP(insn->code) >> 4], + class == BPF_ALU ? "(u32) " : "", + insn->imm); + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_MEM) + verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->src_reg); + else if (BPF_MODE(insn->code) == BPF_XADD) + verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + else + verbose("BUG_%02x\n", insn->code); + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_st_%02x\n", insn->code); + return; + } + verbose("(%02x) *(%s *)(r%d %+d) = %d\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, + insn->off, insn->imm); + } else if (class == BPF_LDX) { + if (BPF_MODE(insn->code) != BPF_MEM) { + verbose("BUG_ldx_%02x\n", insn->code); + return; + } + verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", + insn->code, insn->dst_reg, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->off); + } else if (class == BPF_LD) { + if (BPF_MODE(insn->code) == BPF_ABS) { + verbose("(%02x) r0 = *(%s *)skb[%d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IND) { + verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", + insn->code, + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->src_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM) { + verbose("(%02x) r%d = 0x%x\n", + insn->code, insn->dst_reg, insn->imm); + } else { + verbose("BUG_ld_%02x\n", insn->code); + return; + } + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + verbose("(%02x) call %d\n", insn->code, insn->imm); + } else if (insn->code == (BPF_JMP | BPF_JA)) { + verbose("(%02x) goto pc%+d\n", + insn->code, insn->off); + } else if (insn->code == (BPF_JMP | BPF_EXIT)) { + verbose("(%02x) exit\n", insn->code); + } else if (BPF_SRC(insn->code) == BPF_X) { + verbose("(%02x) if r%d %s r%d goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->src_reg, insn->off); + } else { + verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", + insn->code, insn->dst_reg, + bpf_jmp_string[BPF_OP(insn->code) >> 4], + insn->imm, insn->off); + } + } else { + verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); + } +} + +static int pop_stack(struct verifier_env *env, int *prev_insn_idx) +{ + struct verifier_stack_elem *elem; + int insn_idx; + + if (env->head == NULL) + return -1; + + memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); + insn_idx = env->head->insn_idx; + if (prev_insn_idx) + *prev_insn_idx = env->head->prev_insn_idx; + elem = env->head->next; + kfree(env->head); + env->head = elem; + env->stack_size--; + return insn_idx; +} + +static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, + int prev_insn_idx) +{ + struct verifier_stack_elem *elem; + + elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); + if (!elem) + goto err; + + memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); + elem->insn_idx = insn_idx; + elem->prev_insn_idx = prev_insn_idx; + elem->next = env->head; + env->head = elem; + env->stack_size++; + if (env->stack_size > 1024) { + verbose("BPF program is too complex\n"); + goto err; + } + return &elem->st; +err: + /* pop all elements and return */ + while (pop_stack(env, NULL) >= 0); + return NULL; +} + +#define CALLER_SAVED_REGS 6 +static const int caller_saved[CALLER_SAVED_REGS] = { + BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 +}; + +static void init_reg_state(struct reg_state *regs) +{ + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + regs[i].type = NOT_INIT; + regs[i].imm = 0; + regs[i].map_ptr = NULL; + } + + /* frame pointer */ + regs[BPF_REG_FP].type = FRAME_PTR; + + /* 1st arg to a function */ + regs[BPF_REG_1].type = PTR_TO_CTX; +} + +static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) +{ + BUG_ON(regno >= MAX_BPF_REG); + regs[regno].type = UNKNOWN_VALUE; + regs[regno].imm = 0; + regs[regno].map_ptr = NULL; +} + +enum reg_arg_type { + SRC_OP, /* register is used as source operand */ + DST_OP, /* register is used as destination operand */ + DST_OP_NO_MARK /* same as above, check only, don't mark */ +}; + +static int check_reg_arg(struct reg_state *regs, u32 regno, + enum reg_arg_type t) +{ + if (regno >= MAX_BPF_REG) { + verbose("R%d is invalid\n", regno); + return -EINVAL; + } + + if (t == SRC_OP) { + /* check whether register used as source operand can be read */ + if (regs[regno].type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + } else { + /* check whether register used as dest operand can be written to */ + if (regno == BPF_REG_FP) { + verbose("frame pointer is read only\n"); + return -EACCES; + } + if (t == DST_OP) + mark_reg_unknown_value(regs, regno); + } + return 0; +} + +static int bpf_size_to_bytes(int bpf_size) +{ + if (bpf_size == BPF_W) + return 4; + else if (bpf_size == BPF_H) + return 2; + else if (bpf_size == BPF_B) + return 1; + else if (bpf_size == BPF_DW) + return 8; + else + return -EINVAL; +} + +/* check_stack_read/write functions track spill/fill of registers, + * stack boundary and alignment are checked in check_mem_access() + */ +static int check_stack_write(struct verifier_state *state, int off, int size, + int value_regno) +{ + int i; + /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, + * so it's aligned access and [off, off + size) are within stack limits + */ + + if (value_regno >= 0 && + (state->regs[value_regno].type == PTR_TO_MAP_VALUE || + state->regs[value_regno].type == PTR_TO_STACK || + state->regs[value_regno].type == PTR_TO_CTX)) { + + /* register containing pointer is being spilled into stack */ + if (size != BPF_REG_SIZE) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + + /* save register state */ + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = + state->regs[value_regno]; + + for (i = 0; i < BPF_REG_SIZE; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + } else { + /* regular write of data into stack */ + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = + (struct reg_state) {}; + + for (i = 0; i < size; i++) + state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; + } + return 0; +} + +static int check_stack_read(struct verifier_state *state, int off, int size, + int value_regno) +{ + u8 *slot_type; + int i; + + slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; + + if (slot_type[0] == STACK_SPILL) { + if (size != BPF_REG_SIZE) { + verbose("invalid size of register spill\n"); + return -EACCES; + } + for (i = 1; i < BPF_REG_SIZE; i++) { + if (slot_type[i] != STACK_SPILL) { + verbose("corrupted spill memory\n"); + return -EACCES; + } + } + + if (value_regno >= 0) + /* restore register state from stack */ + state->regs[value_regno] = + state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE]; + return 0; + } else { + for (i = 0; i < size; i++) { + if (slot_type[i] != STACK_MISC) { + verbose("invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + } + if (value_regno >= 0) + /* have read misc data from the stack */ + mark_reg_unknown_value(state->regs, value_regno); + return 0; + } +} + +/* check read/write into map element returned by bpf_map_lookup_elem() */ +static int check_map_access(struct verifier_env *env, u32 regno, int off, + int size) +{ + struct bpf_map *map = env->cur_state.regs[regno].map_ptr; + + if (off < 0 || off + size > map->value_size) { + verbose("invalid access to map value, value_size=%d off=%d size=%d\n", + map->value_size, off, size); + return -EACCES; + } + return 0; +} + +/* check access to 'struct bpf_context' fields */ +static int check_ctx_access(struct verifier_env *env, int off, int size, + enum bpf_access_type t) +{ + if (env->prog->aux->ops->is_valid_access && + env->prog->aux->ops->is_valid_access(off, size, t)) + return 0; + + verbose("invalid bpf_context access off=%d size=%d\n", off, size); + return -EACCES; +} + +/* check whether memory at (regno + off) is accessible for t = (read | write) + * if t==write, value_regno is a register which value is stored into memory + * if t==read, value_regno is a register which will receive the value from memory + * if t==write && value_regno==-1, some unknown value is stored into memory + * if t==read && value_regno==-1, don't care what we read from memory + */ +static int check_mem_access(struct verifier_env *env, u32 regno, int off, + int bpf_size, enum bpf_access_type t, + int value_regno) +{ + struct verifier_state *state = &env->cur_state; + int size, err = 0; + + size = bpf_size_to_bytes(bpf_size); + if (size < 0) + return size; + + if (off % size != 0) { + verbose("misaligned access off %d size %d\n", off, size); + return -EACCES; + } + + if (state->regs[regno].type == PTR_TO_MAP_VALUE) { + err = check_map_access(env, regno, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == PTR_TO_CTX) { + err = check_ctx_access(env, off, size, t); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown_value(state->regs, value_regno); + + } else if (state->regs[regno].type == FRAME_PTR) { + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose("invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + if (t == BPF_WRITE) + err = check_stack_write(state, off, size, value_regno); + else + err = check_stack_read(state, off, size, value_regno); + } else { + verbose("R%d invalid mem access '%s'\n", + regno, reg_type_str[state->regs[regno].type]); + return -EACCES; + } + return err; +} + +static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || + insn->imm != 0) { + verbose("BPF_XADD uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check whether atomic_add can read the memory */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, -1); + if (err) + return err; + + /* check whether atomic_add can write into the same memory */ + return check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1); +} + +/* when register 'regno' is passed into function that will read 'access_size' + * bytes from that pointer, make sure that it's within stack boundary + * and all elements of stack are initialized + */ +static int check_stack_boundary(struct verifier_env *env, + int regno, int access_size) +{ + struct verifier_state *state = &env->cur_state; + struct reg_state *regs = state->regs; + int off, i; + + if (regs[regno].type != PTR_TO_STACK) + return -EACCES; + + off = regs[regno].imm; + if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || + access_size <= 0) { + verbose("invalid stack type R%d off=%d access_size=%d\n", + regno, off, access_size); + return -EACCES; + } + + for (i = 0; i < access_size; i++) { + if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { + verbose("invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; + } + } + return 0; +} + +static int check_func_arg(struct verifier_env *env, u32 regno, + enum bpf_arg_type arg_type, struct bpf_map **mapp) +{ + struct reg_state *reg = env->cur_state.regs + regno; + enum bpf_reg_type expected_type; + int err = 0; + + if (arg_type == ARG_DONTCARE) + return 0; + + if (reg->type == NOT_INIT) { + verbose("R%d !read_ok\n", regno); + return -EACCES; + } + + if (arg_type == ARG_ANYTHING) + return 0; + + if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + arg_type == ARG_PTR_TO_MAP_VALUE) { + expected_type = PTR_TO_STACK; + } else if (arg_type == ARG_CONST_STACK_SIZE) { + expected_type = CONST_IMM; + } else if (arg_type == ARG_CONST_MAP_PTR) { + expected_type = CONST_PTR_TO_MAP; + } else if (arg_type == ARG_PTR_TO_CTX) { + expected_type = PTR_TO_CTX; + } else { + verbose("unsupported arg_type %d\n", arg_type); + return -EFAULT; + } + + if (reg->type != expected_type) { + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[reg->type], reg_type_str[expected_type]); + return -EACCES; + } + + if (arg_type == ARG_CONST_MAP_PTR) { + /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ + *mapp = reg->map_ptr; + + } else if (arg_type == ARG_PTR_TO_MAP_KEY) { + /* bpf_map_xxx(..., map_ptr, ..., key) call: + * check that [key, key + map->key_size) are within + * stack limits and initialized + */ + if (!*mapp) { + /* in function declaration map_ptr must come before + * map_key, so that it's verified and known before + * we have to check map_key here. Otherwise it means + * that kernel subsystem misconfigured verifier + */ + verbose("invalid map_ptr to access map->key\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->key_size); + + } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { + /* bpf_map_xxx(..., map_ptr, ..., value) call: + * check [value, value + map->value_size) validity + */ + if (!*mapp) { + /* kernel subsystem misconfigured verifier */ + verbose("invalid map_ptr to access map->value\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno, (*mapp)->value_size); + + } else if (arg_type == ARG_CONST_STACK_SIZE) { + /* bpf_xxx(..., buf, len) call will access 'len' bytes + * from stack pointer 'buf'. Check it + * note: regno == len, regno - 1 == buf + */ + if (regno == 0) { + /* kernel subsystem misconfigured verifier */ + verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); + return -EACCES; + } + err = check_stack_boundary(env, regno - 1, reg->imm); + } + + return err; +} + +static int check_call(struct verifier_env *env, int func_id) +{ + struct verifier_state *state = &env->cur_state; + const struct bpf_func_proto *fn = NULL; + struct reg_state *regs = state->regs; + struct bpf_map *map = NULL; + struct reg_state *reg; + int i, err; + + /* find function prototype */ + if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { + verbose("invalid func %d\n", func_id); + return -EINVAL; + } + + if (env->prog->aux->ops->get_func_proto) + fn = env->prog->aux->ops->get_func_proto(func_id); + + if (!fn) { + verbose("unknown func %d\n", func_id); + return -EINVAL; + } + + /* eBPF programs must be GPL compatible to use GPL-ed functions */ + if (!env->prog->gpl_compatible && fn->gpl_only) { + verbose("cannot call GPL only function from proprietary program\n"); + return -EINVAL; + } + + /* check args */ + err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); + if (err) + return err; + err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); + if (err) + return err; + + /* reset caller saved regs */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* update return register */ + if (fn->ret_type == RET_INTEGER) { + regs[BPF_REG_0].type = UNKNOWN_VALUE; + } else if (fn->ret_type == RET_VOID) { + regs[BPF_REG_0].type = NOT_INIT; + } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; + /* remember map_ptr, so that check_map_access() + * can check 'value_size' boundary of memory access + * to map element returned from bpf_map_lookup_elem() + */ + if (map == NULL) { + verbose("kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + regs[BPF_REG_0].map_ptr = map; + } else { + verbose("unknown return type %d of func %d\n", + fn->ret_type, func_id); + return -EINVAL; + } + return 0; +} + +/* check validity of 32-bit and 64-bit arithmetic operations */ +static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) +{ + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode == BPF_END || opcode == BPF_NEG) { + if (opcode == BPF_NEG) { + if (BPF_SRC(insn->code) != 0 || + insn->src_reg != BPF_REG_0 || + insn->off != 0 || insn->imm != 0) { + verbose("BPF_NEG uses reserved fields\n"); + return -EINVAL; + } + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0 || + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + verbose("BPF_END uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + } else if (opcode == BPF_MOV) { + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_MOV uses reserved fields\n"); + return -EINVAL; + } + } + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (BPF_SRC(insn->code) == BPF_X) { + if (BPF_CLASS(insn->code) == BPF_ALU64) { + /* case: R1 = R2 + * copy register state to dest reg + */ + regs[insn->dst_reg] = regs[insn->src_reg]; + } else { + regs[insn->dst_reg].type = UNKNOWN_VALUE; + regs[insn->dst_reg].map_ptr = NULL; + } + } else { + /* case: R = imm + * remember the value we stored into this reg + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + + } else if (opcode > BPF_END) { + verbose("invalid BPF_ALU opcode %x\n", opcode); + return -EINVAL; + + } else { /* all other ALU ops: and, sub, xor, add, ... */ + + bool stack_relative = false; + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0 || insn->off != 0) { + verbose("BPF_ALU uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + if ((opcode == BPF_MOD || opcode == BPF_DIV) && + BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { + verbose("div by zero\n"); + return -EINVAL; + } + + /* pattern match 'bpf_add Rx, imm' instruction */ + if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && + regs[insn->dst_reg].type == FRAME_PTR && + BPF_SRC(insn->code) == BPF_K) + stack_relative = true; + + /* check dest operand */ + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (stack_relative) { + regs[insn->dst_reg].type = PTR_TO_STACK; + regs[insn->dst_reg].imm = insn->imm; + } + } + + return 0; +} + +static int check_cond_jmp_op(struct verifier_env *env, + struct bpf_insn *insn, int *insn_idx) +{ + struct reg_state *regs = env->cur_state.regs; + struct verifier_state *other_branch; + u8 opcode = BPF_OP(insn->code); + int err; + + if (opcode > BPF_EXIT) { + verbose("invalid BPF_JMP opcode %x\n", opcode); + return -EINVAL; + } + + if (BPF_SRC(insn->code) == BPF_X) { + if (insn->imm != 0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } else { + if (insn->src_reg != BPF_REG_0) { + verbose("BPF_JMP uses reserved fields\n"); + return -EINVAL; + } + } + + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* detect if R == 0 where R was initialized to zero earlier */ + if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE) && + regs[insn->dst_reg].type == CONST_IMM && + regs[insn->dst_reg].imm == insn->imm) { + if (opcode == BPF_JEQ) { + /* if (imm == imm) goto pc+off; + * only follow the goto, ignore fall-through + */ + *insn_idx += insn->off; + return 0; + } else { + /* if (imm != imm) goto pc+off; + * only follow fall-through branch, since + * that's where the program will go + */ + return 0; + } + } + + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + if (!other_branch) + return -EFAULT; + + /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ + if (BPF_SRC(insn->code) == BPF_K && + insn->imm == 0 && (opcode == BPF_JEQ || + opcode == BPF_JNE) && + regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { + if (opcode == BPF_JEQ) { + /* next fallthrough insn can access memory via + * this register + */ + regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + /* branch targer cannot access it, since reg == 0 */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = 0; + } else { + other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = 0; + } + } else if (BPF_SRC(insn->code) == BPF_K && + (opcode == BPF_JEQ || opcode == BPF_JNE)) { + + if (opcode == BPF_JEQ) { + /* detect if (R == imm) goto + * and in the target state recognize that R = imm + */ + other_branch->regs[insn->dst_reg].type = CONST_IMM; + other_branch->regs[insn->dst_reg].imm = insn->imm; + } else { + /* detect if (R != imm) goto + * and in the fall-through state recognize that R = imm + */ + regs[insn->dst_reg].type = CONST_IMM; + regs[insn->dst_reg].imm = insn->imm; + } + } + if (log_level) + print_verifier_state(env); + return 0; +} + +/* return the map pointer stored inside BPF_LD_IMM64 instruction */ +static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) +{ + u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; + + return (struct bpf_map *) (unsigned long) imm64; +} + +/* verify BPF_LD_IMM64 instruction */ +static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + int err; + + if (BPF_SIZE(insn->code) != BPF_DW) { + verbose("invalid BPF_LD_IMM insn\n"); + return -EINVAL; + } + if (insn->off != 0) { + verbose("BPF_LD_IMM64 uses reserved fields\n"); + return -EINVAL; + } + + err = check_reg_arg(regs, insn->dst_reg, DST_OP); + if (err) + return err; + + if (insn->src_reg == 0) + /* generic move 64-bit immediate into a register */ + return 0; + + /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ + BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); + + regs[insn->dst_reg].type = CONST_PTR_TO_MAP; + regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); + return 0; +} + +static bool may_access_skb(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + return true; + default: + return false; + } +} + +/* verify safety of LD_ABS|LD_IND instructions: + * - they can only appear in the programs where ctx == skb + * - since they are wrappers of function calls, they scratch R1-R5 registers, + * preserve R6-R9, and store return value into R0 + * + * Implicit input: + * ctx == skb == R6 == CTX + * + * Explicit input: + * SRC == any register + * IMM == 32-bit immediate + * + * Output: + * R0 - 8/16/32-bit skb data converted to cpu endianness + */ +static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +{ + struct reg_state *regs = env->cur_state.regs; + u8 mode = BPF_MODE(insn->code); + struct reg_state *reg; + int i, err; + + if (!may_access_skb(env->prog->type)) { + verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); + return -EINVAL; + } + + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { + verbose("BPF_LD_ABS uses reserved fields\n"); + return -EINVAL; + } + + /* check whether implicit source operand (register R6) is readable */ + err = check_reg_arg(regs, BPF_REG_6, SRC_OP); + if (err) + return err; + + if (regs[BPF_REG_6].type != PTR_TO_CTX) { + verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); + return -EINVAL; + } + + if (mode == BPF_IND) { + /* check explicit source operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + } + + /* reset caller saved regs to unreadable */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + reg = regs + caller_saved[i]; + reg->type = NOT_INIT; + reg->imm = 0; + } + + /* mark destination R0 register as readable, since it contains + * the value fetched from the packet + */ + regs[BPF_REG_0].type = UNKNOWN_VALUE; + return 0; +} + +/* non-recursive DFS pseudo code + * 1 procedure DFS-iterative(G,v): + * 2 label v as discovered + * 3 let S be a stack + * 4 S.push(v) + * 5 while S is not empty + * 6 t <- S.pop() + * 7 if t is what we're looking for: + * 8 return t + * 9 for all edges e in G.adjacentEdges(t) do + * 10 if edge e is already labelled + * 11 continue with the next edge + * 12 w <- G.adjacentVertex(t,e) + * 13 if vertex w is not discovered and not explored + * 14 label e as tree-edge + * 15 label w as discovered + * 16 S.push(w) + * 17 continue at 5 + * 18 else if vertex w is discovered + * 19 label e as back-edge + * 20 else + * 21 // vertex w is explored + * 22 label e as forward- or cross-edge + * 23 label t as explored + * 24 S.pop() + * + * convention: + * 0x10 - discovered + * 0x11 - discovered and fall-through edge labelled + * 0x12 - discovered and fall-through and branch edges labelled + * 0x20 - explored + */ + +enum { + DISCOVERED = 0x10, + EXPLORED = 0x20, + FALLTHROUGH = 1, + BRANCH = 2, +}; + +#define STATE_LIST_MARK ((struct verifier_state_list *) -1L) + +static int *insn_stack; /* stack of insns to process */ +static int cur_stack; /* current stack index */ +static int *insn_state; + +/* t, w, e - match pseudo-code above: + * t - index of current instruction + * w - next instruction + * e - edge + */ +static int push_insn(int t, int w, int e, struct verifier_env *env) +{ + if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) + return 0; + + if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) + return 0; + + if (w < 0 || w >= env->prog->len) { + verbose("jump out of range from insn %d to %d\n", t, w); + return -EINVAL; + } + + if (e == BRANCH) + /* mark branch target for state pruning */ + env->explored_states[w] = STATE_LIST_MARK; + + if (insn_state[w] == 0) { + /* tree-edge */ + insn_state[t] = DISCOVERED | e; + insn_state[w] = DISCOVERED; + if (cur_stack >= env->prog->len) + return -E2BIG; + insn_stack[cur_stack++] = w; + return 1; + } else if ((insn_state[w] & 0xF0) == DISCOVERED) { + verbose("back-edge from insn %d to %d\n", t, w); + return -EINVAL; + } else if (insn_state[w] == EXPLORED) { + /* forward- or cross-edge */ + insn_state[t] = DISCOVERED | e; + } else { + verbose("insn state internal bug\n"); + return -EFAULT; + } + return 0; +} + +/* non-recursive depth-first-search to detect loops in BPF program + * loop == back-edge in directed graph + */ +static int check_cfg(struct verifier_env *env) +{ + struct bpf_insn *insns = env->prog->insnsi; + int insn_cnt = env->prog->len; + int ret = 0; + int i, t; + + insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_state) + return -ENOMEM; + + insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); + if (!insn_stack) { + kfree(insn_state); + return -ENOMEM; + } + + insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ + insn_stack[0] = 0; /* 0 is the first instruction */ + cur_stack = 1; + +peek_stack: + if (cur_stack == 0) + goto check_state; + t = insn_stack[cur_stack - 1]; + + if (BPF_CLASS(insns[t].code) == BPF_JMP) { + u8 opcode = BPF_OP(insns[t].code); + + if (opcode == BPF_EXIT) { + goto mark_explored; + } else if (opcode == BPF_CALL) { + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } else if (opcode == BPF_JA) { + if (BPF_SRC(insns[t].code) != BPF_K) { + ret = -EINVAL; + goto err_free; + } + /* unconditional jump with single edge */ + ret = push_insn(t, t + insns[t].off + 1, + FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + /* tell verifier to check for equivalent states + * after every call and jump + */ + if (t + 1 < insn_cnt) + env->explored_states[t + 1] = STATE_LIST_MARK; + } else { + /* conditional jump with two edges */ + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + + ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } + } else { + /* all other non-branch instructions with single + * fall-through edge + */ + ret = push_insn(t, t + 1, FALLTHROUGH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } + +mark_explored: + insn_state[t] = EXPLORED; + if (cur_stack-- <= 0) { + verbose("pop stack internal bug\n"); + ret = -EFAULT; + goto err_free; + } + goto peek_stack; + +check_state: + for (i = 0; i < insn_cnt; i++) { + if (insn_state[i] != EXPLORED) { + verbose("unreachable insn %d\n", i); + ret = -EINVAL; + goto err_free; + } + } + ret = 0; /* cfg looks good */ + +err_free: + kfree(insn_state); + kfree(insn_stack); + return ret; +} + +/* compare two verifier states + * + * all states stored in state_list are known to be valid, since + * verifier reached 'bpf_exit' instruction through them + * + * this function is called when verifier exploring different branches of + * execution popped from the state stack. If it sees an old state that has + * more strict register state and more strict stack state then this execution + * branch doesn't need to be explored further, since verifier already + * concluded that more strict state leads to valid finish. + * + * Therefore two states are equivalent if register state is more conservative + * and explored stack state is more conservative than the current one. + * Example: + * explored current + * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC) + * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC) + * + * In other words if current stack state (one being explored) has more + * valid slots than old one that already passed validation, it means + * the verifier can stop exploring and conclude that current state is valid too + * + * Similarly with registers. If explored state has register type as invalid + * whereas register type in current state is meaningful, it means that + * the current state will reach 'bpf_exit' instruction safely + */ +static bool states_equal(struct verifier_state *old, struct verifier_state *cur) +{ + int i; + + for (i = 0; i < MAX_BPF_REG; i++) { + if (memcmp(&old->regs[i], &cur->regs[i], + sizeof(old->regs[0])) != 0) { + if (old->regs[i].type == NOT_INIT || + (old->regs[i].type == UNKNOWN_VALUE && + cur->regs[i].type != NOT_INIT)) + continue; + return false; + } + } + + for (i = 0; i < MAX_BPF_STACK; i++) { + if (old->stack_slot_type[i] == STACK_INVALID) + continue; + if (old->stack_slot_type[i] != cur->stack_slot_type[i]) + /* Ex: old explored (safe) state has STACK_SPILL in + * this stack slot, but current has has STACK_MISC -> + * this verifier states are not equivalent, + * return false to continue verification of this path + */ + return false; + if (i % BPF_REG_SIZE) + continue; + if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], + &cur->spilled_regs[i / BPF_REG_SIZE], + sizeof(old->spilled_regs[0]))) + /* when explored and current stack slot types are + * the same, check that stored pointers types + * are the same as well. + * Ex: explored safe path could have stored + * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} + * but current path has stored: + * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} + * such verifier states are not equivalent. + * return false to continue verification of this path + */ + return false; + else + continue; + } + return true; +} + +static int is_state_visited(struct verifier_env *env, int insn_idx) +{ + struct verifier_state_list *new_sl; + struct verifier_state_list *sl; + + sl = env->explored_states[insn_idx]; + if (!sl) + /* this 'insn_idx' instruction wasn't marked, so we will not + * be doing state search here + */ + return 0; + + while (sl != STATE_LIST_MARK) { + if (states_equal(&sl->state, &env->cur_state)) + /* reached equivalent register/stack state, + * prune the search + */ + return 1; + sl = sl->next; + } + + /* there were no equivalent states, remember current one. + * technically the current state is not proven to be safe yet, + * but it will either reach bpf_exit (which means it's safe) or + * it will be rejected. Since there are no loops, we won't be + * seeing this 'insn_idx' instruction again on the way to bpf_exit + */ + new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); + if (!new_sl) + return -ENOMEM; + + /* add new state to the head of linked list */ + memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); + new_sl->next = env->explored_states[insn_idx]; + env->explored_states[insn_idx] = new_sl; + return 0; +} + +static int do_check(struct verifier_env *env) +{ + struct verifier_state *state = &env->cur_state; + struct bpf_insn *insns = env->prog->insnsi; + struct reg_state *regs = state->regs; + int insn_cnt = env->prog->len; + int insn_idx, prev_insn_idx = 0; + int insn_processed = 0; + bool do_print_state = false; + + init_reg_state(regs); + insn_idx = 0; + for (;;) { + struct bpf_insn *insn; + u8 class; + int err; + + if (insn_idx >= insn_cnt) { + verbose("invalid insn idx %d insn_cnt %d\n", + insn_idx, insn_cnt); + return -EFAULT; + } + + insn = &insns[insn_idx]; + class = BPF_CLASS(insn->code); + + if (++insn_processed > 32768) { + verbose("BPF program is too large. Proccessed %d insn\n", + insn_processed); + return -E2BIG; + } + + err = is_state_visited(env, insn_idx); + if (err < 0) + return err; + if (err == 1) { + /* found equivalent state, can prune the search */ + if (log_level) { + if (do_print_state) + verbose("\nfrom %d to %d: safe\n", + prev_insn_idx, insn_idx); + else + verbose("%d: safe\n", insn_idx); + } + goto process_bpf_exit; + } + + if (log_level && do_print_state) { + verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); + print_verifier_state(env); + do_print_state = false; + } + + if (log_level) { + verbose("%d: ", insn_idx); + print_bpf_insn(insn); + } + + if (class == BPF_ALU || class == BPF_ALU64) { + err = check_alu_op(regs, insn); + if (err) + return err; + + } else if (class == BPF_LDX) { + enum bpf_reg_type src_reg_type; + + /* check for reserved fields is already done */ + + /* check src operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + + err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); + if (err) + return err; + + src_reg_type = regs[insn->src_reg].type; + + /* check that memory (src_reg + off) is readable, + * the state of dst_reg will be updated by this func + */ + err = check_mem_access(env, insn->src_reg, insn->off, + BPF_SIZE(insn->code), BPF_READ, + insn->dst_reg); + if (err) + return err; + + if (BPF_SIZE(insn->code) != BPF_W) { + insn_idx++; + continue; + } + + if (insn->imm == 0) { + /* saw a valid insn + * dst_reg = *(u32 *)(src_reg + off) + * use reserved 'imm' field to mark this insn + */ + insn->imm = src_reg_type; + + } else if (src_reg_type != insn->imm && + (src_reg_type == PTR_TO_CTX || + insn->imm == PTR_TO_CTX)) { + /* ABuser program is trying to use the same insn + * dst_reg = *(u32*) (src_reg + off) + * with different pointer types: + * src_reg == ctx in one branch and + * src_reg == stack|map in some other branch. + * Reject it. + */ + verbose("same insn cannot be used with different pointers\n"); + return -EINVAL; + } + + } else if (class == BPF_STX) { + if (BPF_MODE(insn->code) == BPF_XADD) { + err = check_xadd(env, insn); + if (err) + return err; + insn_idx++; + continue; + } + + if (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0) { + verbose("BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ + err = check_reg_arg(regs, insn->src_reg, SRC_OP); + if (err) + return err; + /* check src2 operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + insn->src_reg); + if (err) + return err; + + } else if (class == BPF_ST) { + if (BPF_MODE(insn->code) != BPF_MEM || + insn->src_reg != BPF_REG_0) { + verbose("BPF_ST uses reserved fields\n"); + return -EINVAL; + } + /* check src operand */ + err = check_reg_arg(regs, insn->dst_reg, SRC_OP); + if (err) + return err; + + /* check that memory (dst_reg + off) is writeable */ + err = check_mem_access(env, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, + -1); + if (err) + return err; + + } else if (class == BPF_JMP) { + u8 opcode = BPF_OP(insn->code); + + if (opcode == BPF_CALL) { + if (BPF_SRC(insn->code) != BPF_K || + insn->off != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_CALL uses reserved fields\n"); + return -EINVAL; + } + + err = check_call(env, insn->imm); + if (err) + return err; + + } else if (opcode == BPF_JA) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_JA uses reserved fields\n"); + return -EINVAL; + } + + insn_idx += insn->off + 1; + continue; + + } else if (opcode == BPF_EXIT) { + if (BPF_SRC(insn->code) != BPF_K || + insn->imm != 0 || + insn->src_reg != BPF_REG_0 || + insn->dst_reg != BPF_REG_0) { + verbose("BPF_EXIT uses reserved fields\n"); + return -EINVAL; + } + + /* eBPF calling convetion is such that R0 is used + * to return the value from eBPF program. + * Make sure that it's readable at this time + * of bpf_exit, which means that program wrote + * something into it earlier + */ + err = check_reg_arg(regs, BPF_REG_0, SRC_OP); + if (err) + return err; + +process_bpf_exit: + insn_idx = pop_stack(env, &prev_insn_idx); + if (insn_idx < 0) { + break; + } else { + do_print_state = true; + continue; + } + } else { + err = check_cond_jmp_op(env, insn, &insn_idx); + if (err) + return err; + } + } else if (class == BPF_LD) { + u8 mode = BPF_MODE(insn->code); + + if (mode == BPF_ABS || mode == BPF_IND) { + err = check_ld_abs(env, insn); + if (err) + return err; + + } else if (mode == BPF_IMM) { + err = check_ld_imm(env, insn); + if (err) + return err; + + insn_idx++; + } else { + verbose("invalid BPF_LD mode\n"); + return -EINVAL; + } + } else { + verbose("unknown insn class %d\n", class); + return -EINVAL; + } + + insn_idx++; + } + + return 0; +} + +/* look for pseudo eBPF instructions that access map FDs and + * replace them with actual map pointers + */ +static int replace_map_fd_with_map_ptr(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i, j; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0)) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { + struct bpf_map *map; + struct fd f; + + if (i == insn_cnt - 1 || insn[1].code != 0 || + insn[1].dst_reg != 0 || insn[1].src_reg != 0 || + insn[1].off != 0) { + verbose("invalid bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + if (insn->src_reg == 0) + /* valid generic load 64-bit imm */ + goto next_insn; + + if (insn->src_reg != BPF_PSEUDO_MAP_FD) { + verbose("unrecognized bpf_ld_imm64 insn\n"); + return -EINVAL; + } + + f = fdget(insn->imm); + + map = bpf_map_get(f); + if (IS_ERR(map)) { + verbose("fd %d is not pointing to valid bpf_map\n", + insn->imm); + fdput(f); + return PTR_ERR(map); + } + + /* store map pointer inside BPF_LD_IMM64 instruction */ + insn[0].imm = (u32) (unsigned long) map; + insn[1].imm = ((u64) (unsigned long) map) >> 32; + + /* check whether we recorded this map already */ + for (j = 0; j < env->used_map_cnt; j++) + if (env->used_maps[j] == map) { + fdput(f); + goto next_insn; + } + + if (env->used_map_cnt >= MAX_USED_MAPS) { + fdput(f); + return -E2BIG; + } + + /* remember this map */ + env->used_maps[env->used_map_cnt++] = map; + + /* hold the map. If the program is rejected by verifier, + * the map will be released by release_maps() or it + * will be used by the valid program until it's unloaded + * and all maps are released in free_bpf_prog_info() + */ + atomic_inc(&map->refcnt); + + fdput(f); +next_insn: + insn++; + i++; + } + } + + /* now all pseudo BPF_LD_IMM64 instructions load valid + * 'struct bpf_map *' into a register instead of user map_fd. + * These pointers will be used later by verifier to validate map access. + */ + return 0; +} + +/* drop refcnt of maps used by the rejected program */ +static void release_maps(struct verifier_env *env) +{ + int i; + + for (i = 0; i < env->used_map_cnt; i++) + bpf_map_put(env->used_maps[i]); +} + +/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ +static void convert_pseudo_ld_imm64(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) + if (insn->code == (BPF_LD | BPF_IMM | BPF_DW)) + insn->src_reg = 0; +} + +static void adjust_branches(struct bpf_prog *prog, int pos, int delta) +{ + struct bpf_insn *insn = prog->insnsi; + int insn_cnt = prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) != BPF_JMP || + BPF_OP(insn->code) == BPF_CALL || + BPF_OP(insn->code) == BPF_EXIT) + continue; + + /* adjust offset of jmps if necessary */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos && i + insn->off + 1 < pos) + insn->off -= delta; + } +} + +/* convert load instructions that access fields of 'struct __sk_buff' + * into sequence of instructions that access fields of 'struct sk_buff' + */ +static int convert_ctx_accesses(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + u32 cnt; + int i; + + if (!env->prog->aux->ops->convert_ctx_access) + return 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) + continue; + + if (insn->imm != PTR_TO_CTX) { + /* clear internal mark */ + insn->imm = 0; + continue; + } + + cnt = env->prog->aux->ops-> + convert_ctx_access(insn->dst_reg, insn->src_reg, + insn->off, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } + + if (cnt == 1) { + memcpy(insn, insn_buf, sizeof(*insn)); + continue; + } + + /* several new insns need to be inserted. Make room for them */ + insn_cnt += cnt - 1; + new_prog = bpf_prog_realloc(env->prog, + bpf_prog_size(insn_cnt), + GFP_USER); + if (!new_prog) + return -ENOMEM; + + new_prog->len = insn_cnt; + + memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, + sizeof(*insn) * (insn_cnt - i - cnt)); + + /* copy substitute insns in place of load instruction */ + memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); + + /* adjust branches in the whole program */ + adjust_branches(new_prog, i, cnt - 1); + + /* keep walking new program and skip insns we just inserted */ + env->prog = new_prog; + insn = new_prog->insnsi + i + cnt - 1; + i += cnt - 1; + } + + return 0; +} + +static void free_states(struct verifier_env *env) +{ + struct verifier_state_list *sl, *sln; + int i; + + if (!env->explored_states) + return; + + for (i = 0; i < env->prog->len; i++) { + sl = env->explored_states[i]; + + if (sl) + while (sl != STATE_LIST_MARK) { + sln = sl->next; + kfree(sl); + sl = sln; + } + } + + kfree(env->explored_states); +} + +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) +{ + char __user *log_ubuf = NULL; + struct verifier_env *env; + int ret = -EINVAL; + + if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) + return -E2BIG; + + /* 'struct verifier_env' can be global, but since it's not small, + * allocate/free it every time bpf_check() is called + */ + env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); + if (!env) + return -ENOMEM; + + env->prog = *prog; + + /* grab the mutex to protect few globals used by verifier */ + mutex_lock(&bpf_verifier_lock); + + if (attr->log_level || attr->log_buf || attr->log_size) { + /* user requested verbose verifier output + * and supplied buffer to store the verification trace + */ + log_level = attr->log_level; + log_ubuf = (char __user *) (unsigned long) attr->log_buf; + log_size = attr->log_size; + log_len = 0; + + ret = -EINVAL; + /* log_* values have to be sane */ + if (log_size < 128 || log_size > UINT_MAX >> 8 || + log_level == 0 || log_ubuf == NULL) + goto free_env; + + ret = -ENOMEM; + log_buf = vmalloc(log_size); + if (!log_buf) + goto free_env; + } else { + log_level = 0; + } + + ret = replace_map_fd_with_map_ptr(env); + if (ret < 0) + goto skip_full_check; + + env->explored_states = kcalloc(env->prog->len, + sizeof(struct verifier_state_list *), + GFP_USER); + ret = -ENOMEM; + if (!env->explored_states) + goto skip_full_check; + + ret = check_cfg(env); + if (ret < 0) + goto skip_full_check; + + ret = do_check(env); + +skip_full_check: + while (pop_stack(env, NULL) >= 0); + free_states(env); + + if (ret == 0) + /* program is valid, convert *(u32*)(ctx + off) accesses */ + ret = convert_ctx_accesses(env); + + if (log_level && log_len >= log_size - 1) { + BUG_ON(log_len >= log_size); + /* verifier log exceeded user supplied buffer */ + ret = -ENOSPC; + /* fall through to return what was recorded */ + } + + /* copy verifier log back to user space including trailing zero */ + if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { + ret = -EFAULT; + goto free_log_buf; + } + + if (ret == 0 && env->used_map_cnt) { + /* if program passed verifier, update used_maps in bpf_prog_info */ + env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); + + if (!env->prog->aux->used_maps) { + ret = -ENOMEM; + goto free_log_buf; + } + + memcpy(env->prog->aux->used_maps, env->used_maps, + sizeof(env->used_maps[0]) * env->used_map_cnt); + env->prog->aux->used_map_cnt = env->used_map_cnt; + + /* program is valid. Convert pseudo bpf_ld_imm64 into generic + * bpf_ld_imm64 instructions + */ + convert_pseudo_ld_imm64(env); + } + +free_log_buf: + if (log_level) + vfree(log_buf); +free_env: + if (!env->prog->aux->used_maps) + /* if we didn't copy map pointers into bpf_prog_info, release + * them now. Otherwise free_bpf_prog_info() will release them. + */ + release_maps(env); + *prog = env->prog; + kfree(env); + mutex_unlock(&bpf_verifier_lock); + return ret; +} |