diff options
Diffstat (limited to 'net/netfilter')
41 files changed, 1161 insertions, 543 deletions
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 85ca189bd..096a45103 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -104,6 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key) spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); } +static void ip_vs_conn_expire(unsigned long data); /* * Returns hash value for IPVS connection entry @@ -453,10 +454,16 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, } EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); +static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp) +{ + __ip_vs_conn_put(cp); + ip_vs_conn_expire((unsigned long)cp); +} + /* * Put back the conn and restart its timer with its timeout */ -void ip_vs_conn_put(struct ip_vs_conn *cp) +static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp) { unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? 0 : cp->timeout; @@ -465,6 +472,16 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) __ip_vs_conn_put(cp); } +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && + (atomic_read(&cp->refcnt) == 1) && + !timer_pending(&cp->timer)) + /* expire connection immediately */ + __ip_vs_conn_put_notimer(cp); + else + __ip_vs_conn_put_timer(cp); +} /* * Fill a no_client_port connection with a client port number @@ -745,7 +762,7 @@ static int expire_quiescent_template(struct netns_ipvs *ipvs, * If available, return 1, otherwise invalidate this connection * template and return 0. */ -int ip_vs_check_template(struct ip_vs_conn *ct) +int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest) { struct ip_vs_dest *dest = ct->dest; struct netns_ipvs *ipvs = ct->ipvs; @@ -755,7 +772,8 @@ int ip_vs_check_template(struct ip_vs_conn *ct) */ if ((dest == NULL) || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || - expire_quiescent_template(ipvs, dest)) { + expire_quiescent_template(ipvs, dest) || + (cdest && (dest != cdest))) { IP_VS_DBG_BUF(9, "check_template: dest not available for " "protocol %s s:%s:%d v:%s:%d " "-> d:%s:%d\n", @@ -819,7 +837,8 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->control) ip_vs_control_del(cp); - if (cp->flags & IP_VS_CONN_F_NFCT) { + if ((cp->flags & IP_VS_CONN_F_NFCT) && + !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) { /* Do not access conntracks during subsys cleanup * because nf_conntrack_find_get can not be used after * conntrack cleanup for the net. @@ -834,7 +853,10 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_unbind_dest(cp); if (cp->flags & IP_VS_CONN_F_NO_CPORT) atomic_dec(&ip_vs_conn_no_cport_cnt); - call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + ip_vs_conn_rcu_free(&cp->rcu_head); + else + call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); atomic_dec(&ipvs->conn_count); return; } @@ -850,7 +872,7 @@ static void ip_vs_conn_expire(unsigned long data) if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); - ip_vs_conn_put(cp); + __ip_vs_conn_put_timer(cp); } /* Modify timer, so that it expires as soon as possible. @@ -1240,6 +1262,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp) return 1; } +static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp) +{ + struct ip_vs_service *svc; + + if (!cp->dest) + return false; + svc = rcu_dereference(cp->dest->svc); + return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET); +} + /* Called from keventd and must protect itself from softirqs */ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) { @@ -1254,11 +1286,16 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->flags & IP_VS_CONN_F_TEMPLATE) - /* connection template */ - continue; if (cp->ipvs != ipvs) continue; + if (cp->flags & IP_VS_CONN_F_TEMPLATE) { + if (atomic_read(&cp->n_control) || + !ip_vs_conn_ops_mode(cp)) + continue; + else + /* connection template of OPS */ + goto try_drop; + } if (cp->protocol == IPPROTO_TCP) { switch(cp->state) { case IP_VS_TCP_S_SYN_RECV: @@ -1286,6 +1323,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) continue; } } else { +try_drop: if (!todrop_entry(cp)) continue; } diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index b9a4082af..2c1b498a7 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -68,6 +68,7 @@ EXPORT_SYMBOL(ip_vs_conn_put); #ifdef CONFIG_IP_VS_DEBUG EXPORT_SYMBOL(ip_vs_get_debug_level); #endif +EXPORT_SYMBOL(ip_vs_new_conn_out); static int ip_vs_net_id __read_mostly; /* netns cnt used for uniqueness */ @@ -320,7 +321,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* Check if a template already exists */ ct = ip_vs_ct_in_get(¶m); - if (!ct || !ip_vs_check_template(ct)) { + if (!ct || !ip_vs_check_template(ct, NULL)) { struct ip_vs_scheduler *sched; /* @@ -611,7 +612,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ret = cp->packet_xmit(skb, cp, pd->pp, iph); /* do not touch skb anymore */ - atomic_inc(&cp->in_pkts); + if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + atomic_inc(&cp->control->in_pkts); + else + atomic_inc(&cp->in_pkts); ip_vs_conn_put(cp); return ret; } @@ -1100,6 +1104,144 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, } } +/* Generic function to create new connections for outgoing RS packets + * + * Pre-requisites for successful connection creation: + * 1) Virtual Service is NOT fwmark based: + * In fwmark-VS actual vaddr and vport are unknown to IPVS + * 2) Real Server and Virtual Service were NOT configured without port: + * This is to allow match of different VS to the same RS ip-addr + */ +struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + struct ip_vs_conn_param param; + struct ip_vs_conn *ct = NULL, *cp = NULL; + const union nf_inet_addr *vaddr, *daddr, *caddr; + union nf_inet_addr snet; + __be16 vport; + unsigned int flags; + + EnterFunction(12); + vaddr = &svc->addr; + vport = svc->port; + daddr = &iph->saddr; + caddr = &iph->daddr; + + /* check pre-requisites are satisfied */ + if (svc->fwmark) + return NULL; + if (!vport || !dport) + return NULL; + + /* for persistent service first create connection template */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) { + /* apply netmask the same way ingress-side does */ +#ifdef CONFIG_IP_VS_IPV6 + if (svc->af == AF_INET6) + ipv6_addr_prefix(&snet.in6, &caddr->in6, + (__force __u32)svc->netmask); + else +#endif + snet.ip = caddr->ip & svc->netmask; + /* fill params and create template if not existent */ + if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol, + &snet, 0, vaddr, + vport, ¶m) < 0) + return NULL; + ct = ip_vs_ct_in_get(¶m); + /* check if template exists and points to the same dest */ + if (!ct || !ip_vs_check_template(ct, dest)) { + ct = ip_vs_conn_new(¶m, dest->af, daddr, dport, + IP_VS_CONN_F_TEMPLATE, dest, 0); + if (!ct) { + kfree(param.pe_data); + return NULL; + } + ct->timeout = svc->timeout; + } else { + kfree(param.pe_data); + } + } + + /* connection flags */ + flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) && + iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; + /* create connection */ + ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, + caddr, cport, vaddr, vport, ¶m); + cp = ip_vs_conn_new(¶m, dest->af, daddr, dport, flags, dest, 0); + if (!cp) { + if (ct) + ip_vs_conn_put(ct); + return NULL; + } + if (ct) { + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + } + ip_vs_conn_stats(cp, svc); + + /* return connection (will be used to handle outgoing packet) */ + IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u " + "d:%s:%u conn->flags:%X conn->refcnt:%d\n", + ip_vs_fwd_tag(cp), + IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), + IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), + IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + LeaveFunction(12); + return cp; +} + +/* Handle outgoing packets which are considered requests initiated by + * real servers, so that subsequent responses from external client can be + * routed to the right real server. + * Used also for outgoing responses in OPS mode. + * + * Connection management is handled by persistent-engine specific callback. + */ +static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum, + struct netns_ipvs *ipvs, + int af, struct sk_buff *skb, + const struct ip_vs_iphdr *iph) +{ + struct ip_vs_dest *dest; + struct ip_vs_conn *cp = NULL; + __be16 _ports[2], *pptr; + + if (hooknum == NF_INET_LOCAL_IN) + return NULL; + + pptr = frag_safe_skb_hp(skb, iph->len, + sizeof(_ports), _ports, iph); + if (!pptr) + return NULL; + + rcu_read_lock(); + dest = ip_vs_find_real_service(ipvs, af, iph->protocol, + &iph->saddr, pptr[0]); + if (dest) { + struct ip_vs_service *svc; + struct ip_vs_pe *pe; + + svc = rcu_dereference(dest->svc); + if (svc) { + pe = rcu_dereference(svc->pe); + if (pe && pe->conn_out) + cp = pe->conn_out(svc, dest, skb, iph, + pptr[0], pptr[1]); + } + } + rcu_read_unlock(); + + return cp; +} + /* Handle response packets: rewrite addresses and send away... */ static unsigned int @@ -1245,6 +1387,22 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in if (likely(cp)) return handle_response(af, skb, pd, cp, &iph, hooknum); + + /* Check for real-server-started requests */ + if (atomic_read(&ipvs->conn_out_counter)) { + /* Currently only for UDP: + * connection oriented protocols typically use + * ephemeral ports for outgoing connections, so + * related incoming responses would not match any VS + */ + if (pp->protocol == IPPROTO_UDP) { + cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph); + if (likely(cp)) + return handle_response(af, skb, pd, cp, &iph, + hooknum); + } + } + if (sysctl_nat_icmp_send(ipvs) && (pp->protocol == IPPROTO_TCP || pp->protocol == IPPROTO_UDP || @@ -1837,6 +1995,9 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int if (ipvs->sync_state & IP_VS_STATE_MASTER) ip_vs_sync_conn(ipvs, cp, pkts); + else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control) + /* increment is done inside ip_vs_sync_conn too */ + atomic_inc(&cp->control->in_pkts); ip_vs_conn_put(cp); return ret; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 404b2a4f4..c3c809b2e 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -567,6 +567,36 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, return false; } +/* Find real service record by <proto,addr,port>. + * In case of multiple records with the same <proto,addr,port>, only + * the first found record is returned. + * + * To be called under RCU lock. + */ +struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, + __u16 protocol, + const union nf_inet_addr *daddr, + __be16 dport) +{ + unsigned int hash; + struct ip_vs_dest *dest; + + /* Check for "full" addressed entries */ + hash = ip_vs_rs_hashkey(af, daddr, dport); + + hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) { + if (dest->port == dport && + dest->af == af && + ip_vs_addr_equal(af, &dest->addr, daddr) && + (dest->protocol == protocol || dest->vfwmark)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + /* Lookup destination by {addr,port} in the given service * Called under RCU lock. */ @@ -1253,6 +1283,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); + if (svc->pe && svc->pe->conn_out) + atomic_inc(&ipvs->conn_out_counter); ip_vs_start_estimator(ipvs, &svc->stats); @@ -1293,6 +1325,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; + bool new_pe_conn_out, old_pe_conn_out; /* * Lookup the scheduler, by 'u->sched_name' @@ -1355,8 +1388,16 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) svc->netmask = u->netmask; old_pe = rcu_dereference_protected(svc->pe, 1); - if (pe != old_pe) + if (pe != old_pe) { rcu_assign_pointer(svc->pe, pe); + /* check for optional methods in new pe */ + new_pe_conn_out = (pe && pe->conn_out) ? true : false; + old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false; + if (new_pe_conn_out && !old_pe_conn_out) + atomic_inc(&svc->ipvs->conn_out_counter); + if (old_pe_conn_out && !new_pe_conn_out) + atomic_dec(&svc->ipvs->conn_out_counter); + } out: ip_vs_scheduler_put(old_sched); @@ -1389,6 +1430,8 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) /* Unbind persistence engine, keep svc->pe */ old_pe = rcu_dereference_protected(svc->pe, 1); + if (old_pe && old_pe->conn_out) + atomic_dec(&ipvs->conn_out_counter); ip_vs_pe_put(old_pe); /* @@ -2875,8 +2918,10 @@ static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type, if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) || nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) || - nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, + IPVS_STATS_ATTR_PAD) || nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) || nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) || nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) || @@ -2900,16 +2945,26 @@ static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type, if (!nl_stats) return -EMSGSIZE; - if (nla_put_u64(skb, IPVS_STATS_ATTR_CONNS, kstats->conns) || - nla_put_u64(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts) || - nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes) || - nla_put_u64(skb, IPVS_STATS_ATTR_CPS, kstats->cps) || - nla_put_u64(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps) || - nla_put_u64(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps) || - nla_put_u64(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps)) + if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps, + IPVS_STATS_ATTR_PAD) || + nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps, + IPVS_STATS_ATTR_PAD)) goto nla_put_failure; nla_nest_end(skb, nl_stats); @@ -3957,6 +4012,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) (unsigned long) ipvs); atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0); + atomic_set(&ipvs->conn_out_counter, 0); /* procfs stats */ ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 30434fb13..f04fd8df2 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -93,6 +93,10 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return; + /* Never alter conntrack for OPS conns (no reply is expected) */ + if (cp->flags & IP_VS_CONN_F_ONE_PACKET) + return; + /* Alter reply only in original direction */ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return; diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index 0a6eb5c0d..d07ef9e31 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -143,6 +143,20 @@ static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) return cp->pe_data_len; } +static struct ip_vs_conn * +ip_vs_sip_conn_out(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + __be16 dport, + __be16 cport) +{ + if (likely(iph->protocol == IPPROTO_UDP)) + return ip_vs_new_conn_out(svc, dest, skb, iph, dport, cport); + /* currently no need to handle other than UDP */ + return NULL; +} + static struct ip_vs_pe ip_vs_sip_pe = { .name = "sip", @@ -153,6 +167,7 @@ static struct ip_vs_pe ip_vs_sip_pe = .ct_match = ip_vs_sip_ct_match, .hashkey_raw = ip_vs_sip_hashkey_raw, .show_pe_data = ip_vs_sip_show_pe_data, + .conn_out = ip_vs_sip_conn_out, }; static int __init ip_vs_sip_init(void) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 803001a45..1b07578be 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1545,7 +1545,8 @@ error: /* * Set up receiving multicast socket over UDP */ -static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id) +static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id, + int ifindex) { /* multicast addr */ union ipvs_sockaddr mcast_addr; @@ -1566,6 +1567,7 @@ static struct socket *make_receive_sock(struct netns_ipvs *ipvs, int id) set_sock_size(sock->sk, 0, result); get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); + sock->sk->sk_bound_dev_if = ifindex; result = sock->ops->bind(sock, (struct sockaddr *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); @@ -1868,7 +1870,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, if (state == IP_VS_STATE_MASTER) sock = make_send_sock(ipvs, id); else - sock = make_receive_sock(ipvs, id); + sock = make_receive_sock(ipvs, id, dev->ifindex); if (IS_ERR(sock)) { result = PTR_ERR(sock); goto outtinfo; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index dc196a0f5..01d3d894d 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -932,17 +932,14 @@ error: static inline int __tun_gso_type_mask(int encaps_af, int orig_af) { - if (encaps_af == AF_INET) { - if (orig_af == AF_INET) - return SKB_GSO_IPIP; - - return SKB_GSO_SIT; + switch (encaps_af) { + case AF_INET: + return SKB_GSO_IPXIP4; + case AF_INET6: + return SKB_GSO_IPXIP6; + default: + return 0; } - - /* GSO: we need to provide proper SKB_GSO_ value for IPv6: - * SKB_GSO_SIT/IPV6 - */ - return 0; } /* @@ -1013,8 +1010,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af)); - if (IS_ERR(skb)) + if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af))) goto tx_error; skb->transport_header = skb->network_header; @@ -1105,8 +1101,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (IS_ERR(skb)) goto tx_error; - skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af)); - if (IS_ERR(skb)) + if (iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af))) goto tx_error; skb->transport_header = skb->network_header; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e27fd17c6..9f530adad 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -12,6 +12,8 @@ * published by the Free Software Foundation. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/netfilter.h> #include <linux/module.h> @@ -52,6 +54,7 @@ #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_core.h> #include <net/netfilter/nf_nat_helper.h> +#include <net/netns/hash.h> #define NF_CONNTRACK_VERSION "0.5.0" @@ -66,6 +69,12 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); +struct hlist_nulls_head *nf_conntrack_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_conntrack_hash); + +static __read_mostly struct kmem_cache *nf_conntrack_cachep; +static __read_mostly spinlock_t nf_conntrack_locks_all_lock; +static __read_mostly seqcount_t nf_conntrack_generation; static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; @@ -105,7 +114,7 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1, spin_lock_nested(&nf_conntrack_locks[h1], SINGLE_DEPTH_NESTING); } - if (read_seqcount_retry(&net->ct.generation, sequence)) { + if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { nf_conntrack_double_unlock(h1, h2); return true; } @@ -139,43 +148,43 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max); DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); -unsigned int nf_conntrack_hash_rnd __read_mostly; -EXPORT_SYMBOL_GPL(nf_conntrack_hash_rnd); +static unsigned int nf_conntrack_hash_rnd __read_mostly; -static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple) +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, + const struct net *net) { unsigned int n; + u32 seed; + + get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); /* The direction must be ignored, so we hash everything up to the * destination ports (which is a multiple of 4) and treat the last * three bytes manually. */ + seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, nf_conntrack_hash_rnd ^ + return jhash2((u32 *)tuple, n, seed ^ (((__force __u16)tuple->dst.u.all << 16) | tuple->dst.protonum)); } -static u32 __hash_bucket(u32 hash, unsigned int size) -{ - return reciprocal_scale(hash, size); -} - -static u32 hash_bucket(u32 hash, const struct net *net) +static u32 scale_hash(u32 hash) { - return __hash_bucket(hash, net->ct.htable_size); + return reciprocal_scale(hash, nf_conntrack_htable_size); } -static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, - unsigned int size) +static u32 __hash_conntrack(const struct net *net, + const struct nf_conntrack_tuple *tuple, + unsigned int size) { - return __hash_bucket(hash_conntrack_raw(tuple), size); + return reciprocal_scale(hash_conntrack_raw(tuple, net), size); } -static inline u_int32_t hash_conntrack(const struct net *net, - const struct nf_conntrack_tuple *tuple) +static u32 hash_conntrack(const struct net *net, + const struct nf_conntrack_tuple *tuple) { - return __hash_conntrack(tuple, net->ct.htable_size); + return scale_hash(hash_conntrack_raw(tuple, net)); } bool @@ -356,7 +365,7 @@ destroy_conntrack(struct nf_conntrack *nfct) } rcu_read_lock(); l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); - if (l4proto && l4proto->destroy) + if (l4proto->destroy) l4proto->destroy(ct); rcu_read_unlock(); @@ -391,7 +400,7 @@ static void nf_ct_delete_from_lists(struct nf_conn *ct) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); reply_hash = hash_conntrack(net, @@ -443,7 +452,8 @@ static void death_by_timeout(unsigned long ul_conntrack) static inline bool nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const struct nf_conntrack_zone *zone, + const struct net *net) { struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); @@ -452,7 +462,8 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, */ return nf_ct_tuple_equal(tuple, &h->tuple) && nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h)) && - nf_ct_is_confirmed(ct); + nf_ct_is_confirmed(ct) && + net_eq(net, nf_ct_net(ct)); } /* @@ -465,21 +476,23 @@ ____nf_conntrack_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int bucket = hash_bucket(hash, net); + unsigned int bucket, sequence; - /* Disable BHs the entire time since we normally need to disable them - * at least once for the stats anyway. - */ - local_bh_disable(); begin: - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { - if (nf_ct_key_equal(h, tuple, zone)) { - NF_CT_STAT_INC(net, found); - local_bh_enable(); + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + bucket = scale_hash(hash); + ct_hash = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[bucket], hnnode) { + if (nf_ct_key_equal(h, tuple, zone, net)) { + NF_CT_STAT_INC_ATOMIC(net, found); return h; } - NF_CT_STAT_INC(net, searched); + NF_CT_STAT_INC_ATOMIC(net, searched); } /* * if the nulls value we got at the end of this lookup is @@ -487,10 +500,9 @@ begin: * We probably met an item that was moved to another chain. */ if (get_nulls_value(n) != bucket) { - NF_CT_STAT_INC(net, search_restart); + NF_CT_STAT_INC_ATOMIC(net, search_restart); goto begin; } - local_bh_enable(); return NULL; } @@ -512,7 +524,7 @@ begin: !atomic_inc_not_zero(&ct->ct_general.use))) h = NULL; else { - if (unlikely(!nf_ct_key_equal(h, tuple, zone))) { + if (unlikely(!nf_ct_key_equal(h, tuple, zone, net))) { nf_ct_put(ct); goto begin; } @@ -528,7 +540,7 @@ nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { return __nf_conntrack_find_get(net, zone, tuple, - hash_conntrack_raw(tuple)); + hash_conntrack_raw(tuple, net)); } EXPORT_SYMBOL_GPL(nf_conntrack_find_get); @@ -536,12 +548,10 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, unsigned int hash, unsigned int reply_hash) { - struct net *net = nf_ct_net(ct); - hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode, - &net->ct.hash[hash]); + &nf_conntrack_hash[hash]); hlist_nulls_add_head_rcu(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode, - &net->ct.hash[reply_hash]); + &nf_conntrack_hash[reply_hash]); } int @@ -558,7 +568,7 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); reply_hash = hash_conntrack(net, @@ -566,17 +576,14 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* See if there's one in the list already, including reverse */ - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) goto out; add_timer(&ct->timeout); @@ -597,6 +604,63 @@ out: } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); +static inline void nf_ct_acct_update(struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int len) +{ + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; + + atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); + atomic64_add(len, &counter[CTINFO2DIR(ctinfo)].bytes); + } +} + +static void nf_ct_acct_merge(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + const struct nf_conn *loser_ct) +{ + struct nf_conn_acct *acct; + + acct = nf_conn_acct_find(loser_ct); + if (acct) { + struct nf_conn_counter *counter = acct->counter; + unsigned int bytes; + + /* u32 should be fine since we must have seen one packet. */ + bytes = atomic64_read(&counter[CTINFO2DIR(ctinfo)].bytes); + nf_ct_acct_update(ct, ctinfo, bytes); + } +} + +/* Resolve race on insertion if this protocol allows this. */ +static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, + enum ip_conntrack_info ctinfo, + struct nf_conntrack_tuple_hash *h) +{ + /* This is the conntrack entry already in hashes that won race. */ + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + struct nf_conntrack_l4proto *l4proto; + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto->allow_clash && + !nfct_nat(ct) && + !nf_ct_is_dying(ct) && + atomic_inc_not_zero(&ct->ct_general.use)) { + nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct); + nf_conntrack_put(skb->nfct); + /* Assign conntrack already in hashes to this skbuff. Don't + * modify skb->nfctinfo to ensure consistent stateful filtering. + */ + skb->nfct = &ct->ct_general; + return NF_ACCEPT; + } + NF_CT_STAT_INC(net, drop); + return NF_DROP; +} + /* Confirm a connection given skb; places it in hash table */ int __nf_conntrack_confirm(struct sk_buff *skb) @@ -611,6 +675,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) enum ip_conntrack_info ctinfo; struct net *net; unsigned int sequence; + int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); net = nf_ct_net(ct); @@ -626,10 +691,10 @@ __nf_conntrack_confirm(struct sk_buff *skb) local_bh_disable(); do { - sequence = read_seqcount_begin(&net->ct.generation); + sequence = read_seqcount_begin(&nf_conntrack_generation); /* reuse the hash saved before */ hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; - hash = hash_bucket(hash, net); + hash = scale_hash(hash); reply_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -653,23 +718,22 @@ __nf_conntrack_confirm(struct sk_buff *skb) */ nf_ct_del_from_dying_or_unconfirmed_list(ct); - if (unlikely(nf_ct_is_dying(ct))) - goto out; + if (unlikely(nf_ct_is_dying(ct))) { + nf_ct_add_to_dying_list(ct); + goto dying; + } /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &net->ct.hash[reply_hash], hnnode) - if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, - &h->tuple) && - nf_ct_zone_equal(nf_ct_tuplehash_to_ctrack(h), zone, - NF_CT_DIRECTION(h))) + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, + zone, net)) goto out; /* Timer relative to confirmation time, not original @@ -708,10 +772,12 @@ __nf_conntrack_confirm(struct sk_buff *skb) out: nf_ct_add_to_dying_list(ct); + ret = nf_ct_resolve_clash(net, skb, ctinfo, h); +dying: nf_conntrack_double_unlock(hash, reply_hash); NF_CT_STAT_INC(net, insert_failed); local_bh_enable(); - return NF_DROP; + return ret; } EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); @@ -724,29 +790,31 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, struct net *net = nf_ct_net(ignored_conntrack); const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_head *ct_hash; + unsigned int hash, sequence; struct hlist_nulls_node *n; struct nf_conn *ct; - unsigned int hash; zone = nf_ct_zone(ignored_conntrack); - hash = hash_conntrack(net, tuple); - /* Disable BHs the entire time since we need to disable them at - * least once for the stats anyway. - */ - rcu_read_lock_bh(); - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { + rcu_read_lock(); + do { + sequence = read_seqcount_begin(&nf_conntrack_generation); + hash = hash_conntrack(net, tuple); + ct_hash = nf_conntrack_hash; + } while (read_seqcount_retry(&nf_conntrack_generation, sequence)); + + hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[hash], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (ct != ignored_conntrack && - nf_ct_tuple_equal(tuple, &h->tuple) && - nf_ct_zone_equal(ct, zone, NF_CT_DIRECTION(h))) { - NF_CT_STAT_INC(net, found); - rcu_read_unlock_bh(); + nf_ct_key_equal(h, tuple, zone, net)) { + NF_CT_STAT_INC_ATOMIC(net, found); + rcu_read_unlock(); return 1; } - NF_CT_STAT_INC(net, searched); + NF_CT_STAT_INC_ATOMIC(net, searched); } - rcu_read_unlock_bh(); + rcu_read_unlock(); return 0; } @@ -760,71 +828,63 @@ static noinline int early_drop(struct net *net, unsigned int _hash) { /* Use oldest entry, which is roughly LRU */ struct nf_conntrack_tuple_hash *h; - struct nf_conn *ct = NULL, *tmp; + struct nf_conn *tmp; struct hlist_nulls_node *n; - unsigned int i = 0, cnt = 0; - int dropped = 0; - unsigned int hash, sequence; + unsigned int i, hash, sequence; + struct nf_conn *ct = NULL; spinlock_t *lockp; + bool ret = false; + + i = 0; local_bh_disable(); restart: - sequence = read_seqcount_begin(&net->ct.generation); - hash = hash_bucket(_hash, net); - for (; i < net->ct.htable_size; i++) { + sequence = read_seqcount_begin(&nf_conntrack_generation); + for (; i < NF_CT_EVICTION_RANGE; i++) { + hash = scale_hash(_hash++); lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); - if (read_seqcount_retry(&net->ct.generation, sequence)) { + if (read_seqcount_retry(&nf_conntrack_generation, sequence)) { spin_unlock(lockp); goto restart; } - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], - hnnode) { + hlist_nulls_for_each_entry_rcu(h, n, &nf_conntrack_hash[hash], + hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); - if (!test_bit(IPS_ASSURED_BIT, &tmp->status) && - !nf_ct_is_dying(tmp) && - atomic_inc_not_zero(&tmp->ct_general.use)) { + + if (test_bit(IPS_ASSURED_BIT, &tmp->status) || + !net_eq(nf_ct_net(tmp), net) || + nf_ct_is_dying(tmp)) + continue; + + if (atomic_inc_not_zero(&tmp->ct_general.use)) { ct = tmp; break; } - cnt++; } - hash = (hash + 1) % net->ct.htable_size; spin_unlock(lockp); - - if (ct || cnt >= NF_CT_EVICTION_RANGE) + if (ct) break; - } + local_bh_enable(); if (!ct) - return dropped; + return false; - if (del_timer(&ct->timeout)) { + /* kill only if in same netns -- might have moved due to + * SLAB_DESTROY_BY_RCU rules + */ + if (net_eq(nf_ct_net(ct), net) && del_timer(&ct->timeout)) { if (nf_ct_delete(ct, 0, 0)) { - dropped = 1; NF_CT_STAT_INC_ATOMIC(net, early_drop); + ret = true; } } - nf_ct_put(ct); - return dropped; -} -void init_nf_conntrack_hash_rnd(void) -{ - unsigned int rand; - - /* - * Why not initialize nf_conntrack_rnd in a "init()" function ? - * Because there isn't enough entropy when system initializing, - * and we initialize it as late as possible. - */ - do { - get_random_bytes(&rand, sizeof(rand)); - } while (!rand); - cmpxchg(&nf_conntrack_hash_rnd, 0, rand); + nf_ct_put(ct); + return ret; } static struct nf_conn * @@ -836,12 +896,6 @@ __nf_conntrack_alloc(struct net *net, { struct nf_conn *ct; - if (unlikely(!nf_conntrack_hash_rnd)) { - init_nf_conntrack_hash_rnd(); - /* recompute the hash as nf_conntrack_hash_rnd is initialized */ - hash = hash_conntrack_raw(orig); - } - /* We don't want any race condition at early drop stage */ atomic_inc(&net->ct.count); @@ -858,7 +912,7 @@ __nf_conntrack_alloc(struct net *net, * Do not use kmem_cache_zalloc(), as this cache uses * SLAB_DESTROY_BY_RCU. */ - ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); + ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) goto out; @@ -885,7 +939,7 @@ __nf_conntrack_alloc(struct net *net, atomic_set(&ct->ct_general.use, 0); return ct; out_free: - kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + kmem_cache_free(nf_conntrack_cachep, ct); out: atomic_dec(&net->ct.count); return ERR_PTR(-ENOMEM); @@ -912,7 +966,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); nf_ct_ext_free(ct); - kmem_cache_free(net->ct.nf_conntrack_cachep, ct); + kmem_cache_free(nf_conntrack_cachep, ct); smp_mb__before_atomic(); atomic_dec(&net->ct.count); } @@ -966,7 +1020,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, if (!l4proto->new(ct, skb, dataoff, timeouts)) { nf_conntrack_free(ct); - pr_debug("init conntrack: can't track with proto module\n"); + pr_debug("can't track with proto module\n"); return NULL; } @@ -988,7 +1042,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, spin_lock(&nf_conntrack_expect_lock); exp = nf_ct_find_expectation(net, zone, tuple); if (exp) { - pr_debug("conntrack: expectation arrives ct=%p exp=%p\n", + pr_debug("expectation arrives ct=%p exp=%p\n", ct, exp); /* Welcome, Mr. Bond. We've been expecting you... */ __set_bit(IPS_EXPECTED_BIT, &ct->status); @@ -1053,13 +1107,13 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, protonum, net, &tuple, l3proto, l4proto)) { - pr_debug("resolve_normal_ct: Can't get tuple\n"); + pr_debug("Can't get tuple\n"); return NULL; } /* look for tuple match */ zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); - hash = hash_conntrack_raw(&tuple); + hash = hash_conntrack_raw(&tuple, net); h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, @@ -1079,14 +1133,13 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, } else { /* Once we've had two way comms, always ESTABLISHED. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - pr_debug("nf_conntrack_in: normal packet for %p\n", ct); + pr_debug("normal packet for %p\n", ct); *ctinfo = IP_CT_ESTABLISHED; } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { - pr_debug("nf_conntrack_in: related packet for %p\n", - ct); + pr_debug("related packet for %p\n", ct); *ctinfo = IP_CT_RELATED; } else { - pr_debug("nf_conntrack_in: new packet for %p\n", ct); + pr_debug("new packet for %p\n", ct); *ctinfo = IP_CT_NEW; } *set_reply = 0; @@ -1269,17 +1322,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, } acct: - if (do_acct) { - struct nf_conn_acct *acct; - - acct = nf_conn_acct_find(ct); - if (acct) { - struct nf_conn_counter *counter = acct->counter; - - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(skb->len, &counter[CTINFO2DIR(ctinfo)].bytes); - } - } + if (do_acct) + nf_ct_acct_update(ct, ctinfo, skb->len); } EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct); @@ -1288,18 +1332,8 @@ bool __nf_ct_kill_acct(struct nf_conn *ct, const struct sk_buff *skb, int do_acct) { - if (do_acct) { - struct nf_conn_acct *acct; - - acct = nf_conn_acct_find(ct); - if (acct) { - struct nf_conn_counter *counter = acct->counter; - - atomic64_inc(&counter[CTINFO2DIR(ctinfo)].packets); - atomic64_add(skb->len - skb_network_offset(skb), - &counter[CTINFO2DIR(ctinfo)].bytes); - } - } + if (do_acct) + nf_ct_acct_update(ct, ctinfo, skb->len); if (del_timer(&ct->timeout)) { ct->timeout.function((unsigned long)ct); @@ -1395,16 +1429,17 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), int cpu; spinlock_t *lockp; - for (; *bucket < net->ct.htable_size; (*bucket)++) { + for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS]; local_bh_disable(); nf_conntrack_lock(lockp); - if (*bucket < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { + if (*bucket < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); - if (iter(ct, data)) + if (net_eq(nf_ct_net(ct), net) && + iter(ct, data)) goto found; } } @@ -1442,6 +1477,9 @@ void nf_ct_iterate_cleanup(struct net *net, might_sleep(); + if (atomic_read(&net->ct.count) == 0) + return; + while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) { /* Time to push up daises... */ if (del_timer(&ct->timeout)) @@ -1493,6 +1531,8 @@ void nf_conntrack_cleanup_end(void) while (untrack_refs() > 0) schedule(); + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); + #ifdef CONFIG_NF_CONNTRACK_ZONES nf_ct_extend_unregister(&nf_ct_zone_extend); #endif @@ -1505,6 +1545,8 @@ void nf_conntrack_cleanup_end(void) nf_conntrack_tstamp_fini(); nf_conntrack_acct_fini(); nf_conntrack_expect_fini(); + + kmem_cache_destroy(nf_conntrack_cachep); } /* @@ -1543,15 +1585,12 @@ i_see_dead_people: } list_for_each_entry(net, net_exit_list, exit_list) { - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); nf_conntrack_proto_pernet_fini(net); nf_conntrack_helper_pernet_fini(net); nf_conntrack_ecache_pernet_fini(net); nf_conntrack_tstamp_pernet_fini(net); nf_conntrack_acct_pernet_fini(net); nf_conntrack_expect_pernet_fini(net); - kmem_cache_destroy(net->ct.nf_conntrack_cachep); - kfree(net->ct.slabname); free_percpu(net->ct.stat); free_percpu(net->ct.pcpu_lists); } @@ -1563,8 +1602,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) unsigned int nr_slots, i; size_t sz; + if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + + if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + sz = nr_slots * sizeof(struct hlist_nulls_head); hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, get_order(sz)); @@ -1606,7 +1652,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) local_bh_disable(); nf_conntrack_all_lock(); - write_seqcount_begin(&init_net.ct.generation); + write_seqcount_begin(&nf_conntrack_generation); /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections @@ -1614,26 +1660,28 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) * though since that required taking the locks. */ - for (i = 0; i < init_net.ct.htable_size; i++) { - while (!hlist_nulls_empty(&init_net.ct.hash[i])) { - h = hlist_nulls_entry(init_net.ct.hash[i].first, - struct nf_conntrack_tuple_hash, hnnode); + for (i = 0; i < nf_conntrack_htable_size; i++) { + while (!hlist_nulls_empty(&nf_conntrack_hash[i])) { + h = hlist_nulls_entry(nf_conntrack_hash[i].first, + struct nf_conntrack_tuple_hash, hnnode); ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); - bucket = __hash_conntrack(&h->tuple, hashsize); + bucket = __hash_conntrack(nf_ct_net(ct), + &h->tuple, hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } - old_size = init_net.ct.htable_size; - old_hash = init_net.ct.hash; + old_size = nf_conntrack_htable_size; + old_hash = nf_conntrack_hash; - init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; - init_net.ct.hash = hash; + nf_conntrack_hash = hash; + nf_conntrack_htable_size = hashsize; - write_seqcount_end(&init_net.ct.generation); + write_seqcount_end(&nf_conntrack_generation); nf_conntrack_all_unlock(); local_bh_enable(); + synchronize_net(); nf_ct_free_hashtable(old_hash, old_size); return 0; } @@ -1654,7 +1702,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); int nf_conntrack_init_start(void) { int max_factor = 8; - int i, ret, cpu; + int ret = -ENOMEM; + int i, cpu; + + seqcount_init(&nf_conntrack_generation); for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_conntrack_locks[i]); @@ -1681,8 +1732,19 @@ int nf_conntrack_init_start(void) * entries. */ max_factor = 4; } + + nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); + if (!nf_conntrack_hash) + return -ENOMEM; + nf_conntrack_max = max_factor * nf_conntrack_htable_size; + nf_conntrack_cachep = kmem_cache_create("nf_conntrack", + sizeof(struct nf_conn), 0, + SLAB_DESTROY_BY_RCU, NULL); + if (!nf_conntrack_cachep) + goto err_cachep; + printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n", NF_CONNTRACK_VERSION, nf_conntrack_htable_size, nf_conntrack_max); @@ -1759,6 +1821,9 @@ err_tstamp: err_acct: nf_conntrack_expect_fini(); err_expect: + kmem_cache_destroy(nf_conntrack_cachep); +err_cachep: + nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); return ret; } @@ -1778,12 +1843,10 @@ void nf_conntrack_init_end(void) int nf_conntrack_init_net(struct net *net) { - static atomic64_t unique_id; int ret = -ENOMEM; int cpu; atomic_set(&net->ct.count, 0); - seqcount_init(&net->ct.generation); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); if (!net->ct.pcpu_lists) @@ -1801,25 +1864,6 @@ int nf_conntrack_init_net(struct net *net) if (!net->ct.stat) goto err_pcpu_lists; - net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%llu", - (u64)atomic64_inc_return(&unique_id)); - if (!net->ct.slabname) - goto err_slabname; - - net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, - sizeof(struct nf_conn), 0, - SLAB_DESTROY_BY_RCU, NULL); - if (!net->ct.nf_conntrack_cachep) { - printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - goto err_cache; - } - - net->ct.htable_size = nf_conntrack_htable_size; - net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1); - if (!net->ct.hash) { - printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); - goto err_hash; - } ret = nf_conntrack_expect_pernet_init(net); if (ret < 0) goto err_expect; @@ -1851,12 +1895,6 @@ err_tstamp: err_acct: nf_conntrack_expect_pernet_fini(net); err_expect: - nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size); -err_hash: - kmem_cache_destroy(net->ct.nf_conntrack_cachep); -err_cache: - kfree(net->ct.slabname); -err_slabname: free_percpu(net->ct.stat); err_pcpu_lists: free_percpu(net->ct.pcpu_lists); diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 4e78c57b8..d28011b42 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -113,6 +113,60 @@ static void ecache_work(struct work_struct *work) schedule_delayed_work(&ctnet->ecache_dwork, delay); } +int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, + u32 portid, int report) +{ + int ret = 0; + struct net *net = nf_ct_net(ct); + struct nf_ct_event_notifier *notify; + struct nf_conntrack_ecache *e; + + rcu_read_lock(); + notify = rcu_dereference(net->ct.nf_conntrack_event_cb); + if (!notify) + goto out_unlock; + + e = nf_ct_ecache_find(ct); + if (!e) + goto out_unlock; + + if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) { + struct nf_ct_event item = { + .ct = ct, + .portid = e->portid ? e->portid : portid, + .report = report + }; + /* This is a resent of a destroy event? If so, skip missed */ + unsigned long missed = e->portid ? 0 : e->missed; + + if (!((eventmask | missed) & e->ctmask)) + goto out_unlock; + + ret = notify->fcn(eventmask | missed, &item); + if (unlikely(ret < 0 || missed)) { + spin_lock_bh(&ct->lock); + if (ret < 0) { + /* This is a destroy event that has been + * triggered by a process, we store the PORTID + * to include it in the retransmission. + */ + if (eventmask & (1 << IPCT_DESTROY) && + e->portid == 0 && portid != 0) + e->portid = portid; + else + e->missed |= eventmask; + } else { + e->missed &= ~missed; + } + spin_unlock_bh(&ct->lock); + } + } +out_unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); + /* deliver cached events and clear cache entry - must be called with locally * disabled softirqs */ void nf_ct_deliver_cached_events(struct nf_conn *ct) @@ -167,6 +221,36 @@ out_unlock: } EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); +void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, + struct nf_conntrack_expect *exp, + u32 portid, int report) + +{ + struct net *net = nf_ct_exp_net(exp); + struct nf_exp_event_notifier *notify; + struct nf_conntrack_ecache *e; + + rcu_read_lock(); + notify = rcu_dereference(net->ct.nf_expect_event_cb); + if (!notify) + goto out_unlock; + + e = nf_ct_ecache_find(exp->master); + if (!e) + goto out_unlock; + + if (e->expmask & (1 << event)) { + struct nf_exp_event item = { + .exp = exp, + .portid = portid, + .report = report + }; + notify->fcn(1 << event, &item); + } +out_unlock: + rcu_read_unlock(); +} + int nf_conntrack_register_notifier(struct net *net, struct nf_ct_event_notifier *new) { diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 278927ab0..9e3693128 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -24,6 +24,7 @@ #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> +#include <net/netns/hash.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> @@ -35,9 +36,13 @@ unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); +struct hlist_head *nf_ct_expect_hash __read_mostly; +EXPORT_SYMBOL_GPL(nf_ct_expect_hash); + unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; +static unsigned int nf_ct_expect_hashrnd __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, @@ -72,21 +77,32 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect) nf_ct_expect_put(exp); } -static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple) +static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { - unsigned int hash; + unsigned int hash, seed; - if (unlikely(!nf_conntrack_hash_rnd)) { - init_nf_conntrack_hash_rnd(); - } + get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); + + seed = nf_ct_expect_hashrnd ^ net_hash_mix(n); hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd); + (__force __u16)tuple->dst.u.all) ^ seed); return reciprocal_scale(hash, nf_ct_expect_hsize); } +static bool +nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_expect *i, + const struct nf_conntrack_zone *zone, + const struct net *net) +{ + return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && + net_eq(net, nf_ct_net(i->master)) && + nf_ct_zone_equal_any(i->master, zone); +} + struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, @@ -98,10 +114,9 @@ __nf_ct_expect_find(struct net *net, if (!net->ct.expect_count) return NULL; - h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) { - if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal_any(i->master, zone)) + h = nf_ct_expect_dst_hash(net, tuple); + hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) { + if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } return NULL; @@ -139,11 +154,10 @@ nf_ct_find_expectation(struct net *net, if (!net->ct.expect_count) return NULL; - h = nf_ct_expect_dst_hash(tuple); - hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) { + h = nf_ct_expect_dst_hash(net, tuple); + hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && - nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && - nf_ct_zone_equal_any(i->master, zone)) { + nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; break; } @@ -223,6 +237,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a, } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && + net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } @@ -232,6 +247,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a, return a->master == b->master && a->class == b->class && nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && + net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } @@ -342,7 +358,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); - unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); + unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ atomic_add(2, &exp->use); @@ -350,7 +366,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp) hlist_add_head(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; - hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); + hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); net->ct.expect_count++; setup_timer(&exp->timeout, nf_ct_expectation_timed_out, @@ -401,8 +417,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) ret = -ESHUTDOWN; goto out; } - h = nf_ct_expect_dst_hash(&expect->tuple); - hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) { + h = nf_ct_expect_dst_hash(net, &expect->tuple); + hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (expect_matches(i, expect)) { if (del_timer(&i->timeout)) { nf_ct_unlink_expect(i); @@ -468,12 +484,11 @@ struct ct_expect_iter_state { static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { - n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } @@ -483,14 +498,13 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq) static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { - struct net *net = seq_file_net(seq); struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; - head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket])); + head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } @@ -623,28 +637,13 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { - int err = -ENOMEM; - net->ct.expect_count = 0; - net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); - if (net->ct.expect_hash == NULL) - goto err1; - - err = exp_proc_init(net); - if (err < 0) - goto err2; - - return 0; -err2: - nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); -err1: - return err; + return exp_proc_init(net); } void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); - nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize); } int nf_conntrack_expect_init(void) @@ -660,6 +659,13 @@ int nf_conntrack_expect_init(void) 0, 0, NULL); if (!nf_ct_expect_cachep) return -ENOMEM; + + nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); + if (!nf_ct_expect_hash) { + kmem_cache_destroy(nf_ct_expect_cachep); + return -ENOMEM; + } + return 0; } @@ -667,4 +673,5 @@ void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); + nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 883c691ec..19efeba02 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -632,6 +632,7 @@ static int __init nf_conntrack_ftp_init(void) if (ret) { pr_err("failed to register helper for pf: %d port: %d\n", ftp[i][j].tuple.src.l3num, ports[i]); + ports_c = i; nf_conntrack_ftp_fini(); return ret; } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 3b40ec575..196cb3964 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -38,10 +38,10 @@ unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; -static bool nf_ct_auto_assign_helper __read_mostly = true; +static bool nf_ct_auto_assign_helper __read_mostly = false; module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); MODULE_PARM_DESC(nf_conntrack_helper, - "Enable automatic conntrack helper assignment (default 1)"); + "Enable automatic conntrack helper assignment (default 0)"); #ifdef CONFIG_SYSCTL static struct ctl_table helper_sysctl_table[] = { @@ -361,9 +361,10 @@ EXPORT_SYMBOL_GPL(nf_ct_helper_log); int nf_conntrack_helper_register(struct nf_conntrack_helper *me) { - int ret = 0; - struct nf_conntrack_helper *cur; + struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; unsigned int h = helper_hash(&me->tuple); + struct nf_conntrack_helper *cur; + int ret = 0; BUG_ON(me->expect_policy == NULL); BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); @@ -371,9 +372,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) mutex_lock(&nf_ct_helper_mutex); hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { - if (strncmp(cur->name, me->name, NF_CT_HELPER_NAME_LEN) == 0 && - cur->tuple.src.l3num == me->tuple.src.l3num && - cur->tuple.dst.protonum == me->tuple.dst.protonum) { + if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { ret = -EEXIST; goto out; } @@ -400,7 +399,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], hnode) { + &nf_ct_expect_hash[i], hnode) { struct nf_conn_help *help = nfct_help(exp->master); if ((rcu_dereference_protected( help->helper, @@ -424,10 +423,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, spin_unlock_bh(&pcpu->lock); } local_bh_disable(); - for (i = 0; i < net->ct.htable_size; i++) { + for (i = 0; i < nf_conntrack_htable_size; i++) { nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + if (i < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) unhelp(h, me); } spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 8b6da2719..f97ac61d2 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -271,6 +271,7 @@ static int __init nf_conntrack_irc_init(void) if (ret) { pr_err("failed to register helper for pf: %u port: %u\n", irc[i].tuple.src.l3num, ports[i]); + ports_c = i; nf_conntrack_irc_fini(); return ret; } diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index 3ce5c314e..252e6a7cd 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -16,28 +16,11 @@ static spinlock_t nf_connlabels_lock; -static unsigned int label_bits(const struct nf_conn_labels *l) -{ - unsigned int longs = l->words; - return longs * BITS_PER_LONG; -} - -bool nf_connlabel_match(const struct nf_conn *ct, u16 bit) -{ - struct nf_conn_labels *labels = nf_ct_labels_find(ct); - - if (!labels) - return false; - - return bit < label_bits(labels) && test_bit(bit, labels->bits); -} -EXPORT_SYMBOL_GPL(nf_connlabel_match); - int nf_connlabel_set(struct nf_conn *ct, u16 bit) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); - if (!labels || bit >= label_bits(labels)) + if (!labels || BIT_WORD(bit) >= labels->words) return -ENOSPC; if (test_bit(bit, labels->bits)) @@ -50,14 +33,18 @@ int nf_connlabel_set(struct nf_conn *ct, u16 bit) } EXPORT_SYMBOL_GPL(nf_connlabel_set); -static void replace_u32(u32 *address, u32 mask, u32 new) +static int replace_u32(u32 *address, u32 mask, u32 new) { u32 old, tmp; do { old = *address; tmp = (old & mask) ^ new; + if (old == tmp) + return 0; } while (cmpxchg(address, old, tmp) != old); + + return 1; } int nf_connlabels_replace(struct nf_conn *ct, @@ -66,6 +53,7 @@ int nf_connlabels_replace(struct nf_conn *ct, { struct nf_conn_labels *labels; unsigned int size, i; + int changed = 0; u32 *dst; labels = nf_ct_labels_find(ct); @@ -77,29 +65,27 @@ int nf_connlabels_replace(struct nf_conn *ct, words32 = size / sizeof(u32); dst = (u32 *) labels->bits; - if (words32) { - for (i = 0; i < words32; i++) - replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); - } + for (i = 0; i < words32; i++) + changed |= replace_u32(&dst[i], mask ? ~mask[i] : 0, data[i]); size /= sizeof(u32); for (i = words32; i < size; i++) /* pad */ replace_u32(&dst[i], 0, 0); - nf_conntrack_event_cache(IPCT_LABEL, ct); + if (changed) + nf_conntrack_event_cache(IPCT_LABEL, ct); return 0; } EXPORT_SYMBOL_GPL(nf_connlabels_replace); -int nf_connlabels_get(struct net *net, unsigned int n_bits) +int nf_connlabels_get(struct net *net, unsigned int bits) { size_t words; - if (n_bits > (NF_CT_LABELS_MAX_SIZE * BITS_PER_BYTE)) + words = BIT_WORD(bits) + 1; + if (words > NF_CT_LABELS_MAX_SIZE / sizeof(long)) return -ERANGE; - words = BITS_TO_LONGS(n_bits); - spin_lock(&nf_connlabels_lock); net->ct.labels_used++; if (words > net->ct.label_words) @@ -128,6 +114,8 @@ static struct nf_ct_ext_type labels_extend __read_mostly = { int nf_conntrack_labels_init(void) { + BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE / sizeof(long) >= U8_MAX); + spin_lock_init(&nf_connlabels_lock); return nf_ct_extend_register(&labels_extend); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 355e8552f..a18d1ceab 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -58,10 +58,9 @@ MODULE_LICENSE("GPL"); static char __initdata version[] = "0.93"; -static inline int -ctnetlink_dump_tuples_proto(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - struct nf_conntrack_l4proto *l4proto) +static int ctnetlink_dump_tuples_proto(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l4proto *l4proto) { int ret = 0; struct nlattr *nest_parms; @@ -83,10 +82,9 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_tuples_ip(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - struct nf_conntrack_l3proto *l3proto) +static int ctnetlink_dump_tuples_ip(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + struct nf_conntrack_l3proto *l3proto) { int ret = 0; struct nlattr *nest_parms; @@ -106,9 +104,8 @@ nla_put_failure: return -1; } -static int -ctnetlink_dump_tuples(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple) +static int ctnetlink_dump_tuples(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple) { int ret; struct nf_conntrack_l3proto *l3proto; @@ -127,9 +124,8 @@ ctnetlink_dump_tuples(struct sk_buff *skb, return ret; } -static inline int -ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype, - const struct nf_conntrack_zone *zone, int dir) +static int ctnetlink_dump_zone_id(struct sk_buff *skb, int attrtype, + const struct nf_conntrack_zone *zone, int dir) { if (zone->id == NF_CT_DEFAULT_ZONE_ID || zone->dir != dir) return 0; @@ -141,8 +137,7 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_status(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_STATUS, htonl(ct->status))) goto nla_put_failure; @@ -152,8 +147,7 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_timeout(struct sk_buff *skb, const struct nf_conn *ct) { long timeout = ((long)ct->timeout.expires - (long)jiffies) / HZ; @@ -168,8 +162,7 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) +static int ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conntrack_l4proto *l4proto; struct nlattr *nest_proto; @@ -193,8 +186,8 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_helpinfo(struct sk_buff *skb, + const struct nf_conn *ct) { struct nlattr *nest_helper; const struct nf_conn_help *help = nfct_help(ct); @@ -245,8 +238,10 @@ dump_counters(struct sk_buff *skb, struct nf_conn_acct *acct, if (!nest_count) goto nla_put_failure; - if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts)) || - nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes))) + if (nla_put_be64(skb, CTA_COUNTERS_PACKETS, cpu_to_be64(pkts), + CTA_COUNTERS_PAD) || + nla_put_be64(skb, CTA_COUNTERS_BYTES, cpu_to_be64(bytes), + CTA_COUNTERS_PAD)) goto nla_put_failure; nla_nest_end(skb, nest_count); @@ -287,9 +282,11 @@ ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct) if (!nest_count) goto nla_put_failure; - if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start)) || + if (nla_put_be64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start), + CTA_TIMESTAMP_PAD) || (tstamp->stop != 0 && nla_put_be64(skb, CTA_TIMESTAMP_STOP, - cpu_to_be64(tstamp->stop)))) + cpu_to_be64(tstamp->stop), + CTA_TIMESTAMP_PAD))) goto nla_put_failure; nla_nest_end(skb, nest_count); @@ -300,8 +297,7 @@ nla_put_failure: } #ifdef CONFIG_NF_CONNTRACK_MARK -static inline int -ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_MARK, htonl(ct->mark))) goto nla_put_failure; @@ -315,8 +311,7 @@ nla_put_failure: #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK -static inline int -ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_secctx; int len, ret; @@ -345,7 +340,7 @@ nla_put_failure: #endif #ifdef CONFIG_NF_CONNTRACK_LABELS -static int ctnetlink_label_size(const struct nf_conn *ct) +static inline int ctnetlink_label_size(const struct nf_conn *ct) { struct nf_conn_labels *labels = nf_ct_labels_find(ct); @@ -380,8 +375,7 @@ ctnetlink_dump_labels(struct sk_buff *skb, const struct nf_conn *ct) #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) -static inline int -ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_master(struct sk_buff *skb, const struct nf_conn *ct) { struct nlattr *nest_parms; @@ -426,8 +420,8 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, + const struct nf_conn *ct) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *seq; @@ -446,8 +440,7 @@ ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, const struct nf_conn *ct) return 0; } -static inline int -ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) goto nla_put_failure; @@ -457,8 +450,7 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) +static int ctnetlink_dump_use(struct sk_buff *skb, const struct nf_conn *ct) { if (nla_put_be32(skb, CTA_USE, htonl(atomic_read(&ct->ct_general.use)))) goto nla_put_failure; @@ -538,8 +530,7 @@ nla_put_failure: return -1; } -static inline size_t -ctnetlink_proto_size(const struct nf_conn *ct) +static inline size_t ctnetlink_proto_size(const struct nf_conn *ct) { struct nf_conntrack_l3proto *l3proto; struct nf_conntrack_l4proto *l4proto; @@ -556,19 +547,17 @@ ctnetlink_proto_size(const struct nf_conn *ct) return len; } -static inline size_t -ctnetlink_acct_size(const struct nf_conn *ct) +static inline size_t ctnetlink_acct_size(const struct nf_conn *ct) { if (!nf_ct_ext_exist(ct, NF_CT_EXT_ACCT)) return 0; return 2 * nla_total_size(0) /* CTA_COUNTERS_ORIG|REPL */ - + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */ - + 2 * nla_total_size(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */ + + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_PACKETS */ + + 2 * nla_total_size_64bit(sizeof(uint64_t)) /* CTA_COUNTERS_BYTES */ ; } -static inline int -ctnetlink_secctx_size(const struct nf_conn *ct) +static inline int ctnetlink_secctx_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_SECMARK int len, ret; @@ -584,20 +573,19 @@ ctnetlink_secctx_size(const struct nf_conn *ct) #endif } -static inline size_t -ctnetlink_timestamp_size(const struct nf_conn *ct) +static inline size_t ctnetlink_timestamp_size(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_TIMESTAMP if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP)) return 0; - return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t)); + return nla_total_size(0) + 2 * nla_total_size_64bit(sizeof(uint64_t)); #else return 0; #endif } -static inline size_t -ctnetlink_nlmsg_size(const struct nf_conn *ct) +#ifdef CONFIG_NF_CONNTRACK_EVENTS +static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) { return NLMSG_ALIGN(sizeof(struct nfgenmsg)) + 3 * nla_total_size(0) /* CTA_TUPLE_ORIG|REPL|MASTER */ @@ -628,7 +616,6 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) ; } -#ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) { @@ -837,19 +824,22 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conn *)cb->args[1]; local_bh_disable(); - for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { restart: lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS]; nf_conntrack_lock(lockp); - if (cb->args[0] >= net->ct.htable_size) { + if (cb->args[0] >= nf_conntrack_htable_size) { spin_unlock(lockp); goto out; } - hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], - hnnode) { + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]], + hnnode) { if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = nf_ct_tuplehash_to_ctrack(h); + if (!net_eq(net, nf_ct_net(ct))) + continue; + /* Dump entries of a given L3 protocol number. * If it is not specified, ie. l3proto == 0, * then dump everything. */ @@ -891,8 +881,8 @@ out: return skb->len; } -static inline int -ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_tuple *tuple) +static int ctnetlink_parse_tuple_ip(struct nlattr *attr, + struct nf_conntrack_tuple *tuple) { struct nlattr *tb[CTA_IP_MAX+1]; struct nf_conntrack_l3proto *l3proto; @@ -921,9 +911,8 @@ static const struct nla_policy proto_nla_policy[CTA_PROTO_MAX+1] = { [CTA_PROTO_NUM] = { .type = NLA_U8 }, }; -static inline int -ctnetlink_parse_tuple_proto(struct nlattr *attr, - struct nf_conntrack_tuple *tuple) +static int ctnetlink_parse_tuple_proto(struct nlattr *attr, + struct nf_conntrack_tuple *tuple) { struct nlattr *tb[CTA_PROTO_MAX+1]; struct nf_conntrack_l4proto *l4proto; @@ -1050,9 +1039,8 @@ static const struct nla_policy help_nla_policy[CTA_HELP_MAX+1] = { .len = NF_CT_HELPER_NAME_LEN - 1 }, }; -static inline int -ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, - struct nlattr **helpinfo) +static int ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, + struct nlattr **helpinfo) { int err; struct nlattr *tb[CTA_HELP_MAX+1]; @@ -1463,8 +1451,8 @@ ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[]) #endif } -static inline int -ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) +static int ctnetlink_change_helper(struct nf_conn *ct, + const struct nlattr * const cda[]) { struct nf_conntrack_helper *helper; struct nf_conn_help *help = nfct_help(ct); @@ -1524,8 +1512,8 @@ ctnetlink_change_helper(struct nf_conn *ct, const struct nlattr * const cda[]) return -EOPNOTSUPP; } -static inline int -ctnetlink_change_timeout(struct nf_conn *ct, const struct nlattr * const cda[]) +static int ctnetlink_change_timeout(struct nf_conn *ct, + const struct nlattr * const cda[]) { u_int32_t timeout = ntohl(nla_get_be32(cda[CTA_TIMEOUT])); @@ -1544,8 +1532,8 @@ static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, }; -static inline int -ctnetlink_change_protoinfo(struct nf_conn *ct, const struct nlattr * const cda[]) +static int ctnetlink_change_protoinfo(struct nf_conn *ct, + const struct nlattr * const cda[]) { const struct nlattr *attr = cda[CTA_PROTOINFO]; struct nlattr *tb[CTA_PROTOINFO_MAX+1]; @@ -1571,8 +1559,8 @@ static const struct nla_policy seqadj_policy[CTA_SEQADJ_MAX+1] = { [CTA_SEQADJ_OFFSET_AFTER] = { .type = NLA_U32 }, }; -static inline int -change_seq_adj(struct nf_ct_seqadj *seq, const struct nlattr * const attr) +static int change_seq_adj(struct nf_ct_seqadj *seq, + const struct nlattr * const attr) { int err; struct nlattr *cda[CTA_SEQADJ_MAX+1]; @@ -2405,10 +2393,9 @@ static struct nfnl_ct_hook ctnetlink_glue_hook = { * EXPECT ***********************************************************************/ -static inline int -ctnetlink_exp_dump_tuple(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - enum ctattr_expect type) +static int ctnetlink_exp_dump_tuple(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + enum ctattr_expect type) { struct nlattr *nest_parms; @@ -2425,10 +2412,9 @@ nla_put_failure: return -1; } -static inline int -ctnetlink_exp_dump_mask(struct sk_buff *skb, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple_mask *mask) +static int ctnetlink_exp_dump_mask(struct sk_buff *skb, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple_mask *mask) { int ret; struct nf_conntrack_l3proto *l3proto; @@ -2646,10 +2632,14 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: - hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]], + hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]], hnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; + + if (!net_eq(nf_ct_net(exp->master), net)) + continue; + if (cb->args[1]) { if (exp != last) continue; @@ -2900,8 +2890,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], + &nf_ct_expect_hash[i], hnode) { + + if (!net_eq(nf_ct_exp_net(exp), net)) + continue; + m_help = nfct_help(exp->master); if (!strcmp(m_help->helper->name, name) && del_timer(&exp->timeout)) { @@ -2918,8 +2912,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl, spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, - &net->ct.expect_hash[i], + &nf_ct_expect_hash[i], hnode) { + + if (!net_eq(nf_ct_exp_net(exp), net)) + continue; + if (del_timer(&exp->timeout)) { nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).portid, diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index fce1b1cca..399a38fd6 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -645,7 +645,8 @@ static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, - cpu_to_be64(ct->proto.dccp.handshake_seq))) + cpu_to_be64(ct->proto.dccp.handshake_seq), + CTA_PROTOINFO_DCCP_PAD)) goto nla_put_failure; nla_nest_end(skb, nest_parms); spin_unlock_bh(&ct->lock); @@ -660,6 +661,7 @@ static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 }, + [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC }, }; static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 9578a7c37..1d7ab960a 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -191,13 +191,7 @@ static void sctp_print_tuple(struct seq_file *s, /* Print out the private part of the conntrack. */ static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { - enum sctp_conntrack state; - - spin_lock_bh(&ct->lock); - state = ct->proto.sctp.state; - spin_unlock_bh(&ct->lock); - - seq_printf(s, "%s ", sctp_conntrack_names[state]); + seq_printf(s, "%s ", sctp_conntrack_names[ct->proto.sctp.state]); } #define for_each_sctp_chunk(skb, sch, _sch, offset, dataoff, count) \ diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 7cc1d9c22..70c838164 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -313,13 +313,7 @@ static void tcp_print_tuple(struct seq_file *s, /* Print out the private part of the conntrack. */ static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { - enum tcp_conntrack state; - - spin_lock_bh(&ct->lock); - state = ct->proto.tcp.state; - spin_unlock_bh(&ct->lock); - - seq_printf(s, "%s ", tcp_conntrack_names[state]); + seq_printf(s, "%s ", tcp_conntrack_names[ct->proto.tcp.state]); } static unsigned int get_conntrack_index(const struct tcphdr *tcph) diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 478f92f83..4fd040575 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -309,6 +309,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly = .l3proto = PF_INET, .l4proto = IPPROTO_UDP, .name = "udp", + .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, @@ -341,6 +342,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly = .l3proto = PF_INET6, .l4proto = IPPROTO_UDP, .name = "udp", + .allow_clash = true, .pkt_to_tuple = udp_pkt_to_tuple, .invert_tuple = udp_invert_tuple, .print_tuple = udp_print_tuple, diff --git a/net/netfilter/nf_conntrack_proto_udplite.c b/net/netfilter/nf_conntrack_proto_udplite.c index 1ac8ee13a..9d692f5ad 100644 --- a/net/netfilter/nf_conntrack_proto_udplite.c +++ b/net/netfilter/nf_conntrack_proto_udplite.c @@ -274,6 +274,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly = .l3proto = PF_INET, .l4proto = IPPROTO_UDPLITE, .name = "udplite", + .allow_clash = true, .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, @@ -306,6 +307,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly = .l3proto = PF_INET6, .l4proto = IPPROTO_UDPLITE, .name = "udplite", + .allow_clash = true, .pkt_to_tuple = udplite_pkt_to_tuple, .invert_tuple = udplite_invert_tuple, .print_tuple = udplite_print_tuple, diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 7523a575f..3fcbaab83 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -223,6 +223,7 @@ static int __init nf_conntrack_sane_init(void) if (ret) { pr_err("failed to register helper for pf: %d port: %d\n", sane[i][j].tuple.src.l3num, ports[i]); + ports_c = i; nf_conntrack_sane_fini(); return ret; } diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 3e0640273..f72ba5587 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1669,6 +1669,7 @@ static int __init nf_conntrack_sip_init(void) if (ret) { pr_err("failed to register helper for pf: %u port: %u\n", sip[i][j].tuple.src.l3num, ports[i]); + ports_c = i; nf_conntrack_sip_fini(); return ret; } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 0f1a45bca..c026c472e 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -54,14 +54,13 @@ struct ct_iter_state { static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < net->ct.htable_size; + st->bucket < nf_conntrack_htable_size; st->bucket++) { - n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket])); + n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket])); if (!is_a_nulls(n)) return n; } @@ -71,18 +70,17 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, struct hlist_nulls_node *head) { - struct net *net = seq_file_net(seq); struct ct_iter_state *st = seq->private; head = rcu_dereference(hlist_nulls_next_rcu(head)); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= net->ct.htable_size) + if (++st->bucket >= nf_conntrack_htable_size) return NULL; } head = rcu_dereference( hlist_nulls_first_rcu( - &net->ct.hash[st->bucket])); + &nf_conntrack_hash[st->bucket])); } return head; } @@ -458,7 +456,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { }, { .procname = "nf_conntrack_buckets", - .data = &init_net.ct.htable_size, + .data = &nf_conntrack_htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, @@ -489,8 +487,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { { } }; -#define NET_NF_CONNTRACK_MAX 2089 - static struct ctl_table nf_ct_netfilter_table[] = { { .procname = "nf_conntrack_max", @@ -512,7 +508,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) goto out_kmemdup; table[1].data = &net->ct.count; - table[2].data = &net->ct.htable_size; table[3].data = &net->ct.sysctl_checksum; table[4].data = &net->ct.sysctl_log_invalid; diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index 36f964066..2e65b5430 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -142,6 +142,7 @@ static int __init nf_conntrack_tftp_init(void) if (ret) { pr_err("failed to register helper for pf: %u port: %u\n", tftp[i][j].tuple.src.l3num, ports[i]); + ports_c = i; nf_conntrack_tftp_fini(); return ret; } diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 06a9f4577..6877a396f 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -38,6 +38,9 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO] static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO] __read_mostly; +static struct hlist_head *nf_nat_bysource __read_mostly; +static unsigned int nf_nat_htable_size __read_mostly; +static unsigned int nf_nat_hash_rnd __read_mostly; inline const struct nf_nat_l3proto * __nf_nat_l3proto_find(u8 family) @@ -118,15 +121,17 @@ EXPORT_SYMBOL(nf_xfrm_me_harder); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { unsigned int hash; + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + /* Original src, to ensure we map it consistently if poss. */ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_conntrack_hash_rnd); + tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); - return reciprocal_scale(hash, net->ct.nat_htable_size); + return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ @@ -196,9 +201,10 @@ find_appropriate_src(struct net *net, const struct nf_conn_nat *nat; const struct nf_conn *ct; - hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) { + hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) { ct = nat->ct; if (same_src(ct, tuple) && + net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuplepr(result, @@ -431,7 +437,7 @@ nf_nat_setup_info(struct nf_conn *ct, nat = nfct_nat(ct); nat->ct = ct; hlist_add_head_rcu(&nat->bysource, - &net->ct.nat_bysource[srchash]); + &nf_nat_bysource[srchash]); spin_unlock_bh(&nf_nat_lock); } @@ -819,27 +825,14 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, } #endif -static int __net_init nf_nat_net_init(struct net *net) -{ - /* Leave them the same for the moment. */ - net->ct.nat_htable_size = net->ct.htable_size; - net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0); - if (!net->ct.nat_bysource) - return -ENOMEM; - return 0; -} - static void __net_exit nf_nat_net_exit(struct net *net) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0); - synchronize_rcu(); - nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { - .init = nf_nat_net_init, .exit = nf_nat_net_exit, }; @@ -852,8 +845,16 @@ static int __init nf_nat_init(void) { int ret; + /* Leave them the same for the moment. */ + nf_nat_htable_size = nf_conntrack_htable_size; + + nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); + if (!nf_nat_bysource) + return -ENOMEM; + ret = nf_ct_extend_register(&nat_extend); if (ret < 0) { + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); printk(KERN_ERR "nf_nat_core: Unable to register extension\n"); return ret; } @@ -877,6 +878,7 @@ static int __init nf_nat_init(void) return 0; cleanup_extend: + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); nf_ct_extend_unregister(&nat_extend); return ret; } @@ -895,6 +897,7 @@ static void __exit nf_nat_cleanup(void) for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); synchronize_net(); + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size); } MODULE_LICENSE("GPL"); diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 5baa8e24e..b19ad20a7 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -26,23 +26,21 @@ * Once the queue is registered it must reinject all packets it * receives, no matter what. */ -static const struct nf_queue_handler __rcu *queue_handler __read_mostly; /* return EBUSY when somebody else is registered, return EEXIST if the * same handler is registered, return 0 in case of success. */ -void nf_register_queue_handler(const struct nf_queue_handler *qh) +void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh) { /* should never happen, we only have one queueing backend in kernel */ - WARN_ON(rcu_access_pointer(queue_handler)); - rcu_assign_pointer(queue_handler, qh); + WARN_ON(rcu_access_pointer(net->nf.queue_handler)); + rcu_assign_pointer(net->nf.queue_handler, qh); } EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ -void nf_unregister_queue_handler(void) +void nf_unregister_queue_handler(struct net *net) { - RCU_INIT_POINTER(queue_handler, NULL); - synchronize_rcu(); + RCU_INIT_POINTER(net->nf.queue_handler, NULL); } EXPORT_SYMBOL(nf_unregister_queue_handler); @@ -103,7 +101,7 @@ void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) const struct nf_queue_handler *qh; rcu_read_lock(); - qh = rcu_dereference(queue_handler); + qh = rcu_dereference(net->nf.queue_handler); if (qh) qh->nf_hook_drop(net, ops); rcu_read_unlock(); @@ -122,9 +120,10 @@ int nf_queue(struct sk_buff *skb, struct nf_queue_entry *entry = NULL; const struct nf_afinfo *afinfo; const struct nf_queue_handler *qh; + struct net *net = state->net; /* QUEUE == DROP if no one is waiting, to be safe. */ - qh = rcu_dereference(queue_handler); + qh = rcu_dereference(net->nf.queue_handler); if (!qh) { status = -ESRCH; goto err; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2011977cd..cf7c74599 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -944,8 +944,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) if (nest == NULL) goto nla_put_failure; - if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts)) || - nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes))) + if (nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.pkts), + NFTA_COUNTER_PAD) || + nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), + NFTA_COUNTER_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); @@ -975,7 +977,8 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_CHAIN_TABLE, table->name)) goto nla_put_failure; - if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle))) + if (nla_put_be64(skb, NFTA_CHAIN_HANDLE, cpu_to_be64(chain->handle), + NFTA_CHAIN_PAD)) goto nla_put_failure; if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name)) goto nla_put_failure; @@ -1721,9 +1724,11 @@ struct nft_expr *nft_expr_init(const struct nft_ctx *ctx, err = nf_tables_newexpr(ctx, &info, expr); if (err < 0) - goto err2; + goto err3; return expr; +err3: + kfree(expr); err2: module_put(info.ops->type->owner); err1: @@ -1803,13 +1808,15 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; if (nla_put_string(skb, NFTA_RULE_CHAIN, chain->name)) goto nla_put_failure; - if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle))) + if (nla_put_be64(skb, NFTA_RULE_HANDLE, cpu_to_be64(rule->handle), + NFTA_RULE_PAD)) goto nla_put_failure; if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) { prule = list_entry(rule->list.prev, struct nft_rule, list); if (nla_put_be64(skb, NFTA_RULE_POSITION, - cpu_to_be64(prule->handle))) + cpu_to_be64(prule->handle), + NFTA_RULE_PAD)) goto nla_put_failure; } @@ -2312,7 +2319,7 @@ nft_select_set_ops(const struct nlattr * const nla[], static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING }, [NFTA_SET_NAME] = { .type = NLA_STRING, - .len = IFNAMSIZ - 1 }, + .len = NFT_SET_MAXNAMELEN - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, @@ -2396,7 +2403,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, unsigned long *inuse; unsigned int n = 0, min = 0; - p = strnchr(name, IFNAMSIZ, '%'); + p = strnchr(name, NFT_SET_MAXNAMELEN, '%'); if (p != NULL) { if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; @@ -2473,7 +2480,8 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, } if (set->timeout && - nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout))) + nla_put_be64(skb, NFTA_SET_TIMEOUT, cpu_to_be64(set->timeout), + NFTA_SET_PAD)) goto nla_put_failure; if (set->gc_int && nla_put_be32(skb, NFTA_SET_GC_INTERVAL, htonl(set->gc_int))) @@ -2641,6 +2649,8 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk, /* Only accept unspec with dump */ if (nfmsg->nfgen_family == NFPROTO_UNSPEC) return -EAFNOSUPPORT; + if (!nla[NFTA_SET_TABLE]) + return -EINVAL; set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]); if (IS_ERR(set)) @@ -2690,7 +2700,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; - char name[IFNAMSIZ]; + char name[NFT_SET_MAXNAMELEN]; unsigned int size; bool create; u64 timeout; @@ -2938,24 +2948,20 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, * jumps are already validated for that chain. */ list_for_each_entry(i, &set->bindings, list) { - if (binding->flags & NFT_SET_MAP && + if (i->flags & NFT_SET_MAP && i->chain == binding->chain) goto bind; } + iter.genmask = nft_genmask_next(ctx->net); iter.skip = 0; iter.count = 0; iter.err = 0; iter.fn = nf_tables_bind_check_setelem; set->ops->walk(ctx, set, &iter); - if (iter.err < 0) { - /* Destroy anonymous sets if binding fails */ - if (set->flags & NFT_SET_ANONYMOUS) - nf_tables_set_destroy(ctx, set); - + if (iter.err < 0) return iter.err; - } } bind: binding->chain = ctx->chain; @@ -3076,7 +3082,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, if (nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && nla_put_be64(skb, NFTA_SET_ELEM_TIMEOUT, - cpu_to_be64(*nft_set_ext_timeout(ext)))) + cpu_to_be64(*nft_set_ext_timeout(ext)), + NFTA_SET_ELEM_PAD)) goto nla_put_failure; if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { @@ -3089,7 +3096,8 @@ static int nf_tables_fill_setelem(struct sk_buff *skb, expires = 0; if (nla_put_be64(skb, NFTA_SET_ELEM_EXPIRATION, - cpu_to_be64(jiffies_to_msecs(expires)))) + cpu_to_be64(jiffies_to_msecs(expires)), + NFTA_SET_ELEM_PAD)) goto nla_put_failure; } @@ -3182,12 +3190,13 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto nla_put_failure; - args.cb = cb; - args.skb = skb; - args.iter.skip = cb->args[0]; - args.iter.count = 0; - args.iter.err = 0; - args.iter.fn = nf_tables_dump_setelem; + args.cb = cb; + args.skb = skb; + args.iter.genmask = nft_genmask_cur(ctx.net); + args.iter.skip = cb->args[0]; + args.iter.count = 0; + args.iter.err = 0; + args.iter.fn = nf_tables_dump_setelem; set->ops->walk(&ctx, set, &args.iter); nla_nest_end(skb, nest); @@ -3367,6 +3376,22 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem) } EXPORT_SYMBOL_GPL(nft_set_elem_destroy); +static int nft_setelem_parse_flags(const struct nft_set *set, + const struct nlattr *attr, u32 *flags) +{ + if (attr == NULL) + return 0; + + *flags = ntohl(nla_get_be32(attr)); + if (*flags & ~NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + if (!(set->flags & NFT_SET_INTERVAL) && + *flags & NFT_SET_ELEM_INTERVAL_END) + return -EINVAL; + + return 0; +} + static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { @@ -3380,8 +3405,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, struct nft_data data; enum nft_registers dreg; struct nft_trans *trans; + u32 flags = 0; u64 timeout; - u32 flags; u8 ulen; int err; @@ -3395,17 +3420,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, nft_set_ext_prepare(&tmpl); - flags = 0; - if (nla[NFTA_SET_ELEM_FLAGS] != NULL) { - flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS])); - if (flags & ~NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (!(set->flags & NFT_SET_INTERVAL) && - flags & NFT_SET_ELEM_INTERVAL_END) - return -EINVAL; - if (flags != 0) - nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); - } + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + if (flags != 0) + nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); if (set->flags & NFT_SET_MAP) { if (nla[NFTA_SET_ELEM_DATA] == NULL && @@ -3574,9 +3593,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, const struct nlattr *attr) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; + struct nft_set_ext_tmpl tmpl; struct nft_data_desc desc; struct nft_set_elem elem; + struct nft_set_ext *ext; struct nft_trans *trans; + u32 flags = 0; + void *priv; int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, @@ -3588,6 +3611,14 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (nla[NFTA_SET_ELEM_KEY] == NULL) goto err1; + nft_set_ext_prepare(&tmpl); + + err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags); + if (err < 0) + return err; + if (flags != 0) + nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS); + err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc, nla[NFTA_SET_ELEM_KEY]); if (err < 0) @@ -3597,24 +3628,40 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (desc.type != NFT_DATA_VALUE || desc.len != set->klen) goto err2; + nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len); + + err = -ENOMEM; + elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0, + GFP_KERNEL); + if (elem.priv == NULL) + goto err2; + + ext = nft_set_elem_ext(set, elem.priv); + if (flags) + *nft_set_ext_flags(ext) = flags; + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (trans == NULL) { err = -ENOMEM; - goto err2; + goto err3; } - elem.priv = set->ops->deactivate(set, &elem); - if (elem.priv == NULL) { + priv = set->ops->deactivate(set, &elem); + if (priv == NULL) { err = -ENOENT; - goto err3; + goto err4; } + kfree(elem.priv); + elem.priv = priv; nft_trans_elem(trans) = elem; list_add_tail(&trans->list, &ctx->net->nft.commit_list); return 0; -err3: +err4: kfree(trans); +err3: + kfree(elem.priv); err2: nft_data_uninit(&elem.key.val, desc.type); err1: @@ -4236,6 +4283,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, binding->chain != chain) continue; + iter.genmask = nft_genmask_next(ctx->net); iter.skip = 0; iter.count = 0; iter.err = 0; diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index e9f8dffcc..fb8b5892b 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -143,7 +143,7 @@ next_rule: list_for_each_entry_continue_rcu(rule, &chain->rules, list) { /* This rule is not active, skip. */ - if (unlikely(rule->genmask & (1 << gencursor))) + if (unlikely(rule->genmask & gencursor)) continue; rulenum++; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index e9e959f65..39eb1cc62 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -156,7 +156,8 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb, return 0; return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE, - cpu_to_be64(info->rule->handle)); + cpu_to_be64(info->rule->handle), + NFTA_TRACE_PAD); } void nft_trace_notify(struct nft_traceinfo *info) @@ -174,7 +175,7 @@ void nft_trace_notify(struct nft_traceinfo *info) size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(NFT_TABLE_MAXNAMELEN) + nla_total_size(NFT_CHAIN_MAXNAMELEN) + - nla_total_size(sizeof(__be64)) + /* rule handle */ + nla_total_size_64bit(sizeof(__be64)) + /* rule handle */ nla_total_size(sizeof(__be32)) + /* trace type */ nla_total_size(0) + /* VERDICT, nested */ nla_total_size(sizeof(u32)) + /* verdict code */ diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index dbd0803b1..1b4de4bd6 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -162,15 +162,18 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, pkts = atomic64_read(&acct->pkts); bytes = atomic64_read(&acct->bytes); } - if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts)) || - nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes)) || + if (nla_put_be64(skb, NFACCT_PKTS, cpu_to_be64(pkts), + NFACCT_PAD) || + nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes), + NFACCT_PAD) || nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt)))) goto nla_put_failure; if (acct->flags & NFACCT_F_QUOTA) { u64 *quota = (u64 *)acct->data; if (nla_put_be32(skb, NFACCT_FLAGS, htonl(old_flags)) || - nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota))) + nla_put_be64(skb, NFACCT_QUOTA, cpu_to_be64(*quota), + NFACCT_PAD)) goto nla_put_failure; } nlmsg_end(skb, nlh); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 2671b9deb..3c84f1432 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -306,10 +306,10 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout) int i; local_bh_disable(); - for (i = 0; i < net->ct.htable_size; i++) { + for (i = 0; i < nf_conntrack_htable_size; i++) { nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); - if (i < net->ct.htable_size) { - hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) + if (i < nf_conntrack_htable_size) { + hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode) untimeout(h, timeout); } spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index cb5b630a6..5d36a0926 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -295,6 +295,59 @@ static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) return seclen; } +static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry) +{ + struct sk_buff *entskb = entry->skb; + u32 nlalen = 0; + + if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) + return 0; + + if (skb_vlan_tag_present(entskb)) + nlalen += nla_total_size(nla_total_size(sizeof(__be16)) + + nla_total_size(sizeof(__be16))); + + if (entskb->network_header > entskb->mac_header) + nlalen += nla_total_size((entskb->network_header - + entskb->mac_header)); + + return nlalen; +} + +static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb) +{ + struct sk_buff *entskb = entry->skb; + + if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) + return 0; + + if (skb_vlan_tag_present(entskb)) { + struct nlattr *nest; + + nest = nla_nest_start(skb, NFQA_VLAN | NLA_F_NESTED); + if (!nest) + goto nla_put_failure; + + if (nla_put_be16(skb, NFQA_VLAN_TCI, htons(entskb->vlan_tci)) || + nla_put_be16(skb, NFQA_VLAN_PROTO, entskb->vlan_proto)) + goto nla_put_failure; + + nla_nest_end(skb, nest); + } + + if (entskb->mac_header < entskb->network_header) { + int len = (int)(entskb->network_header - entskb->mac_header); + + if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb))) + goto nla_put_failure; + } + + return 0; + +nla_put_failure: + return -1; +} + static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, @@ -334,6 +387,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, if (entskb->tstamp.tv64) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); + size += nfqnl_get_bridge_size(entry); + if (entry->state.hook <= NF_INET_FORWARD || (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) csum_verify = !skb_csum_unnecessary(entskb); @@ -497,9 +552,12 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } } + if (nfqnl_put_bridge(entry, skb) < 0) + goto nla_put_failure; + if (entskb->tstamp.tv64) { struct nfqnl_msg_packet_timestamp ts; - struct timespec64 kts = ktime_to_timespec64(skb->tstamp); + struct timespec64 kts = ktime_to_timespec64(entskb->tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); @@ -911,12 +969,18 @@ static struct notifier_block nfqnl_rtnl_notifier = { .notifier_call = nfqnl_rcv_nl_event, }; +static const struct nla_policy nfqa_vlan_policy[NFQA_VLAN_MAX + 1] = { + [NFQA_VLAN_TCI] = { .type = NLA_U16}, + [NFQA_VLAN_PROTO] = { .type = NLA_U16}, +}; + static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, [NFQA_CT] = { .type = NLA_UNSPEC }, [NFQA_EXP] = { .type = NLA_UNSPEC }, + [NFQA_VLAN] = { .type = NLA_NESTED }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { @@ -1030,6 +1094,40 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct, return ct; } +static int nfqa_parse_bridge(struct nf_queue_entry *entry, + const struct nlattr * const nfqa[]) +{ + if (nfqa[NFQA_VLAN]) { + struct nlattr *tb[NFQA_VLAN_MAX + 1]; + int err; + + err = nla_parse_nested(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN], + nfqa_vlan_policy); + if (err < 0) + return err; + + if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO]) + return -EINVAL; + + entry->skb->vlan_tci = ntohs(nla_get_be16(tb[NFQA_VLAN_TCI])); + entry->skb->vlan_proto = nla_get_be16(tb[NFQA_VLAN_PROTO]); + } + + if (nfqa[NFQA_L2HDR]) { + int mac_header_len = entry->skb->network_header - + entry->skb->mac_header; + + if (mac_header_len != nla_len(nfqa[NFQA_L2HDR])) + return -EINVAL; + else if (mac_header_len > 0) + memcpy(skb_mac_header(entry->skb), + nla_data(nfqa[NFQA_L2HDR]), + mac_header_len); + } + + return 0; +} + static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, @@ -1045,6 +1143,7 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, struct nfnl_ct_hook *nfnl_ct; struct nf_conn *ct = NULL; struct nfnl_queue_net *q = nfnl_queue_pernet(net); + int err; queue = instance_lookup(q, queue_num); if (!queue) @@ -1071,6 +1170,12 @@ static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl, ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo); } + if (entry->state.pf == PF_BRIDGE) { + err = nfqa_parse_bridge(entry, nfqa); + if (err < 0) + return err; + } + if (nfqa[NFQA_PAYLOAD]) { u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); int diff = payload_len - entry->skb->len; @@ -1377,21 +1482,29 @@ static int __net_init nfnl_queue_net_init(struct net *net) net->nf.proc_netfilter, &nfqnl_file_ops)) return -ENOMEM; #endif + nf_register_queue_handler(net, &nfqh); return 0; } static void __net_exit nfnl_queue_net_exit(struct net *net) { + nf_unregister_queue_handler(net); #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif } +static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list) +{ + synchronize_rcu(); +} + static struct pernet_operations nfnl_queue_net_ops = { - .init = nfnl_queue_net_init, - .exit = nfnl_queue_net_exit, - .id = &nfnl_queue_net_id, - .size = sizeof(struct nfnl_queue_net), + .init = nfnl_queue_net_init, + .exit = nfnl_queue_net_exit, + .exit_batch = nfnl_queue_net_exit_batch, + .id = &nfnl_queue_net_id, + .size = sizeof(struct nfnl_queue_net), }; static int __init nfnetlink_queue_init(void) @@ -1412,7 +1525,6 @@ static int __init nfnetlink_queue_init(void) } register_netdevice_notifier(&nfqnl_dev_notifier); - nf_register_queue_handler(&nfqh); return status; cleanup_netlink_notifier: @@ -1424,7 +1536,6 @@ out: static void __exit nfnetlink_queue_fini(void) { - nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index c9743f78f..77db8358a 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -76,8 +76,10 @@ static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr) nft_counter_fetch(priv->counter, &total); - if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) || - nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets))) + if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes), + NFTA_COUNTER_PAD) || + nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets), + NFTA_COUNTER_PAD)) goto nla_put_failure; return 0; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index d4a4619fc..81fbb4507 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -54,7 +54,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr, const struct nf_conn_help *help; const struct nf_conntrack_tuple *tuple; const struct nf_conntrack_helper *helper; - long diff; unsigned int state; ct = nf_ct_get(pkt->skb, &ctinfo); @@ -94,10 +93,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr, return; #endif case NFT_CT_EXPIRATION: - diff = (long)jiffies - (long)ct->timeout.expires; - if (diff < 0) - diff = 0; - *dest = jiffies_to_msecs(diff); + *dest = jiffies_to_msecs(nf_ct_expires(ct)); return; case NFT_CT_HELPER: if (ct->master == NULL) @@ -198,6 +194,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr, } break; #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_replace(ct, + ®s->data[priv->sreg], + ®s->data[priv->sreg], + NF_CT_LABELS_MAX_SIZE / sizeof(u32)); + break; +#endif default: break; } @@ -365,6 +369,16 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, len = FIELD_SIZEOF(struct nf_conn, mark); break; #endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = NF_CT_LABELS_MAX_SIZE; + err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); + if (err) + return err; + break; +#endif default: return -EOPNOTSUPP; } @@ -384,6 +398,18 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, static void nft_ct_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { + struct nft_ct *priv = nft_expr_priv(expr); + + switch (priv->key) { +#ifdef CONFIG_NF_CONNTRACK_LABELS + case NFT_CT_LABELS: + nf_connlabels_put(ctx->net); + break; +#endif + default: + break; + } + nft_ct_l3proto_module_put(ctx->afi->family); } @@ -484,6 +510,8 @@ static struct nft_expr_type nft_ct_type __read_mostly = { static int __init nft_ct_module_init(void) { + BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE); + return nft_register_expr(&nft_ct_type); } diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 9dec3bd1b..78d4914fb 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -227,7 +227,8 @@ static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr) goto nla_put_failure; if (nla_put_string(skb, NFTA_DYNSET_SET_NAME, priv->set->name)) goto nla_put_failure; - if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout))) + if (nla_put_be64(skb, NFTA_DYNSET_TIMEOUT, cpu_to_be64(priv->timeout), + NFTA_DYNSET_PAD)) goto nla_put_failure; if (priv->expr && nft_expr_dump(skb, NFTA_DYNSET_EXPR, priv->expr)) goto nla_put_failure; diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index 3f9d45d3d..f39c53a15 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -189,10 +189,9 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, struct nft_hash_elem *he; struct rhashtable_iter hti; struct nft_set_elem elem; - u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); int err; - err = rhashtable_walk_init(&priv->ht, &hti); + err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); iter->err = err; if (err) return; @@ -218,7 +217,7 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, goto cont; if (nft_set_elem_expired(&he->ext)) goto cont; - if (!nft_set_elem_active(&he->ext, genmask)) + if (!nft_set_elem_active(&he->ext, iter->genmask)) goto cont; elem.priv = he; @@ -248,7 +247,7 @@ static void nft_hash_gc(struct work_struct *work) priv = container_of(work, struct nft_hash, gc_work.work); set = nft_set_container_of(priv); - err = rhashtable_walk_init(&priv->ht, &hti); + err = rhashtable_walk_init(&priv->ht, &hti, GFP_KERNEL); if (err) goto schedule; diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index 99d18578a..070b98938 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -97,8 +97,10 @@ static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit, u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC); u64 rate = limit->rate - limit->burst; - if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) || - nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) || + if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate), + NFTA_LIMIT_PAD) || + nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs), + NFTA_LIMIT_PAD) || nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) || nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) || nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags))) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 16c50b0dd..f4bad9dc1 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -227,7 +227,7 @@ void nft_meta_set_eval(const struct nft_expr *expr, skb->pkt_type = value; break; case NFT_META_NFTRACE: - skb->nf_trace = 1; + skb->nf_trace = !!value; break; default: WARN_ON(1); diff --git a/net/netfilter/nft_rbtree.c b/net/netfilter/nft_rbtree.c index 1c30f41cf..7201d57b5 100644 --- a/net/netfilter/nft_rbtree.c +++ b/net/netfilter/nft_rbtree.c @@ -29,6 +29,17 @@ struct nft_rbtree_elem { struct nft_set_ext ext; }; +static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe) +{ + return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && + (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END); +} + +static bool nft_rbtree_equal(const struct nft_set *set, const void *this, + const struct nft_rbtree_elem *interval) +{ + return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0; +} static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) @@ -37,6 +48,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, const struct nft_rbtree_elem *rbe, *interval = NULL; const struct rb_node *parent; u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); + const void *this; int d; spin_lock_bh(&nft_rbtree_lock); @@ -44,9 +56,16 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key, while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); - d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); + this = nft_set_ext_key(&rbe->ext); + d = memcmp(this, key, set->klen); if (d < 0) { parent = parent->rb_left; + /* In case of adjacent ranges, we always see the high + * part of the range in first place, before the low one. + * So don't update interval if the keys are equal. + */ + if (interval && nft_rbtree_equal(set, this, interval)) + continue; interval = rbe; } else if (d > 0) parent = parent->rb_right; @@ -56,9 +75,7 @@ found: parent = parent->rb_left; continue; } - if (nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) && - *nft_set_ext_flags(&rbe->ext) & - NFT_SET_ELEM_INTERVAL_END) + if (nft_rbtree_interval_end(rbe)) goto out; spin_unlock_bh(&nft_rbtree_lock); @@ -98,9 +115,16 @@ static int __nft_rbtree_insert(const struct nft_set *set, else if (d > 0) p = &parent->rb_right; else { - if (nft_set_elem_active(&rbe->ext, genmask)) - return -EEXIST; - p = &parent->rb_left; + if (nft_set_elem_active(&rbe->ext, genmask)) { + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(new)) + p = &parent->rb_left; + else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(new)) + p = &parent->rb_right; + else + return -EEXIST; + } } } rb_link_node(&new->node, parent, p); @@ -145,7 +169,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, { const struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; - struct nft_rbtree_elem *rbe; + struct nft_rbtree_elem *rbe, *this = elem->priv; u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); int d; @@ -163,6 +187,15 @@ static void *nft_rbtree_deactivate(const struct nft_set *set, parent = parent->rb_left; continue; } + if (nft_rbtree_interval_end(rbe) && + !nft_rbtree_interval_end(this)) { + parent = parent->rb_left; + continue; + } else if (!nft_rbtree_interval_end(rbe) && + nft_rbtree_interval_end(this)) { + parent = parent->rb_right; + continue; + } nft_set_elem_change_active(set, &rbe->ext); return rbe; } @@ -178,7 +211,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, struct nft_rbtree_elem *rbe; struct nft_set_elem elem; struct rb_node *node; - u8 genmask = nft_genmask_cur(read_pnet(&set->pnet)); spin_lock_bh(&nft_rbtree_lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { @@ -186,7 +218,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; - if (!nft_set_elem_active(&rbe->ext, genmask)) + if (!nft_set_elem_active(&rbe->ext, iter->genmask)) goto cont; elem.priv = rbe; diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index bb9cbeb18..a79af2555 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -18,6 +18,16 @@ MODULE_DESCRIPTION("Xtables: add/match connection trackling labels"); MODULE_ALIAS("ipt_connlabel"); MODULE_ALIAS("ip6t_connlabel"); +static bool connlabel_match(const struct nf_conn *ct, u16 bit) +{ + struct nf_conn_labels *labels = nf_ct_labels_find(ct); + + if (!labels) + return false; + + return BIT_WORD(bit) < labels->words && test_bit(bit, labels->bits); +} + static bool connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) { @@ -33,7 +43,7 @@ connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) if (info->options & XT_CONNLABEL_OP_SET) return (nf_connlabel_set(ct, info->bit) == 0) ^ invert; - return nf_connlabel_match(ct, info->bit) ^ invert; + return connlabel_match(ct, info->bit) ^ invert; } static int connlabel_mt_check(const struct xt_mtchk_param *par) @@ -55,7 +65,7 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par) return ret; } - ret = nf_connlabels_get(par->net, info->bit + 1); + ret = nf_connlabels_get(par->net, info->bit); if (ret < 0) nf_ct_l3proto_module_put(par->family); return ret; diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 49d14ecad..b10ade272 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -120,9 +120,9 @@ xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, { switch (protocol) { case IPPROTO_TCP: - return __inet_lookup(net, &tcp_hashinfo, skb, doff, - saddr, sport, daddr, dport, - in->ifindex); + return inet_lookup(net, &tcp_hashinfo, skb, doff, + saddr, sport, daddr, dport, + in->ifindex); case IPPROTO_UDP: return udp4_lib_lookup(net, saddr, sport, daddr, dport, in->ifindex); |