From 377f0a08e4cb56658d878d22c3aed4716e283c6b Mon Sep 17 00:00:00 2001 From: Rami Rosen Date: Tue, 31 Mar 2009 14:43:17 -0700 Subject: ipv4: remove unused parameter from tcp_recv_urg(). Signed-off-by: Rami Rosen Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2451aeb5ac2..fafbec8b073 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1081,8 +1081,7 @@ out_err: * this, no blocking and very strange errors 8) */ -static int tcp_recv_urg(struct sock *sk, long timeo, - struct msghdr *msg, int len, int flags) +static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); @@ -1697,7 +1696,7 @@ out: return err; recv_urg: - err = tcp_recv_urg(sk, timeo, msg, len, flags); + err = tcp_recv_urg(sk, msg, len, flags); goto out; } -- cgit v1.2.3 From c9caceca25854eff4328c89045793a91bf8f9ee3 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 31 Mar 2009 15:06:26 -0700 Subject: core: remove pointless conditional before kfree() Remove pointless conditional before kfree(). Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/core/ethtool.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 244ca56dffa..d9d5160610d 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -261,8 +261,7 @@ static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr) ret = 0; err_out: - if (rule_buf) - kfree(rule_buf); + kfree(rule_buf); return ret; } -- cgit v1.2.3 From f1cffcbfcc53b825da7d1d26244aabd8dccb24aa Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 1 Apr 2009 08:20:18 +0000 Subject: RDS: Fix m_rs_lock deadlock rs_send_drop_to() is called during socket close. If it takes m_rs_lock without disabling interrupts, then rds_send_remove_from_sock() can run from the rx completion handler and thus deadlock. Signed-off-by: Andy Grover Signed-off-by: David S. Miller --- net/rds/send.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 1b37364656f..104fe033203 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -615,7 +615,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; - unsigned long flags; + unsigned long flags, flags2; LIST_HEAD(list); int wake = 0; @@ -651,9 +651,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) list_for_each_entry(rm, &list, m_sock_item) { /* We do this here rather than in the loop above, so that * we don't have to nest m_rs_lock under rs->rs_lock */ - spin_lock(&rm->m_rs_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags2); rm->m_rs = NULL; - spin_unlock(&rm->m_rs_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags2); /* * If we see this flag cleared then we're *sure* that someone -- cgit v1.2.3 From 745cbccac3fe8cead529a1b3358e1e86a1505bfa Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 1 Apr 2009 08:20:19 +0000 Subject: RDS: Rewrite connection cleanup, fixing oops on rmmod This fixes a bug where a connection was unexpectedly not on *any* list while being destroyed. It also cleans up some code duplication and regularizes some function names. * Grab appropriate lock in conn_free() and explain in comment * Ensure via locking that a conn is never not on either a dev's list or the nodev list * Add rds_xx_remove_conn() to match rds_xx_add_conn() * Make rds_xx_add_conn() return void * Rename remove_{,nodev_}conns() to destroy_{,nodev_}conns() and unify their implementation in a helper function * Document lock ordering as nodev conn_lock before dev_conn_lock Reported-by: Yosef Etigin Signed-off-by: Andy Grover Signed-off-by: David S. Miller --- net/rds/ib.c | 5 +++-- net/rds/ib.h | 14 +++++++++++--- net/rds/ib_cm.c | 34 +++++++++++++++++++--------------- net/rds/ib_rdma.c | 43 +++++++++++++++++++++---------------------- net/rds/iw.c | 5 +++-- net/rds/iw.h | 14 +++++++++++--- net/rds/iw_cm.c | 35 +++++++++++++++++++---------------- net/rds/iw_rdma.c | 44 ++++++++++++++++++++++---------------------- 8 files changed, 109 insertions(+), 85 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 06a7b798d9a..4933b380985 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -51,6 +51,7 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); struct list_head rds_ib_devices; +/* NOTE: if also grabbing ibdev lock, grab this first */ DEFINE_SPINLOCK(ib_nodev_conns_lock); LIST_HEAD(ib_nodev_conns); @@ -137,7 +138,7 @@ void rds_ib_remove_one(struct ib_device *device) kfree(i_ipaddr); } - rds_ib_remove_conns(rds_ibdev); + rds_ib_destroy_conns(rds_ibdev); if (rds_ibdev->mr_pool) rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); @@ -249,7 +250,7 @@ static int rds_ib_laddr_check(__be32 addr) void rds_ib_exit(void) { rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); - rds_ib_remove_nodev_conns(); + rds_ib_destroy_nodev_conns(); ib_unregister_client(&rds_ib_client); rds_ib_sysctl_exit(); rds_ib_recv_exit(); diff --git a/net/rds/ib.h b/net/rds/ib.h index 8be563a1363..c08ffffb316 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -267,9 +267,17 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, /* ib_rdma.c */ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); -int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); -void rds_ib_remove_nodev_conns(void); -void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev); +void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); +void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); +void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock); +static inline void rds_ib_destroy_nodev_conns(void) +{ + __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); +} +static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev) +{ + __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); +} struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 0532237bd12..889ab044135 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -126,9 +126,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); if (err) printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); - err = rds_ib_add_conn(rds_ibdev, conn); - if (err) - printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err); + rds_ib_add_conn(rds_ibdev, conn); /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ @@ -616,18 +614,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) /* * Move connection back to the nodev list. */ - if (ic->rds_ibdev) { - - spin_lock_irq(&ic->rds_ibdev->spinlock); - BUG_ON(list_empty(&ic->ib_node)); - list_del(&ic->ib_node); - spin_unlock_irq(&ic->rds_ibdev->spinlock); - - spin_lock_irq(&ib_nodev_conns_lock); - list_add_tail(&ic->ib_node, &ib_nodev_conns); - spin_unlock_irq(&ib_nodev_conns_lock); - ic->rds_ibdev = NULL; - } + if (ic->rds_ibdev) + rds_ib_remove_conn(ic->rds_ibdev, conn); ic->i_cm_id = NULL; ic->i_pd = NULL; @@ -701,11 +689,27 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) return 0; } +/* + * Free a connection. Connection must be shut down and not set for reconnect. + */ void rds_ib_conn_free(void *arg) { struct rds_ib_connection *ic = arg; + spinlock_t *lock_ptr; + rdsdebug("ic %p\n", ic); + + /* + * Conn is either on a dev's list or on the nodev list. + * A race with shutdown() or connect() would cause problems + * (since rds_ibdev would change) but that should never happen. + */ + lock_ptr = ic->rds_ibdev ? &ic->rds_ibdev->spinlock : &ib_nodev_conns_lock; + + spin_lock_irq(lock_ptr); list_del(&ic->ib_node); + spin_unlock_irq(lock_ptr); + kfree(ic); } diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 69a6289ed67..81033af9302 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -139,7 +139,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } -int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) +void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; @@ -148,45 +148,44 @@ int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn BUG_ON(list_empty(&ib_nodev_conns)); BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); - spin_unlock_irq(&ib_nodev_conns_lock); spin_lock_irq(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); spin_unlock_irq(&rds_ibdev->spinlock); + spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; - - return 0; } -void rds_ib_remove_nodev_conns(void) +void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { - struct rds_ib_connection *ic, *_ic; - LIST_HEAD(tmp_list); + struct rds_ib_connection *ic = conn->c_transport_data; - /* avoid calling conn_destroy with irqs off */ - spin_lock_irq(&ib_nodev_conns_lock); - list_splice(&ib_nodev_conns, &tmp_list); - INIT_LIST_HEAD(&ib_nodev_conns); - spin_unlock_irq(&ib_nodev_conns_lock); + /* place conn on nodev_conns_list */ + spin_lock(&ib_nodev_conns_lock); - list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) { - if (ic->conn->c_passive) - rds_conn_destroy(ic->conn->c_passive); - rds_conn_destroy(ic->conn); - } + spin_lock_irq(&rds_ibdev->spinlock); + BUG_ON(list_empty(&ic->ib_node)); + list_del(&ic->ib_node); + spin_unlock_irq(&rds_ibdev->spinlock); + + list_add_tail(&ic->ib_node, &ib_nodev_conns); + + spin_unlock(&ib_nodev_conns_lock); + + ic->rds_ibdev = NULL; } -void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev) +void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock) { struct rds_ib_connection *ic, *_ic; LIST_HEAD(tmp_list); /* avoid calling conn_destroy with irqs off */ - spin_lock_irq(&rds_ibdev->spinlock); - list_splice(&rds_ibdev->conn_list, &tmp_list); - INIT_LIST_HEAD(&rds_ibdev->conn_list); - spin_unlock_irq(&rds_ibdev->spinlock); + spin_lock_irq(list_lock); + list_splice(list, &tmp_list); + INIT_LIST_HEAD(list); + spin_unlock_irq(list_lock); list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) { if (ic->conn->c_passive) diff --git a/net/rds/iw.c b/net/rds/iw.c index 1b56905c4c0..b732efb5b63 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -51,6 +51,7 @@ MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MR struct list_head rds_iw_devices; +/* NOTE: if also grabbing iwdev lock, grab this first */ DEFINE_SPINLOCK(iw_nodev_conns_lock); LIST_HEAD(iw_nodev_conns); @@ -145,7 +146,7 @@ void rds_iw_remove_one(struct ib_device *device) } spin_unlock_irq(&rds_iwdev->spinlock); - rds_iw_remove_conns(rds_iwdev); + rds_iw_destroy_conns(rds_iwdev); if (rds_iwdev->mr_pool) rds_iw_destroy_mr_pool(rds_iwdev->mr_pool); @@ -258,7 +259,7 @@ static int rds_iw_laddr_check(__be32 addr) void rds_iw_exit(void) { rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info); - rds_iw_remove_nodev_conns(); + rds_iw_destroy_nodev_conns(); ib_unregister_client(&rds_iw_client); rds_iw_sysctl_exit(); rds_iw_recv_exit(); diff --git a/net/rds/iw.h b/net/rds/iw.h index 0ddda34f2a1..70eb948f42f 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h @@ -294,9 +294,17 @@ void rds_iw_cm_connect_complete(struct rds_connection *conn, /* ib_rdma.c */ int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); -int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); -void rds_iw_remove_nodev_conns(void); -void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev); +void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); +void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn); +void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock); +static inline void rds_iw_destroy_nodev_conns(void) +{ + __rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock); +} +static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev) +{ + __rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock); +} struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *); void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo); void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *); diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index 57ecb3d4b8a..0ffaa3e97ad 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -86,9 +86,7 @@ void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id); if (err) printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err); - err = rds_iw_add_conn(rds_iwdev, conn); - if (err) - printk(KERN_ERR "rds_iw_add_conn failed (%d)\n", err); + rds_iw_add_conn(rds_iwdev, conn); /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ @@ -637,19 +635,8 @@ void rds_iw_conn_shutdown(struct rds_connection *conn) * Move connection back to the nodev list. * Remove cm_id from the device cm_id list. */ - if (ic->rds_iwdev) { - - spin_lock_irq(&ic->rds_iwdev->spinlock); - BUG_ON(list_empty(&ic->iw_node)); - list_del(&ic->iw_node); - spin_unlock_irq(&ic->rds_iwdev->spinlock); - - spin_lock_irq(&iw_nodev_conns_lock); - list_add_tail(&ic->iw_node, &iw_nodev_conns); - spin_unlock_irq(&iw_nodev_conns_lock); - rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id); - ic->rds_iwdev = NULL; - } + if (ic->rds_iwdev) + rds_iw_remove_conn(ic->rds_iwdev, conn); rdma_destroy_id(ic->i_cm_id); @@ -726,11 +713,27 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) return 0; } +/* + * Free a connection. Connection must be shut down and not set for reconnect. + */ void rds_iw_conn_free(void *arg) { struct rds_iw_connection *ic = arg; + spinlock_t *lock_ptr; + rdsdebug("ic %p\n", ic); + + /* + * Conn is either on a dev's list or on the nodev list. + * A race with shutdown() or connect() would cause problems + * (since rds_iwdev would change) but that should never happen. + */ + lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock; + + spin_lock_irq(lock_ptr); list_del(&ic->iw_node); + spin_unlock_irq(lock_ptr); + kfree(ic); } diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index 1c02a8f952d..dcdb37da80f 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -196,7 +196,7 @@ int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_i return rds_iw_add_cm_id(rds_iwdev, cm_id); } -int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) +void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) { struct rds_iw_connection *ic = conn->c_transport_data; @@ -205,45 +205,45 @@ int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn BUG_ON(list_empty(&iw_nodev_conns)); BUG_ON(list_empty(&ic->iw_node)); list_del(&ic->iw_node); - spin_unlock_irq(&iw_nodev_conns_lock); spin_lock_irq(&rds_iwdev->spinlock); list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); spin_unlock_irq(&rds_iwdev->spinlock); + spin_unlock_irq(&iw_nodev_conns_lock); ic->rds_iwdev = rds_iwdev; - - return 0; } -void rds_iw_remove_nodev_conns(void) +void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn) { - struct rds_iw_connection *ic, *_ic; - LIST_HEAD(tmp_list); + struct rds_iw_connection *ic = conn->c_transport_data; - /* avoid calling conn_destroy with irqs off */ - spin_lock_irq(&iw_nodev_conns_lock); - list_splice(&iw_nodev_conns, &tmp_list); - INIT_LIST_HEAD(&iw_nodev_conns); - spin_unlock_irq(&iw_nodev_conns_lock); + /* place conn on nodev_conns_list */ + spin_lock(&iw_nodev_conns_lock); - list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) { - if (ic->conn->c_passive) - rds_conn_destroy(ic->conn->c_passive); - rds_conn_destroy(ic->conn); - } + spin_lock_irq(&rds_iwdev->spinlock); + BUG_ON(list_empty(&ic->iw_node)); + list_del(&ic->iw_node); + spin_unlock_irq(&rds_iwdev->spinlock); + + list_add_tail(&ic->iw_node, &iw_nodev_conns); + + spin_unlock(&iw_nodev_conns_lock); + + rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id); + ic->rds_iwdev = NULL; } -void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev) +void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock) { struct rds_iw_connection *ic, *_ic; LIST_HEAD(tmp_list); /* avoid calling conn_destroy with irqs off */ - spin_lock_irq(&rds_iwdev->spinlock); - list_splice(&rds_iwdev->conn_list, &tmp_list); - INIT_LIST_HEAD(&rds_iwdev->conn_list); - spin_unlock_irq(&rds_iwdev->spinlock); + spin_lock_irq(list_lock); + list_splice(list, &tmp_list); + INIT_LIST_HEAD(list); + spin_unlock_irq(list_lock); list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) { if (ic->conn->c_passive) -- cgit v1.2.3 From 8cbd9606a6367c221a7bbcc47f3ab1a8c31b6437 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 1 Apr 2009 08:20:20 +0000 Subject: RDS: Use spinlock to protect 64b value update on 32b archs We have a 64bit value that needs to be set atomically. This is easy and quick on all 64bit archs, and can also be done on x86/32 with set_64bit() (uses cmpxchg8b). However other 32b archs don't have this. I actually changed this to the current state in preparation for mainline because the old way (using a spinlock on 32b) resulted in unsightly #ifdefs in the code. But obviously, being correct takes precedence. Signed-off-by: Andy Grover Signed-off-by: David S. Miller --- net/rds/ib.h | 14 +++++--------- net/rds/ib_cm.c | 9 ++++++++- net/rds/ib_recv.c | 37 +++++++++++++++++++++++++++++++++++-- net/rds/iw.h | 14 +++++--------- net/rds/iw_cm.c | 9 ++++++++- net/rds/iw_recv.c | 37 +++++++++++++++++++++++++++++++++++-- net/rds/rds.h | 4 ++++ 7 files changed, 100 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index c08ffffb316..069206cae73 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -108,7 +108,12 @@ struct rds_ib_connection { /* sending acks */ unsigned long i_ack_flags; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t i_ack_next; /* next ACK to send */ +#else + spinlock_t i_ack_lock; /* protect i_ack_next */ u64 i_ack_next; /* next ACK to send */ +#endif struct rds_header *i_ack; struct ib_send_wr i_ack_wr; struct ib_sge i_ack_sge; @@ -363,13 +368,4 @@ rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) return &sge[1]; } -static inline void rds_ib_set_64bit(u64 *ptr, u64 val) -{ -#if BITS_PER_LONG == 64 - *ptr = val; -#else - set_64bit(ptr, val); -#endif -} - #endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 889ab044135..f8e40e1a603 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -636,7 +636,11 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) /* Clear the ACK state */ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); - rds_ib_set_64bit(&ic->i_ack_next, 0); +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_set(&ic->i_ack_next, 0); +#else + ic->i_ack_next = 0; +#endif ic->i_ack_recv = 0; /* Clear flow control state */ @@ -669,6 +673,9 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) INIT_LIST_HEAD(&ic->ib_node); mutex_init(&ic->i_recv_mutex); +#ifndef KERNEL_HAS_ATOMIC64 + spin_lock_init(&ic->i_ack_lock); +#endif /* * rds_ib_conn_shutdown() waits for these to be emptied so they diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 5061b550216..36d931573ff 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -395,10 +395,37 @@ void rds_ib_recv_init_ack(struct rds_ib_connection *ic) * room for it beyond the ring size. Send completion notices its special * wr_id and avoids working with the ring in that case. */ +#ifndef KERNEL_HAS_ATOMIC64 static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, int ack_required) { - rds_ib_set_64bit(&ic->i_ack_next, seq); + unsigned long flags; + + spin_lock_irqsave(&ic->i_ack_lock, flags); + ic->i_ack_next = seq; + if (ack_required) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + spin_unlock_irqrestore(&ic->i_ack_lock, flags); +} + +static u64 rds_ib_get_ack(struct rds_ib_connection *ic) +{ + unsigned long flags; + u64 seq; + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + spin_lock_irqsave(&ic->i_ack_lock, flags); + seq = ic->i_ack_next; + spin_unlock_irqrestore(&ic->i_ack_lock, flags); + + return seq; +} +#else +static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq, + int ack_required) +{ + atomic64_set(&ic->i_ack_next, seq); if (ack_required) { smp_mb__before_clear_bit(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); @@ -410,8 +437,10 @@ static u64 rds_ib_get_ack(struct rds_ib_connection *ic) clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); smp_mb__after_clear_bit(); - return ic->i_ack_next; + return atomic64_read(&ic->i_ack_next); } +#endif + static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits) { @@ -464,6 +493,10 @@ static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credi * - i_ack_next, which is the last sequence number we received * * Potentially, send queue and receive queue handlers can run concurrently. + * It would be nice to not have to use a spinlock to synchronize things, + * but the one problem that rules this out is that 64bit updates are + * not atomic on all platforms. Things would be a lot simpler if + * we had atomic64 or maybe cmpxchg64 everywhere. * * Reconnecting complicates this picture just slightly. When we * reconnect, we may be seeing duplicate packets. The peer diff --git a/net/rds/iw.h b/net/rds/iw.h index 70eb948f42f..b4fb2725289 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h @@ -131,7 +131,12 @@ struct rds_iw_connection { /* sending acks */ unsigned long i_ack_flags; +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_t i_ack_next; /* next ACK to send */ +#else + spinlock_t i_ack_lock; /* protect i_ack_next */ u64 i_ack_next; /* next ACK to send */ +#endif struct rds_header *i_ack; struct ib_send_wr i_ack_wr; struct ib_sge i_ack_sge; @@ -391,13 +396,4 @@ rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge) return &sge[1]; } -static inline void rds_iw_set_64bit(u64 *ptr, u64 val) -{ -#if BITS_PER_LONG == 64 - *ptr = val; -#else - set_64bit(ptr, val); -#endif -} - #endif diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index 0ffaa3e97ad..a416b0d492b 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -659,7 +659,11 @@ void rds_iw_conn_shutdown(struct rds_connection *conn) /* Clear the ACK state */ clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags); - rds_iw_set_64bit(&ic->i_ack_next, 0); +#ifdef KERNEL_HAS_ATOMIC64 + atomic64_set(&ic->i_ack_next, 0); +#else + ic->i_ack_next = 0; +#endif ic->i_ack_recv = 0; /* Clear flow control state */ @@ -693,6 +697,9 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) INIT_LIST_HEAD(&ic->iw_node); mutex_init(&ic->i_recv_mutex); +#ifndef KERNEL_HAS_ATOMIC64 + spin_lock_init(&ic->i_ack_lock); +#endif /* * rds_iw_conn_shutdown() waits for these to be emptied so they diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index a1931f0027a..fde470fa50d 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -395,10 +395,37 @@ void rds_iw_recv_init_ack(struct rds_iw_connection *ic) * room for it beyond the ring size. Send completion notices its special * wr_id and avoids working with the ring in that case. */ +#ifndef KERNEL_HAS_ATOMIC64 static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, int ack_required) { - rds_iw_set_64bit(&ic->i_ack_next, seq); + unsigned long flags; + + spin_lock_irqsave(&ic->i_ack_lock, flags); + ic->i_ack_next = seq; + if (ack_required) + set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + spin_unlock_irqrestore(&ic->i_ack_lock, flags); +} + +static u64 rds_iw_get_ack(struct rds_iw_connection *ic) +{ + unsigned long flags; + u64 seq; + + clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); + + spin_lock_irqsave(&ic->i_ack_lock, flags); + seq = ic->i_ack_next; + spin_unlock_irqrestore(&ic->i_ack_lock, flags); + + return seq; +} +#else +static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq, + int ack_required) +{ + atomic64_set(&ic->i_ack_next, seq); if (ack_required) { smp_mb__before_clear_bit(); set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); @@ -410,8 +437,10 @@ static u64 rds_iw_get_ack(struct rds_iw_connection *ic) clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); smp_mb__after_clear_bit(); - return ic->i_ack_next; + return atomic64_read(&ic->i_ack_next); } +#endif + static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits) { @@ -464,6 +493,10 @@ static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credi * - i_ack_next, which is the last sequence number we received * * Potentially, send queue and receive queue handlers can run concurrently. + * It would be nice to not have to use a spinlock to synchronize things, + * but the one problem that rules this out is that 64bit updates are + * not atomic on all platforms. Things would be a lot simpler if + * we had atomic64 or maybe cmpxchg64 everywhere. * * Reconnecting complicates this picture just slightly. When we * reconnect, we may be seeing duplicate packets. The peer diff --git a/net/rds/rds.h b/net/rds/rds.h index 06040070497..619f0a30a4e 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -28,6 +28,10 @@ */ #define RDS_PORT 18634 +#ifdef ATOMIC64_INIT +#define KERNEL_HAS_ATOMIC64 +#endif + #ifdef DEBUG #define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args) #else -- cgit v1.2.3 From fa9a86ddc8ecd2830a5e773facc250f110300ae7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Apr 2009 00:53:49 -0700 Subject: netfilter: use rcu_read_bh() in ipt_do_table() Commit 784544739a25c30637397ace5489eeb6e15d7d49 (netfilter: iptables: lock free counters) forgot to disable BH in arpt_do_table(), ipt_do_table() and ip6t_do_table() Use rcu_read_lock_bh() instead of rcu_read_lock() cures the problem. Reported-and-bisected-by: Roman Mindalev Signed-off-by: Eric Dumazet Acked-by: Patrick McHardy Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/netfilter/arp_tables.c | 4 ++-- net/ipv4/netfilter/ip_tables.c | 4 ++-- net/ipv6/netfilter/ip6_tables.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 35c5f6a5cb7..5ba533d234d 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -253,7 +253,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - rcu_read_lock(); + rcu_read_lock_bh(); private = rcu_dereference(table->private); table_base = rcu_dereference(private->entries[smp_processor_id()]); @@ -329,7 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, } } while (!hotdrop); - rcu_read_unlock(); + rcu_read_unlock_bh(); if (hotdrop) return NF_DROP; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 82ee7c9049f..810c0b62c7d 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -339,7 +339,7 @@ ipt_do_table(struct sk_buff *skb, IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - rcu_read_lock(); + rcu_read_lock_bh(); private = rcu_dereference(table->private); table_base = rcu_dereference(private->entries[smp_processor_id()]); @@ -437,7 +437,7 @@ ipt_do_table(struct sk_buff *skb, } } while (!hotdrop); - rcu_read_unlock(); + rcu_read_unlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index e89cfa3a8f2..dfed176aed3 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -365,7 +365,7 @@ ip6t_do_table(struct sk_buff *skb, IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - rcu_read_lock(); + rcu_read_lock_bh(); private = rcu_dereference(table->private); table_base = rcu_dereference(private->entries[smp_processor_id()]); @@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb, #ifdef CONFIG_NETFILTER_DEBUG ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; #endif - rcu_read_unlock(); + rcu_read_unlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; -- cgit v1.2.3 From f2bde7328633269ee935d9ed96535ade15cc348f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 1 Apr 2009 11:20:20 +0000 Subject: net: allow multiple dev per napi with GRO GRO assumes that there is a one-to-one relationship between NAPI structure and network device. Some devices like sky2 share multiple devices on a single interrupt so only have one NAPI handler. Rather than split GRO from NAPI, just have GRO assume if device changes that it is a different flow. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 52fea5b28ca..91d792d17e0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2472,8 +2472,9 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) return GRO_NORMAL; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = !compare_ether_header( - skb_mac_header(p), skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = (p->dev == skb->dev) + && !compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); NAPI_GRO_CB(p)->flush = 0; } -- cgit v1.2.3 From 797108d134a91afca9fa59c572336b279bc66afb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 1 Apr 2009 23:15:17 +0000 Subject: tcp: add helper for counter tweaking due mid-wq change MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need full-scale adjustment to fix a TCP miscount in the next patch, so just move it into a helper and call for that from the other places. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 66 ++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c1f259d2d33..f1db89bb3aa 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -754,6 +754,36 @@ static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, tp->fackets_out -= decr; } +/* Pcount in the middle of the write queue got changed, we need to do various + * tweaks to fix counters + */ +static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->packets_out -= decr; + + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + tp->sacked_out -= decr; + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= decr; + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) + tp->lost_out -= decr; + + /* Reno case is special. Sigh... */ + if (tcp_is_reno(tp) && decr > 0) + tp->sacked_out -= min_t(u32, tp->sacked_out, decr); + + tcp_adjust_fackets_out(sk, skb, decr); + + if (tp->lost_skb_hint && + before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && + (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked)) + tp->lost_cnt_hint -= decr; + + tcp_verify_left_out(tp); +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. @@ -836,28 +866,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, int diff = old_factor - tcp_skb_pcount(skb) - tcp_skb_pcount(buff); - tp->packets_out -= diff; - - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= diff; - if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= diff; - - if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) - tp->lost_out -= diff; - - /* Adjust Reno SACK estimate. */ - if (tcp_is_reno(tp) && diff > 0) { - tcp_dec_pcount_approx_int(&tp->sacked_out, diff); - tcp_verify_left_out(tp); - } - tcp_adjust_fackets_out(sk, skb, diff); - - if (tp->lost_skb_hint && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->lost_skb_hint)->seq) && - (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked)) - tp->lost_cnt_hint -= diff; + if (diff) + tcp_adjust_pcount(sk, skb, diff); } /* Link BUFF into the send queue. */ @@ -1768,22 +1778,14 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) * packet counting does not break. */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; - if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= tcp_skb_pcount(next_skb); - if (TCP_SKB_CB(next_skb)->sacked & TCPCB_LOST) - tp->lost_out -= tcp_skb_pcount(next_skb); - /* Reno case is special. Sigh... */ - if (tcp_is_reno(tp) && tp->sacked_out) - tcp_dec_pcount_approx(&tp->sacked_out, next_skb); - - tcp_adjust_fackets_out(sk, next_skb, tcp_skb_pcount(next_skb)); - tp->packets_out -= tcp_skb_pcount(next_skb); /* changed transmit queue under us so clear hints */ tcp_clear_retrans_hints_partial(tp); if (next_skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = skb; + tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb)); + sk_wmem_free_skb(sk, next_skb); } -- cgit v1.2.3 From 9eb9362e569062e2f841b7a023e5fcde10ed63b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 1 Apr 2009 23:18:20 +0000 Subject: tcp: miscounts due to tcp_fragment pcount reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems that trivial reset of pcount to one was not sufficient in tcp_retransmit_skb. Multiple counters experience a positive miscount when skb's pcount gets lowered without the necessary adjustments (depending on skb's sacked bits which exactly), at worst a packets_out miscount can crash at RTO if the write queue is empty! Triggering this requires mss change, so bidir tcp or mtu probe or like. Signed-off-by: Ilpo Järvinen Reported-by: Markus Trippelsdorf Tested-by: Uwe Bugla Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f1db89bb3aa..53300fa2359 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1893,7 +1893,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (tcp_fragment(sk, skb, cur_mss, cur_mss)) return -ENOMEM; /* We'll try again later. */ } else { - tcp_init_tso_segs(sk, skb, cur_mss); + int oldpcount = tcp_skb_pcount(skb); + + if (unlikely(oldpcount > 1)) { + tcp_init_tso_segs(sk, skb, cur_mss); + tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); + } } tcp_retrans_try_collapse(sk, skb, cur_mss); -- cgit v1.2.3