aboutsummaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorDavid Miller <davem@davemloft.net>2007-03-26 18:56:59 -0700
committerGreg Kroah-Hartman <gregkh@suse.de>2007-04-06 03:43:13 -0700
commit00d3a8023ed6cb6c91004b38b50758a542fac645 (patch)
tree04abb371463865c12c5d81fab4bd3ce9989f39cd /net
parent5d50dceccd79e5890d08adb40a538f1ac6dfe89d (diff)
IPV6: Fix ipv6 round-robin locking.
[IPV6]: Fix routing round-robin locking. As per RFC2461, section 6.3.6, item #2, when no routers on the matching list are known to be reachable or probably reachable we do round robin on those available routes so that we make sure to probe as many of them as possible to detect when one becomes reachable faster. Each routing table has a rwlock protecting the tree and the linked list of routes at each leaf. The round robin code executes during lookup and thus with the rwlock taken as a reader. A small local spinlock tries to provide protection but this does not work at all for two reasons: 1) The round-robin list manipulation, as coded, goes like this (with read lock held): walk routes finding head and tail spin_lock(); rotate list using head and tail spin_unlock(); While one thread is rotating the list, another thread can end up with stale values of head and tail and then proceed to corrupt the list when it gets the lock. This ends up causing the OOPS in fib6_add() later onthat many people have been hitting. 2) All the other code paths that run with the rwlock held as a reader do not expect the list to change on them, they expect it to remain completely fixed while they hold the lock in that way. So, simply stated, it is impossible to implement this correctly using a manipulation of the list without violating the rwlock locking semantics. Reimplement using a per-fib6_node round-robin pointer. This way we don't need to manipulate the list at all, and since the round-robin pointer can only ever point to real existing entries we don't need to perform any locking on the changing of the round-robin pointer itself. We only need to reset the round-robin pointer to NULL when the entry it is pointing to is removed. The idea is from Thomas Graf and it is very similar to how this was implemented before the advanced router selection code when in. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/ipv6/ip6_fib.c8
-rw-r--r--net/ipv6/route.c97
2 files changed, 67 insertions, 38 deletions
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 96d8310ae9c8..2d9c425f1c6a 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -659,6 +659,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
ins = &iter->u.next;
}
+ /* Reset round-robin state, if necessary */
+ if (ins == &fn->leaf)
+ fn->rr_ptr = NULL;
+
/*
* insert node
*/
@@ -1110,6 +1114,10 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
rt6_stats.fib_rt_entries--;
rt6_stats.fib_discarded_routes++;
+ /* Reset round-robin state, if necessary */
+ if (fn->rr_ptr == rt)
+ fn->rr_ptr = NULL;
+
/* Adjust walkers */
read_lock(&fib6_walker_lock);
FOR_WALKERS(w) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 5f0043c30b70..01bcf4ab1446 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -354,55 +354,76 @@ static int rt6_score_route(struct rt6_info *rt, int oif,
return m;
}
-static struct rt6_info *rt6_select(struct rt6_info **head, int oif,
- int strict)
+static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
+ int *mpri, struct rt6_info *match)
{
- struct rt6_info *match = NULL, *last = NULL;
- struct rt6_info *rt, *rt0 = *head;
- u32 metric;
+ int m;
+
+ if (rt6_check_expired(rt))
+ goto out;
+
+ m = rt6_score_route(rt, oif, strict);
+ if (m < 0)
+ goto out;
+
+ if (m > *mpri) {
+ if (strict & RT6_LOOKUP_F_REACHABLE)
+ rt6_probe(match);
+ *mpri = m;
+ match = rt;
+ } else if (strict & RT6_LOOKUP_F_REACHABLE) {
+ rt6_probe(rt);
+ }
+
+out:
+ return match;
+}
+
+static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
+ struct rt6_info *rr_head,
+ u32 metric, int oif, int strict)
+{
+ struct rt6_info *rt, *match;
int mpri = -1;
- RT6_TRACE("%s(head=%p(*head=%p), oif=%d)\n",
- __FUNCTION__, head, head ? *head : NULL, oif);
+ match = NULL;
+ for (rt = rr_head; rt && rt->rt6i_metric == metric;
+ rt = rt->u.next)
+ match = find_match(rt, oif, strict, &mpri, match);
+ for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
+ rt = rt->u.next)
+ match = find_match(rt, oif, strict, &mpri, match);
- for (rt = rt0, metric = rt0->rt6i_metric;
- rt && rt->rt6i_metric == metric && (!last || rt != rt0);
- rt = rt->u.next) {
- int m;
+ return match;
+}
- if (rt6_check_expired(rt))
- continue;
+static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
+{
+ struct rt6_info *match, *rt0;
- last = rt;
+ RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
+ __FUNCTION__, fn->leaf, oif);
- m = rt6_score_route(rt, oif, strict);
- if (m < 0)
- continue;
+ rt0 = fn->rr_ptr;
+ if (!rt0)
+ fn->rr_ptr = rt0 = fn->leaf;
- if (m > mpri) {
- if (strict & RT6_LOOKUP_F_REACHABLE)
- rt6_probe(match);
- match = rt;
- mpri = m;
- } else if (strict & RT6_LOOKUP_F_REACHABLE) {
- rt6_probe(rt);
- }
- }
+ match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
if (!match &&
- (strict & RT6_LOOKUP_F_REACHABLE) &&
- last && last != rt0) {
+ (strict & RT6_LOOKUP_F_REACHABLE)) {
+ struct rt6_info *next = rt0->u.next;
+
/* no entries matched; do round-robin */
- static DEFINE_SPINLOCK(lock);
- spin_lock(&lock);
- *head = rt0->u.next;
- rt0->u.next = last->u.next;
- last->u.next = rt0;
- spin_unlock(&lock);
+ if (!next || next->rt6i_metric != rt0->rt6i_metric)
+ next = fn->leaf;
+
+ if (next != rt0)
+ fn->rr_ptr = next;
}
- RT6_TRACE("%s() => %p, score=%d\n",
- __FUNCTION__, match, mpri);
+ RT6_TRACE("%s() => %p\n",
+ __FUNCTION__, match);
return (match ? match : &ip6_null_entry);
}
@@ -648,7 +669,7 @@ restart_2:
fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
restart:
- rt = rt6_select(&fn->leaf, fl->iif, strict | reachable);
+ rt = rt6_select(fn, fl->iif, strict | reachable);
BACKTRACK(&fl->fl6_src);
if (rt == &ip6_null_entry ||
rt->rt6i_flags & RTF_CACHE)
@@ -743,7 +764,7 @@ restart_2:
fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
restart:
- rt = rt6_select(&fn->leaf, fl->oif, strict | reachable);
+ rt = rt6_select(fn, fl->oif, strict | reachable);
BACKTRACK(&fl->fl6_src);
if (rt == &ip6_null_entry ||
rt->rt6i_flags & RTF_CACHE)