blob: ffca5c1066fa50947c4e5b88a86a47cace0ec57b [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080078#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070079#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000082#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090083#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080085#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070095#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <linux/notifier.h>
97#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020098#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070099#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
104#include <linux/if_bridge.h>
Patrick McHardyb863ceb2007-07-14 18:55:06 -0700105#include <linux/if_macvlan.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106#include <net/dst.h>
107#include <net/pkt_sched.h>
108#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000109#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#include <linux/highmem.h>
111#include <linux/init.h>
112#include <linux/kmod.h>
113#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
Johannes Berg295f4a12007-04-26 20:43:56 -0700117#include <net/wext.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500120#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700121#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700122#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700123#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700124#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700125#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700126#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700127#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700128#include <linux/ipv6.h>
129#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700130#include <linux/jhash.h>
131#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700132#include <trace/events/napi.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000133#include <linux/pci.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700135#include "net-sysfs.h"
136
Herbert Xud565b0a2008-12-15 23:38:52 -0800137/* Instead of increasing this, you should create a hash table. */
138#define MAX_GRO_SKBS 8
139
Herbert Xu5d38a072009-01-04 16:13:40 -0800140/* This should be increased if a protocol with a bigger head is added. */
141#define GRO_MAX_HEAD (MAX_HEADER + 128)
142
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143/*
144 * The list of packet types we will receive (as opposed to discard)
145 * and the routines to invoke.
146 *
147 * Why 16. Because with 16 the only overlap we get on a hash of the
148 * low nibble of the protocol value is RARP/SNAP/X.25.
149 *
150 * NOTE: That is no longer true with the addition of VLAN tags. Not
151 * sure which should go first, but I bet it won't make much
152 * difference if we are running VLANs. The good news is that
153 * this protocol won't be in the list unless compiled in, so
Stephen Hemminger3041a062006-05-26 13:25:24 -0700154 * the average user (w/out VLANs) will not be adversely affected.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 * --BLG
156 *
157 * 0800 IP
158 * 8100 802.1Q VLAN
159 * 0001 802.3
160 * 0002 AX.25
161 * 0004 802.2
162 * 8035 RARP
163 * 0005 SNAP
164 * 0805 X.25
165 * 0806 ARP
166 * 8137 IPX
167 * 0009 Localtalk
168 * 86DD IPv6
169 */
170
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800171#define PTYPE_HASH_SIZE (16)
172#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
173
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174static DEFINE_SPINLOCK(ptype_lock);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800175static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -0700176static struct list_head ptype_all __read_mostly; /* Taps */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700179 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 * semaphore.
181 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800182 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 *
184 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700185 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186 * actual updates. This allows pure readers to access the list even
187 * while a writer is preparing to update it.
188 *
189 * To put it another way, dev_base_lock is held for writing only to
190 * protect against pure readers; the rtnl semaphore provides the
191 * protection against other writers.
192 *
193 * See, for example usages, register_netdevice() and
194 * unregister_netdevice(), which must be called with the rtnl
195 * semaphore held.
196 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198EXPORT_SYMBOL(dev_base_lock);
199
Eric W. Biederman881d9662007-09-17 11:56:21 -0700200static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201{
202 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
stephen hemminger08e98972009-11-10 07:20:34 +0000203 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204}
205
Eric W. Biederman881d9662007-09-17 11:56:21 -0700206static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209}
210
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000211static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000212{
213#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000214 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000215#endif
216}
217
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000218static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000219{
220#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000221 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000222#endif
223}
224
Eric W. Biedermance286d32007-09-12 13:53:49 +0200225/* Device list insertion */
226static int list_netdevice(struct net_device *dev)
227{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900228 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200229
230 ASSERT_RTNL();
231
232 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800233 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000234 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000235 hlist_add_head_rcu(&dev->index_hlist,
236 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200237 write_unlock_bh(&dev_base_lock);
238 return 0;
239}
240
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000241/* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
243 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200244static void unlist_netdevice(struct net_device *dev)
245{
246 ASSERT_RTNL();
247
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800250 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000251 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000252 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200253 write_unlock_bh(&dev_base_lock);
254}
255
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256/*
257 * Our notifier list
258 */
259
Alan Sternf07d5b92006-05-09 15:23:03 -0700260static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261
262/*
263 * Device drivers call our routines to queue packets here. We empty the
264 * queue in the local softnet handler.
265 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700266
Eric Dumazet9958da02010-04-17 04:17:02 +0000267DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700268EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269
David S. Millercf508b12008-07-22 14:16:42 -0700270#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700271/*
David S. Millerc773e842008-07-08 23:13:53 -0700272 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700273 * according to dev->type
274 */
275static const unsigned short netdev_lock_type[] =
276 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
277 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
278 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
279 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
280 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
281 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
282 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
283 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
284 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
285 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
286 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
287 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
288 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800289 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
Dmitry Eremin-Solenikov929122c2009-08-14 20:00:20 +0400290 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000291 ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700292
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700293static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700294 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
306 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800307 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
Dmitry Eremin-Solenikov929122c2009-08-14 20:00:20 +0400308 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000309 "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700310
311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700313
314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315{
316 int i;
317
318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 if (netdev_lock_type[i] == dev_type)
320 return i;
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type) - 1;
323}
324
David S. Millercf508b12008-07-22 14:16:42 -0700325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700327{
328 int i;
329
330 i = netdev_lock_pos(dev_type);
331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 netdev_lock_name[i]);
333}
David S. Millercf508b12008-07-22 14:16:42 -0700334
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336{
337 int i;
338
339 i = netdev_lock_pos(dev->type);
340 lockdep_set_class_and_name(&dev->addr_list_lock,
341 &netdev_addr_lock_key[i],
342 netdev_lock_name[i]);
343}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700344#else
David S. Millercf508b12008-07-22 14:16:42 -0700345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 unsigned short dev_type)
347{
348}
349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700350{
351}
352#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353
354/*******************************************************************************
355
356 Protocol management and registration routines
357
358*******************************************************************************/
359
360/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
363 * here.
364 *
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
373 * --ANK (980803)
374 */
375
376/**
377 * dev_add_pack - add packet handler
378 * @pt: packet type declaration
379 *
380 * Add a protocol handler to the networking stack. The passed &packet_type
381 * is linked into kernel lists and may not be freed until it has been
382 * removed from the kernel lists.
383 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900384 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385 * guarantee all CPU's that are in middle of receiving packets
386 * will see the new packet type (until the next received packet).
387 */
388
389void dev_add_pack(struct packet_type *pt)
390{
391 int hash;
392
393 spin_lock_bh(&ptype_lock);
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700394 if (pt->type == htons(ETH_P_ALL))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395 list_add_rcu(&pt->list, &ptype_all);
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700396 else {
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800397 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398 list_add_rcu(&pt->list, &ptype_base[hash]);
399 }
400 spin_unlock_bh(&ptype_lock);
401}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700402EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404/**
405 * __dev_remove_pack - remove packet handler
406 * @pt: packet type declaration
407 *
408 * Remove a protocol handler that was previously added to the kernel
409 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
410 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900411 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412 *
413 * The packet type might still be in use by receivers
414 * and must not be freed until after all the CPU's have gone
415 * through a quiescent state.
416 */
417void __dev_remove_pack(struct packet_type *pt)
418{
419 struct list_head *head;
420 struct packet_type *pt1;
421
422 spin_lock_bh(&ptype_lock);
423
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700424 if (pt->type == htons(ETH_P_ALL))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 head = &ptype_all;
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700426 else
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800427 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428
429 list_for_each_entry(pt1, head, list) {
430 if (pt == pt1) {
431 list_del_rcu(&pt->list);
432 goto out;
433 }
434 }
435
436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437out:
438 spin_unlock_bh(&ptype_lock);
439}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700440EXPORT_SYMBOL(__dev_remove_pack);
441
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442/**
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
445 *
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
449 * returns.
450 *
451 * This call sleeps to guarantee that no CPU is looking at the packet
452 * type after return.
453 */
454void dev_remove_pack(struct packet_type *pt)
455{
456 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900457
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 synchronize_net();
459}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700460EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461
462/******************************************************************************
463
464 Device Boot-time Settings Routines
465
466*******************************************************************************/
467
468/* Boot time configuration table */
469static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470
471/**
472 * netdev_boot_setup_add - add new setup entry
473 * @name: name of the device
474 * @map: configured settings for the device
475 *
476 * Adds new setup entry to the dev_boot_setup list. The function
477 * returns 0 on error and 1 on success. This is a generic routine to
478 * all netdevices.
479 */
480static int netdev_boot_setup_add(char *name, struct ifmap *map)
481{
482 struct netdev_boot_setup *s;
483 int i;
484
485 s = dev_boot_setup;
486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700489 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490 memcpy(&s[i].map, map, sizeof(s[i].map));
491 break;
492 }
493 }
494
495 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496}
497
498/**
499 * netdev_boot_setup_check - check boot time settings
500 * @dev: the netdevice
501 *
502 * Check boot time settings for the device.
503 * The found settings are set for the device to be used
504 * later in the device probing.
505 * Returns 0 if no settings found, 1 if they are.
506 */
507int netdev_boot_setup_check(struct net_device *dev)
508{
509 struct netdev_boot_setup *s = dev_boot_setup;
510 int i;
511
512 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700514 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515 dev->irq = s[i].map.irq;
516 dev->base_addr = s[i].map.base_addr;
517 dev->mem_start = s[i].map.mem_start;
518 dev->mem_end = s[i].map.mem_end;
519 return 1;
520 }
521 }
522 return 0;
523}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700524EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525
526
527/**
528 * netdev_boot_base - get address from boot time settings
529 * @prefix: prefix for network device
530 * @unit: id for network device
531 *
532 * Check boot time settings for the base address of device.
533 * The found settings are set for the device to be used
534 * later in the device probing.
535 * Returns 0 if no settings found.
536 */
537unsigned long netdev_boot_base(const char *prefix, int unit)
538{
539 const struct netdev_boot_setup *s = dev_boot_setup;
540 char name[IFNAMSIZ];
541 int i;
542
543 sprintf(name, "%s%d", prefix, unit);
544
545 /*
546 * If device already registered then return base of 1
547 * to indicate not to probe for this interface
548 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700549 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550 return 1;
551
552 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 if (!strcmp(name, s[i].name))
554 return s[i].map.base_addr;
555 return 0;
556}
557
558/*
559 * Saves at boot time configured settings for any netdevice.
560 */
561int __init netdev_boot_setup(char *str)
562{
563 int ints[5];
564 struct ifmap map;
565
566 str = get_options(str, ARRAY_SIZE(ints), ints);
567 if (!str || !*str)
568 return 0;
569
570 /* Save settings */
571 memset(&map, 0, sizeof(map));
572 if (ints[0] > 0)
573 map.irq = ints[1];
574 if (ints[0] > 1)
575 map.base_addr = ints[2];
576 if (ints[0] > 2)
577 map.mem_start = ints[3];
578 if (ints[0] > 3)
579 map.mem_end = ints[4];
580
581 /* Add new entry to the list */
582 return netdev_boot_setup_add(str, &map);
583}
584
585__setup("netdev=", netdev_boot_setup);
586
587/*******************************************************************************
588
589 Device Interface Subroutines
590
591*******************************************************************************/
592
593/**
594 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700595 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700596 * @name: name to find
597 *
598 * Find an interface by name. Must be called under RTNL semaphore
599 * or @dev_base_lock. If the name is found a pointer to the device
600 * is returned. If the name is not found then %NULL is returned. The
601 * reference counters are not incremented so the caller must be
602 * careful with locks.
603 */
604
Eric W. Biederman881d9662007-09-17 11:56:21 -0700605struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606{
607 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700608 struct net_device *dev;
609 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700611 hlist_for_each_entry(dev, p, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 if (!strncmp(dev->name, name, IFNAMSIZ))
613 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700614
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615 return NULL;
616}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700617EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618
619/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000620 * dev_get_by_name_rcu - find a device by its name
621 * @net: the applicable net namespace
622 * @name: name to find
623 *
624 * Find an interface by name.
625 * If the name is found a pointer to the device is returned.
626 * If the name is not found then %NULL is returned.
627 * The reference counters are not incremented so the caller must be
628 * careful with locks. The caller must hold RCU lock.
629 */
630
631struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632{
633 struct hlist_node *p;
634 struct net_device *dev;
635 struct hlist_head *head = dev_name_hash(net, name);
636
637 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 if (!strncmp(dev->name, name, IFNAMSIZ))
639 return dev;
640
641 return NULL;
642}
643EXPORT_SYMBOL(dev_get_by_name_rcu);
644
645/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700647 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 * @name: name to find
649 *
650 * Find an interface by name. This can be called from any
651 * context and does its own locking. The returned handle has
652 * the usage count incremented and the caller must use dev_put() to
653 * release it when it is no longer needed. %NULL is returned if no
654 * matching device is found.
655 */
656
Eric W. Biederman881d9662007-09-17 11:56:21 -0700657struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658{
659 struct net_device *dev;
660
Eric Dumazet72c95282009-10-30 07:11:27 +0000661 rcu_read_lock();
662 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663 if (dev)
664 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000665 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 return dev;
667}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700668EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669
670/**
671 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700672 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700673 * @ifindex: index of device
674 *
675 * Search for an interface by index. Returns %NULL if the device
676 * is not found or a pointer to the device. The device has not
677 * had its reference counter increased so the caller must be careful
678 * about locking. The caller must hold either the RTNL semaphore
679 * or @dev_base_lock.
680 */
681
Eric W. Biederman881d9662007-09-17 11:56:21 -0700682struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683{
684 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700685 struct net_device *dev;
686 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700688 hlist_for_each_entry(dev, p, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700689 if (dev->ifindex == ifindex)
690 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700691
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692 return NULL;
693}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700694EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700695
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000696/**
697 * dev_get_by_index_rcu - find a device by its ifindex
698 * @net: the applicable net namespace
699 * @ifindex: index of device
700 *
701 * Search for an interface by index. Returns %NULL if the device
702 * is not found or a pointer to the device. The device has not
703 * had its reference counter increased so the caller must be careful
704 * about locking. The caller must hold RCU lock.
705 */
706
707struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708{
709 struct hlist_node *p;
710 struct net_device *dev;
711 struct hlist_head *head = dev_index_hash(net, ifindex);
712
713 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 if (dev->ifindex == ifindex)
715 return dev;
716
717 return NULL;
718}
719EXPORT_SYMBOL(dev_get_by_index_rcu);
720
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721
722/**
723 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700724 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 * @ifindex: index of device
726 *
727 * Search for an interface by index. Returns NULL if the device
728 * is not found or a pointer to the device. The device returned has
729 * had a reference added and the pointer is safe until the user calls
730 * dev_put to indicate they have finished with it.
731 */
732
Eric W. Biederman881d9662007-09-17 11:56:21 -0700733struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734{
735 struct net_device *dev;
736
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000737 rcu_read_lock();
738 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739 if (dev)
740 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000741 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700742 return dev;
743}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700744EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745
746/**
747 * dev_getbyhwaddr - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700748 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700749 * @type: media type of device
750 * @ha: hardware address
751 *
752 * Search for an interface by MAC address. Returns NULL if the device
753 * is not found or a pointer to the device. The caller must hold the
754 * rtnl semaphore. The returned device has not had its ref count increased
755 * and the caller must therefore be careful about locking
756 *
757 * BUGS:
758 * If the API was consistent this would be __dev_get_by_hwaddr
759 */
760
Eric W. Biederman881d9662007-09-17 11:56:21 -0700761struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700762{
763 struct net_device *dev;
764
765 ASSERT_RTNL();
766
Denis V. Lunev81103a52007-12-12 10:47:38 -0800767 for_each_netdev(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700768 if (dev->type == type &&
769 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700770 return dev;
771
772 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700773}
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300774EXPORT_SYMBOL(dev_getbyhwaddr);
775
Eric W. Biederman881d9662007-09-17 11:56:21 -0700776struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700777{
778 struct net_device *dev;
779
780 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700781 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700782 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700783 return dev;
784
785 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700786}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700787EXPORT_SYMBOL(__dev_getfirstbyhwtype);
788
Eric W. Biederman881d9662007-09-17 11:56:21 -0700789struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000791 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700792
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000793 rcu_read_lock();
794 for_each_netdev_rcu(net, dev)
795 if (dev->type == type) {
796 dev_hold(dev);
797 ret = dev;
798 break;
799 }
800 rcu_read_unlock();
801 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700802}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803EXPORT_SYMBOL(dev_getfirstbyhwtype);
804
805/**
806 * dev_get_by_flags - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700807 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808 * @if_flags: IFF_* values
809 * @mask: bitmask of bits in if_flags to check
810 *
811 * Search for any interface with the given flags. Returns NULL if a device
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900812 * is not found or a pointer to the device. The device returned has
Linus Torvalds1da177e2005-04-16 15:20:36 -0700813 * had a reference added and the pointer is safe until the user calls
814 * dev_put to indicate they have finished with it.
815 */
816
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700817struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
818 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700820 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821
Pavel Emelianov7562f872007-05-03 15:13:45 -0700822 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800823 rcu_read_lock();
824 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825 if (((dev->flags ^ if_flags) & mask) == 0) {
826 dev_hold(dev);
Pavel Emelianov7562f872007-05-03 15:13:45 -0700827 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700828 break;
829 }
830 }
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800831 rcu_read_unlock();
Pavel Emelianov7562f872007-05-03 15:13:45 -0700832 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700834EXPORT_SYMBOL(dev_get_by_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835
836/**
837 * dev_valid_name - check if name is okay for network device
838 * @name: name string
839 *
840 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700841 * to allow sysfs to work. We also disallow any kind of
842 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700843 */
Mitch Williamsc2373ee2005-11-09 10:34:45 -0800844int dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700845{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700846 if (*name == '\0')
847 return 0;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700848 if (strlen(name) >= IFNAMSIZ)
849 return 0;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700850 if (!strcmp(name, ".") || !strcmp(name, ".."))
851 return 0;
852
853 while (*name) {
854 if (*name == '/' || isspace(*name))
855 return 0;
856 name++;
857 }
858 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700860EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861
862/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200863 * __dev_alloc_name - allocate a name for a device
864 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200866 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867 *
868 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700869 * id. It scans list of devices to build up a free map, then chooses
870 * the first empty slot. The caller must hold the dev_base or rtnl lock
871 * while allocating the name and adding the device in order to avoid
872 * duplicates.
873 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
874 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875 */
876
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200877static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878{
879 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700880 const char *p;
881 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700882 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 struct net_device *d;
884
885 p = strnchr(name, IFNAMSIZ-1, '%');
886 if (p) {
887 /*
888 * Verify the string as this thing may have come from
889 * the user. There must be either one "%d" and no other "%"
890 * characters.
891 */
892 if (p[1] != 'd' || strchr(p + 2, '%'))
893 return -EINVAL;
894
895 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700896 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700897 if (!inuse)
898 return -ENOMEM;
899
Eric W. Biederman881d9662007-09-17 11:56:21 -0700900 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 if (!sscanf(d->name, name, &i))
902 continue;
903 if (i < 0 || i >= max_netdevices)
904 continue;
905
906 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200907 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908 if (!strncmp(buf, d->name, IFNAMSIZ))
909 set_bit(i, inuse);
910 }
911
912 i = find_first_zero_bit(inuse, max_netdevices);
913 free_page((unsigned long) inuse);
914 }
915
Octavian Purdilad9031022009-11-18 02:36:59 +0000916 if (buf != name)
917 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200918 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700919 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700920
921 /* It is possible to run out of possible slots
922 * when the name is long and there isn't enough space left
923 * for the digits, or if all bits are used.
924 */
925 return -ENFILE;
926}
927
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200928/**
929 * dev_alloc_name - allocate a name for a device
930 * @dev: device
931 * @name: name format string
932 *
933 * Passed a format string - eg "lt%d" it will try and find a suitable
934 * id. It scans list of devices to build up a free map, then chooses
935 * the first empty slot. The caller must hold the dev_base or rtnl lock
936 * while allocating the name and adding the device in order to avoid
937 * duplicates.
938 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
939 * Returns the number of the unit assigned or a negative errno code.
940 */
941
942int dev_alloc_name(struct net_device *dev, const char *name)
943{
944 char buf[IFNAMSIZ];
945 struct net *net;
946 int ret;
947
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900948 BUG_ON(!dev_net(dev));
949 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200950 ret = __dev_alloc_name(net, name, buf);
951 if (ret >= 0)
952 strlcpy(dev->name, buf, IFNAMSIZ);
953 return ret;
954}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700955EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200956
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000957static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
Octavian Purdilad9031022009-11-18 02:36:59 +0000958{
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000959 struct net *net;
960
961 BUG_ON(!dev_net(dev));
962 net = dev_net(dev);
963
Octavian Purdilad9031022009-11-18 02:36:59 +0000964 if (!dev_valid_name(name))
965 return -EINVAL;
966
967 if (fmt && strchr(name, '%'))
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000968 return dev_alloc_name(dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +0000969 else if (__dev_get_by_name(net, name))
970 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000971 else if (dev->name != name)
972 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +0000973
974 return 0;
975}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976
977/**
978 * dev_change_name - change name of a device
979 * @dev: device
980 * @newname: name (or format string) must be at least IFNAMSIZ
981 *
982 * Change name of a device, can pass format strings "eth%d".
983 * for wildcarding.
984 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -0700985int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700986{
Herbert Xufcc5a032007-07-30 17:03:38 -0700987 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -0700989 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -0700990 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991
992 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900993 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900995 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700996 if (dev->flags & IFF_UP)
997 return -EBUSY;
998
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -0700999 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1000 return 0;
1001
Herbert Xufcc5a032007-07-30 17:03:38 -07001002 memcpy(oldname, dev->name, IFNAMSIZ);
1003
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00001004 err = dev_get_valid_name(dev, newname, 1);
Octavian Purdilad9031022009-11-18 02:36:59 +00001005 if (err < 0)
1006 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007
Herbert Xufcc5a032007-07-30 17:03:38 -07001008rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001009 ret = device_rename(&dev->dev, dev->name);
1010 if (ret) {
1011 memcpy(dev->name, oldname, IFNAMSIZ);
1012 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001013 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001014
1015 write_lock_bh(&dev_base_lock);
Eric W. Biederman92749822007-04-03 00:07:30 -06001016 hlist_del(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001017 write_unlock_bh(&dev_base_lock);
1018
1019 synchronize_rcu();
1020
1021 write_lock_bh(&dev_base_lock);
1022 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001023 write_unlock_bh(&dev_base_lock);
1024
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001025 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001026 ret = notifier_to_errno(ret);
1027
1028 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001029 /* err >= 0 after dev_alloc_name() or stores the first errno */
1030 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001031 err = ret;
1032 memcpy(dev->name, oldname, IFNAMSIZ);
1033 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001034 } else {
1035 printk(KERN_ERR
1036 "%s: name change rollback failed: %d.\n",
1037 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001038 }
1039 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001040
1041 return err;
1042}
1043
1044/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001045 * dev_set_alias - change ifalias of a device
1046 * @dev: device
1047 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001048 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001049 *
1050 * Set ifalias for a device,
1051 */
1052int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1053{
1054 ASSERT_RTNL();
1055
1056 if (len >= IFALIASZ)
1057 return -EINVAL;
1058
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001059 if (!len) {
1060 if (dev->ifalias) {
1061 kfree(dev->ifalias);
1062 dev->ifalias = NULL;
1063 }
1064 return 0;
1065 }
1066
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001067 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001068 if (!dev->ifalias)
1069 return -ENOMEM;
1070
1071 strlcpy(dev->ifalias, alias, len+1);
1072 return len;
1073}
1074
1075
1076/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001077 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001078 * @dev: device to cause notification
1079 *
1080 * Called to indicate a device has changed features.
1081 */
1082void netdev_features_change(struct net_device *dev)
1083{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001084 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001085}
1086EXPORT_SYMBOL(netdev_features_change);
1087
1088/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001089 * netdev_state_change - device changes state
1090 * @dev: device to cause notification
1091 *
1092 * Called to indicate a device has changed state. This function calls
1093 * the notifier chains for netdev_chain and sends a NEWLINK message
1094 * to the routing socket.
1095 */
1096void netdev_state_change(struct net_device *dev)
1097{
1098 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001099 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001100 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1101 }
1102}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001103EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001104
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001105int netdev_bonding_change(struct net_device *dev, unsigned long event)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001106{
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001107 return call_netdevice_notifiers(event, dev);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001108}
1109EXPORT_SYMBOL(netdev_bonding_change);
1110
Linus Torvalds1da177e2005-04-16 15:20:36 -07001111/**
1112 * dev_load - load a network module
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001113 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07001114 * @name: name of interface
1115 *
1116 * If a network interface is not present and the process has suitable
1117 * privileges this function loads the module. If module loading is not
1118 * available in this kernel then it becomes a nop.
1119 */
1120
Eric W. Biederman881d9662007-09-17 11:56:21 -07001121void dev_load(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122{
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001123 struct net_device *dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001124
Eric Dumazet72c95282009-10-30 07:11:27 +00001125 rcu_read_lock();
1126 dev = dev_get_by_name_rcu(net, name);
1127 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001128
Eric Parisa8f80e82009-08-13 09:44:51 -04001129 if (!dev && capable(CAP_NET_ADMIN))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130 request_module("%s", name);
1131}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001132EXPORT_SYMBOL(dev_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133
Patrick McHardybd380812010-02-26 06:34:53 +00001134static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001136 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001137 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001138
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001139 ASSERT_RTNL();
1140
Linus Torvalds1da177e2005-04-16 15:20:36 -07001141 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001142 * Is it even present?
1143 */
1144 if (!netif_device_present(dev))
1145 return -ENODEV;
1146
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001147 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1148 ret = notifier_to_errno(ret);
1149 if (ret)
1150 return ret;
1151
Linus Torvalds1da177e2005-04-16 15:20:36 -07001152 /*
1153 * Call device private open method
1154 */
1155 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001156
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001157 if (ops->ndo_validate_addr)
1158 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001159
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001160 if (!ret && ops->ndo_open)
1161 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001163 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001164 * If it went open OK then:
1165 */
1166
Jeff Garzikbada3392007-10-23 20:19:37 -07001167 if (ret)
1168 clear_bit(__LINK_STATE_START, &dev->state);
1169 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001170 /*
1171 * Set the flags.
1172 */
1173 dev->flags |= IFF_UP;
1174
1175 /*
Dan Williams649274d2009-01-11 00:20:39 -08001176 * Enable NET_DMA
1177 */
David S. Millerb4bd07c2009-02-06 22:06:43 -08001178 net_dmaengine_get();
Dan Williams649274d2009-01-11 00:20:39 -08001179
1180 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 * Initialize multicasting status
1182 */
Patrick McHardy4417da62007-06-27 01:28:10 -07001183 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001184
1185 /*
1186 * Wakeup transmit queue engine
1187 */
1188 dev_activate(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001189 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001190
Linus Torvalds1da177e2005-04-16 15:20:36 -07001191 return ret;
1192}
Patrick McHardybd380812010-02-26 06:34:53 +00001193
1194/**
1195 * dev_open - prepare an interface for use.
1196 * @dev: device to open
1197 *
1198 * Takes a device from down to up state. The device's private open
1199 * function is invoked and then the multicast lists are loaded. Finally
1200 * the device is moved into the up state and a %NETDEV_UP message is
1201 * sent to the netdev notifier chain.
1202 *
1203 * Calling this function on an active interface is a nop. On a failure
1204 * a negative errno code is returned.
1205 */
1206int dev_open(struct net_device *dev)
1207{
1208 int ret;
1209
1210 /*
1211 * Is it already up?
1212 */
1213 if (dev->flags & IFF_UP)
1214 return 0;
1215
1216 /*
1217 * Open device
1218 */
1219 ret = __dev_open(dev);
1220 if (ret < 0)
1221 return ret;
1222
1223 /*
1224 * ... and announce new interface.
1225 */
1226 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1227 call_netdevice_notifiers(NETDEV_UP, dev);
1228
1229 return ret;
1230}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001231EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232
Patrick McHardybd380812010-02-26 06:34:53 +00001233static int __dev_close(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001234{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001235 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardybd380812010-02-26 06:34:53 +00001236
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001237 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001238 might_sleep();
1239
Linus Torvalds1da177e2005-04-16 15:20:36 -07001240 /*
1241 * Tell people we are going down, so that they can
1242 * prepare to death, when device is still operating.
1243 */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001244 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001245
Linus Torvalds1da177e2005-04-16 15:20:36 -07001246 clear_bit(__LINK_STATE_START, &dev->state);
1247
1248 /* Synchronize to scheduled poll. We cannot touch poll list,
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001249 * it can be even on different cpu. So just clear netif_running().
1250 *
1251 * dev->stop() will invoke napi_disable() on all of it's
1252 * napi_struct instances on this device.
1253 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001254 smp_mb__after_clear_bit(); /* Commit netif_running(). */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001256 dev_deactivate(dev);
1257
Linus Torvalds1da177e2005-04-16 15:20:36 -07001258 /*
1259 * Call the device specific close. This cannot fail.
1260 * Only if device is UP
1261 *
1262 * We allow it to be called even after a DETACH hot-plug
1263 * event.
1264 */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001265 if (ops->ndo_stop)
1266 ops->ndo_stop(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001267
1268 /*
1269 * Device is now down.
1270 */
1271
1272 dev->flags &= ~IFF_UP;
1273
1274 /*
Dan Williams649274d2009-01-11 00:20:39 -08001275 * Shutdown NET_DMA
1276 */
David S. Millerb4bd07c2009-02-06 22:06:43 -08001277 net_dmaengine_put();
Dan Williams649274d2009-01-11 00:20:39 -08001278
Linus Torvalds1da177e2005-04-16 15:20:36 -07001279 return 0;
1280}
Patrick McHardybd380812010-02-26 06:34:53 +00001281
1282/**
1283 * dev_close - shutdown an interface.
1284 * @dev: device to shutdown
1285 *
1286 * This function moves an active device into down state. A
1287 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1288 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1289 * chain.
1290 */
1291int dev_close(struct net_device *dev)
1292{
1293 if (!(dev->flags & IFF_UP))
1294 return 0;
1295
1296 __dev_close(dev);
1297
1298 /*
1299 * Tell people we are down
1300 */
1301 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1302 call_netdevice_notifiers(NETDEV_DOWN, dev);
1303
1304 return 0;
1305}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001306EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001307
1308
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001309/**
1310 * dev_disable_lro - disable Large Receive Offload on a device
1311 * @dev: device
1312 *
1313 * Disable Large Receive Offload (LRO) on a net device. Must be
1314 * called under RTNL. This is needed if received packets may be
1315 * forwarded to another interface.
1316 */
1317void dev_disable_lro(struct net_device *dev)
1318{
1319 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1320 dev->ethtool_ops->set_flags) {
1321 u32 flags = dev->ethtool_ops->get_flags(dev);
1322 if (flags & ETH_FLAG_LRO) {
1323 flags &= ~ETH_FLAG_LRO;
1324 dev->ethtool_ops->set_flags(dev, flags);
1325 }
1326 }
1327 WARN_ON(dev->features & NETIF_F_LRO);
1328}
1329EXPORT_SYMBOL(dev_disable_lro);
1330
1331
Eric W. Biederman881d9662007-09-17 11:56:21 -07001332static int dev_boot_phase = 1;
1333
Linus Torvalds1da177e2005-04-16 15:20:36 -07001334/*
1335 * Device change register/unregister. These are not inline or static
1336 * as we export them to the world.
1337 */
1338
1339/**
1340 * register_netdevice_notifier - register a network notifier block
1341 * @nb: notifier
1342 *
1343 * Register a notifier to be called when network device events occur.
1344 * The notifier passed is linked into the kernel structures and must
1345 * not be reused until it has been unregistered. A negative errno code
1346 * is returned on a failure.
1347 *
1348 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001349 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001350 * view of the network device list.
1351 */
1352
1353int register_netdevice_notifier(struct notifier_block *nb)
1354{
1355 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001356 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001357 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001358 int err;
1359
1360 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001361 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001362 if (err)
1363 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001364 if (dev_boot_phase)
1365 goto unlock;
1366 for_each_net(net) {
1367 for_each_netdev(net, dev) {
1368 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1369 err = notifier_to_errno(err);
1370 if (err)
1371 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001372
Eric W. Biederman881d9662007-09-17 11:56:21 -07001373 if (!(dev->flags & IFF_UP))
1374 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001375
Eric W. Biederman881d9662007-09-17 11:56:21 -07001376 nb->notifier_call(nb, NETDEV_UP, dev);
1377 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001378 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001379
1380unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001381 rtnl_unlock();
1382 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001383
1384rollback:
1385 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001386 for_each_net(net) {
1387 for_each_netdev(net, dev) {
1388 if (dev == last)
1389 break;
Herbert Xufcc5a032007-07-30 17:03:38 -07001390
Eric W. Biederman881d9662007-09-17 11:56:21 -07001391 if (dev->flags & IFF_UP) {
1392 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1393 nb->notifier_call(nb, NETDEV_DOWN, dev);
1394 }
1395 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00001396 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001397 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001398 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001399
1400 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001401 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001402}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001403EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404
1405/**
1406 * unregister_netdevice_notifier - unregister a network notifier block
1407 * @nb: notifier
1408 *
1409 * Unregister a notifier previously registered by
1410 * register_netdevice_notifier(). The notifier is unlinked into the
1411 * kernel structures and may then be reused. A negative errno code
1412 * is returned on a failure.
1413 */
1414
1415int unregister_netdevice_notifier(struct notifier_block *nb)
1416{
Herbert Xu9f514952006-03-25 01:24:25 -08001417 int err;
1418
1419 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001420 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xu9f514952006-03-25 01:24:25 -08001421 rtnl_unlock();
1422 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001423}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001424EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001425
1426/**
1427 * call_netdevice_notifiers - call all network notifier blocks
1428 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001429 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430 *
1431 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001432 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001433 */
1434
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001435int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001436{
Jiri Pirkoab930472010-04-20 01:45:37 -07001437 ASSERT_RTNL();
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001438 return raw_notifier_call_chain(&netdev_chain, val, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001439}
1440
1441/* When > 0 there are consumers of rx skb time stamps */
1442static atomic_t netstamp_needed = ATOMIC_INIT(0);
1443
1444void net_enable_timestamp(void)
1445{
1446 atomic_inc(&netstamp_needed);
1447}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001448EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001449
1450void net_disable_timestamp(void)
1451{
1452 atomic_dec(&netstamp_needed);
1453}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001454EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001455
Eric Dumazet3b098e22010-05-15 23:57:10 -07001456static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001457{
1458 if (atomic_read(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001459 __net_timestamp(skb);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001460 else
1461 skb->tstamp.tv64 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001462}
1463
Eric Dumazet3b098e22010-05-15 23:57:10 -07001464static inline void net_timestamp_check(struct sk_buff *skb)
1465{
1466 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1467 __net_timestamp(skb);
1468}
1469
Arnd Bergmann44540962009-11-26 06:07:08 +00001470/**
1471 * dev_forward_skb - loopback an skb to another netif
1472 *
1473 * @dev: destination network device
1474 * @skb: buffer to forward
1475 *
1476 * return values:
1477 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001478 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001479 *
1480 * dev_forward_skb can be used for injecting an skb from the
1481 * start_xmit function of one device into the receive queue
1482 * of another device.
1483 *
1484 * The receiving device may be in another namespace, so
1485 * we have to clear all information in the skb that could
1486 * impact namespace isolation.
1487 */
1488int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1489{
1490 skb_orphan(skb);
1491
Eric Dumazet6ec82562010-05-06 00:53:53 -07001492 if (!(dev->flags & IFF_UP) ||
1493 (skb->len > (dev->mtu + dev->hard_header_len))) {
1494 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001495 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001496 }
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001497 skb_set_dev(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001498 skb->tstamp.tv64 = 0;
1499 skb->pkt_type = PACKET_HOST;
1500 skb->protocol = eth_type_trans(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001501 return netif_rx(skb);
1502}
1503EXPORT_SYMBOL_GPL(dev_forward_skb);
1504
Linus Torvalds1da177e2005-04-16 15:20:36 -07001505/*
1506 * Support routine. Sends outgoing frames to any network
1507 * taps currently in use.
1508 */
1509
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001510static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001511{
1512 struct packet_type *ptype;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001513
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001514#ifdef CONFIG_NET_CLS_ACT
1515 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
Eric Dumazet3b098e22010-05-15 23:57:10 -07001516 net_timestamp_set(skb);
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001517#else
Eric Dumazet3b098e22010-05-15 23:57:10 -07001518 net_timestamp_set(skb);
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001519#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001520
1521 rcu_read_lock();
1522 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1523 /* Never send packets back to the socket
1524 * they originated from - MvS (miquels@drinkel.ow.org)
1525 */
1526 if ((ptype->dev == dev || !ptype->dev) &&
1527 (ptype->af_packet_priv == NULL ||
1528 (struct sock *)ptype->af_packet_priv != skb->sk)) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001529 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001530 if (!skb2)
1531 break;
1532
1533 /* skb->nh should be correctly
1534 set by sender, so that the second statement is
1535 just protection against buggy protocols.
1536 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001537 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001539 if (skb_network_header(skb2) < skb2->data ||
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001540 skb2->network_header > skb2->tail) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001541 if (net_ratelimit())
1542 printk(KERN_CRIT "protocol %04x is "
1543 "buggy, dev %s\n",
1544 skb2->protocol, dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001545 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001546 }
1547
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001548 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 skb2->pkt_type = PACKET_OUTGOING;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07001550 ptype->func(skb2, skb->dev, ptype, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001551 }
1552 }
1553 rcu_read_unlock();
1554}
1555
Denis Vlasenko56079432006-03-29 15:57:29 -08001556
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001557static inline void __netif_reschedule(struct Qdisc *q)
1558{
1559 struct softnet_data *sd;
1560 unsigned long flags;
1561
1562 local_irq_save(flags);
1563 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00001564 q->next_sched = NULL;
1565 *sd->output_queue_tailp = q;
1566 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001567 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1568 local_irq_restore(flags);
1569}
1570
David S. Miller37437bb2008-07-16 02:15:04 -07001571void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08001572{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001573 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1574 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08001575}
1576EXPORT_SYMBOL(__netif_schedule);
1577
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001578void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08001579{
Eric Dumazet15e83ed2010-05-19 23:16:03 +00001580 if (!skb->destructor)
1581 dev_kfree_skb(skb);
1582 else if (atomic_dec_and_test(&skb->users)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001583 struct softnet_data *sd;
1584 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08001585
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001586 local_irq_save(flags);
1587 sd = &__get_cpu_var(softnet_data);
1588 skb->next = sd->completion_queue;
1589 sd->completion_queue = skb;
1590 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1591 local_irq_restore(flags);
1592 }
Denis Vlasenko56079432006-03-29 15:57:29 -08001593}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001594EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08001595
1596void dev_kfree_skb_any(struct sk_buff *skb)
1597{
1598 if (in_irq() || irqs_disabled())
1599 dev_kfree_skb_irq(skb);
1600 else
1601 dev_kfree_skb(skb);
1602}
1603EXPORT_SYMBOL(dev_kfree_skb_any);
1604
1605
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001606/**
1607 * netif_device_detach - mark device as removed
1608 * @dev: network device
1609 *
1610 * Mark device as removed from system and therefore no longer available.
1611 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001612void netif_device_detach(struct net_device *dev)
1613{
1614 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1615 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001616 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001617 }
1618}
1619EXPORT_SYMBOL(netif_device_detach);
1620
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001621/**
1622 * netif_device_attach - mark device as attached
1623 * @dev: network device
1624 *
1625 * Mark device as attached from system and restart if needed.
1626 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001627void netif_device_attach(struct net_device *dev)
1628{
1629 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1630 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001631 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001632 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001633 }
1634}
1635EXPORT_SYMBOL(netif_device_attach);
1636
Ben Hutchings6de329e2008-06-16 17:02:28 -07001637static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1638{
1639 return ((features & NETIF_F_GEN_CSUM) ||
1640 ((features & NETIF_F_IP_CSUM) &&
1641 protocol == htons(ETH_P_IP)) ||
1642 ((features & NETIF_F_IPV6_CSUM) &&
Yi Zou1c8dbcf2009-02-27 14:06:54 -08001643 protocol == htons(ETH_P_IPV6)) ||
1644 ((features & NETIF_F_FCOE_CRC) &&
1645 protocol == htons(ETH_P_FCOE)));
Ben Hutchings6de329e2008-06-16 17:02:28 -07001646}
1647
1648static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1649{
1650 if (can_checksum_protocol(dev->features, skb->protocol))
1651 return true;
1652
1653 if (skb->protocol == htons(ETH_P_8021Q)) {
1654 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1655 if (can_checksum_protocol(dev->features & dev->vlan_features,
1656 veh->h_vlan_encapsulated_proto))
1657 return true;
1658 }
1659
1660 return false;
1661}
Denis Vlasenko56079432006-03-29 15:57:29 -08001662
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001663/**
1664 * skb_dev_set -- assign a new device to a buffer
1665 * @skb: buffer for the new device
1666 * @dev: network device
1667 *
1668 * If an skb is owned by a device already, we have to reset
1669 * all data private to the namespace a device belongs to
1670 * before assigning it a new device.
1671 */
1672#ifdef CONFIG_NET_NS
1673void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1674{
1675 skb_dst_drop(skb);
1676 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1677 secpath_reset(skb);
1678 nf_reset(skb);
1679 skb_init_secmark(skb);
1680 skb->mark = 0;
1681 skb->priority = 0;
1682 skb->nf_trace = 0;
1683 skb->ipvs_property = 0;
1684#ifdef CONFIG_NET_SCHED
1685 skb->tc_index = 0;
1686#endif
1687 }
1688 skb->dev = dev;
1689}
1690EXPORT_SYMBOL(skb_set_dev);
1691#endif /* CONFIG_NET_NS */
1692
Linus Torvalds1da177e2005-04-16 15:20:36 -07001693/*
1694 * Invalidate hardware checksum when packet is to be mangled, and
1695 * complete checksum manually on outgoing path.
1696 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07001697int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001698{
Al Virod3bc23e2006-11-14 21:24:49 -08001699 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07001700 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701
Patrick McHardy84fa7932006-08-29 16:44:56 -07001702 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07001703 goto out_set_summed;
1704
1705 if (unlikely(skb_shinfo(skb)->gso_size)) {
Herbert Xua430a432006-07-08 13:34:56 -07001706 /* Let GSO fix up the checksum. */
1707 goto out_set_summed;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708 }
1709
Herbert Xua0308472007-10-15 01:47:15 -07001710 offset = skb->csum_start - skb_headroom(skb);
1711 BUG_ON(offset >= skb_headlen(skb));
1712 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1713
1714 offset += skb->csum_offset;
1715 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1716
1717 if (skb_cloned(skb) &&
1718 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001719 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1720 if (ret)
1721 goto out;
1722 }
1723
Herbert Xua0308472007-10-15 01:47:15 -07001724 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07001725out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001726 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001727out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001728 return ret;
1729}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001730EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001732/**
1733 * skb_gso_segment - Perform segmentation on skb.
1734 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07001735 * @features: features for the output path (see dev->features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001736 *
1737 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07001738 *
1739 * It may return NULL if the skb requires no segmentation. This is
1740 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001741 */
Herbert Xu576a30e2006-06-27 13:22:38 -07001742struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001743{
1744 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1745 struct packet_type *ptype;
Al Viro252e33462006-11-14 20:48:11 -08001746 __be16 type = skb->protocol;
Herbert Xua430a432006-07-08 13:34:56 -07001747 int err;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001748
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001749 skb_reset_mac_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001750 skb->mac_len = skb->network_header - skb->mac_header;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001751 __skb_pull(skb, skb->mac_len);
1752
Herbert Xu67fd1a72009-01-19 16:26:44 -08001753 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1754 struct net_device *dev = skb->dev;
1755 struct ethtool_drvinfo info = {};
1756
1757 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1758 dev->ethtool_ops->get_drvinfo(dev, &info);
1759
1760 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1761 "ip_summed=%d",
1762 info.driver, dev ? dev->features : 0L,
1763 skb->sk ? skb->sk->sk_route_caps : 0L,
1764 skb->len, skb->data_len, skb->ip_summed);
1765
Herbert Xua430a432006-07-08 13:34:56 -07001766 if (skb_header_cloned(skb) &&
1767 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1768 return ERR_PTR(err);
1769 }
1770
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001771 rcu_read_lock();
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08001772 list_for_each_entry_rcu(ptype,
1773 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001774 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
Patrick McHardy84fa7932006-08-29 16:44:56 -07001775 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
Herbert Xua430a432006-07-08 13:34:56 -07001776 err = ptype->gso_send_check(skb);
1777 segs = ERR_PTR(err);
1778 if (err || skb_gso_ok(skb, features))
1779 break;
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001780 __skb_push(skb, (skb->data -
1781 skb_network_header(skb)));
Herbert Xua430a432006-07-08 13:34:56 -07001782 }
Herbert Xu576a30e2006-06-27 13:22:38 -07001783 segs = ptype->gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001784 break;
1785 }
1786 }
1787 rcu_read_unlock();
1788
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001789 __skb_push(skb, skb->data - skb_mac_header(skb));
Herbert Xu576a30e2006-06-27 13:22:38 -07001790
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001791 return segs;
1792}
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001793EXPORT_SYMBOL(skb_gso_segment);
1794
Herbert Xufb286bb2005-11-10 13:01:24 -08001795/* Take action when hardware reception checksum errors are detected. */
1796#ifdef CONFIG_BUG
1797void netdev_rx_csum_fault(struct net_device *dev)
1798{
1799 if (net_ratelimit()) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001800 printk(KERN_ERR "%s: hw csum failure.\n",
Stephen Hemminger246a4212005-12-08 15:21:39 -08001801 dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08001802 dump_stack();
1803 }
1804}
1805EXPORT_SYMBOL(netdev_rx_csum_fault);
1806#endif
1807
Linus Torvalds1da177e2005-04-16 15:20:36 -07001808/* Actually, we should eliminate this check as soon as we know, that:
1809 * 1. IOMMU is present and allows to map all the memory.
1810 * 2. No high memory really exists on this machine.
1811 */
1812
Eric Dumazet9092c652010-04-02 13:34:49 -07001813static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001814{
Herbert Xu3d3a8532006-06-27 13:33:10 -07001815#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001816 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001817 if (!(dev->features & NETIF_F_HIGHDMA)) {
1818 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1819 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1820 return 1;
1821 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001822
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001823 if (PCI_DMA_BUS_IS_PHYS) {
1824 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825
Eric Dumazet9092c652010-04-02 13:34:49 -07001826 if (!pdev)
1827 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001828 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1829 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1830 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1831 return 1;
1832 }
1833 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07001834#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001835 return 0;
1836}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001837
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001838struct dev_gso_cb {
1839 void (*destructor)(struct sk_buff *skb);
1840};
1841
1842#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1843
1844static void dev_gso_skb_destructor(struct sk_buff *skb)
1845{
1846 struct dev_gso_cb *cb;
1847
1848 do {
1849 struct sk_buff *nskb = skb->next;
1850
1851 skb->next = nskb->next;
1852 nskb->next = NULL;
1853 kfree_skb(nskb);
1854 } while (skb->next);
1855
1856 cb = DEV_GSO_CB(skb);
1857 if (cb->destructor)
1858 cb->destructor(skb);
1859}
1860
1861/**
1862 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1863 * @skb: buffer to segment
1864 *
1865 * This function segments the given skb and stores the list of segments
1866 * in skb->next.
1867 */
1868static int dev_gso_segment(struct sk_buff *skb)
1869{
1870 struct net_device *dev = skb->dev;
1871 struct sk_buff *segs;
Herbert Xu576a30e2006-06-27 13:22:38 -07001872 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1873 NETIF_F_SG : 0);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001874
Herbert Xu576a30e2006-06-27 13:22:38 -07001875 segs = skb_gso_segment(skb, features);
1876
1877 /* Verifying header integrity only. */
1878 if (!segs)
1879 return 0;
1880
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07001881 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001882 return PTR_ERR(segs);
1883
1884 skb->next = segs;
1885 DEV_GSO_CB(skb)->destructor = skb->destructor;
1886 skb->destructor = dev_gso_skb_destructor;
1887
1888 return 0;
1889}
1890
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001891/*
1892 * Try to orphan skb early, right before transmission by the device.
1893 * We cannot orphan skb if tx timestamp is requested, since
1894 * drivers need to call skb_tstamp_tx() to send the timestamp.
1895 */
1896static inline void skb_orphan_try(struct sk_buff *skb)
1897{
1898 if (!skb_tx(skb)->flags)
1899 skb_orphan(skb);
1900}
1901
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001902int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1903 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001904{
Stephen Hemminger00829822008-11-20 20:14:53 -08001905 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00001906 int rc = NETDEV_TX_OK;
Stephen Hemminger00829822008-11-20 20:14:53 -08001907
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001908 if (likely(!skb->next)) {
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -07001909 if (!list_empty(&ptype_all))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001910 dev_queue_xmit_nit(skb, dev);
1911
Eric Dumazet93f154b2009-05-18 22:19:19 -07001912 /*
1913 * If device doesnt need skb->dst, release it right now while
1914 * its hot in this cpu cache
1915 */
Eric Dumazetadf30902009-06-02 05:19:30 +00001916 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1917 skb_dst_drop(skb);
1918
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001919 skb_orphan_try(skb);
David S. Miller9ccb8972010-04-22 01:02:07 -07001920
1921 if (netif_needs_gso(dev, skb)) {
1922 if (unlikely(dev_gso_segment(skb)))
1923 goto out_kfree_skb;
1924 if (skb->next)
1925 goto gso;
1926 }
1927
Patrick Ohlyac45f602009-02-12 05:03:37 +00001928 rc = ops->ndo_start_xmit(skb, dev);
Patrick McHardyec634fe2009-07-05 19:23:38 -07001929 if (rc == NETDEV_TX_OK)
Eric Dumazet08baf562009-05-25 22:58:01 -07001930 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00001931 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001932 }
1933
Herbert Xu576a30e2006-06-27 13:22:38 -07001934gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001935 do {
1936 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001937
1938 skb->next = nskb->next;
1939 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00001940
1941 /*
1942 * If device doesnt need nskb->dst, release it right now while
1943 * its hot in this cpu cache
1944 */
1945 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1946 skb_dst_drop(nskb);
1947
Stephen Hemminger00829822008-11-20 20:14:53 -08001948 rc = ops->ndo_start_xmit(nskb, dev);
Patrick McHardyec634fe2009-07-05 19:23:38 -07001949 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00001950 if (rc & ~NETDEV_TX_MASK)
1951 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07001952 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001953 skb->next = nskb;
1954 return rc;
1955 }
Eric Dumazet08baf562009-05-25 22:58:01 -07001956 txq_trans_update(txq);
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001957 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07001958 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001959 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001960
Patrick McHardy572a9d72009-11-10 06:14:14 +00001961out_kfree_gso_skb:
1962 if (likely(skb->next == NULL))
1963 skb->destructor = DEV_GSO_CB(skb)->destructor;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001964out_kfree_skb:
1965 kfree_skb(skb);
Patrick McHardy572a9d72009-11-10 06:14:14 +00001966 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001967}
1968
Tom Herbert0a9627f2010-03-16 08:03:29 +00001969static u32 hashrnd __read_mostly;
David S. Millerb6b2fed2008-07-21 09:48:06 -07001970
Stephen Hemminger92477442009-03-21 13:39:26 -07001971u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
David S. Miller8f0f2222008-07-15 03:47:03 -07001972{
David S. Miller70192982009-01-27 16:34:47 -08001973 u32 hash;
David S. Millerb6b2fed2008-07-21 09:48:06 -07001974
David S. Miller513de112009-05-03 14:43:10 -07001975 if (skb_rx_queue_recorded(skb)) {
1976 hash = skb_get_rx_queue(skb);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001977 while (unlikely(hash >= dev->real_num_tx_queues))
David S. Miller513de112009-05-03 14:43:10 -07001978 hash -= dev->real_num_tx_queues;
1979 return hash;
1980 }
Eric Dumazetec581f62009-05-01 09:05:06 -07001981
1982 if (skb->sk && skb->sk->sk_hash)
David S. Miller70192982009-01-27 16:34:47 -08001983 hash = skb->sk->sk_hash;
Eric Dumazetec581f62009-05-01 09:05:06 -07001984 else
Eric Dumazetb249dcb2010-04-19 21:56:38 +00001985 hash = (__force u16) skb->protocol;
David S. Millerd5a9e242009-01-27 16:22:11 -08001986
Tom Herbert0a9627f2010-03-16 08:03:29 +00001987 hash = jhash_1word(hash, hashrnd);
David S. Millerd5a9e242009-01-27 16:22:11 -08001988
David S. Millerb6b2fed2008-07-21 09:48:06 -07001989 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
David S. Miller8f0f2222008-07-15 03:47:03 -07001990}
Stephen Hemminger92477442009-03-21 13:39:26 -07001991EXPORT_SYMBOL(skb_tx_hash);
David S. Miller8f0f2222008-07-15 03:47:03 -07001992
Eric Dumazeted046422009-11-13 21:54:04 +00001993static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1994{
1995 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1996 if (net_ratelimit()) {
Eric Dumazet7a161ea2010-04-08 21:26:13 +00001997 pr_warning("%s selects TX queue %d, but "
1998 "real number of TX queues is %d\n",
1999 dev->name, queue_index, dev->real_num_tx_queues);
Eric Dumazeted046422009-11-13 21:54:04 +00002000 }
2001 return 0;
2002 }
2003 return queue_index;
2004}
2005
David S. Millere8a04642008-07-17 00:34:19 -07002006static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2007 struct sk_buff *skb)
2008{
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002009 u16 queue_index;
2010 struct sock *sk = skb->sk;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002011
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002012 if (sk_tx_queue_recorded(sk)) {
2013 queue_index = sk_tx_queue_get(sk);
2014 } else {
2015 const struct net_device_ops *ops = dev->netdev_ops;
2016
2017 if (ops->ndo_select_queue) {
2018 queue_index = ops->ndo_select_queue(dev, skb);
Eric Dumazeted046422009-11-13 21:54:04 +00002019 queue_index = dev_cap_txqueue(dev, queue_index);
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002020 } else {
2021 queue_index = 0;
2022 if (dev->real_num_tx_queues > 1)
2023 queue_index = skb_tx_hash(dev, skb);
2024
Eric Dumazet8728c542010-04-11 21:18:17 +00002025 if (sk) {
David S. Miller87eb3672010-04-21 01:14:25 -07002026 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
Eric Dumazet8728c542010-04-11 21:18:17 +00002027
2028 if (dst && skb_dst(skb) == dst)
2029 sk_tx_queue_set(sk, queue_index);
2030 }
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002031 }
2032 }
David S. Millereae792b2008-07-15 03:03:33 -07002033
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002034 skb_set_queue_mapping(skb, queue_index);
2035 return netdev_get_tx_queue(dev, queue_index);
David S. Millere8a04642008-07-17 00:34:19 -07002036}
2037
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002038static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2039 struct net_device *dev,
2040 struct netdev_queue *txq)
2041{
2042 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002043 bool contended = qdisc_is_running(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002044 int rc;
2045
Eric Dumazet79640a42010-06-02 05:09:29 -07002046 /*
2047 * Heuristic to force contended enqueues to serialize on a
2048 * separate lock before trying to get qdisc main lock.
2049 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2050 * and dequeue packets faster.
2051 */
2052 if (unlikely(contended))
2053 spin_lock(&q->busylock);
2054
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002055 spin_lock(root_lock);
2056 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2057 kfree_skb(skb);
2058 rc = NET_XMIT_DROP;
2059 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07002060 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002061 /*
2062 * This is a work-conserving queue; there are no old skbs
2063 * waiting to be sent out; and the qdisc is not running -
2064 * xmit the skb directly.
2065 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002066 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2067 skb_dst_force(skb);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002068 __qdisc_update_bstats(q, skb->len);
Eric Dumazet79640a42010-06-02 05:09:29 -07002069 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2070 if (unlikely(contended)) {
2071 spin_unlock(&q->busylock);
2072 contended = false;
2073 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002074 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002075 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07002076 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002077
2078 rc = NET_XMIT_SUCCESS;
2079 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002080 skb_dst_force(skb);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002081 rc = qdisc_enqueue_root(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002082 if (qdisc_run_begin(q)) {
2083 if (unlikely(contended)) {
2084 spin_unlock(&q->busylock);
2085 contended = false;
2086 }
2087 __qdisc_run(q);
2088 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002089 }
2090 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07002091 if (unlikely(contended))
2092 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002093 return rc;
2094}
2095
Krishna Kumar4b258462010-01-21 01:26:29 -08002096/*
2097 * Returns true if either:
2098 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2099 * 2. skb is fragmented and the device does not support SG, or if
2100 * at least one of fragments is in highmem and device does not
2101 * support DMA from it.
2102 */
2103static inline int skb_needs_linearize(struct sk_buff *skb,
2104 struct net_device *dev)
2105{
2106 return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2107 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2108 illegal_highdma(dev, skb)));
2109}
2110
Dave Jonesd29f7492008-07-22 14:09:06 -07002111/**
2112 * dev_queue_xmit - transmit a buffer
2113 * @skb: buffer to transmit
2114 *
2115 * Queue a buffer for transmission to a network device. The caller must
2116 * have set the device and priority and built the buffer before calling
2117 * this function. The function can be called from an interrupt.
2118 *
2119 * A negative errno code is returned on a failure. A success does not
2120 * guarantee the frame will be transmitted as it may be dropped due
2121 * to congestion or traffic shaping.
2122 *
2123 * -----------------------------------------------------------------------------------
2124 * I notice this method can also return errors from the queue disciplines,
2125 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2126 * be positive.
2127 *
2128 * Regardless of the return value, the skb is consumed, so it is currently
2129 * difficult to retry a send to this method. (You can bump the ref count
2130 * before sending to hold a reference for retry if you are careful.)
2131 *
2132 * When calling this method, interrupts MUST be enabled. This is because
2133 * the BH enable code must have IRQs enabled so that it will not deadlock.
2134 * --BLG
2135 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002136int dev_queue_xmit(struct sk_buff *skb)
2137{
2138 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002139 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002140 struct Qdisc *q;
2141 int rc = -ENOMEM;
2142
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002143 /* GSO will handle the following emulations directly. */
2144 if (netif_needs_gso(dev, skb))
2145 goto gso;
2146
Krishna Kumar4b258462010-01-21 01:26:29 -08002147 /* Convert a paged skb to linear, if required */
2148 if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002149 goto out_kfree_skb;
2150
2151 /* If packet is not checksummed and device does not support
2152 * checksumming for this protocol, complete checksumming here.
2153 */
Herbert Xu663ead32007-04-09 11:59:07 -07002154 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2155 skb_set_transport_header(skb, skb->csum_start -
2156 skb_headroom(skb));
Ben Hutchings6de329e2008-06-16 17:02:28 -07002157 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2158 goto out_kfree_skb;
Herbert Xu663ead32007-04-09 11:59:07 -07002159 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002161gso:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002162 /* Disable soft irqs for various locks below. Also
2163 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002164 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002165 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002166
David S. Millereae792b2008-07-15 03:03:33 -07002167 txq = dev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002168 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002169
Linus Torvalds1da177e2005-04-16 15:20:36 -07002170#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002171 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002172#endif
2173 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002174 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002175 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002176 }
2177
2178 /* The device has no queue. Common case for software devices:
2179 loopback, all the sorts of tunnels...
2180
Herbert Xu932ff272006-06-09 12:20:56 -07002181 Really, it is unlikely that netif_tx_lock protection is necessary
2182 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002183 counters.)
2184 However, it is possible, that they rely on protection
2185 made by us here.
2186
2187 Check this and shot the lock. It is not prone from deadlocks.
2188 Either shot noqueue qdisc, it is even simpler 8)
2189 */
2190 if (dev->flags & IFF_UP) {
2191 int cpu = smp_processor_id(); /* ok because BHs are off */
2192
David S. Millerc773e842008-07-08 23:13:53 -07002193 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002194
David S. Millerc773e842008-07-08 23:13:53 -07002195 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002196
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002197 if (!netif_tx_queue_stopped(txq)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002198 rc = dev_hard_start_xmit(skb, dev, txq);
2199 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002200 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201 goto out;
2202 }
2203 }
David S. Millerc773e842008-07-08 23:13:53 -07002204 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205 if (net_ratelimit())
2206 printk(KERN_CRIT "Virtual device %s asks to "
2207 "queue packet!\n", dev->name);
2208 } else {
2209 /* Recursion is detected! It is possible,
2210 * unfortunately */
2211 if (net_ratelimit())
2212 printk(KERN_CRIT "Dead loop on virtual device "
2213 "%s, fix it urgently!\n", dev->name);
2214 }
2215 }
2216
2217 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002218 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219
2220out_kfree_skb:
2221 kfree_skb(skb);
2222 return rc;
2223out:
Herbert Xud4828d82006-06-22 02:28:18 -07002224 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225 return rc;
2226}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002227EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002228
2229
2230/*=======================================================================
2231 Receiver routines
2232 =======================================================================*/
2233
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002234int netdev_max_backlog __read_mostly = 1000;
Eric Dumazet3b098e22010-05-15 23:57:10 -07002235int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002236int netdev_budget __read_mostly = 300;
2237int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002238
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002239/* Called with irq disabled */
2240static inline void ____napi_schedule(struct softnet_data *sd,
2241 struct napi_struct *napi)
2242{
2243 list_add_tail(&napi->poll_list, &sd->poll_list);
2244 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2245}
2246
Eric Dumazetdf334542010-03-24 19:13:54 +00002247#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002248
2249/* One global table that all flow-based protocols share. */
Eric Dumazet8770acf2010-04-17 00:54:36 -07002250struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002251EXPORT_SYMBOL(rps_sock_flow_table);
2252
Tom Herbert0a9627f2010-03-16 08:03:29 +00002253/*
2254 * get_rps_cpu is called from netif_receive_skb and returns the target
2255 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002256 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002257 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002258static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2259 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002260{
2261 struct ipv6hdr *ip6;
2262 struct iphdr *ip;
2263 struct netdev_rx_queue *rxqueue;
2264 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002265 struct rps_dev_flow_table *flow_table;
2266 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002267 int cpu = -1;
2268 u8 ip_proto;
Tom Herbertfec5e652010-04-16 16:01:27 -07002269 u16 tcpu;
Changli Gao8c52d502010-04-24 22:50:10 -07002270 u32 addr1, addr2, ihl;
2271 union {
2272 u32 v32;
2273 u16 v16[2];
2274 } ports;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002275
Tom Herbert0a9627f2010-03-16 08:03:29 +00002276 if (skb_rx_queue_recorded(skb)) {
2277 u16 index = skb_get_rx_queue(skb);
2278 if (unlikely(index >= dev->num_rx_queues)) {
2279 if (net_ratelimit()) {
Eric Dumazet7a161ea2010-04-08 21:26:13 +00002280 pr_warning("%s received packet on queue "
2281 "%u, but number of RX queues is %u\n",
2282 dev->name, index, dev->num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002283 }
2284 goto done;
2285 }
2286 rxqueue = dev->_rx + index;
2287 } else
2288 rxqueue = dev->_rx;
2289
Tom Herbertfec5e652010-04-16 16:01:27 -07002290 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002291 goto done;
2292
2293 if (skb->rxhash)
2294 goto got_hash; /* Skip hash computation on packet header */
2295
2296 switch (skb->protocol) {
2297 case __constant_htons(ETH_P_IP):
2298 if (!pskb_may_pull(skb, sizeof(*ip)))
2299 goto done;
2300
2301 ip = (struct iphdr *) skb->data;
2302 ip_proto = ip->protocol;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002303 addr1 = (__force u32) ip->saddr;
2304 addr2 = (__force u32) ip->daddr;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002305 ihl = ip->ihl;
2306 break;
2307 case __constant_htons(ETH_P_IPV6):
2308 if (!pskb_may_pull(skb, sizeof(*ip6)))
2309 goto done;
2310
2311 ip6 = (struct ipv6hdr *) skb->data;
2312 ip_proto = ip6->nexthdr;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002313 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2314 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
Tom Herbert0a9627f2010-03-16 08:03:29 +00002315 ihl = (40 >> 2);
2316 break;
2317 default:
2318 goto done;
2319 }
Tom Herbert0a9627f2010-03-16 08:03:29 +00002320 switch (ip_proto) {
2321 case IPPROTO_TCP:
2322 case IPPROTO_UDP:
2323 case IPPROTO_DCCP:
2324 case IPPROTO_ESP:
2325 case IPPROTO_AH:
2326 case IPPROTO_SCTP:
2327 case IPPROTO_UDPLITE:
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002328 if (pskb_may_pull(skb, (ihl * 4) + 4)) {
Changli Gao8c52d502010-04-24 22:50:10 -07002329 ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2330 if (ports.v16[1] < ports.v16[0])
2331 swap(ports.v16[0], ports.v16[1]);
2332 break;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002333 }
Tom Herbert0a9627f2010-03-16 08:03:29 +00002334 default:
Changli Gao8c52d502010-04-24 22:50:10 -07002335 ports.v32 = 0;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002336 break;
2337 }
2338
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002339 /* get a consistent hash (same value on both flow directions) */
2340 if (addr2 < addr1)
2341 swap(addr1, addr2);
Changli Gao8c52d502010-04-24 22:50:10 -07002342 skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002343 if (!skb->rxhash)
2344 skb->rxhash = 1;
2345
2346got_hash:
Tom Herbertfec5e652010-04-16 16:01:27 -07002347 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2348 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2349 if (flow_table && sock_flow_table) {
2350 u16 next_cpu;
2351 struct rps_dev_flow *rflow;
2352
2353 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2354 tcpu = rflow->cpu;
2355
2356 next_cpu = sock_flow_table->ents[skb->rxhash &
2357 sock_flow_table->mask];
2358
2359 /*
2360 * If the desired CPU (where last recvmsg was done) is
2361 * different from current CPU (one in the rx-queue flow
2362 * table entry), switch if one of the following holds:
2363 * - Current CPU is unset (equal to RPS_NO_CPU).
2364 * - Current CPU is offline.
2365 * - The current CPU's queue tail has advanced beyond the
2366 * last packet that was enqueued using this table entry.
2367 * This guarantees that all previous packets for the flow
2368 * have been dequeued, thus preserving in order delivery.
2369 */
2370 if (unlikely(tcpu != next_cpu) &&
2371 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2372 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2373 rflow->last_qtail)) >= 0)) {
2374 tcpu = rflow->cpu = next_cpu;
2375 if (tcpu != RPS_NO_CPU)
2376 rflow->last_qtail = per_cpu(softnet_data,
2377 tcpu).input_queue_head;
2378 }
2379 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2380 *rflowp = rflow;
2381 cpu = tcpu;
2382 goto done;
2383 }
2384 }
2385
Tom Herbert0a9627f2010-03-16 08:03:29 +00002386 map = rcu_dereference(rxqueue->rps_map);
2387 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07002388 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00002389
2390 if (cpu_online(tcpu)) {
2391 cpu = tcpu;
2392 goto done;
2393 }
2394 }
2395
2396done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00002397 return cpu;
2398}
2399
Tom Herbert0a9627f2010-03-16 08:03:29 +00002400/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002401static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002402{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002403 struct softnet_data *sd = data;
2404
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002405 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00002406 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002407}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002408
Tom Herbertfec5e652010-04-16 16:01:27 -07002409#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00002410
2411/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002412 * Check if this softnet_data structure is another cpu one
2413 * If yes, queue it to our IPI list and return 1
2414 * If no, return 0
2415 */
2416static int rps_ipi_queued(struct softnet_data *sd)
2417{
2418#ifdef CONFIG_RPS
2419 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2420
2421 if (sd != mysd) {
2422 sd->rps_ipi_next = mysd->rps_ipi_list;
2423 mysd->rps_ipi_list = sd;
2424
2425 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2426 return 1;
2427 }
2428#endif /* CONFIG_RPS */
2429 return 0;
2430}
2431
2432/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00002433 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2434 * queue (may be a remote CPU queue).
2435 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002436static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2437 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002438{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002439 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002440 unsigned long flags;
2441
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002442 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002443
2444 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002445
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002446 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002447 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2448 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002449enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002450 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002451 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002452 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00002453 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002454 return NET_RX_SUCCESS;
2455 }
2456
Eric Dumazetebda37c22010-05-06 23:51:21 +00002457 /* Schedule NAPI for backlog device
2458 * We can use non atomic operation since we own the queue lock
2459 */
2460 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002461 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002462 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002463 }
2464 goto enqueue;
2465 }
2466
Changli Gaodee42872010-05-02 05:42:16 +00002467 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002468 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002469
Tom Herbert0a9627f2010-03-16 08:03:29 +00002470 local_irq_restore(flags);
2471
2472 kfree_skb(skb);
2473 return NET_RX_DROP;
2474}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002475
Linus Torvalds1da177e2005-04-16 15:20:36 -07002476/**
2477 * netif_rx - post buffer to the network code
2478 * @skb: buffer to post
2479 *
2480 * This function receives a packet from a device driver and queues it for
2481 * the upper (protocol) levels to process. It always succeeds. The buffer
2482 * may be dropped during processing for congestion control or by the
2483 * protocol layers.
2484 *
2485 * return values:
2486 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002487 * NET_RX_DROP (packet was dropped)
2488 *
2489 */
2490
2491int netif_rx(struct sk_buff *skb)
2492{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002493 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002494
2495 /* if netpoll wants it, pretend we never saw it */
2496 if (netpoll_rx(skb))
2497 return NET_RX_DROP;
2498
Eric Dumazet3b098e22010-05-15 23:57:10 -07002499 if (netdev_tstamp_prequeue)
2500 net_timestamp_check(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002501
Eric Dumazetdf334542010-03-24 19:13:54 +00002502#ifdef CONFIG_RPS
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002503 {
Tom Herbertfec5e652010-04-16 16:01:27 -07002504 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002505 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002507 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07002508
2509 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002510 if (cpu < 0)
2511 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07002512
2513 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2514
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002515 rcu_read_unlock();
2516 }
2517#else
Tom Herbertfec5e652010-04-16 16:01:27 -07002518 {
2519 unsigned int qtail;
2520 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2521 put_cpu();
2522 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002523#endif
2524 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002525}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002526EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002527
2528int netif_rx_ni(struct sk_buff *skb)
2529{
2530 int err;
2531
2532 preempt_disable();
2533 err = netif_rx(skb);
2534 if (local_softirq_pending())
2535 do_softirq();
2536 preempt_enable();
2537
2538 return err;
2539}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002540EXPORT_SYMBOL(netif_rx_ni);
2541
Linus Torvalds1da177e2005-04-16 15:20:36 -07002542static void net_tx_action(struct softirq_action *h)
2543{
2544 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2545
2546 if (sd->completion_queue) {
2547 struct sk_buff *clist;
2548
2549 local_irq_disable();
2550 clist = sd->completion_queue;
2551 sd->completion_queue = NULL;
2552 local_irq_enable();
2553
2554 while (clist) {
2555 struct sk_buff *skb = clist;
2556 clist = clist->next;
2557
Ilpo Järvinen547b7922008-07-25 21:43:18 -07002558 WARN_ON(atomic_read(&skb->users));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002559 __kfree_skb(skb);
2560 }
2561 }
2562
2563 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07002564 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002565
2566 local_irq_disable();
2567 head = sd->output_queue;
2568 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00002569 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002570 local_irq_enable();
2571
2572 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07002573 struct Qdisc *q = head;
2574 spinlock_t *root_lock;
2575
Linus Torvalds1da177e2005-04-16 15:20:36 -07002576 head = head->next_sched;
2577
David S. Miller5fb66222008-08-02 20:02:43 -07002578 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07002579 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002580 smp_mb__before_clear_bit();
2581 clear_bit(__QDISC_STATE_SCHED,
2582 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07002583 qdisc_run(q);
2584 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002585 } else {
David S. Miller195648b2008-08-19 04:00:36 -07002586 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002587 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07002588 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002589 } else {
2590 smp_mb__before_clear_bit();
2591 clear_bit(__QDISC_STATE_SCHED,
2592 &q->state);
2593 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002594 }
2595 }
2596 }
2597}
2598
Stephen Hemminger6f05f622007-03-08 20:46:03 -08002599static inline int deliver_skb(struct sk_buff *skb,
2600 struct packet_type *pt_prev,
2601 struct net_device *orig_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602{
2603 atomic_inc(&skb->users);
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002604 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002605}
2606
2607#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
Michał Mirosławda678292009-06-05 05:35:28 +00002608
2609#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2610/* This hook is defined here for ATM LANE */
2611int (*br_fdb_test_addr_hook)(struct net_device *dev,
2612 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07002613EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00002614#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002615
Stephen Hemminger6229e362007-03-21 13:38:47 -07002616/*
2617 * If bridge module is loaded call bridging hook.
2618 * returns NULL if packet was consumed.
2619 */
2620struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2621 struct sk_buff *skb) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07002622EXPORT_SYMBOL_GPL(br_handle_frame_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00002623
Stephen Hemminger6229e362007-03-21 13:38:47 -07002624static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2625 struct packet_type **pt_prev, int *ret,
2626 struct net_device *orig_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002627{
2628 struct net_bridge_port *port;
2629
Stephen Hemminger6229e362007-03-21 13:38:47 -07002630 if (skb->pkt_type == PACKET_LOOPBACK ||
2631 (port = rcu_dereference(skb->dev->br_port)) == NULL)
2632 return skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002633
2634 if (*pt_prev) {
Stephen Hemminger6229e362007-03-21 13:38:47 -07002635 *ret = deliver_skb(skb, *pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002636 *pt_prev = NULL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002637 }
2638
Stephen Hemminger6229e362007-03-21 13:38:47 -07002639 return br_handle_frame_hook(port, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002640}
2641#else
Stephen Hemminger6229e362007-03-21 13:38:47 -07002642#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002643#endif
2644
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002645#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
Jiri Pirkoa14462f2010-05-06 01:33:53 +00002646struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
2647 struct sk_buff *skb) __read_mostly;
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002648EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2649
2650static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2651 struct packet_type **pt_prev,
2652 int *ret,
2653 struct net_device *orig_dev)
2654{
Jiri Pirkoa14462f2010-05-06 01:33:53 +00002655 struct macvlan_port *port;
2656
2657 port = rcu_dereference(skb->dev->macvlan_port);
2658 if (!port)
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002659 return skb;
2660
2661 if (*pt_prev) {
2662 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2663 *pt_prev = NULL;
2664 }
Jiri Pirkoa14462f2010-05-06 01:33:53 +00002665 return macvlan_handle_frame_hook(port, skb);
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002666}
2667#else
2668#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
2669#endif
2670
Linus Torvalds1da177e2005-04-16 15:20:36 -07002671#ifdef CONFIG_NET_CLS_ACT
2672/* TODO: Maybe we should just force sch_ingress to be compiled in
2673 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2674 * a compare and 2 stores extra right now if we dont have it on
2675 * but have CONFIG_NET_CLS_ACT
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002676 * NOTE: This doesnt stop any functionality; if you dont have
Linus Torvalds1da177e2005-04-16 15:20:36 -07002677 * the ingress scheduler, you just cant add policies on ingress.
2678 *
2679 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002680static int ing_filter(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002681{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002682 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07002683 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07002684 struct netdev_queue *rxq;
2685 int result = TC_ACT_OK;
2686 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002687
Herbert Xuf697c3e2007-10-14 00:38:47 -07002688 if (MAX_RED_LOOP < ttl++) {
2689 printk(KERN_WARNING
2690 "Redir loop detected Dropping packet (%d->%d)\n",
Eric Dumazet8964be42009-11-20 15:35:04 -08002691 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07002692 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002693 }
2694
Herbert Xuf697c3e2007-10-14 00:38:47 -07002695 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2696 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2697
David S. Miller555353c2008-07-08 17:33:13 -07002698 rxq = &dev->rx_queue;
2699
David S. Miller83874002008-07-17 00:53:03 -07002700 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07002701 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07002702 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07002703 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2704 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07002705 spin_unlock(qdisc_lock(q));
2706 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07002707
Linus Torvalds1da177e2005-04-16 15:20:36 -07002708 return result;
2709}
Herbert Xuf697c3e2007-10-14 00:38:47 -07002710
2711static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2712 struct packet_type **pt_prev,
2713 int *ret, struct net_device *orig_dev)
2714{
David S. Miller8d50b532008-07-30 02:37:46 -07002715 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07002716 goto out;
2717
2718 if (*pt_prev) {
2719 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2720 *pt_prev = NULL;
2721 } else {
2722 /* Huh? Why does turning on AF_PACKET affect this? */
2723 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2724 }
2725
2726 switch (ing_filter(skb)) {
2727 case TC_ACT_SHOT:
2728 case TC_ACT_STOLEN:
2729 kfree_skb(skb);
2730 return NULL;
2731 }
2732
2733out:
2734 skb->tc_verd = 0;
2735 return skb;
2736}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002737#endif
2738
Patrick McHardybc1d0412008-07-14 22:49:30 -07002739/*
2740 * netif_nit_deliver - deliver received packets to network taps
2741 * @skb: buffer
2742 *
2743 * This function is used to deliver incoming packets to network
2744 * taps. It should be used when the normal netif_receive_skb path
2745 * is bypassed, for example because of VLAN acceleration.
2746 */
2747void netif_nit_deliver(struct sk_buff *skb)
2748{
2749 struct packet_type *ptype;
2750
2751 if (list_empty(&ptype_all))
2752 return;
2753
2754 skb_reset_network_header(skb);
2755 skb_reset_transport_header(skb);
2756 skb->mac_len = skb->network_header - skb->mac_header;
2757
2758 rcu_read_lock();
2759 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2760 if (!ptype->dev || ptype->dev == skb->dev)
2761 deliver_skb(skb, ptype, skb->dev);
2762 }
2763 rcu_read_unlock();
2764}
2765
Eric Dumazetacbbc072010-04-11 06:56:11 +00002766static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2767 struct net_device *master)
2768{
2769 if (skb->pkt_type == PACKET_HOST) {
2770 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2771
2772 memcpy(dest, master->dev_addr, ETH_ALEN);
2773 }
2774}
2775
2776/* On bonding slaves other than the currently active slave, suppress
2777 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2778 * ARP on active-backup slaves with arp_validate enabled.
2779 */
2780int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2781{
2782 struct net_device *dev = skb->dev;
2783
2784 if (master->priv_flags & IFF_MASTER_ARPMON)
2785 dev->last_rx = jiffies;
2786
2787 if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
2788 /* Do address unmangle. The local destination address
2789 * will be always the one master has. Provides the right
2790 * functionality in a bridge.
2791 */
2792 skb_bond_set_mac_by_master(skb, master);
2793 }
2794
2795 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2796 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2797 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2798 return 0;
2799
2800 if (master->priv_flags & IFF_MASTER_ALB) {
2801 if (skb->pkt_type != PACKET_BROADCAST &&
2802 skb->pkt_type != PACKET_MULTICAST)
2803 return 0;
2804 }
2805 if (master->priv_flags & IFF_MASTER_8023AD &&
2806 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2807 return 0;
2808
2809 return 1;
2810 }
2811 return 0;
2812}
2813EXPORT_SYMBOL(__skb_bond_should_drop);
2814
Eric Dumazet10f744d2010-03-28 23:07:20 -07002815static int __netif_receive_skb(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002816{
2817 struct packet_type *ptype, *pt_prev;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002818 struct net_device *orig_dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002819 struct net_device *master;
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002820 struct net_device *null_or_orig;
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002821 struct net_device *null_or_bond;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822 int ret = NET_RX_DROP;
Al Viro252e33462006-11-14 20:48:11 -08002823 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002824
Eric Dumazet3b098e22010-05-15 23:57:10 -07002825 if (!netdev_tstamp_prequeue)
2826 net_timestamp_check(skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07002827
Eric Dumazet05423b22009-10-26 18:40:35 -07002828 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
Patrick McHardy9b22ea52008-11-04 14:49:57 -08002829 return NET_RX_SUCCESS;
2830
Linus Torvalds1da177e2005-04-16 15:20:36 -07002831 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002832 if (netpoll_receive_skb(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002833 return NET_RX_DROP;
2834
Eric Dumazet8964be42009-11-20 15:35:04 -08002835 if (!skb->skb_iif)
2836 skb->skb_iif = skb->dev->ifindex;
David S. Miller86e65da2005-08-09 19:36:29 -07002837
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002838 null_or_orig = NULL;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07002839 orig_dev = skb->dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002840 master = ACCESS_ONCE(orig_dev->master);
2841 if (master) {
2842 if (skb_bond_should_drop(skb, master))
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002843 null_or_orig = orig_dev; /* deliver only exact match */
2844 else
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002845 skb->dev = master;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07002846 }
Jay Vosburgh8f903c72006-02-21 16:36:44 -08002847
Eric Dumazet27f39c73e2010-05-19 22:07:23 +00002848 __this_cpu_inc(softnet_data.processed);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002849 skb_reset_network_header(skb);
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -03002850 skb_reset_transport_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07002851 skb->mac_len = skb->network_header - skb->mac_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002852
2853 pt_prev = NULL;
2854
2855 rcu_read_lock();
2856
2857#ifdef CONFIG_NET_CLS_ACT
2858 if (skb->tc_verd & TC_NCLS) {
2859 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2860 goto ncls;
2861 }
2862#endif
2863
2864 list_for_each_entry_rcu(ptype, &ptype_all, list) {
Joe Eykholtf9823072008-07-02 18:22:02 -07002865 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2866 ptype->dev == orig_dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002867 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002868 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002869 pt_prev = ptype;
2870 }
2871 }
2872
2873#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07002874 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2875 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002876 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002877ncls:
2878#endif
2879
Stephen Hemminger6229e362007-03-21 13:38:47 -07002880 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2881 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002882 goto out;
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002883 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2884 if (!skb)
2885 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002886
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002887 /*
2888 * Make sure frames received on VLAN interfaces stacked on
2889 * bonding interfaces still make their way to any base bonding
2890 * device that may have registered for a specific ptype. The
2891 * handler may have to adjust skb->dev and orig_dev.
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002892 */
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002893 null_or_bond = NULL;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002894 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2895 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002896 null_or_bond = vlan_dev_real_dev(skb->dev);
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002897 }
2898
Linus Torvalds1da177e2005-04-16 15:20:36 -07002899 type = skb->protocol;
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08002900 list_for_each_entry_rcu(ptype,
2901 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002902 if (ptype->type == type && (ptype->dev == null_or_orig ||
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002903 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2904 ptype->dev == null_or_bond)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002905 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002906 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002907 pt_prev = ptype;
2908 }
2909 }
2910
2911 if (pt_prev) {
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002912 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002913 } else {
2914 kfree_skb(skb);
2915 /* Jamal, now you will not able to escape explaining
2916 * me how you were going to use this. :-)
2917 */
2918 ret = NET_RX_DROP;
2919 }
2920
2921out:
2922 rcu_read_unlock();
2923 return ret;
2924}
Tom Herbert0a9627f2010-03-16 08:03:29 +00002925
2926/**
2927 * netif_receive_skb - process receive buffer from network
2928 * @skb: buffer to process
2929 *
2930 * netif_receive_skb() is the main receive data processing function.
2931 * It always succeeds. The buffer may be dropped during processing
2932 * for congestion control or by the protocol layers.
2933 *
2934 * This function may only be called from softirq context and interrupts
2935 * should be enabled.
2936 *
2937 * Return values (usually ignored):
2938 * NET_RX_SUCCESS: no congestion
2939 * NET_RX_DROP: packet was dropped
2940 */
2941int netif_receive_skb(struct sk_buff *skb)
2942{
Eric Dumazet3b098e22010-05-15 23:57:10 -07002943 if (netdev_tstamp_prequeue)
2944 net_timestamp_check(skb);
2945
Eric Dumazetdf334542010-03-24 19:13:54 +00002946#ifdef CONFIG_RPS
Eric Dumazet3b098e22010-05-15 23:57:10 -07002947 {
2948 struct rps_dev_flow voidflow, *rflow = &voidflow;
2949 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002950
Eric Dumazet3b098e22010-05-15 23:57:10 -07002951 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00002952
Eric Dumazet3b098e22010-05-15 23:57:10 -07002953 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07002954
Eric Dumazet3b098e22010-05-15 23:57:10 -07002955 if (cpu >= 0) {
2956 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2957 rcu_read_unlock();
2958 } else {
2959 rcu_read_unlock();
2960 ret = __netif_receive_skb(skb);
2961 }
2962
2963 return ret;
Tom Herbertfec5e652010-04-16 16:01:27 -07002964 }
Tom Herbert1e94d722010-03-18 17:45:44 -07002965#else
2966 return __netif_receive_skb(skb);
2967#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00002968}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002969EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002970
Eric Dumazet88751272010-04-19 05:07:33 +00002971/* Network device is going away, flush any packets still pending
2972 * Called with irqs disabled.
2973 */
Changli Gao152102c2010-03-30 20:16:22 +00002974static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002975{
Changli Gao152102c2010-03-30 20:16:22 +00002976 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002977 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002978 struct sk_buff *skb, *tmp;
2979
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002980 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002981 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002982 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002983 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002984 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002985 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002986 }
Changli Gao6e7676c2010-04-27 15:07:33 -07002987 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002988 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002989
2990 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
2991 if (skb->dev == dev) {
2992 __skb_unlink(skb, &sd->process_queue);
2993 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002994 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002995 }
2996 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002997}
2998
Herbert Xud565b0a2008-12-15 23:38:52 -08002999static int napi_gro_complete(struct sk_buff *skb)
3000{
3001 struct packet_type *ptype;
3002 __be16 type = skb->protocol;
3003 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3004 int err = -ENOENT;
3005
Herbert Xufc59f9a2009-04-14 15:11:06 -07003006 if (NAPI_GRO_CB(skb)->count == 1) {
3007 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003008 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07003009 }
Herbert Xud565b0a2008-12-15 23:38:52 -08003010
3011 rcu_read_lock();
3012 list_for_each_entry_rcu(ptype, head, list) {
3013 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3014 continue;
3015
3016 err = ptype->gro_complete(skb);
3017 break;
3018 }
3019 rcu_read_unlock();
3020
3021 if (err) {
3022 WARN_ON(&ptype->list == head);
3023 kfree_skb(skb);
3024 return NET_RX_SUCCESS;
3025 }
3026
3027out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003028 return netif_receive_skb(skb);
3029}
3030
David S. Miller11380a42010-01-19 13:46:10 -08003031static void napi_gro_flush(struct napi_struct *napi)
Herbert Xud565b0a2008-12-15 23:38:52 -08003032{
3033 struct sk_buff *skb, *next;
3034
3035 for (skb = napi->gro_list; skb; skb = next) {
3036 next = skb->next;
3037 skb->next = NULL;
3038 napi_gro_complete(skb);
3039 }
3040
Herbert Xu4ae55442009-02-08 18:00:36 +00003041 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003042 napi->gro_list = NULL;
3043}
Herbert Xud565b0a2008-12-15 23:38:52 -08003044
Ben Hutchings5b252f02009-10-29 07:17:09 +00003045enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003046{
3047 struct sk_buff **pp = NULL;
3048 struct packet_type *ptype;
3049 __be16 type = skb->protocol;
3050 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
Herbert Xu0da2afd52008-12-26 14:57:42 -08003051 int same_flow;
Herbert Xud565b0a2008-12-15 23:38:52 -08003052 int mac_len;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003053 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003054
3055 if (!(skb->dev->features & NETIF_F_GRO))
3056 goto normal;
3057
David S. Miller4cf704f2009-06-09 00:18:51 -07003058 if (skb_is_gso(skb) || skb_has_frags(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003059 goto normal;
3060
Herbert Xud565b0a2008-12-15 23:38:52 -08003061 rcu_read_lock();
3062 list_for_each_entry_rcu(ptype, head, list) {
Herbert Xud565b0a2008-12-15 23:38:52 -08003063 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3064 continue;
3065
Herbert Xu86911732009-01-29 14:19:50 +00003066 skb_set_network_header(skb, skb_gro_offset(skb));
Herbert Xud565b0a2008-12-15 23:38:52 -08003067 mac_len = skb->network_header - skb->mac_header;
3068 skb->mac_len = mac_len;
3069 NAPI_GRO_CB(skb)->same_flow = 0;
3070 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003071 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003072
Herbert Xud565b0a2008-12-15 23:38:52 -08003073 pp = ptype->gro_receive(&napi->gro_list, skb);
3074 break;
3075 }
3076 rcu_read_unlock();
3077
3078 if (&ptype->list == head)
3079 goto normal;
3080
Herbert Xu0da2afd52008-12-26 14:57:42 -08003081 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003082 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003083
Herbert Xud565b0a2008-12-15 23:38:52 -08003084 if (pp) {
3085 struct sk_buff *nskb = *pp;
3086
3087 *pp = nskb->next;
3088 nskb->next = NULL;
3089 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003090 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003091 }
3092
Herbert Xu0da2afd52008-12-26 14:57:42 -08003093 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003094 goto ok;
3095
Herbert Xu4ae55442009-02-08 18:00:36 +00003096 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003097 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003098
Herbert Xu4ae55442009-02-08 18:00:36 +00003099 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003100 NAPI_GRO_CB(skb)->count = 1;
Herbert Xu86911732009-01-29 14:19:50 +00003101 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003102 skb->next = napi->gro_list;
3103 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003104 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003105
Herbert Xuad0f9902009-02-01 01:24:55 -08003106pull:
Herbert Xucb189782009-05-26 18:50:31 +00003107 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3108 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3109
3110 BUG_ON(skb->end - skb->tail < grow);
3111
3112 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3113
3114 skb->tail += grow;
3115 skb->data_len -= grow;
3116
3117 skb_shinfo(skb)->frags[0].page_offset += grow;
3118 skb_shinfo(skb)->frags[0].size -= grow;
3119
3120 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3121 put_page(skb_shinfo(skb)->frags[0].page);
3122 memmove(skb_shinfo(skb)->frags,
3123 skb_shinfo(skb)->frags + 1,
3124 --skb_shinfo(skb)->nr_frags);
3125 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003126 }
3127
Herbert Xud565b0a2008-12-15 23:38:52 -08003128ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003129 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003130
3131normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003132 ret = GRO_NORMAL;
3133 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003134}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003135EXPORT_SYMBOL(dev_gro_receive);
3136
Ben Hutchings5b252f02009-10-29 07:17:09 +00003137static gro_result_t
3138__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003139{
3140 struct sk_buff *p;
3141
Herbert Xud1c76af2009-03-16 10:50:02 -07003142 if (netpoll_rx_on(skb))
3143 return GRO_NORMAL;
3144
Herbert Xu96e93ea2009-01-06 10:49:34 -08003145 for (p = napi->gro_list; p; p = p->next) {
Joe Perchesf64f9e72009-11-29 16:55:45 -08003146 NAPI_GRO_CB(p)->same_flow =
3147 (p->dev == skb->dev) &&
3148 !compare_ether_header(skb_mac_header(p),
3149 skb_gro_mac_header(skb));
Herbert Xu96e93ea2009-01-06 10:49:34 -08003150 NAPI_GRO_CB(p)->flush = 0;
3151 }
3152
3153 return dev_gro_receive(napi, skb);
3154}
Herbert Xu5d38a072009-01-04 16:13:40 -08003155
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003156gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003157{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003158 switch (ret) {
3159 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003160 if (netif_receive_skb(skb))
3161 ret = GRO_DROP;
3162 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003163
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003164 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003165 case GRO_MERGED_FREE:
Herbert Xu5d38a072009-01-04 16:13:40 -08003166 kfree_skb(skb);
3167 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003168
3169 case GRO_HELD:
3170 case GRO_MERGED:
3171 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003172 }
3173
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003174 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003175}
3176EXPORT_SYMBOL(napi_skb_finish);
3177
Herbert Xu78a478d2009-05-26 18:50:21 +00003178void skb_gro_reset_offset(struct sk_buff *skb)
3179{
3180 NAPI_GRO_CB(skb)->data_offset = 0;
3181 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003182 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003183
Herbert Xu78d3fd02009-05-26 18:50:23 +00003184 if (skb->mac_header == skb->tail &&
Herbert Xu74895942009-05-26 18:50:27 +00003185 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
Herbert Xu78a478d2009-05-26 18:50:21 +00003186 NAPI_GRO_CB(skb)->frag0 =
3187 page_address(skb_shinfo(skb)->frags[0].page) +
3188 skb_shinfo(skb)->frags[0].page_offset;
Herbert Xu74895942009-05-26 18:50:27 +00003189 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3190 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003191}
3192EXPORT_SYMBOL(skb_gro_reset_offset);
3193
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003194gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003195{
Herbert Xu86911732009-01-29 14:19:50 +00003196 skb_gro_reset_offset(skb);
3197
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003198 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003199}
3200EXPORT_SYMBOL(napi_gro_receive);
3201
Herbert Xu96e93ea2009-01-06 10:49:34 -08003202void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3203{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003204 __skb_pull(skb, skb_headlen(skb));
3205 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3206
3207 napi->skb = skb;
3208}
3209EXPORT_SYMBOL(napi_reuse_skb);
3210
Herbert Xu76620aa2009-04-16 02:02:07 -07003211struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003212{
Herbert Xu5d38a072009-01-04 16:13:40 -08003213 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003214
3215 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003216 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3217 if (skb)
3218 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003219 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003220 return skb;
3221}
Herbert Xu76620aa2009-04-16 02:02:07 -07003222EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003223
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003224gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3225 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003226{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003227 switch (ret) {
3228 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003229 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003230 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003231
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003232 if (ret == GRO_HELD)
3233 skb_gro_pull(skb, -ETH_HLEN);
3234 else if (netif_receive_skb(skb))
3235 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003236 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003237
3238 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003239 case GRO_MERGED_FREE:
3240 napi_reuse_skb(napi, skb);
3241 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003242
3243 case GRO_MERGED:
3244 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003245 }
3246
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003247 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003248}
3249EXPORT_SYMBOL(napi_frags_finish);
3250
Herbert Xu76620aa2009-04-16 02:02:07 -07003251struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003252{
Herbert Xu76620aa2009-04-16 02:02:07 -07003253 struct sk_buff *skb = napi->skb;
3254 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00003255 unsigned int hlen;
3256 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07003257
3258 napi->skb = NULL;
3259
3260 skb_reset_mac_header(skb);
3261 skb_gro_reset_offset(skb);
3262
Herbert Xua5b1cf22009-05-26 18:50:28 +00003263 off = skb_gro_offset(skb);
3264 hlen = off + sizeof(*eth);
3265 eth = skb_gro_header_fast(skb, off);
3266 if (skb_gro_header_hard(skb, hlen)) {
3267 eth = skb_gro_header_slow(skb, hlen, off);
3268 if (unlikely(!eth)) {
3269 napi_reuse_skb(napi, skb);
3270 skb = NULL;
3271 goto out;
3272 }
Herbert Xu76620aa2009-04-16 02:02:07 -07003273 }
3274
3275 skb_gro_pull(skb, sizeof(*eth));
3276
3277 /*
3278 * This works because the only protocols we care about don't require
3279 * special handling. We'll fix it up properly at the end.
3280 */
3281 skb->protocol = eth->h_proto;
3282
3283out:
3284 return skb;
3285}
3286EXPORT_SYMBOL(napi_frags_skb);
3287
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003288gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07003289{
3290 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003291
3292 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003293 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003294
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003295 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08003296}
3297EXPORT_SYMBOL(napi_gro_frags);
3298
Eric Dumazete326bed2010-04-22 00:22:45 -07003299/*
3300 * net_rps_action sends any pending IPI's for rps.
3301 * Note: called with local irq disabled, but exits with local irq enabled.
3302 */
3303static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3304{
3305#ifdef CONFIG_RPS
3306 struct softnet_data *remsd = sd->rps_ipi_list;
3307
3308 if (remsd) {
3309 sd->rps_ipi_list = NULL;
3310
3311 local_irq_enable();
3312
3313 /* Send pending IPI's to kick RPS processing on remote cpus. */
3314 while (remsd) {
3315 struct softnet_data *next = remsd->rps_ipi_next;
3316
3317 if (cpu_online(remsd->cpu))
3318 __smp_call_function_single(remsd->cpu,
3319 &remsd->csd, 0);
3320 remsd = next;
3321 }
3322 } else
3323#endif
3324 local_irq_enable();
3325}
3326
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003327static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003328{
3329 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003330 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003331
Eric Dumazete326bed2010-04-22 00:22:45 -07003332#ifdef CONFIG_RPS
3333 /* Check if we have pending ipi, its better to send them now,
3334 * not waiting net_rx_action() end.
3335 */
3336 if (sd->rps_ipi_list) {
3337 local_irq_disable();
3338 net_rps_action_and_irq_enable(sd);
3339 }
3340#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003341 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07003342 local_irq_disable();
3343 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003344 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07003345 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003346
Changli Gao6e7676c2010-04-27 15:07:33 -07003347 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07003348 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07003349 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07003350 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00003351 input_queue_head_incr(sd);
3352 if (++work >= quota) {
3353 local_irq_enable();
3354 return work;
3355 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003356 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003357
Changli Gao6e7676c2010-04-27 15:07:33 -07003358 rps_lock(sd);
3359 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003360 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07003361 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3362 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003363
Changli Gao6e7676c2010-04-27 15:07:33 -07003364 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003365 /*
3366 * Inline a custom version of __napi_complete().
3367 * only current cpu owns and manipulates this napi,
3368 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3369 * we can use a plain write instead of clear_bit(),
3370 * and we dont need an smp_mb() memory barrier.
3371 */
3372 list_del(&napi->poll_list);
3373 napi->state = 0;
3374
Changli Gao6e7676c2010-04-27 15:07:33 -07003375 quota = work + qlen;
3376 }
3377 rps_unlock(sd);
3378 }
3379 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003380
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003381 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003382}
3383
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003384/**
3385 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07003386 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003387 *
3388 * The entry's receive function will be scheduled to run
3389 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08003390void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003391{
3392 unsigned long flags;
3393
3394 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003395 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003396 local_irq_restore(flags);
3397}
3398EXPORT_SYMBOL(__napi_schedule);
3399
Herbert Xud565b0a2008-12-15 23:38:52 -08003400void __napi_complete(struct napi_struct *n)
3401{
3402 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3403 BUG_ON(n->gro_list);
3404
3405 list_del(&n->poll_list);
3406 smp_mb__before_clear_bit();
3407 clear_bit(NAPI_STATE_SCHED, &n->state);
3408}
3409EXPORT_SYMBOL(__napi_complete);
3410
3411void napi_complete(struct napi_struct *n)
3412{
3413 unsigned long flags;
3414
3415 /*
3416 * don't let napi dequeue from the cpu poll list
3417 * just in case its running on a different cpu
3418 */
3419 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3420 return;
3421
3422 napi_gro_flush(n);
3423 local_irq_save(flags);
3424 __napi_complete(n);
3425 local_irq_restore(flags);
3426}
3427EXPORT_SYMBOL(napi_complete);
3428
3429void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3430 int (*poll)(struct napi_struct *, int), int weight)
3431{
3432 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00003433 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003434 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08003435 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003436 napi->poll = poll;
3437 napi->weight = weight;
3438 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08003439 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08003440#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08003441 spin_lock_init(&napi->poll_lock);
3442 napi->poll_owner = -1;
3443#endif
3444 set_bit(NAPI_STATE_SCHED, &napi->state);
3445}
3446EXPORT_SYMBOL(netif_napi_add);
3447
3448void netif_napi_del(struct napi_struct *napi)
3449{
3450 struct sk_buff *skb, *next;
3451
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08003452 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07003453 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08003454
3455 for (skb = napi->gro_list; skb; skb = next) {
3456 next = skb->next;
3457 skb->next = NULL;
3458 kfree_skb(skb);
3459 }
3460
3461 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00003462 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003463}
3464EXPORT_SYMBOL(netif_napi_del);
3465
Linus Torvalds1da177e2005-04-16 15:20:36 -07003466static void net_rx_action(struct softirq_action *h)
3467{
Eric Dumazete326bed2010-04-22 00:22:45 -07003468 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003469 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07003470 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07003471 void *have;
3472
Linus Torvalds1da177e2005-04-16 15:20:36 -07003473 local_irq_disable();
3474
Eric Dumazete326bed2010-04-22 00:22:45 -07003475 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003476 struct napi_struct *n;
3477 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003478
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003479 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003480 * Allow this to run for 2 jiffies since which will allow
3481 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003482 */
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003483 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003484 goto softnet_break;
3485
3486 local_irq_enable();
3487
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003488 /* Even though interrupts have been re-enabled, this
3489 * access is safe because interrupts can only add new
3490 * entries to the tail of this list, and only ->poll()
3491 * calls can remove this head entry from the list.
3492 */
Eric Dumazete326bed2010-04-22 00:22:45 -07003493 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003494
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003495 have = netpoll_poll_lock(n);
3496
3497 weight = n->weight;
3498
David S. Miller0a7606c2007-10-29 21:28:47 -07003499 /* This NAPI_STATE_SCHED test is for avoiding a race
3500 * with netpoll's poll_napi(). Only the entity which
3501 * obtains the lock and sees NAPI_STATE_SCHED set will
3502 * actually make the ->poll() call. Therefore we avoid
3503 * accidently calling ->poll() when NAPI is not scheduled.
3504 */
3505 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00003506 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07003507 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00003508 trace_napi_poll(n);
3509 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003510
3511 WARN_ON_ONCE(work > weight);
3512
3513 budget -= work;
3514
3515 local_irq_disable();
3516
3517 /* Drivers must not modify the NAPI state if they
3518 * consume the entire weight. In such cases this code
3519 * still "owns" the NAPI instance and therefore can
3520 * move the instance around on the list at-will.
3521 */
David S. Millerfed17f32008-01-07 21:00:40 -08003522 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07003523 if (unlikely(napi_disable_pending(n))) {
3524 local_irq_enable();
3525 napi_complete(n);
3526 local_irq_disable();
3527 } else
Eric Dumazete326bed2010-04-22 00:22:45 -07003528 list_move_tail(&n->poll_list, &sd->poll_list);
David S. Millerfed17f32008-01-07 21:00:40 -08003529 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003530
3531 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003532 }
3533out:
Eric Dumazete326bed2010-04-22 00:22:45 -07003534 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003535
Chris Leechdb217332006-06-17 21:24:58 -07003536#ifdef CONFIG_NET_DMA
3537 /*
3538 * There may not be any more sk_buffs coming right now, so push
3539 * any pending DMA copies to hardware
3540 */
Dan Williams2ba05622009-01-06 11:38:14 -07003541 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07003542#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003543
Linus Torvalds1da177e2005-04-16 15:20:36 -07003544 return;
3545
3546softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00003547 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003548 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3549 goto out;
3550}
3551
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003552static gifconf_func_t *gifconf_list[NPROTO];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003553
3554/**
3555 * register_gifconf - register a SIOCGIF handler
3556 * @family: Address family
3557 * @gifconf: Function handler
3558 *
3559 * Register protocol dependent address dumping routines. The handler
3560 * that is passed must not be freed or reused until it has been replaced
3561 * by another handler.
3562 */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003563int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003564{
3565 if (family >= NPROTO)
3566 return -EINVAL;
3567 gifconf_list[family] = gifconf;
3568 return 0;
3569}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003570EXPORT_SYMBOL(register_gifconf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003571
3572
3573/*
3574 * Map an interface index to its name (SIOCGIFNAME)
3575 */
3576
3577/*
3578 * We need this ioctl for efficient implementation of the
3579 * if_indextoname() function required by the IPv6 API. Without
3580 * it, we would have to search all the interfaces to find a
3581 * match. --pb
3582 */
3583
Eric W. Biederman881d9662007-09-17 11:56:21 -07003584static int dev_ifname(struct net *net, struct ifreq __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003585{
3586 struct net_device *dev;
3587 struct ifreq ifr;
3588
3589 /*
3590 * Fetch the caller's info block.
3591 */
3592
3593 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3594 return -EFAULT;
3595
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003596 rcu_read_lock();
3597 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003598 if (!dev) {
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003599 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003600 return -ENODEV;
3601 }
3602
3603 strcpy(ifr.ifr_name, dev->name);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003604 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003605
3606 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3607 return -EFAULT;
3608 return 0;
3609}
3610
3611/*
3612 * Perform a SIOCGIFCONF call. This structure will change
3613 * size eventually, and there is nothing I can do about it.
3614 * Thus we will need a 'compatibility mode'.
3615 */
3616
Eric W. Biederman881d9662007-09-17 11:56:21 -07003617static int dev_ifconf(struct net *net, char __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003618{
3619 struct ifconf ifc;
3620 struct net_device *dev;
3621 char __user *pos;
3622 int len;
3623 int total;
3624 int i;
3625
3626 /*
3627 * Fetch the caller's info block.
3628 */
3629
3630 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3631 return -EFAULT;
3632
3633 pos = ifc.ifc_buf;
3634 len = ifc.ifc_len;
3635
3636 /*
3637 * Loop over the interfaces, and write an info block for each.
3638 */
3639
3640 total = 0;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003641 for_each_netdev(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003642 for (i = 0; i < NPROTO; i++) {
3643 if (gifconf_list[i]) {
3644 int done;
3645 if (!pos)
3646 done = gifconf_list[i](dev, NULL, 0);
3647 else
3648 done = gifconf_list[i](dev, pos + total,
3649 len - total);
3650 if (done < 0)
3651 return -EFAULT;
3652 total += done;
3653 }
3654 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003655 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003656
3657 /*
3658 * All done. Write the updated control block back to the caller.
3659 */
3660 ifc.ifc_len = total;
3661
3662 /*
3663 * Both BSD and Solaris return 0 here, so we do too.
3664 */
3665 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3666}
3667
3668#ifdef CONFIG_PROC_FS
3669/*
3670 * This is invoked by the /proc filesystem handler to display a device
3671 * in detail.
3672 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003673void *dev_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003674 __acquires(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003675{
Denis V. Luneve372c412007-11-19 22:31:54 -08003676 struct net *net = seq_file_net(seq);
Pavel Emelianov7562f872007-05-03 15:13:45 -07003677 loff_t off;
3678 struct net_device *dev;
3679
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003680 rcu_read_lock();
Pavel Emelianov7562f872007-05-03 15:13:45 -07003681 if (!*pos)
3682 return SEQ_START_TOKEN;
3683
3684 off = 1;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003685 for_each_netdev_rcu(net, dev)
Pavel Emelianov7562f872007-05-03 15:13:45 -07003686 if (off++ == *pos)
3687 return dev;
3688
3689 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003690}
3691
3692void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3693{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003694 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3695 first_net_device(seq_file_net(seq)) :
3696 next_net_device((struct net_device *)v);
3697
Linus Torvalds1da177e2005-04-16 15:20:36 -07003698 ++*pos;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003699 return rcu_dereference(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003700}
3701
3702void dev_seq_stop(struct seq_file *seq, void *v)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003703 __releases(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003704{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003705 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003706}
3707
3708static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3709{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08003710 const struct net_device_stats *stats = dev_get_stats(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003711
Jesper Dangaard Brouer2d13baf2010-01-05 05:50:52 +00003712 seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
Rusty Russell5a1b5892007-04-28 21:04:03 -07003713 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3714 dev->name, stats->rx_bytes, stats->rx_packets,
3715 stats->rx_errors,
3716 stats->rx_dropped + stats->rx_missed_errors,
3717 stats->rx_fifo_errors,
3718 stats->rx_length_errors + stats->rx_over_errors +
3719 stats->rx_crc_errors + stats->rx_frame_errors,
3720 stats->rx_compressed, stats->multicast,
3721 stats->tx_bytes, stats->tx_packets,
3722 stats->tx_errors, stats->tx_dropped,
3723 stats->tx_fifo_errors, stats->collisions,
3724 stats->tx_carrier_errors +
3725 stats->tx_aborted_errors +
3726 stats->tx_window_errors +
3727 stats->tx_heartbeat_errors,
3728 stats->tx_compressed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003729}
3730
3731/*
3732 * Called from the PROCfs module. This now uses the new arbitrary sized
3733 * /proc/net interface to create /proc/net/dev
3734 */
3735static int dev_seq_show(struct seq_file *seq, void *v)
3736{
3737 if (v == SEQ_START_TOKEN)
3738 seq_puts(seq, "Inter-| Receive "
3739 " | Transmit\n"
3740 " face |bytes packets errs drop fifo frame "
3741 "compressed multicast|bytes packets errs "
3742 "drop fifo colls carrier compressed\n");
3743 else
3744 dev_seq_printf_stats(seq, v);
3745 return 0;
3746}
3747
Changli Gaodee42872010-05-02 05:42:16 +00003748static struct softnet_data *softnet_get_online(loff_t *pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003749{
Changli Gaodee42872010-05-02 05:42:16 +00003750 struct softnet_data *sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003751
Mike Travis0c0b0ac2008-05-02 16:43:08 -07003752 while (*pos < nr_cpu_ids)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003753 if (cpu_online(*pos)) {
Changli Gaodee42872010-05-02 05:42:16 +00003754 sd = &per_cpu(softnet_data, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003755 break;
3756 } else
3757 ++*pos;
Changli Gaodee42872010-05-02 05:42:16 +00003758 return sd;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003759}
3760
3761static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3762{
3763 return softnet_get_online(pos);
3764}
3765
3766static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3767{
3768 ++*pos;
3769 return softnet_get_online(pos);
3770}
3771
3772static void softnet_seq_stop(struct seq_file *seq, void *v)
3773{
3774}
3775
3776static int softnet_seq_show(struct seq_file *seq, void *v)
3777{
Changli Gaodee42872010-05-02 05:42:16 +00003778 struct softnet_data *sd = v;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003779
Tom Herbert0a9627f2010-03-16 08:03:29 +00003780 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
Changli Gaodee42872010-05-02 05:42:16 +00003781 sd->processed, sd->dropped, sd->time_squeeze, 0,
Stephen Hemmingerc1ebcdb2005-06-23 20:08:59 -07003782 0, 0, 0, 0, /* was fastroute */
Changli Gaodee42872010-05-02 05:42:16 +00003783 sd->cpu_collision, sd->received_rps);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003784 return 0;
3785}
3786
Stephen Hemmingerf6908082007-03-12 14:34:29 -07003787static const struct seq_operations dev_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003788 .start = dev_seq_start,
3789 .next = dev_seq_next,
3790 .stop = dev_seq_stop,
3791 .show = dev_seq_show,
3792};
3793
3794static int dev_seq_open(struct inode *inode, struct file *file)
3795{
Denis V. Luneve372c412007-11-19 22:31:54 -08003796 return seq_open_net(inode, file, &dev_seq_ops,
3797 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003798}
3799
Arjan van de Ven9a321442007-02-12 00:55:35 -08003800static const struct file_operations dev_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003801 .owner = THIS_MODULE,
3802 .open = dev_seq_open,
3803 .read = seq_read,
3804 .llseek = seq_lseek,
Denis V. Luneve372c412007-11-19 22:31:54 -08003805 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003806};
3807
Stephen Hemmingerf6908082007-03-12 14:34:29 -07003808static const struct seq_operations softnet_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003809 .start = softnet_seq_start,
3810 .next = softnet_seq_next,
3811 .stop = softnet_seq_stop,
3812 .show = softnet_seq_show,
3813};
3814
3815static int softnet_seq_open(struct inode *inode, struct file *file)
3816{
3817 return seq_open(file, &softnet_seq_ops);
3818}
3819
Arjan van de Ven9a321442007-02-12 00:55:35 -08003820static const struct file_operations softnet_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003821 .owner = THIS_MODULE,
3822 .open = softnet_seq_open,
3823 .read = seq_read,
3824 .llseek = seq_lseek,
3825 .release = seq_release,
3826};
3827
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003828static void *ptype_get_idx(loff_t pos)
3829{
3830 struct packet_type *pt = NULL;
3831 loff_t i = 0;
3832 int t;
3833
3834 list_for_each_entry_rcu(pt, &ptype_all, list) {
3835 if (i == pos)
3836 return pt;
3837 ++i;
3838 }
3839
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003840 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003841 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3842 if (i == pos)
3843 return pt;
3844 ++i;
3845 }
3846 }
3847 return NULL;
3848}
3849
3850static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
Stephen Hemminger72348a42008-01-21 02:27:29 -08003851 __acquires(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003852{
3853 rcu_read_lock();
3854 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3855}
3856
3857static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3858{
3859 struct packet_type *pt;
3860 struct list_head *nxt;
3861 int hash;
3862
3863 ++*pos;
3864 if (v == SEQ_START_TOKEN)
3865 return ptype_get_idx(0);
3866
3867 pt = v;
3868 nxt = pt->list.next;
3869 if (pt->type == htons(ETH_P_ALL)) {
3870 if (nxt != &ptype_all)
3871 goto found;
3872 hash = 0;
3873 nxt = ptype_base[0].next;
3874 } else
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003875 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003876
3877 while (nxt == &ptype_base[hash]) {
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003878 if (++hash >= PTYPE_HASH_SIZE)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003879 return NULL;
3880 nxt = ptype_base[hash].next;
3881 }
3882found:
3883 return list_entry(nxt, struct packet_type, list);
3884}
3885
3886static void ptype_seq_stop(struct seq_file *seq, void *v)
Stephen Hemminger72348a42008-01-21 02:27:29 -08003887 __releases(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003888{
3889 rcu_read_unlock();
3890}
3891
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003892static int ptype_seq_show(struct seq_file *seq, void *v)
3893{
3894 struct packet_type *pt = v;
3895
3896 if (v == SEQ_START_TOKEN)
3897 seq_puts(seq, "Type Device Function\n");
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09003898 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003899 if (pt->type == htons(ETH_P_ALL))
3900 seq_puts(seq, "ALL ");
3901 else
3902 seq_printf(seq, "%04x", ntohs(pt->type));
3903
Alexey Dobriyan908cd2d2008-11-16 19:50:35 -08003904 seq_printf(seq, " %-8s %pF\n",
3905 pt->dev ? pt->dev->name : "", pt->func);
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003906 }
3907
3908 return 0;
3909}
3910
3911static const struct seq_operations ptype_seq_ops = {
3912 .start = ptype_seq_start,
3913 .next = ptype_seq_next,
3914 .stop = ptype_seq_stop,
3915 .show = ptype_seq_show,
3916};
3917
3918static int ptype_seq_open(struct inode *inode, struct file *file)
3919{
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07003920 return seq_open_net(inode, file, &ptype_seq_ops,
3921 sizeof(struct seq_net_private));
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003922}
3923
3924static const struct file_operations ptype_seq_fops = {
3925 .owner = THIS_MODULE,
3926 .open = ptype_seq_open,
3927 .read = seq_read,
3928 .llseek = seq_lseek,
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07003929 .release = seq_release_net,
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003930};
3931
3932
Pavel Emelyanov46650792007-10-08 20:38:39 -07003933static int __net_init dev_proc_net_init(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003934{
3935 int rc = -ENOMEM;
3936
Eric W. Biederman881d9662007-09-17 11:56:21 -07003937 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003938 goto out;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003939 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003940 goto out_dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003941 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003942 goto out_softnet;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003943
Eric W. Biederman881d9662007-09-17 11:56:21 -07003944 if (wext_proc_init(net))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003945 goto out_ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003946 rc = 0;
3947out:
3948 return rc;
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003949out_ptype:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003950 proc_net_remove(net, "ptype");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003951out_softnet:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003952 proc_net_remove(net, "softnet_stat");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003953out_dev:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003954 proc_net_remove(net, "dev");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003955 goto out;
3956}
Eric W. Biederman881d9662007-09-17 11:56:21 -07003957
Pavel Emelyanov46650792007-10-08 20:38:39 -07003958static void __net_exit dev_proc_net_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07003959{
3960 wext_proc_exit(net);
3961
3962 proc_net_remove(net, "ptype");
3963 proc_net_remove(net, "softnet_stat");
3964 proc_net_remove(net, "dev");
3965}
3966
Denis V. Lunev022cbae2007-11-13 03:23:50 -08003967static struct pernet_operations __net_initdata dev_proc_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07003968 .init = dev_proc_net_init,
3969 .exit = dev_proc_net_exit,
3970};
3971
3972static int __init dev_proc_init(void)
3973{
3974 return register_pernet_subsys(&dev_proc_ops);
3975}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003976#else
3977#define dev_proc_init() 0
3978#endif /* CONFIG_PROC_FS */
3979
3980
3981/**
3982 * netdev_set_master - set up master/slave pair
3983 * @slave: slave device
3984 * @master: new master device
3985 *
3986 * Changes the master device of the slave. Pass %NULL to break the
3987 * bonding. The caller must hold the RTNL semaphore. On a failure
3988 * a negative errno code is returned. On success the reference counts
3989 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3990 * function returns zero.
3991 */
3992int netdev_set_master(struct net_device *slave, struct net_device *master)
3993{
3994 struct net_device *old = slave->master;
3995
3996 ASSERT_RTNL();
3997
3998 if (master) {
3999 if (old)
4000 return -EBUSY;
4001 dev_hold(master);
4002 }
4003
4004 slave->master = master;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004005
Eric Dumazet283f2fe2010-03-18 13:37:40 +00004006 if (old) {
4007 synchronize_net();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004008 dev_put(old);
Eric Dumazet283f2fe2010-03-18 13:37:40 +00004009 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004010 if (master)
4011 slave->flags |= IFF_SLAVE;
4012 else
4013 slave->flags &= ~IFF_SLAVE;
4014
4015 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4016 return 0;
4017}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004018EXPORT_SYMBOL(netdev_set_master);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004019
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004020static void dev_change_rx_flags(struct net_device *dev, int flags)
4021{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004022 const struct net_device_ops *ops = dev->netdev_ops;
4023
4024 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4025 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004026}
4027
Wang Chendad9b332008-06-18 01:48:28 -07004028static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07004029{
4030 unsigned short old_flags = dev->flags;
David Howells8192b0c2008-11-14 10:39:10 +11004031 uid_t uid;
4032 gid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07004033
Patrick McHardy24023452007-07-14 18:51:31 -07004034 ASSERT_RTNL();
4035
Wang Chendad9b332008-06-18 01:48:28 -07004036 dev->flags |= IFF_PROMISC;
4037 dev->promiscuity += inc;
4038 if (dev->promiscuity == 0) {
4039 /*
4040 * Avoid overflow.
4041 * If inc causes overflow, untouch promisc and return error.
4042 */
4043 if (inc < 0)
4044 dev->flags &= ~IFF_PROMISC;
4045 else {
4046 dev->promiscuity -= inc;
4047 printk(KERN_WARNING "%s: promiscuity touches roof, "
4048 "set promiscuity failed, promiscuity feature "
4049 "of device might be broken.\n", dev->name);
4050 return -EOVERFLOW;
4051 }
4052 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004053 if (dev->flags != old_flags) {
4054 printk(KERN_INFO "device %s %s promiscuous mode\n",
4055 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4056 "left");
David Howells8192b0c2008-11-14 10:39:10 +11004057 if (audit_enabled) {
4058 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004059 audit_log(current->audit_context, GFP_ATOMIC,
4060 AUDIT_ANOM_PROMISCUOUS,
4061 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4062 dev->name, (dev->flags & IFF_PROMISC),
4063 (old_flags & IFF_PROMISC),
4064 audit_get_loginuid(current),
David Howells8192b0c2008-11-14 10:39:10 +11004065 uid, gid,
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004066 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11004067 }
Patrick McHardy24023452007-07-14 18:51:31 -07004068
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004069 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07004070 }
Wang Chendad9b332008-06-18 01:48:28 -07004071 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004072}
4073
Linus Torvalds1da177e2005-04-16 15:20:36 -07004074/**
4075 * dev_set_promiscuity - update promiscuity count on a device
4076 * @dev: device
4077 * @inc: modifier
4078 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07004079 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004080 * remains above zero the interface remains promiscuous. Once it hits zero
4081 * the device reverts back to normal filtering operation. A negative inc
4082 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07004083 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004084 */
Wang Chendad9b332008-06-18 01:48:28 -07004085int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004086{
4087 unsigned short old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07004088 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004089
Wang Chendad9b332008-06-18 01:48:28 -07004090 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07004091 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07004092 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07004093 if (dev->flags != old_flags)
4094 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07004095 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004096}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004097EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004098
4099/**
4100 * dev_set_allmulti - update allmulti count on a device
4101 * @dev: device
4102 * @inc: modifier
4103 *
4104 * Add or remove reception of all multicast frames to a device. While the
4105 * count in the device remains above zero the interface remains listening
4106 * to all interfaces. Once it hits zero the device reverts back to normal
4107 * filtering operation. A negative @inc value is used to drop the counter
4108 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07004109 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004110 */
4111
Wang Chendad9b332008-06-18 01:48:28 -07004112int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004113{
4114 unsigned short old_flags = dev->flags;
4115
Patrick McHardy24023452007-07-14 18:51:31 -07004116 ASSERT_RTNL();
4117
Linus Torvalds1da177e2005-04-16 15:20:36 -07004118 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07004119 dev->allmulti += inc;
4120 if (dev->allmulti == 0) {
4121 /*
4122 * Avoid overflow.
4123 * If inc causes overflow, untouch allmulti and return error.
4124 */
4125 if (inc < 0)
4126 dev->flags &= ~IFF_ALLMULTI;
4127 else {
4128 dev->allmulti -= inc;
4129 printk(KERN_WARNING "%s: allmulti touches roof, "
4130 "set allmulti failed, allmulti feature of "
4131 "device might be broken.\n", dev->name);
4132 return -EOVERFLOW;
4133 }
4134 }
Patrick McHardy24023452007-07-14 18:51:31 -07004135 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004136 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07004137 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07004138 }
Wang Chendad9b332008-06-18 01:48:28 -07004139 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004140}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004141EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07004142
4143/*
4144 * Upload unicast and multicast address lists to device and
4145 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08004146 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07004147 * are present.
4148 */
4149void __dev_set_rx_mode(struct net_device *dev)
4150{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004151 const struct net_device_ops *ops = dev->netdev_ops;
4152
Patrick McHardy4417da62007-06-27 01:28:10 -07004153 /* dev_open will call this function so the list will stay sane. */
4154 if (!(dev->flags&IFF_UP))
4155 return;
4156
4157 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09004158 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07004159
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004160 if (ops->ndo_set_rx_mode)
4161 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004162 else {
4163 /* Unicast addresses changes may only happen under the rtnl,
4164 * therefore calling __dev_set_promiscuity here is safe.
4165 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004166 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004167 __dev_set_promiscuity(dev, 1);
4168 dev->uc_promisc = 1;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004169 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004170 __dev_set_promiscuity(dev, -1);
4171 dev->uc_promisc = 0;
4172 }
4173
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004174 if (ops->ndo_set_multicast_list)
4175 ops->ndo_set_multicast_list(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004176 }
4177}
4178
4179void dev_set_rx_mode(struct net_device *dev)
4180{
David S. Millerb9e40852008-07-15 00:15:08 -07004181 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004182 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07004183 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004184}
4185
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004186/**
4187 * dev_get_flags - get flags reported to userspace
4188 * @dev: device
4189 *
4190 * Get the combination of flag bits exported through APIs to userspace.
4191 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004192unsigned dev_get_flags(const struct net_device *dev)
4193{
4194 unsigned flags;
4195
4196 flags = (dev->flags & ~(IFF_PROMISC |
4197 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08004198 IFF_RUNNING |
4199 IFF_LOWER_UP |
4200 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07004201 (dev->gflags & (IFF_PROMISC |
4202 IFF_ALLMULTI));
4203
Stefan Rompfb00055a2006-03-20 17:09:11 -08004204 if (netif_running(dev)) {
4205 if (netif_oper_up(dev))
4206 flags |= IFF_RUNNING;
4207 if (netif_carrier_ok(dev))
4208 flags |= IFF_LOWER_UP;
4209 if (netif_dormant(dev))
4210 flags |= IFF_DORMANT;
4211 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004212
4213 return flags;
4214}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004215EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004216
Patrick McHardybd380812010-02-26 06:34:53 +00004217int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004218{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004219 int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004220 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004221
Patrick McHardy24023452007-07-14 18:51:31 -07004222 ASSERT_RTNL();
4223
Linus Torvalds1da177e2005-04-16 15:20:36 -07004224 /*
4225 * Set the flags on our device.
4226 */
4227
4228 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4229 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4230 IFF_AUTOMEDIA)) |
4231 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4232 IFF_ALLMULTI));
4233
4234 /*
4235 * Load in the correct multicast list now the flags have changed.
4236 */
4237
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004238 if ((old_flags ^ flags) & IFF_MULTICAST)
4239 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07004240
Patrick McHardy4417da62007-06-27 01:28:10 -07004241 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004242
4243 /*
4244 * Have we downed the interface. We handle IFF_UP ourselves
4245 * according to user attempts to set it, rather than blindly
4246 * setting it.
4247 */
4248
4249 ret = 0;
4250 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00004251 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004252
4253 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07004254 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004255 }
4256
Linus Torvalds1da177e2005-04-16 15:20:36 -07004257 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004258 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4259
Linus Torvalds1da177e2005-04-16 15:20:36 -07004260 dev->gflags ^= IFF_PROMISC;
4261 dev_set_promiscuity(dev, inc);
4262 }
4263
4264 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4265 is important. Some (broken) drivers set IFF_PROMISC, when
4266 IFF_ALLMULTI is requested not asking us and not reporting.
4267 */
4268 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004269 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4270
Linus Torvalds1da177e2005-04-16 15:20:36 -07004271 dev->gflags ^= IFF_ALLMULTI;
4272 dev_set_allmulti(dev, inc);
4273 }
4274
Patrick McHardybd380812010-02-26 06:34:53 +00004275 return ret;
4276}
4277
4278void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4279{
4280 unsigned int changes = dev->flags ^ old_flags;
4281
4282 if (changes & IFF_UP) {
4283 if (dev->flags & IFF_UP)
4284 call_netdevice_notifiers(NETDEV_UP, dev);
4285 else
4286 call_netdevice_notifiers(NETDEV_DOWN, dev);
4287 }
4288
4289 if (dev->flags & IFF_UP &&
4290 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4291 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4292}
4293
4294/**
4295 * dev_change_flags - change device settings
4296 * @dev: device
4297 * @flags: device state flags
4298 *
4299 * Change settings on device based state flags. The flags are
4300 * in the userspace exported format.
4301 */
4302int dev_change_flags(struct net_device *dev, unsigned flags)
4303{
4304 int ret, changes;
4305 int old_flags = dev->flags;
4306
4307 ret = __dev_change_flags(dev, flags);
4308 if (ret < 0)
4309 return ret;
4310
4311 changes = old_flags ^ dev->flags;
Thomas Graf7c355f52007-06-05 16:03:03 -07004312 if (changes)
4313 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004314
Patrick McHardybd380812010-02-26 06:34:53 +00004315 __dev_notify_flags(dev, old_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004316 return ret;
4317}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004318EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004319
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004320/**
4321 * dev_set_mtu - Change maximum transfer unit
4322 * @dev: device
4323 * @new_mtu: new transfer unit
4324 *
4325 * Change the maximum transfer size of the network device.
4326 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004327int dev_set_mtu(struct net_device *dev, int new_mtu)
4328{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004329 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004330 int err;
4331
4332 if (new_mtu == dev->mtu)
4333 return 0;
4334
4335 /* MTU must be positive. */
4336 if (new_mtu < 0)
4337 return -EINVAL;
4338
4339 if (!netif_device_present(dev))
4340 return -ENODEV;
4341
4342 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004343 if (ops->ndo_change_mtu)
4344 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004345 else
4346 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004347
Linus Torvalds1da177e2005-04-16 15:20:36 -07004348 if (!err && dev->flags & IFF_UP)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004349 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004350 return err;
4351}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004352EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004353
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004354/**
4355 * dev_set_mac_address - Change Media Access Control Address
4356 * @dev: device
4357 * @sa: new address
4358 *
4359 * Change the hardware (MAC) address of the device
4360 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004361int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4362{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004363 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004364 int err;
4365
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004366 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004367 return -EOPNOTSUPP;
4368 if (sa->sa_family != dev->type)
4369 return -EINVAL;
4370 if (!netif_device_present(dev))
4371 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004372 err = ops->ndo_set_mac_address(dev, sa);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004373 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004374 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004375 return err;
4376}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004377EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004378
4379/*
Eric Dumazet3710bec2009-11-01 19:42:09 +00004380 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07004381 */
Jeff Garzik14e3e072007-10-08 00:06:32 -07004382static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004383{
4384 int err;
Eric Dumazet3710bec2009-11-01 19:42:09 +00004385 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004386
4387 if (!dev)
4388 return -ENODEV;
4389
4390 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004391 case SIOCGIFFLAGS: /* Get interface flags */
4392 ifr->ifr_flags = (short) dev_get_flags(dev);
4393 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004394
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004395 case SIOCGIFMETRIC: /* Get the metric on the interface
4396 (currently unused) */
4397 ifr->ifr_metric = 0;
4398 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004399
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004400 case SIOCGIFMTU: /* Get the MTU of a device */
4401 ifr->ifr_mtu = dev->mtu;
4402 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004403
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004404 case SIOCGIFHWADDR:
4405 if (!dev->addr_len)
4406 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4407 else
4408 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4409 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4410 ifr->ifr_hwaddr.sa_family = dev->type;
4411 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004412
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004413 case SIOCGIFSLAVE:
4414 err = -EINVAL;
4415 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004416
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004417 case SIOCGIFMAP:
4418 ifr->ifr_map.mem_start = dev->mem_start;
4419 ifr->ifr_map.mem_end = dev->mem_end;
4420 ifr->ifr_map.base_addr = dev->base_addr;
4421 ifr->ifr_map.irq = dev->irq;
4422 ifr->ifr_map.dma = dev->dma;
4423 ifr->ifr_map.port = dev->if_port;
4424 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004425
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004426 case SIOCGIFINDEX:
4427 ifr->ifr_ifindex = dev->ifindex;
4428 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004429
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004430 case SIOCGIFTXQLEN:
4431 ifr->ifr_qlen = dev->tx_queue_len;
4432 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004433
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004434 default:
4435 /* dev_ioctl() should ensure this case
4436 * is never reached
4437 */
4438 WARN_ON(1);
4439 err = -EINVAL;
4440 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004441
4442 }
4443 return err;
4444}
4445
4446/*
4447 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4448 */
4449static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4450{
4451 int err;
4452 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004453 const struct net_device_ops *ops;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004454
4455 if (!dev)
4456 return -ENODEV;
4457
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004458 ops = dev->netdev_ops;
4459
Jeff Garzik14e3e072007-10-08 00:06:32 -07004460 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004461 case SIOCSIFFLAGS: /* Set interface flags */
4462 return dev_change_flags(dev, ifr->ifr_flags);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004463
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004464 case SIOCSIFMETRIC: /* Set the metric on the interface
4465 (currently unused) */
4466 return -EOPNOTSUPP;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004467
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004468 case SIOCSIFMTU: /* Set the MTU of a device */
4469 return dev_set_mtu(dev, ifr->ifr_mtu);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004470
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004471 case SIOCSIFHWADDR:
4472 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004473
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004474 case SIOCSIFHWBROADCAST:
4475 if (ifr->ifr_hwaddr.sa_family != dev->type)
4476 return -EINVAL;
4477 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4478 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4479 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4480 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004481
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004482 case SIOCSIFMAP:
4483 if (ops->ndo_set_config) {
4484 if (!netif_device_present(dev))
4485 return -ENODEV;
4486 return ops->ndo_set_config(dev, &ifr->ifr_map);
4487 }
4488 return -EOPNOTSUPP;
4489
4490 case SIOCADDMULTI:
4491 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4492 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4493 return -EINVAL;
4494 if (!netif_device_present(dev))
4495 return -ENODEV;
Jiri Pirko22bedad32010-04-01 21:22:57 +00004496 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004497
4498 case SIOCDELMULTI:
4499 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4500 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4501 return -EINVAL;
4502 if (!netif_device_present(dev))
4503 return -ENODEV;
Jiri Pirko22bedad32010-04-01 21:22:57 +00004504 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004505
4506 case SIOCSIFTXQLEN:
4507 if (ifr->ifr_qlen < 0)
4508 return -EINVAL;
4509 dev->tx_queue_len = ifr->ifr_qlen;
4510 return 0;
4511
4512 case SIOCSIFNAME:
4513 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4514 return dev_change_name(dev, ifr->ifr_newname);
4515
4516 /*
4517 * Unknown or private ioctl
4518 */
4519 default:
4520 if ((cmd >= SIOCDEVPRIVATE &&
4521 cmd <= SIOCDEVPRIVATE + 15) ||
4522 cmd == SIOCBONDENSLAVE ||
4523 cmd == SIOCBONDRELEASE ||
4524 cmd == SIOCBONDSETHWADDR ||
4525 cmd == SIOCBONDSLAVEINFOQUERY ||
4526 cmd == SIOCBONDINFOQUERY ||
4527 cmd == SIOCBONDCHANGEACTIVE ||
4528 cmd == SIOCGMIIPHY ||
4529 cmd == SIOCGMIIREG ||
4530 cmd == SIOCSMIIREG ||
4531 cmd == SIOCBRADDIF ||
4532 cmd == SIOCBRDELIF ||
4533 cmd == SIOCSHWTSTAMP ||
4534 cmd == SIOCWANDEV) {
4535 err = -EOPNOTSUPP;
4536 if (ops->ndo_do_ioctl) {
4537 if (netif_device_present(dev))
4538 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4539 else
4540 err = -ENODEV;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004541 }
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004542 } else
4543 err = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004544
4545 }
4546 return err;
4547}
4548
4549/*
4550 * This function handles all "interface"-type I/O control requests. The actual
4551 * 'doing' part of this is dev_ifsioc above.
4552 */
4553
4554/**
4555 * dev_ioctl - network device ioctl
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004556 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004557 * @cmd: command to issue
4558 * @arg: pointer to a struct ifreq in user space
4559 *
4560 * Issue ioctl functions to devices. This is normally called by the
4561 * user space syscall interfaces but can sometimes be useful for
4562 * other purposes. The return value is the return from the syscall if
4563 * positive or a negative errno code on error.
4564 */
4565
Eric W. Biederman881d9662007-09-17 11:56:21 -07004566int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004567{
4568 struct ifreq ifr;
4569 int ret;
4570 char *colon;
4571
4572 /* One special case: SIOCGIFCONF takes ifconf argument
4573 and requires shared lock, because it sleeps writing
4574 to user space.
4575 */
4576
4577 if (cmd == SIOCGIFCONF) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004578 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07004579 ret = dev_ifconf(net, (char __user *) arg);
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004580 rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004581 return ret;
4582 }
4583 if (cmd == SIOCGIFNAME)
Eric W. Biederman881d9662007-09-17 11:56:21 -07004584 return dev_ifname(net, (struct ifreq __user *)arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004585
4586 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4587 return -EFAULT;
4588
4589 ifr.ifr_name[IFNAMSIZ-1] = 0;
4590
4591 colon = strchr(ifr.ifr_name, ':');
4592 if (colon)
4593 *colon = 0;
4594
4595 /*
4596 * See which interface the caller is talking about.
4597 */
4598
4599 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004600 /*
4601 * These ioctl calls:
4602 * - can be done by all.
4603 * - atomic and do not require locking.
4604 * - return a value
4605 */
4606 case SIOCGIFFLAGS:
4607 case SIOCGIFMETRIC:
4608 case SIOCGIFMTU:
4609 case SIOCGIFHWADDR:
4610 case SIOCGIFSLAVE:
4611 case SIOCGIFMAP:
4612 case SIOCGIFINDEX:
4613 case SIOCGIFTXQLEN:
4614 dev_load(net, ifr.ifr_name);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004615 rcu_read_lock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004616 ret = dev_ifsioc_locked(net, &ifr, cmd);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004617 rcu_read_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004618 if (!ret) {
4619 if (colon)
4620 *colon = ':';
4621 if (copy_to_user(arg, &ifr,
4622 sizeof(struct ifreq)))
4623 ret = -EFAULT;
4624 }
4625 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004626
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004627 case SIOCETHTOOL:
4628 dev_load(net, ifr.ifr_name);
4629 rtnl_lock();
4630 ret = dev_ethtool(net, &ifr);
4631 rtnl_unlock();
4632 if (!ret) {
4633 if (colon)
4634 *colon = ':';
4635 if (copy_to_user(arg, &ifr,
4636 sizeof(struct ifreq)))
4637 ret = -EFAULT;
4638 }
4639 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004640
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004641 /*
4642 * These ioctl calls:
4643 * - require superuser power.
4644 * - require strict serialization.
4645 * - return a value
4646 */
4647 case SIOCGMIIPHY:
4648 case SIOCGMIIREG:
4649 case SIOCSIFNAME:
4650 if (!capable(CAP_NET_ADMIN))
4651 return -EPERM;
4652 dev_load(net, ifr.ifr_name);
4653 rtnl_lock();
4654 ret = dev_ifsioc(net, &ifr, cmd);
4655 rtnl_unlock();
4656 if (!ret) {
4657 if (colon)
4658 *colon = ':';
4659 if (copy_to_user(arg, &ifr,
4660 sizeof(struct ifreq)))
4661 ret = -EFAULT;
4662 }
4663 return ret;
4664
4665 /*
4666 * These ioctl calls:
4667 * - require superuser power.
4668 * - require strict serialization.
4669 * - do not return a value
4670 */
4671 case SIOCSIFFLAGS:
4672 case SIOCSIFMETRIC:
4673 case SIOCSIFMTU:
4674 case SIOCSIFMAP:
4675 case SIOCSIFHWADDR:
4676 case SIOCSIFSLAVE:
4677 case SIOCADDMULTI:
4678 case SIOCDELMULTI:
4679 case SIOCSIFHWBROADCAST:
4680 case SIOCSIFTXQLEN:
4681 case SIOCSMIIREG:
4682 case SIOCBONDENSLAVE:
4683 case SIOCBONDRELEASE:
4684 case SIOCBONDSETHWADDR:
4685 case SIOCBONDCHANGEACTIVE:
4686 case SIOCBRADDIF:
4687 case SIOCBRDELIF:
4688 case SIOCSHWTSTAMP:
4689 if (!capable(CAP_NET_ADMIN))
4690 return -EPERM;
4691 /* fall through */
4692 case SIOCBONDSLAVEINFOQUERY:
4693 case SIOCBONDINFOQUERY:
4694 dev_load(net, ifr.ifr_name);
4695 rtnl_lock();
4696 ret = dev_ifsioc(net, &ifr, cmd);
4697 rtnl_unlock();
4698 return ret;
4699
4700 case SIOCGIFMEM:
4701 /* Get the per device memory space. We can add this but
4702 * currently do not support it */
4703 case SIOCSIFMEM:
4704 /* Set the per device memory buffer space.
4705 * Not applicable in our case */
4706 case SIOCSIFLINK:
4707 return -EINVAL;
4708
4709 /*
4710 * Unknown or private ioctl.
4711 */
4712 default:
4713 if (cmd == SIOCWANDEV ||
4714 (cmd >= SIOCDEVPRIVATE &&
4715 cmd <= SIOCDEVPRIVATE + 15)) {
Eric W. Biederman881d9662007-09-17 11:56:21 -07004716 dev_load(net, ifr.ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004717 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07004718 ret = dev_ifsioc(net, &ifr, cmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004719 rtnl_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004720 if (!ret && copy_to_user(arg, &ifr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07004721 sizeof(struct ifreq)))
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004722 ret = -EFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004723 return ret;
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004724 }
4725 /* Take care of Wireless Extensions */
4726 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4727 return wext_handle_ioctl(net, &ifr, cmd, arg);
4728 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004729 }
4730}
4731
4732
4733/**
4734 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004735 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004736 *
4737 * Returns a suitable unique value for a new device interface
4738 * number. The caller must hold the rtnl semaphore or the
4739 * dev_base_lock to be sure it remains unique.
4740 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07004741static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004742{
4743 static int ifindex;
4744 for (;;) {
4745 if (++ifindex <= 0)
4746 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004747 if (!__dev_get_by_index(net, ifindex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004748 return ifindex;
4749 }
4750}
4751
Linus Torvalds1da177e2005-04-16 15:20:36 -07004752/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08004753static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004754
Stephen Hemminger6f05f622007-03-08 20:46:03 -08004755static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004756{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004757 list_add_tail(&dev->todo_list, &net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004758}
4759
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004760static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004761{
Krishna Kumare93737b2009-12-08 22:26:02 +00004762 struct net_device *dev, *tmp;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004763
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004764 BUG_ON(dev_boot_phase);
4765 ASSERT_RTNL();
4766
Krishna Kumare93737b2009-12-08 22:26:02 +00004767 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004768 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00004769 * for initialization unwind. Remove those
4770 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004771 */
4772 if (dev->reg_state == NETREG_UNINITIALIZED) {
4773 pr_debug("unregister_netdevice: device %s/%p never "
4774 "was registered\n", dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004775
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004776 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00004777 list_del(&dev->unreg_list);
4778 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004779 }
4780
4781 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4782
4783 /* If device is running, close it first. */
4784 dev_close(dev);
4785
4786 /* And unlink it from device chain. */
4787 unlist_netdevice(dev);
4788
4789 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004790 }
4791
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004792 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004793
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004794 list_for_each_entry(dev, head, unreg_list) {
4795 /* Shutdown queueing discipline. */
4796 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004797
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004798
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004799 /* Notify protocols, that we are about to destroy
4800 this device. They should clean all the things.
4801 */
4802 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4803
Patrick McHardya2835762010-02-26 06:34:51 +00004804 if (!dev->rtnl_link_ops ||
4805 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4806 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4807
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004808 /*
4809 * Flush the unicast and multicast chains
4810 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00004811 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00004812 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004813
4814 if (dev->netdev_ops->ndo_uninit)
4815 dev->netdev_ops->ndo_uninit(dev);
4816
4817 /* Notifier chain MUST detach us from master device. */
4818 WARN_ON(dev->master);
4819
4820 /* Remove entries from kobject tree */
4821 netdev_unregister_kobject(dev);
4822 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004823
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004824 /* Process any work delayed until the end of the batch */
stephen hemmingere5e26d72010-02-24 14:01:38 +00004825 dev = list_first_entry(head, struct net_device, unreg_list);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004826 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4827
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004828 synchronize_net();
4829
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004830 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004831 dev_put(dev);
4832}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004833
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004834static void rollback_registered(struct net_device *dev)
4835{
4836 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004837
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004838 list_add(&dev->unreg_list, &single);
4839 rollback_registered_many(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004840}
4841
David S. Millere8a04642008-07-17 00:34:19 -07004842static void __netdev_init_queue_locks_one(struct net_device *dev,
4843 struct netdev_queue *dev_queue,
4844 void *_unused)
David S. Millerc773e842008-07-08 23:13:53 -07004845{
4846 spin_lock_init(&dev_queue->_xmit_lock);
David S. Millercf508b12008-07-22 14:16:42 -07004847 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
David S. Millerc773e842008-07-08 23:13:53 -07004848 dev_queue->xmit_lock_owner = -1;
4849}
4850
4851static void netdev_init_queue_locks(struct net_device *dev)
4852{
David S. Millere8a04642008-07-17 00:34:19 -07004853 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4854 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
David S. Millerc773e842008-07-08 23:13:53 -07004855}
4856
Herbert Xub63365a2008-10-23 01:11:29 -07004857unsigned long netdev_fix_features(unsigned long features, const char *name)
4858{
4859 /* Fix illegal SG+CSUM combinations. */
4860 if ((features & NETIF_F_SG) &&
4861 !(features & NETIF_F_ALL_CSUM)) {
4862 if (name)
4863 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4864 "checksum feature.\n", name);
4865 features &= ~NETIF_F_SG;
4866 }
4867
4868 /* TSO requires that SG is present as well. */
4869 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4870 if (name)
4871 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4872 "SG feature.\n", name);
4873 features &= ~NETIF_F_TSO;
4874 }
4875
4876 if (features & NETIF_F_UFO) {
4877 if (!(features & NETIF_F_GEN_CSUM)) {
4878 if (name)
4879 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4880 "since no NETIF_F_HW_CSUM feature.\n",
4881 name);
4882 features &= ~NETIF_F_UFO;
4883 }
4884
4885 if (!(features & NETIF_F_SG)) {
4886 if (name)
4887 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4888 "since no NETIF_F_SG feature.\n", name);
4889 features &= ~NETIF_F_UFO;
4890 }
4891 }
4892
4893 return features;
4894}
4895EXPORT_SYMBOL(netdev_fix_features);
4896
Linus Torvalds1da177e2005-04-16 15:20:36 -07004897/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08004898 * netif_stacked_transfer_operstate - transfer operstate
4899 * @rootdev: the root or lower level device to transfer state from
4900 * @dev: the device to transfer operstate to
4901 *
4902 * Transfer operational state from root to device. This is normally
4903 * called when a stacking relationship exists between the root
4904 * device and the device(a leaf device).
4905 */
4906void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4907 struct net_device *dev)
4908{
4909 if (rootdev->operstate == IF_OPER_DORMANT)
4910 netif_dormant_on(dev);
4911 else
4912 netif_dormant_off(dev);
4913
4914 if (netif_carrier_ok(rootdev)) {
4915 if (!netif_carrier_ok(dev))
4916 netif_carrier_on(dev);
4917 } else {
4918 if (netif_carrier_ok(dev))
4919 netif_carrier_off(dev);
4920 }
4921}
4922EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4923
4924/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07004925 * register_netdevice - register a network device
4926 * @dev: device to register
4927 *
4928 * Take a completed network device structure and add it to the kernel
4929 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4930 * chain. 0 is returned on success. A negative errno code is returned
4931 * on a failure to set up the device, or if the name is a duplicate.
4932 *
4933 * Callers must hold the rtnl semaphore. You may want
4934 * register_netdev() instead of this.
4935 *
4936 * BUGS:
4937 * The locking appears insufficient to guarantee two parallel registers
4938 * will not get the same name.
4939 */
4940
4941int register_netdevice(struct net_device *dev)
4942{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004943 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004944 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004945
4946 BUG_ON(dev_boot_phase);
4947 ASSERT_RTNL();
4948
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07004949 might_sleep();
4950
Linus Torvalds1da177e2005-04-16 15:20:36 -07004951 /* When net_device's are persistent, this will be fatal. */
4952 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004953 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004954
David S. Millerf1f28aa2008-07-15 00:08:33 -07004955 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07004956 netdev_set_addr_lockdep_class(dev);
David S. Millerc773e842008-07-08 23:13:53 -07004957 netdev_init_queue_locks(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004958
Linus Torvalds1da177e2005-04-16 15:20:36 -07004959 dev->iflink = -1;
4960
Eric Dumazetdf334542010-03-24 19:13:54 +00004961#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00004962 if (!dev->num_rx_queues) {
4963 /*
4964 * Allocate a single RX queue if driver never called
4965 * alloc_netdev_mq
4966 */
4967
4968 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4969 if (!dev->_rx) {
4970 ret = -ENOMEM;
4971 goto out;
4972 }
4973
4974 dev->_rx->first = dev->_rx;
4975 atomic_set(&dev->_rx->count, 1);
4976 dev->num_rx_queues = 1;
4977 }
Eric Dumazetdf334542010-03-24 19:13:54 +00004978#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004979 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004980 if (dev->netdev_ops->ndo_init) {
4981 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004982 if (ret) {
4983 if (ret > 0)
4984 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08004985 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004986 }
4987 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004988
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00004989 ret = dev_get_valid_name(dev, dev->name, 0);
Octavian Purdilad9031022009-11-18 02:36:59 +00004990 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07004991 goto err_uninit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004992
Eric W. Biederman881d9662007-09-17 11:56:21 -07004993 dev->ifindex = dev_new_index(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004994 if (dev->iflink == -1)
4995 dev->iflink = dev->ifindex;
4996
Stephen Hemmingerd212f872007-06-27 00:47:37 -07004997 /* Fix illegal checksum combinations */
4998 if ((dev->features & NETIF_F_HW_CSUM) &&
4999 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5000 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5001 dev->name);
5002 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5003 }
5004
5005 if ((dev->features & NETIF_F_NO_CSUM) &&
5006 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5007 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5008 dev->name);
5009 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5010 }
5011
Herbert Xub63365a2008-10-23 01:11:29 -07005012 dev->features = netdev_fix_features(dev->features, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005013
Lennert Buytenheke5a4a722008-08-03 01:23:10 -07005014 /* Enable software GSO if SG is supported. */
5015 if (dev->features & NETIF_F_SG)
5016 dev->features |= NETIF_F_GSO;
5017
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005018 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5019 ret = notifier_to_errno(ret);
5020 if (ret)
5021 goto err_uninit;
5022
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005023 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005024 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005025 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005026 dev->reg_state = NETREG_REGISTERED;
5027
Linus Torvalds1da177e2005-04-16 15:20:36 -07005028 /*
5029 * Default initial state at registry is that the
5030 * device is present.
5031 */
5032
5033 set_bit(__LINK_STATE_PRESENT, &dev->state);
5034
Linus Torvalds1da177e2005-04-16 15:20:36 -07005035 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005036 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005037 list_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005038
5039 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005040 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005041 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005042 if (ret) {
5043 rollback_registered(dev);
5044 dev->reg_state = NETREG_UNREGISTERED;
5045 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005046 /*
5047 * Prevent userspace races by waiting until the network
5048 * device is fully setup before sending notifications.
5049 */
Patrick McHardya2835762010-02-26 06:34:51 +00005050 if (!dev->rtnl_link_ops ||
5051 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5052 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005053
5054out:
5055 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005056
5057err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005058 if (dev->netdev_ops->ndo_uninit)
5059 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005060 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005061}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005062EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005063
5064/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005065 * init_dummy_netdev - init a dummy network device for NAPI
5066 * @dev: device to init
5067 *
5068 * This takes a network device structure and initialize the minimum
5069 * amount of fields so it can be used to schedule NAPI polls without
5070 * registering a full blown interface. This is to be used by drivers
5071 * that need to tie several hardware interfaces to a single NAPI
5072 * poll scheduler due to HW limitations.
5073 */
5074int init_dummy_netdev(struct net_device *dev)
5075{
5076 /* Clear everything. Note we don't initialize spinlocks
5077 * are they aren't supposed to be taken by any of the
5078 * NAPI code and this dummy netdev is supposed to be
5079 * only ever used for NAPI polls
5080 */
5081 memset(dev, 0, sizeof(struct net_device));
5082
5083 /* make sure we BUG if trying to hit standard
5084 * register/unregister code path
5085 */
5086 dev->reg_state = NETREG_DUMMY;
5087
5088 /* initialize the ref count */
5089 atomic_set(&dev->refcnt, 1);
5090
5091 /* NAPI wants this */
5092 INIT_LIST_HEAD(&dev->napi_list);
5093
5094 /* a dummy interface is started by default */
5095 set_bit(__LINK_STATE_PRESENT, &dev->state);
5096 set_bit(__LINK_STATE_START, &dev->state);
5097
5098 return 0;
5099}
5100EXPORT_SYMBOL_GPL(init_dummy_netdev);
5101
5102
5103/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005104 * register_netdev - register a network device
5105 * @dev: device to register
5106 *
5107 * Take a completed network device structure and add it to the kernel
5108 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5109 * chain. 0 is returned on success. A negative errno code is returned
5110 * on a failure to set up the device, or if the name is a duplicate.
5111 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005112 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005113 * and expands the device name if you passed a format string to
5114 * alloc_netdev.
5115 */
5116int register_netdev(struct net_device *dev)
5117{
5118 int err;
5119
5120 rtnl_lock();
5121
5122 /*
5123 * If the name is a format string the caller wants us to do a
5124 * name allocation.
5125 */
5126 if (strchr(dev->name, '%')) {
5127 err = dev_alloc_name(dev, dev->name);
5128 if (err < 0)
5129 goto out;
5130 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005131
Linus Torvalds1da177e2005-04-16 15:20:36 -07005132 err = register_netdevice(dev);
5133out:
5134 rtnl_unlock();
5135 return err;
5136}
5137EXPORT_SYMBOL(register_netdev);
5138
5139/*
5140 * netdev_wait_allrefs - wait until all references are gone.
5141 *
5142 * This is called when unregistering network devices.
5143 *
5144 * Any protocol or device that holds a reference should register
5145 * for netdevice notification, and cleanup and put back the
5146 * reference if they receive an UNREGISTER event.
5147 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005148 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005149 */
5150static void netdev_wait_allrefs(struct net_device *dev)
5151{
5152 unsigned long rebroadcast_time, warning_time;
5153
Eric Dumazete014deb2009-11-17 05:59:21 +00005154 linkwatch_forget_dev(dev);
5155
Linus Torvalds1da177e2005-04-16 15:20:36 -07005156 rebroadcast_time = warning_time = jiffies;
5157 while (atomic_read(&dev->refcnt) != 0) {
5158 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005159 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005160
5161 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005162 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005163 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
Octavian Purdila395264d2009-11-16 13:49:35 +00005164 * should have already handle it the first time */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005165
5166 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5167 &dev->state)) {
5168 /* We must not have linkwatch events
5169 * pending on unregister. If this
5170 * happens, we simply run the queue
5171 * unscheduled, resulting in a noop
5172 * for this device.
5173 */
5174 linkwatch_run_queue();
5175 }
5176
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005177 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005178
5179 rebroadcast_time = jiffies;
5180 }
5181
5182 msleep(250);
5183
5184 if (time_after(jiffies, warning_time + 10 * HZ)) {
5185 printk(KERN_EMERG "unregister_netdevice: "
5186 "waiting for %s to become free. Usage "
5187 "count = %d\n",
5188 dev->name, atomic_read(&dev->refcnt));
5189 warning_time = jiffies;
5190 }
5191 }
5192}
5193
5194/* The sequence is:
5195 *
5196 * rtnl_lock();
5197 * ...
5198 * register_netdevice(x1);
5199 * register_netdevice(x2);
5200 * ...
5201 * unregister_netdevice(y1);
5202 * unregister_netdevice(y2);
5203 * ...
5204 * rtnl_unlock();
5205 * free_netdev(y1);
5206 * free_netdev(y2);
5207 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07005208 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07005209 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005210 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07005211 * without deadlocking with linkwatch via keventd.
5212 * 2) Since we run with the RTNL semaphore not held, we can sleep
5213 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07005214 *
5215 * We must not return until all unregister events added during
5216 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005217 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005218void netdev_run_todo(void)
5219{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005220 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005221
Linus Torvalds1da177e2005-04-16 15:20:36 -07005222 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005223 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07005224
5225 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005226
Linus Torvalds1da177e2005-04-16 15:20:36 -07005227 while (!list_empty(&list)) {
5228 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00005229 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005230 list_del(&dev->todo_list);
5231
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005232 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005233 printk(KERN_ERR "network todo '%s' but state %d\n",
5234 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005235 dump_stack();
5236 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005237 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005238
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005239 dev->reg_state = NETREG_UNREGISTERED;
5240
Changli Gao152102c2010-03-30 20:16:22 +00005241 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07005242
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005243 netdev_wait_allrefs(dev);
5244
5245 /* paranoia */
5246 BUG_ON(atomic_read(&dev->refcnt));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07005247 WARN_ON(dev->ip_ptr);
5248 WARN_ON(dev->ip6_ptr);
5249 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005250
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005251 if (dev->destructor)
5252 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07005253
5254 /* Free network device */
5255 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005256 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005257}
5258
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005259/**
Eric Dumazetd83345a2009-11-16 03:36:51 +00005260 * dev_txq_stats_fold - fold tx_queues stats
5261 * @dev: device to get statistics from
5262 * @stats: struct net_device_stats to hold results
5263 */
5264void dev_txq_stats_fold(const struct net_device *dev,
5265 struct net_device_stats *stats)
5266{
5267 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5268 unsigned int i;
5269 struct netdev_queue *txq;
5270
5271 for (i = 0; i < dev->num_tx_queues; i++) {
5272 txq = netdev_get_tx_queue(dev, i);
5273 tx_bytes += txq->tx_bytes;
5274 tx_packets += txq->tx_packets;
5275 tx_dropped += txq->tx_dropped;
5276 }
5277 if (tx_bytes || tx_packets || tx_dropped) {
5278 stats->tx_bytes = tx_bytes;
5279 stats->tx_packets = tx_packets;
5280 stats->tx_dropped = tx_dropped;
5281 }
5282}
5283EXPORT_SYMBOL(dev_txq_stats_fold);
5284
5285/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005286 * dev_get_stats - get network device statistics
5287 * @dev: device to get statistics from
5288 *
5289 * Get network statistics from device. The device driver may provide
5290 * its own method by setting dev->netdev_ops->get_stats; otherwise
5291 * the internal statistics structure is used.
5292 */
5293const struct net_device_stats *dev_get_stats(struct net_device *dev)
Eric Dumazet7004bf22009-05-18 00:34:33 +00005294{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005295 const struct net_device_ops *ops = dev->netdev_ops;
5296
5297 if (ops->ndo_get_stats)
5298 return ops->ndo_get_stats(dev);
Eric Dumazet7004bf22009-05-18 00:34:33 +00005299
Eric Dumazetd83345a2009-11-16 03:36:51 +00005300 dev_txq_stats_fold(dev, &dev->stats);
5301 return &dev->stats;
Rusty Russellc45d2862007-03-28 14:29:08 -07005302}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005303EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07005304
David S. Millerdc2b4842008-07-08 17:18:23 -07005305static void netdev_init_one_queue(struct net_device *dev,
David S. Millere8a04642008-07-17 00:34:19 -07005306 struct netdev_queue *queue,
5307 void *_unused)
David S. Millerdc2b4842008-07-08 17:18:23 -07005308{
David S. Millerdc2b4842008-07-08 17:18:23 -07005309 queue->dev = dev;
5310}
5311
David S. Millerbb949fb2008-07-08 16:55:56 -07005312static void netdev_init_queues(struct net_device *dev)
5313{
David S. Millere8a04642008-07-17 00:34:19 -07005314 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5315 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
David S. Millerc3f26a22008-07-31 16:58:50 -07005316 spin_lock_init(&dev->tx_global_lock);
David S. Millerbb949fb2008-07-08 16:55:56 -07005317}
5318
Linus Torvalds1da177e2005-04-16 15:20:36 -07005319/**
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005320 * alloc_netdev_mq - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005321 * @sizeof_priv: size of private data to allocate space for
5322 * @name: device name format string
5323 * @setup: callback to initialize device
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005324 * @queue_count: the number of subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005325 *
5326 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005327 * and performs basic initialization. Also allocates subquue structs
5328 * for each queue on the device at the end of the netdevice.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005329 */
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005330struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5331 void (*setup)(struct net_device *), unsigned int queue_count)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005332{
David S. Millere8a04642008-07-17 00:34:19 -07005333 struct netdev_queue *tx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005334 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07005335 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005336 struct net_device *p;
Eric Dumazetdf334542010-03-24 19:13:54 +00005337#ifdef CONFIG_RPS
5338 struct netdev_rx_queue *rx;
Tom Herbert0a9627f2010-03-16 08:03:29 +00005339 int i;
Eric Dumazetdf334542010-03-24 19:13:54 +00005340#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005341
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005342 BUG_ON(strlen(name) >= sizeof(dev->name));
5343
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005344 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005345 if (sizeof_priv) {
5346 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005347 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005348 alloc_size += sizeof_priv;
5349 }
5350 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005351 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005352
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07005353 p = kzalloc(alloc_size, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005354 if (!p) {
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005355 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005356 return NULL;
5357 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005358
Stephen Hemminger79439862008-07-21 13:28:44 -07005359 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
David S. Millere8a04642008-07-17 00:34:19 -07005360 if (!tx) {
5361 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5362 "tx qdiscs.\n");
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005363 goto free_p;
David S. Millere8a04642008-07-17 00:34:19 -07005364 }
5365
Eric Dumazetdf334542010-03-24 19:13:54 +00005366#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005367 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5368 if (!rx) {
5369 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5370 "rx queues.\n");
5371 goto free_tx;
5372 }
5373
5374 atomic_set(&rx->count, queue_count);
5375
5376 /*
5377 * Set a pointer to first element in the array which holds the
5378 * reference count.
5379 */
5380 for (i = 0; i < queue_count; i++)
5381 rx[i].first = rx;
Eric Dumazetdf334542010-03-24 19:13:54 +00005382#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005383
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005384 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005385 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005386
5387 if (dev_addr_init(dev))
Tom Herbert0a9627f2010-03-16 08:03:29 +00005388 goto free_rx;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005389
Jiri Pirko22bedad32010-04-01 21:22:57 +00005390 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005391 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00005392
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005393 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005394
David S. Millere8a04642008-07-17 00:34:19 -07005395 dev->_tx = tx;
5396 dev->num_tx_queues = queue_count;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005397 dev->real_num_tx_queues = queue_count;
David S. Millere8a04642008-07-17 00:34:19 -07005398
Eric Dumazetdf334542010-03-24 19:13:54 +00005399#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005400 dev->_rx = rx;
5401 dev->num_rx_queues = queue_count;
Eric Dumazetdf334542010-03-24 19:13:54 +00005402#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005403
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07005404 dev->gso_max_size = GSO_MAX_SIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005405
David S. Millerbb949fb2008-07-08 16:55:56 -07005406 netdev_init_queues(dev);
5407
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005408 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5409 dev->ethtool_ntuple_list.count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08005410 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005411 INIT_LIST_HEAD(&dev->unreg_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00005412 INIT_LIST_HEAD(&dev->link_watch_list);
Eric Dumazet93f154b2009-05-18 22:19:19 -07005413 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005414 setup(dev);
5415 strcpy(dev->name, name);
5416 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005417
Tom Herbert0a9627f2010-03-16 08:03:29 +00005418free_rx:
Eric Dumazetdf334542010-03-24 19:13:54 +00005419#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005420 kfree(rx);
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005421free_tx:
Eric Dumazetdf334542010-03-24 19:13:54 +00005422#endif
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005423 kfree(tx);
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005424free_p:
5425 kfree(p);
5426 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005427}
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005428EXPORT_SYMBOL(alloc_netdev_mq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005429
5430/**
5431 * free_netdev - free network device
5432 * @dev: device
5433 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005434 * This function does the last stage of destroying an allocated device
5435 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005436 * If this is the last reference then it will be freed.
5437 */
5438void free_netdev(struct net_device *dev)
5439{
Herbert Xud565b0a2008-12-15 23:38:52 -08005440 struct napi_struct *p, *n;
5441
Denis V. Lunevf3005d72008-04-16 02:02:18 -07005442 release_net(dev_net(dev));
5443
David S. Millere8a04642008-07-17 00:34:19 -07005444 kfree(dev->_tx);
5445
Jiri Pirkof001fde2009-05-05 02:48:28 +00005446 /* Flush device addresses */
5447 dev_addr_flush(dev);
5448
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005449 /* Clear ethtool n-tuple list */
5450 ethtool_ntuple_flush(dev);
5451
Herbert Xud565b0a2008-12-15 23:38:52 -08005452 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5453 netif_napi_del(p);
5454
Stephen Hemminger3041a062006-05-26 13:25:24 -07005455 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005456 if (dev->reg_state == NETREG_UNINITIALIZED) {
5457 kfree((char *)dev - dev->padded);
5458 return;
5459 }
5460
5461 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5462 dev->reg_state = NETREG_RELEASED;
5463
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07005464 /* will free via device release */
5465 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005466}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005467EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005468
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005469/**
5470 * synchronize_net - Synchronize with packet receive processing
5471 *
5472 * Wait for packets currently being received to be done.
5473 * Does not block later packets from starting.
5474 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005475void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005476{
5477 might_sleep();
Paul E. McKenneyfbd568a3e2005-05-01 08:59:04 -07005478 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005479}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005480EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005481
5482/**
Eric Dumazet44a08732009-10-27 07:03:04 +00005483 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07005484 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00005485 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08005486 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07005487 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005488 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00005489 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005490 *
5491 * Callers must hold the rtnl semaphore. You may want
5492 * unregister_netdev() instead of this.
5493 */
5494
Eric Dumazet44a08732009-10-27 07:03:04 +00005495void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005496{
Herbert Xua6620712007-12-12 19:21:56 -08005497 ASSERT_RTNL();
5498
Eric Dumazet44a08732009-10-27 07:03:04 +00005499 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005500 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00005501 } else {
5502 rollback_registered(dev);
5503 /* Finish processing unregister after unlock */
5504 net_set_todo(dev);
5505 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005506}
Eric Dumazet44a08732009-10-27 07:03:04 +00005507EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005508
5509/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005510 * unregister_netdevice_many - unregister many devices
5511 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005512 */
5513void unregister_netdevice_many(struct list_head *head)
5514{
5515 struct net_device *dev;
5516
5517 if (!list_empty(head)) {
5518 rollback_registered_many(head);
5519 list_for_each_entry(dev, head, unreg_list)
5520 net_set_todo(dev);
5521 }
5522}
Eric Dumazet63c80992009-10-27 07:06:49 +00005523EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005524
5525/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005526 * unregister_netdev - remove device from the kernel
5527 * @dev: device
5528 *
5529 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005530 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005531 *
5532 * This is just a wrapper for unregister_netdevice that takes
5533 * the rtnl semaphore. In general you want to use this and not
5534 * unregister_netdevice.
5535 */
5536void unregister_netdev(struct net_device *dev)
5537{
5538 rtnl_lock();
5539 unregister_netdevice(dev);
5540 rtnl_unlock();
5541}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005542EXPORT_SYMBOL(unregister_netdev);
5543
Eric W. Biedermance286d32007-09-12 13:53:49 +02005544/**
5545 * dev_change_net_namespace - move device to different nethost namespace
5546 * @dev: device
5547 * @net: network namespace
5548 * @pat: If not NULL name pattern to try if the current device name
5549 * is already taken in the destination network namespace.
5550 *
5551 * This function shuts down a device interface and moves it
5552 * to a new network namespace. On success 0 is returned, on
5553 * a failure a netagive errno code is returned.
5554 *
5555 * Callers must hold the rtnl semaphore.
5556 */
5557
5558int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5559{
Eric W. Biedermance286d32007-09-12 13:53:49 +02005560 int err;
5561
5562 ASSERT_RTNL();
5563
5564 /* Don't allow namespace local devices to be moved. */
5565 err = -EINVAL;
5566 if (dev->features & NETIF_F_NETNS_LOCAL)
5567 goto out;
5568
5569 /* Ensure the device has been registrered */
5570 err = -EINVAL;
5571 if (dev->reg_state != NETREG_REGISTERED)
5572 goto out;
5573
5574 /* Get out if there is nothing todo */
5575 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09005576 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005577 goto out;
5578
5579 /* Pick the destination device name, and ensure
5580 * we can use it in the destination network namespace.
5581 */
5582 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00005583 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005584 /* We get here if we can't use the current device name */
5585 if (!pat)
5586 goto out;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00005587 if (dev_get_valid_name(dev, pat, 1))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005588 goto out;
5589 }
5590
5591 /*
5592 * And now a mini version of register_netdevice unregister_netdevice.
5593 */
5594
5595 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07005596 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005597
5598 /* And unlink it from device chain */
5599 err = -ENODEV;
5600 unlist_netdevice(dev);
5601
5602 synchronize_net();
5603
5604 /* Shutdown queueing discipline. */
5605 dev_shutdown(dev);
5606
5607 /* Notify protocols, that we are about to destroy
5608 this device. They should clean all the things.
5609 */
5610 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005611 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005612
5613 /*
5614 * Flush the unicast and multicast chains
5615 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005616 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005617 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005618
5619 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005620 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005621
Eric W. Biedermance286d32007-09-12 13:53:49 +02005622 /* If there is an ifindex conflict assign a new one */
5623 if (__dev_get_by_index(net, dev->ifindex)) {
5624 int iflink = (dev->iflink == dev->ifindex);
5625 dev->ifindex = dev_new_index(net);
5626 if (iflink)
5627 dev->iflink = dev->ifindex;
5628 }
5629
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005630 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07005631 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005632 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005633
5634 /* Add the device back in the hashes */
5635 list_netdevice(dev);
5636
5637 /* Notify protocols, that a new device appeared. */
5638 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5639
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005640 /*
5641 * Prevent userspace races by waiting until the network
5642 * device is fully setup before sending notifications.
5643 */
5644 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5645
Eric W. Biedermance286d32007-09-12 13:53:49 +02005646 synchronize_net();
5647 err = 0;
5648out:
5649 return err;
5650}
Johannes Berg463d0182009-07-14 00:33:35 +02005651EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005652
Linus Torvalds1da177e2005-04-16 15:20:36 -07005653static int dev_cpu_callback(struct notifier_block *nfb,
5654 unsigned long action,
5655 void *ocpu)
5656{
5657 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005658 struct sk_buff *skb;
5659 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5660 struct softnet_data *sd, *oldsd;
5661
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005662 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005663 return NOTIFY_OK;
5664
5665 local_irq_disable();
5666 cpu = smp_processor_id();
5667 sd = &per_cpu(softnet_data, cpu);
5668 oldsd = &per_cpu(softnet_data, oldcpu);
5669
5670 /* Find end of our completion_queue. */
5671 list_skb = &sd->completion_queue;
5672 while (*list_skb)
5673 list_skb = &(*list_skb)->next;
5674 /* Append completion queue from offline CPU. */
5675 *list_skb = oldsd->completion_queue;
5676 oldsd->completion_queue = NULL;
5677
Linus Torvalds1da177e2005-04-16 15:20:36 -07005678 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00005679 if (oldsd->output_queue) {
5680 *sd->output_queue_tailp = oldsd->output_queue;
5681 sd->output_queue_tailp = oldsd->output_queue_tailp;
5682 oldsd->output_queue = NULL;
5683 oldsd->output_queue_tailp = &oldsd->output_queue;
5684 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005685
5686 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5687 local_irq_enable();
5688
5689 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00005690 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5691 netif_rx(skb);
5692 input_queue_head_incr(oldsd);
5693 }
Tom Herbertfec5e652010-04-16 16:01:27 -07005694 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005695 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00005696 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07005697 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005698
5699 return NOTIFY_OK;
5700}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005701
5702
Herbert Xu7f353bf2007-08-10 15:47:58 -07005703/**
Herbert Xub63365a2008-10-23 01:11:29 -07005704 * netdev_increment_features - increment feature set by one
5705 * @all: current feature set
5706 * @one: new feature set
5707 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07005708 *
5709 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07005710 * @one to the master device with current feature set @all. Will not
5711 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07005712 */
Herbert Xub63365a2008-10-23 01:11:29 -07005713unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5714 unsigned long mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07005715{
Herbert Xub63365a2008-10-23 01:11:29 -07005716 /* If device needs checksumming, downgrade to it. */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005717 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
Herbert Xub63365a2008-10-23 01:11:29 -07005718 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5719 else if (mask & NETIF_F_ALL_CSUM) {
5720 /* If one device supports v4/v6 checksumming, set for all. */
5721 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5722 !(all & NETIF_F_GEN_CSUM)) {
5723 all &= ~NETIF_F_ALL_CSUM;
5724 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5725 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07005726
Herbert Xub63365a2008-10-23 01:11:29 -07005727 /* If one device supports hw checksumming, set for all. */
5728 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5729 all &= ~NETIF_F_ALL_CSUM;
5730 all |= NETIF_F_HW_CSUM;
5731 }
5732 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07005733
Herbert Xub63365a2008-10-23 01:11:29 -07005734 one |= NETIF_F_ALL_CSUM;
Herbert Xu7f353bf2007-08-10 15:47:58 -07005735
Herbert Xub63365a2008-10-23 01:11:29 -07005736 one |= all & NETIF_F_ONE_FOR_ALL;
Sridhar Samudralad9f59502009-10-07 12:24:25 +00005737 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
Herbert Xub63365a2008-10-23 01:11:29 -07005738 all |= one & mask & NETIF_F_ONE_FOR_ALL;
Herbert Xu7f353bf2007-08-10 15:47:58 -07005739
5740 return all;
5741}
Herbert Xub63365a2008-10-23 01:11:29 -07005742EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07005743
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005744static struct hlist_head *netdev_create_hash(void)
5745{
5746 int i;
5747 struct hlist_head *hash;
5748
5749 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5750 if (hash != NULL)
5751 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5752 INIT_HLIST_HEAD(&hash[i]);
5753
5754 return hash;
5755}
5756
Eric W. Biederman881d9662007-09-17 11:56:21 -07005757/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07005758static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07005759{
Eric W. Biederman881d9662007-09-17 11:56:21 -07005760 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07005761
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005762 net->dev_name_head = netdev_create_hash();
5763 if (net->dev_name_head == NULL)
5764 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005765
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005766 net->dev_index_head = netdev_create_hash();
5767 if (net->dev_index_head == NULL)
5768 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005769
5770 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005771
5772err_idx:
5773 kfree(net->dev_name_head);
5774err_name:
5775 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005776}
5777
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005778/**
5779 * netdev_drivername - network driver for the device
5780 * @dev: network device
5781 * @buffer: buffer for resulting name
5782 * @len: size of buffer
5783 *
5784 * Determine network driver for device.
5785 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07005786char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
Arjan van de Ven6579e572008-07-21 13:31:48 -07005787{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07005788 const struct device_driver *driver;
5789 const struct device *parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07005790
5791 if (len <= 0 || !buffer)
5792 return buffer;
5793 buffer[0] = 0;
5794
5795 parent = dev->dev.parent;
5796
5797 if (!parent)
5798 return buffer;
5799
5800 driver = parent->driver;
5801 if (driver && driver->name)
5802 strlcpy(buffer, driver->name, len);
5803 return buffer;
5804}
5805
Pavel Emelyanov46650792007-10-08 20:38:39 -07005806static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07005807{
5808 kfree(net->dev_name_head);
5809 kfree(net->dev_index_head);
5810}
5811
Denis V. Lunev022cbae2007-11-13 03:23:50 -08005812static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07005813 .init = netdev_init,
5814 .exit = netdev_exit,
5815};
5816
Pavel Emelyanov46650792007-10-08 20:38:39 -07005817static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02005818{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005819 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02005820 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005821 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02005822 * initial network namespace
5823 */
5824 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005825 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005826 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005827 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02005828
5829 /* Ignore unmoveable devices (i.e. loopback) */
5830 if (dev->features & NETIF_F_NETNS_LOCAL)
5831 continue;
5832
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005833 /* Leave virtual devices for the generic cleanup */
5834 if (dev->rtnl_link_ops)
5835 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08005836
Eric W. Biedermance286d32007-09-12 13:53:49 +02005837 /* Push remaing network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005838 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5839 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005840 if (err) {
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005841 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
Eric W. Biedermance286d32007-09-12 13:53:49 +02005842 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005843 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02005844 }
5845 }
5846 rtnl_unlock();
5847}
5848
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00005849static void __net_exit default_device_exit_batch(struct list_head *net_list)
5850{
5851 /* At exit all network devices most be removed from a network
5852 * namespace. Do this in the reverse order of registeration.
5853 * Do this across as many network namespaces as possible to
5854 * improve batching efficiency.
5855 */
5856 struct net_device *dev;
5857 struct net *net;
5858 LIST_HEAD(dev_kill_list);
5859
5860 rtnl_lock();
5861 list_for_each_entry(net, net_list, exit_list) {
5862 for_each_netdev_reverse(net, dev) {
5863 if (dev->rtnl_link_ops)
5864 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5865 else
5866 unregister_netdevice_queue(dev, &dev_kill_list);
5867 }
5868 }
5869 unregister_netdevice_many(&dev_kill_list);
5870 rtnl_unlock();
5871}
5872
Denis V. Lunev022cbae2007-11-13 03:23:50 -08005873static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005874 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00005875 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02005876};
5877
Linus Torvalds1da177e2005-04-16 15:20:36 -07005878/*
5879 * Initialize the DEV module. At boot time this walks the device list and
5880 * unhooks any devices that fail to initialise (normally hardware not
5881 * present) and leaves us with a valid list of present and active devices.
5882 *
5883 */
5884
5885/*
5886 * This is called single threaded during boot, so no need
5887 * to take the rtnl semaphore.
5888 */
5889static int __init net_dev_init(void)
5890{
5891 int i, rc = -ENOMEM;
5892
5893 BUG_ON(!dev_boot_phase);
5894
Linus Torvalds1da177e2005-04-16 15:20:36 -07005895 if (dev_proc_init())
5896 goto out;
5897
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005898 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07005899 goto out;
5900
5901 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08005902 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005903 INIT_LIST_HEAD(&ptype_base[i]);
5904
Eric W. Biederman881d9662007-09-17 11:56:21 -07005905 if (register_pernet_subsys(&netdev_net_ops))
5906 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005907
5908 /*
5909 * Initialise the packet receive queues.
5910 */
5911
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07005912 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005913 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005914
Changli Gaodee42872010-05-02 05:42:16 +00005915 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005916 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07005917 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005918 sd->completion_queue = NULL;
5919 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00005920 sd->output_queue = NULL;
5921 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00005922#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005923 sd->csd.func = rps_trigger_softirq;
5924 sd->csd.info = sd;
5925 sd->csd.flags = 0;
5926 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07005927#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005928
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005929 sd->backlog.poll = process_backlog;
5930 sd->backlog.weight = weight_p;
5931 sd->backlog.gro_list = NULL;
5932 sd->backlog.gro_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005933 }
5934
Linus Torvalds1da177e2005-04-16 15:20:36 -07005935 dev_boot_phase = 0;
5936
Eric W. Biederman505d4f72008-11-07 22:54:20 -08005937 /* The loopback device is special if any other network devices
5938 * is present in a network namespace the loopback device must
5939 * be present. Since we now dynamically allocate and free the
5940 * loopback device ensure this invariant is maintained by
5941 * keeping the loopback device as the first device on the
5942 * list of network devices. Ensuring the loopback devices
5943 * is the first device that appears and the last network device
5944 * that disappears.
5945 */
5946 if (register_pernet_device(&loopback_net_ops))
5947 goto out;
5948
5949 if (register_pernet_device(&default_device_ops))
5950 goto out;
5951
Carlos R. Mafra962cf362008-05-15 11:15:37 -03005952 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5953 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005954
5955 hotcpu_notifier(dev_cpu_callback, 0);
5956 dst_init();
5957 dev_mcast_init();
5958 rc = 0;
5959out:
5960 return rc;
5961}
5962
5963subsys_initcall(net_dev_init);
5964
Krishna Kumare88721f2009-02-18 17:55:02 -08005965static int __init initialize_hashrnd(void)
5966{
Tom Herbert0a9627f2010-03-16 08:03:29 +00005967 get_random_bytes(&hashrnd, sizeof(hashrnd));
Krishna Kumare88721f2009-02-18 17:55:02 -08005968 return 0;
5969}
5970
5971late_initcall_sync(initialize_hashrnd);
5972