blob: efd318db11ab3732b37defcbbb9d84266e2c0732 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080078#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070079#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000082#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090083#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080085#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070095#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <linux/notifier.h>
97#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020098#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070099#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104#include <net/dst.h>
105#include <net/pkt_sched.h>
106#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000107#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700108#include <linux/highmem.h>
109#include <linux/init.h>
110#include <linux/kmod.h>
111#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112#include <linux/netpoll.h>
113#include <linux/rcupdate.h>
114#include <linux/delay.h>
Johannes Berg295f4a12007-04-26 20:43:56 -0700115#include <net/wext.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700117#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500118#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700119#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700120#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700121#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700122#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700123#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700124#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700125#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700126#include <linux/ipv6.h>
127#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700128#include <linux/jhash.h>
129#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700130#include <trace/events/napi.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000131#include <linux/pci.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700133#include "net-sysfs.h"
134
Herbert Xud565b0a2008-12-15 23:38:52 -0800135/* Instead of increasing this, you should create a hash table. */
136#define MAX_GRO_SKBS 8
137
Herbert Xu5d38a072009-01-04 16:13:40 -0800138/* This should be increased if a protocol with a bigger head is added. */
139#define GRO_MAX_HEAD (MAX_HEADER + 128)
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141/*
142 * The list of packet types we will receive (as opposed to discard)
143 * and the routines to invoke.
144 *
145 * Why 16. Because with 16 the only overlap we get on a hash of the
146 * low nibble of the protocol value is RARP/SNAP/X.25.
147 *
148 * NOTE: That is no longer true with the addition of VLAN tags. Not
149 * sure which should go first, but I bet it won't make much
150 * difference if we are running VLANs. The good news is that
151 * this protocol won't be in the list unless compiled in, so
Stephen Hemminger3041a062006-05-26 13:25:24 -0700152 * the average user (w/out VLANs) will not be adversely affected.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 * --BLG
154 *
155 * 0800 IP
156 * 8100 802.1Q VLAN
157 * 0001 802.3
158 * 0002 AX.25
159 * 0004 802.2
160 * 8035 RARP
161 * 0005 SNAP
162 * 0805 X.25
163 * 0806 ARP
164 * 8137 IPX
165 * 0009 Localtalk
166 * 86DD IPv6
167 */
168
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800169#define PTYPE_HASH_SIZE (16)
170#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
171
Linus Torvalds1da177e2005-04-16 15:20:36 -0700172static DEFINE_SPINLOCK(ptype_lock);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +0800173static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -0700174static struct list_head ptype_all __read_mostly; /* Taps */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700175
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700177 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178 * semaphore.
179 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800180 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700181 *
182 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700183 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184 * actual updates. This allows pure readers to access the list even
185 * while a writer is preparing to update it.
186 *
187 * To put it another way, dev_base_lock is held for writing only to
188 * protect against pure readers; the rtnl semaphore provides the
189 * protection against other writers.
190 *
191 * See, for example usages, register_netdevice() and
192 * unregister_netdevice(), which must be called with the rtnl
193 * semaphore held.
194 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700196EXPORT_SYMBOL(dev_base_lock);
197
Eric W. Biederman881d9662007-09-17 11:56:21 -0700198static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700199{
200 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
stephen hemminger08e98972009-11-10 07:20:34 +0000201 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700202}
203
Eric W. Biederman881d9662007-09-17 11:56:21 -0700204static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700205{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700206 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207}
208
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000209static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000210{
211#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000212 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000213#endif
214}
215
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000216static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000217{
218#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000219 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000220#endif
221}
222
Eric W. Biedermance286d32007-09-12 13:53:49 +0200223/* Device list insertion */
224static int list_netdevice(struct net_device *dev)
225{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900226 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200227
228 ASSERT_RTNL();
229
230 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800231 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000232 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000233 hlist_add_head_rcu(&dev->index_hlist,
234 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200235 write_unlock_bh(&dev_base_lock);
236 return 0;
237}
238
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000239/* Device list removal
240 * caller must respect a RCU grace period before freeing/reusing dev
241 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200242static void unlist_netdevice(struct net_device *dev)
243{
244 ASSERT_RTNL();
245
246 /* Unlink dev from the device chain */
247 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800248 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000249 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000250 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200251 write_unlock_bh(&dev_base_lock);
252}
253
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254/*
255 * Our notifier list
256 */
257
Alan Sternf07d5b92006-05-09 15:23:03 -0700258static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700259
260/*
261 * Device drivers call our routines to queue packets here. We empty the
262 * queue in the local softnet handler.
263 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700264
Eric Dumazet9958da02010-04-17 04:17:02 +0000265DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700266EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267
David S. Millercf508b12008-07-22 14:16:42 -0700268#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700269/*
David S. Millerc773e842008-07-08 23:13:53 -0700270 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700271 * according to dev->type
272 */
273static const unsigned short netdev_lock_type[] =
274 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
275 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
276 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
277 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
278 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
279 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
280 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
281 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
282 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
283 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
284 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
285 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
286 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800287 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
Dmitry Eremin-Solenikov929122c2009-08-14 20:00:20 +0400288 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000289 ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700290
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700291static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700292 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
293 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
294 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
295 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
296 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
297 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
298 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
299 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
300 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
301 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
302 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
303 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
304 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800305 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
Dmitry Eremin-Solenikov929122c2009-08-14 20:00:20 +0400306 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000307 "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700308
309static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700310static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700311
312static inline unsigned short netdev_lock_pos(unsigned short dev_type)
313{
314 int i;
315
316 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
317 if (netdev_lock_type[i] == dev_type)
318 return i;
319 /* the last key is used by default */
320 return ARRAY_SIZE(netdev_lock_type) - 1;
321}
322
David S. Millercf508b12008-07-22 14:16:42 -0700323static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
324 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700325{
326 int i;
327
328 i = netdev_lock_pos(dev_type);
329 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
330 netdev_lock_name[i]);
331}
David S. Millercf508b12008-07-22 14:16:42 -0700332
333static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
334{
335 int i;
336
337 i = netdev_lock_pos(dev->type);
338 lockdep_set_class_and_name(&dev->addr_list_lock,
339 &netdev_addr_lock_key[i],
340 netdev_lock_name[i]);
341}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700342#else
David S. Millercf508b12008-07-22 14:16:42 -0700343static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
344 unsigned short dev_type)
345{
346}
347static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700348{
349}
350#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351
352/*******************************************************************************
353
354 Protocol management and registration routines
355
356*******************************************************************************/
357
358/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359 * Add a protocol ID to the list. Now that the input handler is
360 * smarter we can dispense with all the messy stuff that used to be
361 * here.
362 *
363 * BEWARE!!! Protocol handlers, mangling input packets,
364 * MUST BE last in hash buckets and checking protocol handlers
365 * MUST start from promiscuous ptype_all chain in net_bh.
366 * It is true now, do not change it.
367 * Explanation follows: if protocol handler, mangling packet, will
368 * be the first on list, it is not able to sense, that packet
369 * is cloned and should be copied-on-write, so that it will
370 * change it and subsequent readers will get broken packet.
371 * --ANK (980803)
372 */
373
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000374static inline struct list_head *ptype_head(const struct packet_type *pt)
375{
376 if (pt->type == htons(ETH_P_ALL))
377 return &ptype_all;
378 else
379 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
380}
381
Linus Torvalds1da177e2005-04-16 15:20:36 -0700382/**
383 * dev_add_pack - add packet handler
384 * @pt: packet type declaration
385 *
386 * Add a protocol handler to the networking stack. The passed &packet_type
387 * is linked into kernel lists and may not be freed until it has been
388 * removed from the kernel lists.
389 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900390 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700391 * guarantee all CPU's that are in middle of receiving packets
392 * will see the new packet type (until the next received packet).
393 */
394
395void dev_add_pack(struct packet_type *pt)
396{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000397 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000399 spin_lock(&ptype_lock);
400 list_add_rcu(&pt->list, head);
401 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700403EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405/**
406 * __dev_remove_pack - remove packet handler
407 * @pt: packet type declaration
408 *
409 * Remove a protocol handler that was previously added to the kernel
410 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
411 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900412 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700413 *
414 * The packet type might still be in use by receivers
415 * and must not be freed until after all the CPU's have gone
416 * through a quiescent state.
417 */
418void __dev_remove_pack(struct packet_type *pt)
419{
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000420 struct list_head *head = ptype_head(pt);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421 struct packet_type *pt1;
422
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000423 spin_lock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700424
425 list_for_each_entry(pt1, head, list) {
426 if (pt == pt1) {
427 list_del_rcu(&pt->list);
428 goto out;
429 }
430 }
431
432 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
433out:
Eric Dumazetc07b68e2010-09-02 03:53:46 +0000434 spin_unlock(&ptype_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700435}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700436EXPORT_SYMBOL(__dev_remove_pack);
437
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438/**
439 * dev_remove_pack - remove packet handler
440 * @pt: packet type declaration
441 *
442 * Remove a protocol handler that was previously added to the kernel
443 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
444 * from the kernel lists and can be freed or reused once this function
445 * returns.
446 *
447 * This call sleeps to guarantee that no CPU is looking at the packet
448 * type after return.
449 */
450void dev_remove_pack(struct packet_type *pt)
451{
452 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900453
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454 synchronize_net();
455}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700456EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700457
458/******************************************************************************
459
460 Device Boot-time Settings Routines
461
462*******************************************************************************/
463
464/* Boot time configuration table */
465static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
466
467/**
468 * netdev_boot_setup_add - add new setup entry
469 * @name: name of the device
470 * @map: configured settings for the device
471 *
472 * Adds new setup entry to the dev_boot_setup list. The function
473 * returns 0 on error and 1 on success. This is a generic routine to
474 * all netdevices.
475 */
476static int netdev_boot_setup_add(char *name, struct ifmap *map)
477{
478 struct netdev_boot_setup *s;
479 int i;
480
481 s = dev_boot_setup;
482 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
483 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
484 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700485 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700486 memcpy(&s[i].map, map, sizeof(s[i].map));
487 break;
488 }
489 }
490
491 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
492}
493
494/**
495 * netdev_boot_setup_check - check boot time settings
496 * @dev: the netdevice
497 *
498 * Check boot time settings for the device.
499 * The found settings are set for the device to be used
500 * later in the device probing.
501 * Returns 0 if no settings found, 1 if they are.
502 */
503int netdev_boot_setup_check(struct net_device *dev)
504{
505 struct netdev_boot_setup *s = dev_boot_setup;
506 int i;
507
508 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
509 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700510 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511 dev->irq = s[i].map.irq;
512 dev->base_addr = s[i].map.base_addr;
513 dev->mem_start = s[i].map.mem_start;
514 dev->mem_end = s[i].map.mem_end;
515 return 1;
516 }
517 }
518 return 0;
519}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700520EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700521
522
523/**
524 * netdev_boot_base - get address from boot time settings
525 * @prefix: prefix for network device
526 * @unit: id for network device
527 *
528 * Check boot time settings for the base address of device.
529 * The found settings are set for the device to be used
530 * later in the device probing.
531 * Returns 0 if no settings found.
532 */
533unsigned long netdev_boot_base(const char *prefix, int unit)
534{
535 const struct netdev_boot_setup *s = dev_boot_setup;
536 char name[IFNAMSIZ];
537 int i;
538
539 sprintf(name, "%s%d", prefix, unit);
540
541 /*
542 * If device already registered then return base of 1
543 * to indicate not to probe for this interface
544 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700545 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700546 return 1;
547
548 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
549 if (!strcmp(name, s[i].name))
550 return s[i].map.base_addr;
551 return 0;
552}
553
554/*
555 * Saves at boot time configured settings for any netdevice.
556 */
557int __init netdev_boot_setup(char *str)
558{
559 int ints[5];
560 struct ifmap map;
561
562 str = get_options(str, ARRAY_SIZE(ints), ints);
563 if (!str || !*str)
564 return 0;
565
566 /* Save settings */
567 memset(&map, 0, sizeof(map));
568 if (ints[0] > 0)
569 map.irq = ints[1];
570 if (ints[0] > 1)
571 map.base_addr = ints[2];
572 if (ints[0] > 2)
573 map.mem_start = ints[3];
574 if (ints[0] > 3)
575 map.mem_end = ints[4];
576
577 /* Add new entry to the list */
578 return netdev_boot_setup_add(str, &map);
579}
580
581__setup("netdev=", netdev_boot_setup);
582
583/*******************************************************************************
584
585 Device Interface Subroutines
586
587*******************************************************************************/
588
589/**
590 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700591 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700592 * @name: name to find
593 *
594 * Find an interface by name. Must be called under RTNL semaphore
595 * or @dev_base_lock. If the name is found a pointer to the device
596 * is returned. If the name is not found then %NULL is returned. The
597 * reference counters are not incremented so the caller must be
598 * careful with locks.
599 */
600
Eric W. Biederman881d9662007-09-17 11:56:21 -0700601struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602{
603 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700604 struct net_device *dev;
605 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700607 hlist_for_each_entry(dev, p, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608 if (!strncmp(dev->name, name, IFNAMSIZ))
609 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700610
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611 return NULL;
612}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700613EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700614
615/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000616 * dev_get_by_name_rcu - find a device by its name
617 * @net: the applicable net namespace
618 * @name: name to find
619 *
620 * Find an interface by name.
621 * If the name is found a pointer to the device is returned.
622 * If the name is not found then %NULL is returned.
623 * The reference counters are not incremented so the caller must be
624 * careful with locks. The caller must hold RCU lock.
625 */
626
627struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
628{
629 struct hlist_node *p;
630 struct net_device *dev;
631 struct hlist_head *head = dev_name_hash(net, name);
632
633 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
634 if (!strncmp(dev->name, name, IFNAMSIZ))
635 return dev;
636
637 return NULL;
638}
639EXPORT_SYMBOL(dev_get_by_name_rcu);
640
641/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700643 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644 * @name: name to find
645 *
646 * Find an interface by name. This can be called from any
647 * context and does its own locking. The returned handle has
648 * the usage count incremented and the caller must use dev_put() to
649 * release it when it is no longer needed. %NULL is returned if no
650 * matching device is found.
651 */
652
Eric W. Biederman881d9662007-09-17 11:56:21 -0700653struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700654{
655 struct net_device *dev;
656
Eric Dumazet72c95282009-10-30 07:11:27 +0000657 rcu_read_lock();
658 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 if (dev)
660 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000661 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662 return dev;
663}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700664EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700665
666/**
667 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700668 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669 * @ifindex: index of device
670 *
671 * Search for an interface by index. Returns %NULL if the device
672 * is not found or a pointer to the device. The device has not
673 * had its reference counter increased so the caller must be careful
674 * about locking. The caller must hold either the RTNL semaphore
675 * or @dev_base_lock.
676 */
677
Eric W. Biederman881d9662007-09-17 11:56:21 -0700678struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700679{
680 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700681 struct net_device *dev;
682 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700684 hlist_for_each_entry(dev, p, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700685 if (dev->ifindex == ifindex)
686 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700687
Linus Torvalds1da177e2005-04-16 15:20:36 -0700688 return NULL;
689}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700690EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000692/**
693 * dev_get_by_index_rcu - find a device by its ifindex
694 * @net: the applicable net namespace
695 * @ifindex: index of device
696 *
697 * Search for an interface by index. Returns %NULL if the device
698 * is not found or a pointer to the device. The device has not
699 * had its reference counter increased so the caller must be careful
700 * about locking. The caller must hold RCU lock.
701 */
702
703struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
704{
705 struct hlist_node *p;
706 struct net_device *dev;
707 struct hlist_head *head = dev_index_hash(net, ifindex);
708
709 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
710 if (dev->ifindex == ifindex)
711 return dev;
712
713 return NULL;
714}
715EXPORT_SYMBOL(dev_get_by_index_rcu);
716
Linus Torvalds1da177e2005-04-16 15:20:36 -0700717
718/**
719 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700720 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721 * @ifindex: index of device
722 *
723 * Search for an interface by index. Returns NULL if the device
724 * is not found or a pointer to the device. The device returned has
725 * had a reference added and the pointer is safe until the user calls
726 * dev_put to indicate they have finished with it.
727 */
728
Eric W. Biederman881d9662007-09-17 11:56:21 -0700729struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730{
731 struct net_device *dev;
732
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000733 rcu_read_lock();
734 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700735 if (dev)
736 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000737 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738 return dev;
739}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700740EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700741
742/**
743 * dev_getbyhwaddr - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700744 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745 * @type: media type of device
746 * @ha: hardware address
747 *
748 * Search for an interface by MAC address. Returns NULL if the device
749 * is not found or a pointer to the device. The caller must hold the
750 * rtnl semaphore. The returned device has not had its ref count increased
751 * and the caller must therefore be careful about locking
752 *
753 * BUGS:
754 * If the API was consistent this would be __dev_get_by_hwaddr
755 */
756
Eric W. Biederman881d9662007-09-17 11:56:21 -0700757struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700758{
759 struct net_device *dev;
760
761 ASSERT_RTNL();
762
Denis V. Lunev81103a52007-12-12 10:47:38 -0800763 for_each_netdev(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700764 if (dev->type == type &&
765 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700766 return dev;
767
768 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769}
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300770EXPORT_SYMBOL(dev_getbyhwaddr);
771
Eric W. Biederman881d9662007-09-17 11:56:21 -0700772struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700773{
774 struct net_device *dev;
775
776 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700777 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700778 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700779 return dev;
780
781 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700782}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700783EXPORT_SYMBOL(__dev_getfirstbyhwtype);
784
Eric W. Biederman881d9662007-09-17 11:56:21 -0700785struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700786{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000787 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000789 rcu_read_lock();
790 for_each_netdev_rcu(net, dev)
791 if (dev->type == type) {
792 dev_hold(dev);
793 ret = dev;
794 break;
795 }
796 rcu_read_unlock();
797 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799EXPORT_SYMBOL(dev_getfirstbyhwtype);
800
801/**
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000802 * dev_get_by_flags_rcu - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700803 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804 * @if_flags: IFF_* values
805 * @mask: bitmask of bits in if_flags to check
806 *
807 * Search for any interface with the given flags. Returns NULL if a device
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000808 * is not found or a pointer to the device. Must be called inside
809 * rcu_read_lock(), and result refcount is unchanged.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810 */
811
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000812struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700813 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700815 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700816
Pavel Emelianov7562f872007-05-03 15:13:45 -0700817 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800818 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819 if (((dev->flags ^ if_flags) & mask) == 0) {
Pavel Emelianov7562f872007-05-03 15:13:45 -0700820 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821 break;
822 }
823 }
Pavel Emelianov7562f872007-05-03 15:13:45 -0700824 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825}
Eric Dumazetbb69ae02010-06-07 11:42:13 +0000826EXPORT_SYMBOL(dev_get_by_flags_rcu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827
828/**
829 * dev_valid_name - check if name is okay for network device
830 * @name: name string
831 *
832 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700833 * to allow sysfs to work. We also disallow any kind of
834 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835 */
Mitch Williamsc2373ee2005-11-09 10:34:45 -0800836int dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700837{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700838 if (*name == '\0')
839 return 0;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700840 if (strlen(name) >= IFNAMSIZ)
841 return 0;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700842 if (!strcmp(name, ".") || !strcmp(name, ".."))
843 return 0;
844
845 while (*name) {
846 if (*name == '/' || isspace(*name))
847 return 0;
848 name++;
849 }
850 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700851}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700852EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853
854/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200855 * __dev_alloc_name - allocate a name for a device
856 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200858 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859 *
860 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700861 * id. It scans list of devices to build up a free map, then chooses
862 * the first empty slot. The caller must hold the dev_base or rtnl lock
863 * while allocating the name and adding the device in order to avoid
864 * duplicates.
865 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
866 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867 */
868
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200869static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700870{
871 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700872 const char *p;
873 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700874 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875 struct net_device *d;
876
877 p = strnchr(name, IFNAMSIZ-1, '%');
878 if (p) {
879 /*
880 * Verify the string as this thing may have come from
881 * the user. There must be either one "%d" and no other "%"
882 * characters.
883 */
884 if (p[1] != 'd' || strchr(p + 2, '%'))
885 return -EINVAL;
886
887 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700888 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700889 if (!inuse)
890 return -ENOMEM;
891
Eric W. Biederman881d9662007-09-17 11:56:21 -0700892 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 if (!sscanf(d->name, name, &i))
894 continue;
895 if (i < 0 || i >= max_netdevices)
896 continue;
897
898 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200899 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900 if (!strncmp(buf, d->name, IFNAMSIZ))
901 set_bit(i, inuse);
902 }
903
904 i = find_first_zero_bit(inuse, max_netdevices);
905 free_page((unsigned long) inuse);
906 }
907
Octavian Purdilad9031022009-11-18 02:36:59 +0000908 if (buf != name)
909 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200910 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700912
913 /* It is possible to run out of possible slots
914 * when the name is long and there isn't enough space left
915 * for the digits, or if all bits are used.
916 */
917 return -ENFILE;
918}
919
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200920/**
921 * dev_alloc_name - allocate a name for a device
922 * @dev: device
923 * @name: name format string
924 *
925 * Passed a format string - eg "lt%d" it will try and find a suitable
926 * id. It scans list of devices to build up a free map, then chooses
927 * the first empty slot. The caller must hold the dev_base or rtnl lock
928 * while allocating the name and adding the device in order to avoid
929 * duplicates.
930 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
931 * Returns the number of the unit assigned or a negative errno code.
932 */
933
934int dev_alloc_name(struct net_device *dev, const char *name)
935{
936 char buf[IFNAMSIZ];
937 struct net *net;
938 int ret;
939
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900940 BUG_ON(!dev_net(dev));
941 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200942 ret = __dev_alloc_name(net, name, buf);
943 if (ret >= 0)
944 strlcpy(dev->name, buf, IFNAMSIZ);
945 return ret;
946}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700947EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200948
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000949static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
Octavian Purdilad9031022009-11-18 02:36:59 +0000950{
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000951 struct net *net;
952
953 BUG_ON(!dev_net(dev));
954 net = dev_net(dev);
955
Octavian Purdilad9031022009-11-18 02:36:59 +0000956 if (!dev_valid_name(name))
957 return -EINVAL;
958
959 if (fmt && strchr(name, '%'))
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000960 return dev_alloc_name(dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +0000961 else if (__dev_get_by_name(net, name))
962 return -EEXIST;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000963 else if (dev->name != name)
964 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +0000965
966 return 0;
967}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700968
969/**
970 * dev_change_name - change name of a device
971 * @dev: device
972 * @newname: name (or format string) must be at least IFNAMSIZ
973 *
974 * Change name of a device, can pass format strings "eth%d".
975 * for wildcarding.
976 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -0700977int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700978{
Herbert Xufcc5a032007-07-30 17:03:38 -0700979 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700980 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -0700981 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -0700982 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700983
984 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900985 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700986
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900987 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988 if (dev->flags & IFF_UP)
989 return -EBUSY;
990
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -0700991 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
992 return 0;
993
Herbert Xufcc5a032007-07-30 17:03:38 -0700994 memcpy(oldname, dev->name, IFNAMSIZ);
995
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +0000996 err = dev_get_valid_name(dev, newname, 1);
Octavian Purdilad9031022009-11-18 02:36:59 +0000997 if (err < 0)
998 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999
Herbert Xufcc5a032007-07-30 17:03:38 -07001000rollback:
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07001001 ret = device_rename(&dev->dev, dev->name);
1002 if (ret) {
1003 memcpy(dev->name, oldname, IFNAMSIZ);
1004 return ret;
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001005 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001006
1007 write_lock_bh(&dev_base_lock);
Eric W. Biederman92749822007-04-03 00:07:30 -06001008 hlist_del(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001009 write_unlock_bh(&dev_base_lock);
1010
1011 synchronize_rcu();
1012
1013 write_lock_bh(&dev_base_lock);
1014 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001015 write_unlock_bh(&dev_base_lock);
1016
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001017 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001018 ret = notifier_to_errno(ret);
1019
1020 if (ret) {
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001021 /* err >= 0 after dev_alloc_name() or stores the first errno */
1022 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001023 err = ret;
1024 memcpy(dev->name, oldname, IFNAMSIZ);
1025 goto rollback;
Eric Dumazet91e9c07b2009-11-15 23:30:24 +00001026 } else {
1027 printk(KERN_ERR
1028 "%s: name change rollback failed: %d.\n",
1029 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001030 }
1031 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001032
1033 return err;
1034}
1035
1036/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001037 * dev_set_alias - change ifalias of a device
1038 * @dev: device
1039 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001040 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001041 *
1042 * Set ifalias for a device,
1043 */
1044int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1045{
1046 ASSERT_RTNL();
1047
1048 if (len >= IFALIASZ)
1049 return -EINVAL;
1050
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001051 if (!len) {
1052 if (dev->ifalias) {
1053 kfree(dev->ifalias);
1054 dev->ifalias = NULL;
1055 }
1056 return 0;
1057 }
1058
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001059 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001060 if (!dev->ifalias)
1061 return -ENOMEM;
1062
1063 strlcpy(dev->ifalias, alias, len+1);
1064 return len;
1065}
1066
1067
1068/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001069 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001070 * @dev: device to cause notification
1071 *
1072 * Called to indicate a device has changed features.
1073 */
1074void netdev_features_change(struct net_device *dev)
1075{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001076 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001077}
1078EXPORT_SYMBOL(netdev_features_change);
1079
1080/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001081 * netdev_state_change - device changes state
1082 * @dev: device to cause notification
1083 *
1084 * Called to indicate a device has changed state. This function calls
1085 * the notifier chains for netdev_chain and sends a NEWLINK message
1086 * to the routing socket.
1087 */
1088void netdev_state_change(struct net_device *dev)
1089{
1090 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001091 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001092 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1093 }
1094}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001095EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001097int netdev_bonding_change(struct net_device *dev, unsigned long event)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001098{
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001099 return call_netdevice_notifiers(event, dev);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001100}
1101EXPORT_SYMBOL(netdev_bonding_change);
1102
Linus Torvalds1da177e2005-04-16 15:20:36 -07001103/**
1104 * dev_load - load a network module
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001105 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07001106 * @name: name of interface
1107 *
1108 * If a network interface is not present and the process has suitable
1109 * privileges this function loads the module. If module loading is not
1110 * available in this kernel then it becomes a nop.
1111 */
1112
Eric W. Biederman881d9662007-09-17 11:56:21 -07001113void dev_load(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001114{
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001115 struct net_device *dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001116
Eric Dumazet72c95282009-10-30 07:11:27 +00001117 rcu_read_lock();
1118 dev = dev_get_by_name_rcu(net, name);
1119 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001120
Eric Parisa8f80e82009-08-13 09:44:51 -04001121 if (!dev && capable(CAP_NET_ADMIN))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122 request_module("%s", name);
1123}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001124EXPORT_SYMBOL(dev_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001125
Patrick McHardybd380812010-02-26 06:34:53 +00001126static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001128 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001129 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001130
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001131 ASSERT_RTNL();
1132
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001134 * Is it even present?
1135 */
1136 if (!netif_device_present(dev))
1137 return -ENODEV;
1138
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001139 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1140 ret = notifier_to_errno(ret);
1141 if (ret)
1142 return ret;
1143
Linus Torvalds1da177e2005-04-16 15:20:36 -07001144 /*
1145 * Call device private open method
1146 */
1147 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001148
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001149 if (ops->ndo_validate_addr)
1150 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001151
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001152 if (!ret && ops->ndo_open)
1153 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001154
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001155 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001156 * If it went open OK then:
1157 */
1158
Jeff Garzikbada3392007-10-23 20:19:37 -07001159 if (ret)
1160 clear_bit(__LINK_STATE_START, &dev->state);
1161 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001162 /*
1163 * Set the flags.
1164 */
1165 dev->flags |= IFF_UP;
1166
1167 /*
Dan Williams649274d2009-01-11 00:20:39 -08001168 * Enable NET_DMA
1169 */
David S. Millerb4bd07c2009-02-06 22:06:43 -08001170 net_dmaengine_get();
Dan Williams649274d2009-01-11 00:20:39 -08001171
1172 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173 * Initialize multicasting status
1174 */
Patrick McHardy4417da62007-06-27 01:28:10 -07001175 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001176
1177 /*
1178 * Wakeup transmit queue engine
1179 */
1180 dev_activate(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001181 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001182
Linus Torvalds1da177e2005-04-16 15:20:36 -07001183 return ret;
1184}
Patrick McHardybd380812010-02-26 06:34:53 +00001185
1186/**
1187 * dev_open - prepare an interface for use.
1188 * @dev: device to open
1189 *
1190 * Takes a device from down to up state. The device's private open
1191 * function is invoked and then the multicast lists are loaded. Finally
1192 * the device is moved into the up state and a %NETDEV_UP message is
1193 * sent to the netdev notifier chain.
1194 *
1195 * Calling this function on an active interface is a nop. On a failure
1196 * a negative errno code is returned.
1197 */
1198int dev_open(struct net_device *dev)
1199{
1200 int ret;
1201
1202 /*
1203 * Is it already up?
1204 */
1205 if (dev->flags & IFF_UP)
1206 return 0;
1207
1208 /*
1209 * Open device
1210 */
1211 ret = __dev_open(dev);
1212 if (ret < 0)
1213 return ret;
1214
1215 /*
1216 * ... and announce new interface.
1217 */
1218 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1219 call_netdevice_notifiers(NETDEV_UP, dev);
1220
1221 return ret;
1222}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001223EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224
Patrick McHardybd380812010-02-26 06:34:53 +00001225static int __dev_close(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001226{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001227 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardybd380812010-02-26 06:34:53 +00001228
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001229 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001230 might_sleep();
1231
Linus Torvalds1da177e2005-04-16 15:20:36 -07001232 /*
1233 * Tell people we are going down, so that they can
1234 * prepare to death, when device is still operating.
1235 */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001236 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001237
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238 clear_bit(__LINK_STATE_START, &dev->state);
1239
1240 /* Synchronize to scheduled poll. We cannot touch poll list,
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001241 * it can be even on different cpu. So just clear netif_running().
1242 *
1243 * dev->stop() will invoke napi_disable() on all of it's
1244 * napi_struct instances on this device.
1245 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001246 smp_mb__after_clear_bit(); /* Commit netif_running(). */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001247
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001248 dev_deactivate(dev);
1249
Linus Torvalds1da177e2005-04-16 15:20:36 -07001250 /*
1251 * Call the device specific close. This cannot fail.
1252 * Only if device is UP
1253 *
1254 * We allow it to be called even after a DETACH hot-plug
1255 * event.
1256 */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001257 if (ops->ndo_stop)
1258 ops->ndo_stop(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001259
1260 /*
1261 * Device is now down.
1262 */
1263
1264 dev->flags &= ~IFF_UP;
1265
1266 /*
Dan Williams649274d2009-01-11 00:20:39 -08001267 * Shutdown NET_DMA
1268 */
David S. Millerb4bd07c2009-02-06 22:06:43 -08001269 net_dmaengine_put();
Dan Williams649274d2009-01-11 00:20:39 -08001270
Linus Torvalds1da177e2005-04-16 15:20:36 -07001271 return 0;
1272}
Patrick McHardybd380812010-02-26 06:34:53 +00001273
1274/**
1275 * dev_close - shutdown an interface.
1276 * @dev: device to shutdown
1277 *
1278 * This function moves an active device into down state. A
1279 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1280 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1281 * chain.
1282 */
1283int dev_close(struct net_device *dev)
1284{
1285 if (!(dev->flags & IFF_UP))
1286 return 0;
1287
1288 __dev_close(dev);
1289
1290 /*
1291 * Tell people we are down
1292 */
1293 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1294 call_netdevice_notifiers(NETDEV_DOWN, dev);
1295
1296 return 0;
1297}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001298EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001299
1300
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001301/**
1302 * dev_disable_lro - disable Large Receive Offload on a device
1303 * @dev: device
1304 *
1305 * Disable Large Receive Offload (LRO) on a net device. Must be
1306 * called under RTNL. This is needed if received packets may be
1307 * forwarded to another interface.
1308 */
1309void dev_disable_lro(struct net_device *dev)
1310{
1311 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1312 dev->ethtool_ops->set_flags) {
1313 u32 flags = dev->ethtool_ops->get_flags(dev);
1314 if (flags & ETH_FLAG_LRO) {
1315 flags &= ~ETH_FLAG_LRO;
1316 dev->ethtool_ops->set_flags(dev, flags);
1317 }
1318 }
1319 WARN_ON(dev->features & NETIF_F_LRO);
1320}
1321EXPORT_SYMBOL(dev_disable_lro);
1322
1323
Eric W. Biederman881d9662007-09-17 11:56:21 -07001324static int dev_boot_phase = 1;
1325
Linus Torvalds1da177e2005-04-16 15:20:36 -07001326/*
1327 * Device change register/unregister. These are not inline or static
1328 * as we export them to the world.
1329 */
1330
1331/**
1332 * register_netdevice_notifier - register a network notifier block
1333 * @nb: notifier
1334 *
1335 * Register a notifier to be called when network device events occur.
1336 * The notifier passed is linked into the kernel structures and must
1337 * not be reused until it has been unregistered. A negative errno code
1338 * is returned on a failure.
1339 *
1340 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001341 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001342 * view of the network device list.
1343 */
1344
1345int register_netdevice_notifier(struct notifier_block *nb)
1346{
1347 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001348 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001349 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001350 int err;
1351
1352 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001353 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001354 if (err)
1355 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001356 if (dev_boot_phase)
1357 goto unlock;
1358 for_each_net(net) {
1359 for_each_netdev(net, dev) {
1360 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1361 err = notifier_to_errno(err);
1362 if (err)
1363 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001364
Eric W. Biederman881d9662007-09-17 11:56:21 -07001365 if (!(dev->flags & IFF_UP))
1366 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001367
Eric W. Biederman881d9662007-09-17 11:56:21 -07001368 nb->notifier_call(nb, NETDEV_UP, dev);
1369 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001370 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001371
1372unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001373 rtnl_unlock();
1374 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001375
1376rollback:
1377 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001378 for_each_net(net) {
1379 for_each_netdev(net, dev) {
1380 if (dev == last)
1381 break;
Herbert Xufcc5a032007-07-30 17:03:38 -07001382
Eric W. Biederman881d9662007-09-17 11:56:21 -07001383 if (dev->flags & IFF_UP) {
1384 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1385 nb->notifier_call(nb, NETDEV_DOWN, dev);
1386 }
1387 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00001388 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001389 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001390 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001391
1392 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001393 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001394}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001395EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001396
1397/**
1398 * unregister_netdevice_notifier - unregister a network notifier block
1399 * @nb: notifier
1400 *
1401 * Unregister a notifier previously registered by
1402 * register_netdevice_notifier(). The notifier is unlinked into the
1403 * kernel structures and may then be reused. A negative errno code
1404 * is returned on a failure.
1405 */
1406
1407int unregister_netdevice_notifier(struct notifier_block *nb)
1408{
Herbert Xu9f514952006-03-25 01:24:25 -08001409 int err;
1410
1411 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001412 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xu9f514952006-03-25 01:24:25 -08001413 rtnl_unlock();
1414 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001416EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001417
1418/**
1419 * call_netdevice_notifiers - call all network notifier blocks
1420 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001421 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001422 *
1423 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001424 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001425 */
1426
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001427int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428{
Jiri Pirkoab930472010-04-20 01:45:37 -07001429 ASSERT_RTNL();
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001430 return raw_notifier_call_chain(&netdev_chain, val, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001431}
1432
1433/* When > 0 there are consumers of rx skb time stamps */
1434static atomic_t netstamp_needed = ATOMIC_INIT(0);
1435
1436void net_enable_timestamp(void)
1437{
1438 atomic_inc(&netstamp_needed);
1439}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001440EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001441
1442void net_disable_timestamp(void)
1443{
1444 atomic_dec(&netstamp_needed);
1445}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001446EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001447
Eric Dumazet3b098e22010-05-15 23:57:10 -07001448static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001449{
1450 if (atomic_read(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001451 __net_timestamp(skb);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001452 else
1453 skb->tstamp.tv64 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454}
1455
Eric Dumazet3b098e22010-05-15 23:57:10 -07001456static inline void net_timestamp_check(struct sk_buff *skb)
1457{
1458 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1459 __net_timestamp(skb);
1460}
1461
Arnd Bergmann44540962009-11-26 06:07:08 +00001462/**
1463 * dev_forward_skb - loopback an skb to another netif
1464 *
1465 * @dev: destination network device
1466 * @skb: buffer to forward
1467 *
1468 * return values:
1469 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001470 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001471 *
1472 * dev_forward_skb can be used for injecting an skb from the
1473 * start_xmit function of one device into the receive queue
1474 * of another device.
1475 *
1476 * The receiving device may be in another namespace, so
1477 * we have to clear all information in the skb that could
1478 * impact namespace isolation.
1479 */
1480int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1481{
1482 skb_orphan(skb);
Ben Greearc736eef2010-07-22 09:54:47 +00001483 nf_reset(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001484
Eric Dumazet6ec82562010-05-06 00:53:53 -07001485 if (!(dev->flags & IFF_UP) ||
1486 (skb->len > (dev->mtu + dev->hard_header_len))) {
1487 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001488 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001489 }
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001490 skb_set_dev(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001491 skb->tstamp.tv64 = 0;
1492 skb->pkt_type = PACKET_HOST;
1493 skb->protocol = eth_type_trans(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001494 return netif_rx(skb);
1495}
1496EXPORT_SYMBOL_GPL(dev_forward_skb);
1497
Linus Torvalds1da177e2005-04-16 15:20:36 -07001498/*
1499 * Support routine. Sends outgoing frames to any network
1500 * taps currently in use.
1501 */
1502
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001503static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001504{
1505 struct packet_type *ptype;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001506
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001507#ifdef CONFIG_NET_CLS_ACT
1508 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
Eric Dumazet3b098e22010-05-15 23:57:10 -07001509 net_timestamp_set(skb);
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001510#else
Eric Dumazet3b098e22010-05-15 23:57:10 -07001511 net_timestamp_set(skb);
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001512#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001513
1514 rcu_read_lock();
1515 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1516 /* Never send packets back to the socket
1517 * they originated from - MvS (miquels@drinkel.ow.org)
1518 */
1519 if ((ptype->dev == dev || !ptype->dev) &&
1520 (ptype->af_packet_priv == NULL ||
1521 (struct sock *)ptype->af_packet_priv != skb->sk)) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001522 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523 if (!skb2)
1524 break;
1525
1526 /* skb->nh should be correctly
1527 set by sender, so that the second statement is
1528 just protection against buggy protocols.
1529 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001530 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001531
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001532 if (skb_network_header(skb2) < skb2->data ||
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001533 skb2->network_header > skb2->tail) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001534 if (net_ratelimit())
1535 printk(KERN_CRIT "protocol %04x is "
1536 "buggy, dev %s\n",
Sebastian Andrzej Siewior70777d02010-06-30 10:39:19 -07001537 ntohs(skb2->protocol),
1538 dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001539 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001540 }
1541
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001542 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543 skb2->pkt_type = PACKET_OUTGOING;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07001544 ptype->func(skb2, skb->dev, ptype, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545 }
1546 }
1547 rcu_read_unlock();
1548}
1549
John Fastabendf0796d52010-07-01 13:21:57 +00001550/*
1551 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1552 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1553 */
1554void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1555{
1556 unsigned int real_num = dev->real_num_tx_queues;
1557
1558 if (unlikely(txq > dev->num_tx_queues))
1559 ;
1560 else if (txq > real_num)
1561 dev->real_num_tx_queues = txq;
1562 else if (txq < real_num) {
1563 dev->real_num_tx_queues = txq;
1564 qdisc_reset_all_tx_gt(dev, txq);
1565 }
1566}
1567EXPORT_SYMBOL(netif_set_real_num_tx_queues);
Denis Vlasenko56079432006-03-29 15:57:29 -08001568
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001569static inline void __netif_reschedule(struct Qdisc *q)
1570{
1571 struct softnet_data *sd;
1572 unsigned long flags;
1573
1574 local_irq_save(flags);
1575 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00001576 q->next_sched = NULL;
1577 *sd->output_queue_tailp = q;
1578 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001579 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1580 local_irq_restore(flags);
1581}
1582
David S. Miller37437bb2008-07-16 02:15:04 -07001583void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08001584{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001585 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1586 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08001587}
1588EXPORT_SYMBOL(__netif_schedule);
1589
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001590void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08001591{
David S. Miller3578b0c2010-08-03 00:24:04 -07001592 if (atomic_dec_and_test(&skb->users)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001593 struct softnet_data *sd;
1594 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08001595
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001596 local_irq_save(flags);
1597 sd = &__get_cpu_var(softnet_data);
1598 skb->next = sd->completion_queue;
1599 sd->completion_queue = skb;
1600 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1601 local_irq_restore(flags);
1602 }
Denis Vlasenko56079432006-03-29 15:57:29 -08001603}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001604EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08001605
1606void dev_kfree_skb_any(struct sk_buff *skb)
1607{
1608 if (in_irq() || irqs_disabled())
1609 dev_kfree_skb_irq(skb);
1610 else
1611 dev_kfree_skb(skb);
1612}
1613EXPORT_SYMBOL(dev_kfree_skb_any);
1614
1615
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001616/**
1617 * netif_device_detach - mark device as removed
1618 * @dev: network device
1619 *
1620 * Mark device as removed from system and therefore no longer available.
1621 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001622void netif_device_detach(struct net_device *dev)
1623{
1624 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1625 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001626 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001627 }
1628}
1629EXPORT_SYMBOL(netif_device_detach);
1630
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001631/**
1632 * netif_device_attach - mark device as attached
1633 * @dev: network device
1634 *
1635 * Mark device as attached from system and restart if needed.
1636 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001637void netif_device_attach(struct net_device *dev)
1638{
1639 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1640 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001641 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001642 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001643 }
1644}
1645EXPORT_SYMBOL(netif_device_attach);
1646
Ben Hutchings6de329e2008-06-16 17:02:28 -07001647static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1648{
1649 return ((features & NETIF_F_GEN_CSUM) ||
1650 ((features & NETIF_F_IP_CSUM) &&
1651 protocol == htons(ETH_P_IP)) ||
1652 ((features & NETIF_F_IPV6_CSUM) &&
Yi Zou1c8dbcf2009-02-27 14:06:54 -08001653 protocol == htons(ETH_P_IPV6)) ||
1654 ((features & NETIF_F_FCOE_CRC) &&
1655 protocol == htons(ETH_P_FCOE)));
Ben Hutchings6de329e2008-06-16 17:02:28 -07001656}
1657
1658static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1659{
1660 if (can_checksum_protocol(dev->features, skb->protocol))
1661 return true;
1662
1663 if (skb->protocol == htons(ETH_P_8021Q)) {
1664 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1665 if (can_checksum_protocol(dev->features & dev->vlan_features,
1666 veh->h_vlan_encapsulated_proto))
1667 return true;
1668 }
1669
1670 return false;
1671}
Denis Vlasenko56079432006-03-29 15:57:29 -08001672
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001673/**
1674 * skb_dev_set -- assign a new device to a buffer
1675 * @skb: buffer for the new device
1676 * @dev: network device
1677 *
1678 * If an skb is owned by a device already, we have to reset
1679 * all data private to the namespace a device belongs to
1680 * before assigning it a new device.
1681 */
1682#ifdef CONFIG_NET_NS
1683void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1684{
1685 skb_dst_drop(skb);
1686 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1687 secpath_reset(skb);
1688 nf_reset(skb);
1689 skb_init_secmark(skb);
1690 skb->mark = 0;
1691 skb->priority = 0;
1692 skb->nf_trace = 0;
1693 skb->ipvs_property = 0;
1694#ifdef CONFIG_NET_SCHED
1695 skb->tc_index = 0;
1696#endif
1697 }
1698 skb->dev = dev;
1699}
1700EXPORT_SYMBOL(skb_set_dev);
1701#endif /* CONFIG_NET_NS */
1702
Linus Torvalds1da177e2005-04-16 15:20:36 -07001703/*
1704 * Invalidate hardware checksum when packet is to be mangled, and
1705 * complete checksum manually on outgoing path.
1706 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07001707int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001708{
Al Virod3bc23e2006-11-14 21:24:49 -08001709 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07001710 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001711
Patrick McHardy84fa7932006-08-29 16:44:56 -07001712 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07001713 goto out_set_summed;
1714
1715 if (unlikely(skb_shinfo(skb)->gso_size)) {
Herbert Xua430a432006-07-08 13:34:56 -07001716 /* Let GSO fix up the checksum. */
1717 goto out_set_summed;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718 }
1719
Herbert Xua0308472007-10-15 01:47:15 -07001720 offset = skb->csum_start - skb_headroom(skb);
1721 BUG_ON(offset >= skb_headlen(skb));
1722 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1723
1724 offset += skb->csum_offset;
1725 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1726
1727 if (skb_cloned(skb) &&
1728 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001729 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1730 if (ret)
1731 goto out;
1732 }
1733
Herbert Xua0308472007-10-15 01:47:15 -07001734 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07001735out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001736 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001737out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001738 return ret;
1739}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001740EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001741
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001742/**
1743 * skb_gso_segment - Perform segmentation on skb.
1744 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07001745 * @features: features for the output path (see dev->features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001746 *
1747 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07001748 *
1749 * It may return NULL if the skb requires no segmentation. This is
1750 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001751 */
Herbert Xu576a30e2006-06-27 13:22:38 -07001752struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001753{
1754 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1755 struct packet_type *ptype;
Al Viro252e33462006-11-14 20:48:11 -08001756 __be16 type = skb->protocol;
Herbert Xua430a432006-07-08 13:34:56 -07001757 int err;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001758
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001759 skb_reset_mac_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001760 skb->mac_len = skb->network_header - skb->mac_header;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001761 __skb_pull(skb, skb->mac_len);
1762
Herbert Xu67fd1a72009-01-19 16:26:44 -08001763 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1764 struct net_device *dev = skb->dev;
1765 struct ethtool_drvinfo info = {};
1766
1767 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1768 dev->ethtool_ops->get_drvinfo(dev, &info);
1769
1770 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1771 "ip_summed=%d",
1772 info.driver, dev ? dev->features : 0L,
1773 skb->sk ? skb->sk->sk_route_caps : 0L,
1774 skb->len, skb->data_len, skb->ip_summed);
1775
Herbert Xua430a432006-07-08 13:34:56 -07001776 if (skb_header_cloned(skb) &&
1777 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1778 return ERR_PTR(err);
1779 }
1780
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001781 rcu_read_lock();
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08001782 list_for_each_entry_rcu(ptype,
1783 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001784 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
Patrick McHardy84fa7932006-08-29 16:44:56 -07001785 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
Herbert Xua430a432006-07-08 13:34:56 -07001786 err = ptype->gso_send_check(skb);
1787 segs = ERR_PTR(err);
1788 if (err || skb_gso_ok(skb, features))
1789 break;
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001790 __skb_push(skb, (skb->data -
1791 skb_network_header(skb)));
Herbert Xua430a432006-07-08 13:34:56 -07001792 }
Herbert Xu576a30e2006-06-27 13:22:38 -07001793 segs = ptype->gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001794 break;
1795 }
1796 }
1797 rcu_read_unlock();
1798
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001799 __skb_push(skb, skb->data - skb_mac_header(skb));
Herbert Xu576a30e2006-06-27 13:22:38 -07001800
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001801 return segs;
1802}
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001803EXPORT_SYMBOL(skb_gso_segment);
1804
Herbert Xufb286bb2005-11-10 13:01:24 -08001805/* Take action when hardware reception checksum errors are detected. */
1806#ifdef CONFIG_BUG
1807void netdev_rx_csum_fault(struct net_device *dev)
1808{
1809 if (net_ratelimit()) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001810 printk(KERN_ERR "%s: hw csum failure.\n",
Stephen Hemminger246a4212005-12-08 15:21:39 -08001811 dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08001812 dump_stack();
1813 }
1814}
1815EXPORT_SYMBOL(netdev_rx_csum_fault);
1816#endif
1817
Linus Torvalds1da177e2005-04-16 15:20:36 -07001818/* Actually, we should eliminate this check as soon as we know, that:
1819 * 1. IOMMU is present and allows to map all the memory.
1820 * 2. No high memory really exists on this machine.
1821 */
1822
Eric Dumazet9092c652010-04-02 13:34:49 -07001823static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001824{
Herbert Xu3d3a8532006-06-27 13:33:10 -07001825#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001826 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001827 if (!(dev->features & NETIF_F_HIGHDMA)) {
1828 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1829 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1830 return 1;
1831 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001832
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001833 if (PCI_DMA_BUS_IS_PHYS) {
1834 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001835
Eric Dumazet9092c652010-04-02 13:34:49 -07001836 if (!pdev)
1837 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001838 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1839 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1840 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1841 return 1;
1842 }
1843 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07001844#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001845 return 0;
1846}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001847
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001848struct dev_gso_cb {
1849 void (*destructor)(struct sk_buff *skb);
1850};
1851
1852#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1853
1854static void dev_gso_skb_destructor(struct sk_buff *skb)
1855{
1856 struct dev_gso_cb *cb;
1857
1858 do {
1859 struct sk_buff *nskb = skb->next;
1860
1861 skb->next = nskb->next;
1862 nskb->next = NULL;
1863 kfree_skb(nskb);
1864 } while (skb->next);
1865
1866 cb = DEV_GSO_CB(skb);
1867 if (cb->destructor)
1868 cb->destructor(skb);
1869}
1870
1871/**
1872 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1873 * @skb: buffer to segment
1874 *
1875 * This function segments the given skb and stores the list of segments
1876 * in skb->next.
1877 */
1878static int dev_gso_segment(struct sk_buff *skb)
1879{
1880 struct net_device *dev = skb->dev;
1881 struct sk_buff *segs;
Herbert Xu576a30e2006-06-27 13:22:38 -07001882 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1883 NETIF_F_SG : 0);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001884
Herbert Xu576a30e2006-06-27 13:22:38 -07001885 segs = skb_gso_segment(skb, features);
1886
1887 /* Verifying header integrity only. */
1888 if (!segs)
1889 return 0;
1890
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07001891 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001892 return PTR_ERR(segs);
1893
1894 skb->next = segs;
1895 DEV_GSO_CB(skb)->destructor = skb->destructor;
1896 skb->destructor = dev_gso_skb_destructor;
1897
1898 return 0;
1899}
1900
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001901/*
1902 * Try to orphan skb early, right before transmission by the device.
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001903 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1904 * is needed on driver level for other reasons, e.g. see net/can/raw.c
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001905 */
1906static inline void skb_orphan_try(struct sk_buff *skb)
1907{
Eric Dumazet87fd3082010-07-13 05:24:20 +00001908 struct sock *sk = skb->sk;
1909
Oliver Hartkopp2244d072010-08-17 08:59:14 +00001910 if (sk && !skb_shinfo(skb)->tx_flags) {
Eric Dumazet87fd3082010-07-13 05:24:20 +00001911 /* skb_tx_hash() wont be able to get sk.
1912 * We copy sk_hash into skb->rxhash
1913 */
1914 if (!skb->rxhash)
1915 skb->rxhash = sk->sk_hash;
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001916 skb_orphan(skb);
Eric Dumazet87fd3082010-07-13 05:24:20 +00001917 }
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001918}
1919
John Fastabend6afff0c2010-06-16 14:18:12 +00001920/*
1921 * Returns true if either:
1922 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
1923 * 2. skb is fragmented and the device does not support SG, or if
1924 * at least one of fragments is in highmem and device does not
1925 * support DMA from it.
1926 */
1927static inline int skb_needs_linearize(struct sk_buff *skb,
1928 struct net_device *dev)
1929{
1930 return skb_is_nonlinear(skb) &&
David S. Miller21dc3302010-08-23 00:13:46 -07001931 ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
John Fastabend6afff0c2010-06-16 14:18:12 +00001932 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
1933 illegal_highdma(dev, skb))));
1934}
1935
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001936int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1937 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001938{
Stephen Hemminger00829822008-11-20 20:14:53 -08001939 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00001940 int rc = NETDEV_TX_OK;
Stephen Hemminger00829822008-11-20 20:14:53 -08001941
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001942 if (likely(!skb->next)) {
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -07001943 if (!list_empty(&ptype_all))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001944 dev_queue_xmit_nit(skb, dev);
1945
Eric Dumazet93f154b2009-05-18 22:19:19 -07001946 /*
1947 * If device doesnt need skb->dst, release it right now while
1948 * its hot in this cpu cache
1949 */
Eric Dumazetadf30902009-06-02 05:19:30 +00001950 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1951 skb_dst_drop(skb);
1952
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001953 skb_orphan_try(skb);
David S. Miller9ccb8972010-04-22 01:02:07 -07001954
1955 if (netif_needs_gso(dev, skb)) {
1956 if (unlikely(dev_gso_segment(skb)))
1957 goto out_kfree_skb;
1958 if (skb->next)
1959 goto gso;
John Fastabend6afff0c2010-06-16 14:18:12 +00001960 } else {
1961 if (skb_needs_linearize(skb, dev) &&
1962 __skb_linearize(skb))
1963 goto out_kfree_skb;
1964
1965 /* If packet is not checksummed and device does not
1966 * support checksumming for this protocol, complete
1967 * checksumming here.
1968 */
1969 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1970 skb_set_transport_header(skb, skb->csum_start -
1971 skb_headroom(skb));
1972 if (!dev_can_checksum(dev, skb) &&
1973 skb_checksum_help(skb))
1974 goto out_kfree_skb;
1975 }
David S. Miller9ccb8972010-04-22 01:02:07 -07001976 }
1977
Patrick Ohlyac45f602009-02-12 05:03:37 +00001978 rc = ops->ndo_start_xmit(skb, dev);
Patrick McHardyec634fe2009-07-05 19:23:38 -07001979 if (rc == NETDEV_TX_OK)
Eric Dumazet08baf562009-05-25 22:58:01 -07001980 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00001981 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001982 }
1983
Herbert Xu576a30e2006-06-27 13:22:38 -07001984gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001985 do {
1986 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001987
1988 skb->next = nskb->next;
1989 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00001990
1991 /*
1992 * If device doesnt need nskb->dst, release it right now while
1993 * its hot in this cpu cache
1994 */
1995 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1996 skb_dst_drop(nskb);
1997
Stephen Hemminger00829822008-11-20 20:14:53 -08001998 rc = ops->ndo_start_xmit(nskb, dev);
Patrick McHardyec634fe2009-07-05 19:23:38 -07001999 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002000 if (rc & ~NETDEV_TX_MASK)
2001 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07002002 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002003 skb->next = nskb;
2004 return rc;
2005 }
Eric Dumazet08baf562009-05-25 22:58:01 -07002006 txq_trans_update(txq);
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002007 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07002008 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002009 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002010
Patrick McHardy572a9d72009-11-10 06:14:14 +00002011out_kfree_gso_skb:
2012 if (likely(skb->next == NULL))
2013 skb->destructor = DEV_GSO_CB(skb)->destructor;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002014out_kfree_skb:
2015 kfree_skb(skb);
Patrick McHardy572a9d72009-11-10 06:14:14 +00002016 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002017}
2018
Tom Herbert0a9627f2010-03-16 08:03:29 +00002019static u32 hashrnd __read_mostly;
David S. Millerb6b2fed2008-07-21 09:48:06 -07002020
Stephen Hemminger92477442009-03-21 13:39:26 -07002021u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
David S. Miller8f0f2222008-07-15 03:47:03 -07002022{
David S. Miller70192982009-01-27 16:34:47 -08002023 u32 hash;
David S. Millerb6b2fed2008-07-21 09:48:06 -07002024
David S. Miller513de112009-05-03 14:43:10 -07002025 if (skb_rx_queue_recorded(skb)) {
2026 hash = skb_get_rx_queue(skb);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002027 while (unlikely(hash >= dev->real_num_tx_queues))
David S. Miller513de112009-05-03 14:43:10 -07002028 hash -= dev->real_num_tx_queues;
2029 return hash;
2030 }
Eric Dumazetec581f62009-05-01 09:05:06 -07002031
2032 if (skb->sk && skb->sk->sk_hash)
David S. Miller70192982009-01-27 16:34:47 -08002033 hash = skb->sk->sk_hash;
Eric Dumazetec581f62009-05-01 09:05:06 -07002034 else
Eric Dumazet87fd3082010-07-13 05:24:20 +00002035 hash = (__force u16) skb->protocol ^ skb->rxhash;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002036 hash = jhash_1word(hash, hashrnd);
David S. Millerd5a9e242009-01-27 16:22:11 -08002037
David S. Millerb6b2fed2008-07-21 09:48:06 -07002038 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
David S. Miller8f0f2222008-07-15 03:47:03 -07002039}
Stephen Hemminger92477442009-03-21 13:39:26 -07002040EXPORT_SYMBOL(skb_tx_hash);
David S. Miller8f0f2222008-07-15 03:47:03 -07002041
Eric Dumazeted046422009-11-13 21:54:04 +00002042static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2043{
2044 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2045 if (net_ratelimit()) {
Eric Dumazet7a161ea2010-04-08 21:26:13 +00002046 pr_warning("%s selects TX queue %d, but "
2047 "real number of TX queues is %d\n",
2048 dev->name, queue_index, dev->real_num_tx_queues);
Eric Dumazeted046422009-11-13 21:54:04 +00002049 }
2050 return 0;
2051 }
2052 return queue_index;
2053}
2054
David S. Millere8a04642008-07-17 00:34:19 -07002055static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2056 struct sk_buff *skb)
2057{
Tom Herbertb0f77d02010-07-14 20:50:29 -07002058 int queue_index;
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002059 struct sock *sk = skb->sk;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002060
Tom Herbertb0f77d02010-07-14 20:50:29 -07002061 queue_index = sk_tx_queue_get(sk);
2062 if (queue_index < 0) {
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002063 const struct net_device_ops *ops = dev->netdev_ops;
2064
2065 if (ops->ndo_select_queue) {
2066 queue_index = ops->ndo_select_queue(dev, skb);
Eric Dumazeted046422009-11-13 21:54:04 +00002067 queue_index = dev_cap_txqueue(dev, queue_index);
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002068 } else {
2069 queue_index = 0;
2070 if (dev->real_num_tx_queues > 1)
2071 queue_index = skb_tx_hash(dev, skb);
2072
Eric Dumazet8728c542010-04-11 21:18:17 +00002073 if (sk) {
David S. Miller87eb3672010-04-21 01:14:25 -07002074 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
Eric Dumazet8728c542010-04-11 21:18:17 +00002075
2076 if (dst && skb_dst(skb) == dst)
2077 sk_tx_queue_set(sk, queue_index);
2078 }
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002079 }
2080 }
David S. Millereae792b2008-07-15 03:03:33 -07002081
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002082 skb_set_queue_mapping(skb, queue_index);
2083 return netdev_get_tx_queue(dev, queue_index);
David S. Millere8a04642008-07-17 00:34:19 -07002084}
2085
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002086static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2087 struct net_device *dev,
2088 struct netdev_queue *txq)
2089{
2090 spinlock_t *root_lock = qdisc_lock(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002091 bool contended = qdisc_is_running(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002092 int rc;
2093
Eric Dumazet79640a42010-06-02 05:09:29 -07002094 /*
2095 * Heuristic to force contended enqueues to serialize on a
2096 * separate lock before trying to get qdisc main lock.
2097 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2098 * and dequeue packets faster.
2099 */
2100 if (unlikely(contended))
2101 spin_lock(&q->busylock);
2102
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002103 spin_lock(root_lock);
2104 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2105 kfree_skb(skb);
2106 rc = NET_XMIT_DROP;
2107 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
Eric Dumazetbc135b22010-06-02 03:23:51 -07002108 qdisc_run_begin(q)) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002109 /*
2110 * This is a work-conserving queue; there are no old skbs
2111 * waiting to be sent out; and the qdisc is not running -
2112 * xmit the skb directly.
2113 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002114 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2115 skb_dst_force(skb);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002116 __qdisc_update_bstats(q, skb->len);
Eric Dumazet79640a42010-06-02 05:09:29 -07002117 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2118 if (unlikely(contended)) {
2119 spin_unlock(&q->busylock);
2120 contended = false;
2121 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002122 __qdisc_run(q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002123 } else
Eric Dumazetbc135b22010-06-02 03:23:51 -07002124 qdisc_run_end(q);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002125
2126 rc = NET_XMIT_SUCCESS;
2127 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002128 skb_dst_force(skb);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002129 rc = qdisc_enqueue_root(skb, q);
Eric Dumazet79640a42010-06-02 05:09:29 -07002130 if (qdisc_run_begin(q)) {
2131 if (unlikely(contended)) {
2132 spin_unlock(&q->busylock);
2133 contended = false;
2134 }
2135 __qdisc_run(q);
2136 }
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002137 }
2138 spin_unlock(root_lock);
Eric Dumazet79640a42010-06-02 05:09:29 -07002139 if (unlikely(contended))
2140 spin_unlock(&q->busylock);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002141 return rc;
2142}
2143
Dave Jonesd29f7492008-07-22 14:09:06 -07002144/**
2145 * dev_queue_xmit - transmit a buffer
2146 * @skb: buffer to transmit
2147 *
2148 * Queue a buffer for transmission to a network device. The caller must
2149 * have set the device and priority and built the buffer before calling
2150 * this function. The function can be called from an interrupt.
2151 *
2152 * A negative errno code is returned on a failure. A success does not
2153 * guarantee the frame will be transmitted as it may be dropped due
2154 * to congestion or traffic shaping.
2155 *
2156 * -----------------------------------------------------------------------------------
2157 * I notice this method can also return errors from the queue disciplines,
2158 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2159 * be positive.
2160 *
2161 * Regardless of the return value, the skb is consumed, so it is currently
2162 * difficult to retry a send to this method. (You can bump the ref count
2163 * before sending to hold a reference for retry if you are careful.)
2164 *
2165 * When calling this method, interrupts MUST be enabled. This is because
2166 * the BH enable code must have IRQs enabled so that it will not deadlock.
2167 * --BLG
2168 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002169int dev_queue_xmit(struct sk_buff *skb)
2170{
2171 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002172 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002173 struct Qdisc *q;
2174 int rc = -ENOMEM;
2175
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002176 /* Disable soft irqs for various locks below. Also
2177 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002178 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002179 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002180
David S. Millereae792b2008-07-15 03:03:33 -07002181 txq = dev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002182 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002183
Linus Torvalds1da177e2005-04-16 15:20:36 -07002184#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002185 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002186#endif
2187 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002188 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002189 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002190 }
2191
2192 /* The device has no queue. Common case for software devices:
2193 loopback, all the sorts of tunnels...
2194
Herbert Xu932ff272006-06-09 12:20:56 -07002195 Really, it is unlikely that netif_tx_lock protection is necessary
2196 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002197 counters.)
2198 However, it is possible, that they rely on protection
2199 made by us here.
2200
2201 Check this and shot the lock. It is not prone from deadlocks.
2202 Either shot noqueue qdisc, it is even simpler 8)
2203 */
2204 if (dev->flags & IFF_UP) {
2205 int cpu = smp_processor_id(); /* ok because BHs are off */
2206
David S. Millerc773e842008-07-08 23:13:53 -07002207 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208
David S. Millerc773e842008-07-08 23:13:53 -07002209 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002211 if (!netif_tx_queue_stopped(txq)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002212 rc = dev_hard_start_xmit(skb, dev, txq);
2213 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002214 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002215 goto out;
2216 }
2217 }
David S. Millerc773e842008-07-08 23:13:53 -07002218 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002219 if (net_ratelimit())
2220 printk(KERN_CRIT "Virtual device %s asks to "
2221 "queue packet!\n", dev->name);
2222 } else {
2223 /* Recursion is detected! It is possible,
2224 * unfortunately */
2225 if (net_ratelimit())
2226 printk(KERN_CRIT "Dead loop on virtual device "
2227 "%s, fix it urgently!\n", dev->name);
2228 }
2229 }
2230
2231 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002232 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002233
Linus Torvalds1da177e2005-04-16 15:20:36 -07002234 kfree_skb(skb);
2235 return rc;
2236out:
Herbert Xud4828d82006-06-22 02:28:18 -07002237 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002238 return rc;
2239}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002240EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002241
2242
2243/*=======================================================================
2244 Receiver routines
2245 =======================================================================*/
2246
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002247int netdev_max_backlog __read_mostly = 1000;
Eric Dumazet3b098e22010-05-15 23:57:10 -07002248int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002249int netdev_budget __read_mostly = 300;
2250int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002251
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002252/* Called with irq disabled */
2253static inline void ____napi_schedule(struct softnet_data *sd,
2254 struct napi_struct *napi)
2255{
2256 list_add_tail(&napi->poll_list, &sd->poll_list);
2257 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2258}
2259
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002260/*
2261 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2262 * and src/dst port numbers. Returns a non-zero hash number on success
2263 * and 0 on failure.
2264 */
2265__u32 __skb_get_rxhash(struct sk_buff *skb)
2266{
Changli Gao12fcdef2010-08-17 19:04:32 +00002267 int nhoff, hash = 0, poff;
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002268 struct ipv6hdr *ip6;
2269 struct iphdr *ip;
2270 u8 ip_proto;
2271 u32 addr1, addr2, ihl;
2272 union {
2273 u32 v32;
2274 u16 v16[2];
2275 } ports;
2276
2277 nhoff = skb_network_offset(skb);
2278
2279 switch (skb->protocol) {
2280 case __constant_htons(ETH_P_IP):
2281 if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
2282 goto done;
2283
Changli Gao10034892010-08-21 06:13:28 +00002284 ip = (struct iphdr *) (skb->data + nhoff);
Changli Gaodbe57752010-08-17 19:01:38 +00002285 if (ip->frag_off & htons(IP_MF | IP_OFFSET))
2286 ip_proto = 0;
2287 else
2288 ip_proto = ip->protocol;
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002289 addr1 = (__force u32) ip->saddr;
2290 addr2 = (__force u32) ip->daddr;
2291 ihl = ip->ihl;
2292 break;
2293 case __constant_htons(ETH_P_IPV6):
2294 if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
2295 goto done;
2296
Changli Gao10034892010-08-21 06:13:28 +00002297 ip6 = (struct ipv6hdr *) (skb->data + nhoff);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002298 ip_proto = ip6->nexthdr;
2299 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2300 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
2301 ihl = (40 >> 2);
2302 break;
2303 default:
2304 goto done;
2305 }
2306
Changli Gao12fcdef2010-08-17 19:04:32 +00002307 ports.v32 = 0;
2308 poff = proto_ports_offset(ip_proto);
2309 if (poff >= 0) {
2310 nhoff += ihl * 4 + poff;
2311 if (pskb_may_pull(skb, nhoff + 4)) {
2312 ports.v32 = * (__force u32 *) (skb->data + nhoff);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002313 if (ports.v16[1] < ports.v16[0])
2314 swap(ports.v16[0], ports.v16[1]);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002315 }
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002316 }
2317
2318 /* get a consistent hash (same value on both flow directions) */
2319 if (addr2 < addr1)
2320 swap(addr1, addr2);
2321
2322 hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
2323 if (!hash)
2324 hash = 1;
2325
2326done:
2327 return hash;
2328}
2329EXPORT_SYMBOL(__skb_get_rxhash);
2330
Eric Dumazetdf334542010-03-24 19:13:54 +00002331#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002332
2333/* One global table that all flow-based protocols share. */
Eric Dumazet8770acf2010-04-17 00:54:36 -07002334struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002335EXPORT_SYMBOL(rps_sock_flow_table);
2336
Tom Herbert0a9627f2010-03-16 08:03:29 +00002337/*
2338 * get_rps_cpu is called from netif_receive_skb and returns the target
2339 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002340 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002341 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002342static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2343 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002344{
Tom Herbert0a9627f2010-03-16 08:03:29 +00002345 struct netdev_rx_queue *rxqueue;
2346 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002347 struct rps_dev_flow_table *flow_table;
2348 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002349 int cpu = -1;
Tom Herbertfec5e652010-04-16 16:01:27 -07002350 u16 tcpu;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002351
Tom Herbert0a9627f2010-03-16 08:03:29 +00002352 if (skb_rx_queue_recorded(skb)) {
2353 u16 index = skb_get_rx_queue(skb);
2354 if (unlikely(index >= dev->num_rx_queues)) {
Tim Gardner08c801f2010-06-08 17:51:27 -06002355 WARN_ONCE(dev->num_rx_queues > 1, "%s received packet "
2356 "on queue %u, but number of RX queues is %u\n",
2357 dev->name, index, dev->num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002358 goto done;
2359 }
2360 rxqueue = dev->_rx + index;
2361 } else
2362 rxqueue = dev->_rx;
2363
Tom Herbertfec5e652010-04-16 16:01:27 -07002364 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002365 goto done;
2366
Changli Gao2d47b452010-08-17 19:00:56 +00002367 skb_reset_network_header(skb);
Krishna Kumarbfb564e2010-08-04 06:15:52 +00002368 if (!skb_get_rxhash(skb))
Tom Herbert0a9627f2010-03-16 08:03:29 +00002369 goto done;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002370
Tom Herbertfec5e652010-04-16 16:01:27 -07002371 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2372 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2373 if (flow_table && sock_flow_table) {
2374 u16 next_cpu;
2375 struct rps_dev_flow *rflow;
2376
2377 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2378 tcpu = rflow->cpu;
2379
2380 next_cpu = sock_flow_table->ents[skb->rxhash &
2381 sock_flow_table->mask];
2382
2383 /*
2384 * If the desired CPU (where last recvmsg was done) is
2385 * different from current CPU (one in the rx-queue flow
2386 * table entry), switch if one of the following holds:
2387 * - Current CPU is unset (equal to RPS_NO_CPU).
2388 * - Current CPU is offline.
2389 * - The current CPU's queue tail has advanced beyond the
2390 * last packet that was enqueued using this table entry.
2391 * This guarantees that all previous packets for the flow
2392 * have been dequeued, thus preserving in order delivery.
2393 */
2394 if (unlikely(tcpu != next_cpu) &&
2395 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2396 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2397 rflow->last_qtail)) >= 0)) {
2398 tcpu = rflow->cpu = next_cpu;
2399 if (tcpu != RPS_NO_CPU)
2400 rflow->last_qtail = per_cpu(softnet_data,
2401 tcpu).input_queue_head;
2402 }
2403 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2404 *rflowp = rflow;
2405 cpu = tcpu;
2406 goto done;
2407 }
2408 }
2409
Tom Herbert0a9627f2010-03-16 08:03:29 +00002410 map = rcu_dereference(rxqueue->rps_map);
2411 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07002412 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00002413
2414 if (cpu_online(tcpu)) {
2415 cpu = tcpu;
2416 goto done;
2417 }
2418 }
2419
2420done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00002421 return cpu;
2422}
2423
Tom Herbert0a9627f2010-03-16 08:03:29 +00002424/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002425static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002426{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002427 struct softnet_data *sd = data;
2428
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002429 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00002430 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002431}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002432
Tom Herbertfec5e652010-04-16 16:01:27 -07002433#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00002434
2435/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002436 * Check if this softnet_data structure is another cpu one
2437 * If yes, queue it to our IPI list and return 1
2438 * If no, return 0
2439 */
2440static int rps_ipi_queued(struct softnet_data *sd)
2441{
2442#ifdef CONFIG_RPS
2443 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2444
2445 if (sd != mysd) {
2446 sd->rps_ipi_next = mysd->rps_ipi_list;
2447 mysd->rps_ipi_list = sd;
2448
2449 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2450 return 1;
2451 }
2452#endif /* CONFIG_RPS */
2453 return 0;
2454}
2455
2456/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00002457 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2458 * queue (may be a remote CPU queue).
2459 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002460static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2461 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002462{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002463 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002464 unsigned long flags;
2465
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002466 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002467
2468 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002469
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002470 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002471 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2472 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002473enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002474 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002475 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002476 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00002477 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002478 return NET_RX_SUCCESS;
2479 }
2480
Eric Dumazetebda37c22010-05-06 23:51:21 +00002481 /* Schedule NAPI for backlog device
2482 * We can use non atomic operation since we own the queue lock
2483 */
2484 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002485 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002486 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002487 }
2488 goto enqueue;
2489 }
2490
Changli Gaodee42872010-05-02 05:42:16 +00002491 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002492 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002493
Tom Herbert0a9627f2010-03-16 08:03:29 +00002494 local_irq_restore(flags);
2495
2496 kfree_skb(skb);
2497 return NET_RX_DROP;
2498}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002499
Linus Torvalds1da177e2005-04-16 15:20:36 -07002500/**
2501 * netif_rx - post buffer to the network code
2502 * @skb: buffer to post
2503 *
2504 * This function receives a packet from a device driver and queues it for
2505 * the upper (protocol) levels to process. It always succeeds. The buffer
2506 * may be dropped during processing for congestion control or by the
2507 * protocol layers.
2508 *
2509 * return values:
2510 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002511 * NET_RX_DROP (packet was dropped)
2512 *
2513 */
2514
2515int netif_rx(struct sk_buff *skb)
2516{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002517 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002518
2519 /* if netpoll wants it, pretend we never saw it */
2520 if (netpoll_rx(skb))
2521 return NET_RX_DROP;
2522
Eric Dumazet3b098e22010-05-15 23:57:10 -07002523 if (netdev_tstamp_prequeue)
2524 net_timestamp_check(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002525
Eric Dumazetdf334542010-03-24 19:13:54 +00002526#ifdef CONFIG_RPS
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002527 {
Tom Herbertfec5e652010-04-16 16:01:27 -07002528 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002529 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002530
Changli Gaocece1942010-08-07 20:35:43 -07002531 preempt_disable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002532 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07002533
2534 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002535 if (cpu < 0)
2536 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07002537
2538 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2539
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002540 rcu_read_unlock();
Changli Gaocece1942010-08-07 20:35:43 -07002541 preempt_enable();
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002542 }
2543#else
Tom Herbertfec5e652010-04-16 16:01:27 -07002544 {
2545 unsigned int qtail;
2546 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2547 put_cpu();
2548 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002549#endif
2550 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002551}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002552EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002553
2554int netif_rx_ni(struct sk_buff *skb)
2555{
2556 int err;
2557
2558 preempt_disable();
2559 err = netif_rx(skb);
2560 if (local_softirq_pending())
2561 do_softirq();
2562 preempt_enable();
2563
2564 return err;
2565}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002566EXPORT_SYMBOL(netif_rx_ni);
2567
Linus Torvalds1da177e2005-04-16 15:20:36 -07002568static void net_tx_action(struct softirq_action *h)
2569{
2570 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2571
2572 if (sd->completion_queue) {
2573 struct sk_buff *clist;
2574
2575 local_irq_disable();
2576 clist = sd->completion_queue;
2577 sd->completion_queue = NULL;
2578 local_irq_enable();
2579
2580 while (clist) {
2581 struct sk_buff *skb = clist;
2582 clist = clist->next;
2583
Ilpo Järvinen547b7922008-07-25 21:43:18 -07002584 WARN_ON(atomic_read(&skb->users));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002585 __kfree_skb(skb);
2586 }
2587 }
2588
2589 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07002590 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002591
2592 local_irq_disable();
2593 head = sd->output_queue;
2594 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00002595 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002596 local_irq_enable();
2597
2598 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07002599 struct Qdisc *q = head;
2600 spinlock_t *root_lock;
2601
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602 head = head->next_sched;
2603
David S. Miller5fb66222008-08-02 20:02:43 -07002604 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07002605 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002606 smp_mb__before_clear_bit();
2607 clear_bit(__QDISC_STATE_SCHED,
2608 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07002609 qdisc_run(q);
2610 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002611 } else {
David S. Miller195648b2008-08-19 04:00:36 -07002612 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002613 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07002614 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002615 } else {
2616 smp_mb__before_clear_bit();
2617 clear_bit(__QDISC_STATE_SCHED,
2618 &q->state);
2619 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002620 }
2621 }
2622 }
2623}
2624
Stephen Hemminger6f05f622007-03-08 20:46:03 -08002625static inline int deliver_skb(struct sk_buff *skb,
2626 struct packet_type *pt_prev,
2627 struct net_device *orig_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002628{
2629 atomic_inc(&skb->users);
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002630 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002631}
2632
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002633#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2634 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
Michał Mirosławda678292009-06-05 05:35:28 +00002635/* This hook is defined here for ATM LANE */
2636int (*br_fdb_test_addr_hook)(struct net_device *dev,
2637 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07002638EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00002639#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002640
Linus Torvalds1da177e2005-04-16 15:20:36 -07002641#ifdef CONFIG_NET_CLS_ACT
2642/* TODO: Maybe we should just force sch_ingress to be compiled in
2643 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2644 * a compare and 2 stores extra right now if we dont have it on
2645 * but have CONFIG_NET_CLS_ACT
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002646 * NOTE: This doesnt stop any functionality; if you dont have
Linus Torvalds1da177e2005-04-16 15:20:36 -07002647 * the ingress scheduler, you just cant add policies on ingress.
2648 *
2649 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002650static int ing_filter(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002651{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002652 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07002653 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07002654 struct netdev_queue *rxq;
2655 int result = TC_ACT_OK;
2656 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002657
Stephen Hemmingerde384832010-08-01 00:33:23 -07002658 if (unlikely(MAX_RED_LOOP < ttl++)) {
2659 if (net_ratelimit())
2660 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2661 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07002662 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002663 }
2664
Herbert Xuf697c3e2007-10-14 00:38:47 -07002665 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2666 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2667
David S. Miller555353c2008-07-08 17:33:13 -07002668 rxq = &dev->rx_queue;
2669
David S. Miller83874002008-07-17 00:53:03 -07002670 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07002671 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07002672 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07002673 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2674 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07002675 spin_unlock(qdisc_lock(q));
2676 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07002677
Linus Torvalds1da177e2005-04-16 15:20:36 -07002678 return result;
2679}
Herbert Xuf697c3e2007-10-14 00:38:47 -07002680
2681static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2682 struct packet_type **pt_prev,
2683 int *ret, struct net_device *orig_dev)
2684{
David S. Miller8d50b532008-07-30 02:37:46 -07002685 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07002686 goto out;
2687
2688 if (*pt_prev) {
2689 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2690 *pt_prev = NULL;
Herbert Xuf697c3e2007-10-14 00:38:47 -07002691 }
2692
2693 switch (ing_filter(skb)) {
2694 case TC_ACT_SHOT:
2695 case TC_ACT_STOLEN:
2696 kfree_skb(skb);
2697 return NULL;
2698 }
2699
2700out:
2701 skb->tc_verd = 0;
2702 return skb;
2703}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002704#endif
2705
Patrick McHardybc1d0412008-07-14 22:49:30 -07002706/*
2707 * netif_nit_deliver - deliver received packets to network taps
2708 * @skb: buffer
2709 *
2710 * This function is used to deliver incoming packets to network
2711 * taps. It should be used when the normal netif_receive_skb path
2712 * is bypassed, for example because of VLAN acceleration.
2713 */
2714void netif_nit_deliver(struct sk_buff *skb)
2715{
2716 struct packet_type *ptype;
2717
2718 if (list_empty(&ptype_all))
2719 return;
2720
2721 skb_reset_network_header(skb);
2722 skb_reset_transport_header(skb);
2723 skb->mac_len = skb->network_header - skb->mac_header;
2724
2725 rcu_read_lock();
2726 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2727 if (!ptype->dev || ptype->dev == skb->dev)
2728 deliver_skb(skb, ptype, skb->dev);
2729 }
2730 rcu_read_unlock();
2731}
2732
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002733/**
2734 * netdev_rx_handler_register - register receive handler
2735 * @dev: device to register a handler for
2736 * @rx_handler: receive handler to register
Jiri Pirko93e2c322010-06-10 03:34:59 +00002737 * @rx_handler_data: data pointer that is used by rx handler
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002738 *
2739 * Register a receive hander for a device. This handler will then be
2740 * called from __netif_receive_skb. A negative errno code is returned
2741 * on a failure.
2742 *
2743 * The caller must hold the rtnl_mutex.
2744 */
2745int netdev_rx_handler_register(struct net_device *dev,
Jiri Pirko93e2c322010-06-10 03:34:59 +00002746 rx_handler_func_t *rx_handler,
2747 void *rx_handler_data)
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002748{
2749 ASSERT_RTNL();
2750
2751 if (dev->rx_handler)
2752 return -EBUSY;
2753
Jiri Pirko93e2c322010-06-10 03:34:59 +00002754 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002755 rcu_assign_pointer(dev->rx_handler, rx_handler);
2756
2757 return 0;
2758}
2759EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
2760
2761/**
2762 * netdev_rx_handler_unregister - unregister receive handler
2763 * @dev: device to unregister a handler from
2764 *
2765 * Unregister a receive hander from a device.
2766 *
2767 * The caller must hold the rtnl_mutex.
2768 */
2769void netdev_rx_handler_unregister(struct net_device *dev)
2770{
2771
2772 ASSERT_RTNL();
2773 rcu_assign_pointer(dev->rx_handler, NULL);
Jiri Pirko93e2c322010-06-10 03:34:59 +00002774 rcu_assign_pointer(dev->rx_handler_data, NULL);
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002775}
2776EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
2777
Eric Dumazetacbbc072010-04-11 06:56:11 +00002778static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2779 struct net_device *master)
2780{
2781 if (skb->pkt_type == PACKET_HOST) {
2782 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2783
2784 memcpy(dest, master->dev_addr, ETH_ALEN);
2785 }
2786}
2787
2788/* On bonding slaves other than the currently active slave, suppress
2789 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2790 * ARP on active-backup slaves with arp_validate enabled.
2791 */
2792int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2793{
2794 struct net_device *dev = skb->dev;
2795
2796 if (master->priv_flags & IFF_MASTER_ARPMON)
2797 dev->last_rx = jiffies;
2798
Jiri Pirkof350a0a82010-06-15 06:50:45 +00002799 if ((master->priv_flags & IFF_MASTER_ALB) &&
2800 (master->priv_flags & IFF_BRIDGE_PORT)) {
Eric Dumazetacbbc072010-04-11 06:56:11 +00002801 /* Do address unmangle. The local destination address
2802 * will be always the one master has. Provides the right
2803 * functionality in a bridge.
2804 */
2805 skb_bond_set_mac_by_master(skb, master);
2806 }
2807
2808 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2809 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2810 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2811 return 0;
2812
2813 if (master->priv_flags & IFF_MASTER_ALB) {
2814 if (skb->pkt_type != PACKET_BROADCAST &&
2815 skb->pkt_type != PACKET_MULTICAST)
2816 return 0;
2817 }
2818 if (master->priv_flags & IFF_MASTER_8023AD &&
2819 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2820 return 0;
2821
2822 return 1;
2823 }
2824 return 0;
2825}
2826EXPORT_SYMBOL(__skb_bond_should_drop);
2827
Eric Dumazet10f744d2010-03-28 23:07:20 -07002828static int __netif_receive_skb(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002829{
2830 struct packet_type *ptype, *pt_prev;
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002831 rx_handler_func_t *rx_handler;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002832 struct net_device *orig_dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002833 struct net_device *master;
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002834 struct net_device *null_or_orig;
John Fastabend2df4a0f2010-05-12 21:31:11 +00002835 struct net_device *orig_or_bond;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002836 int ret = NET_RX_DROP;
Al Viro252e33462006-11-14 20:48:11 -08002837 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002838
Eric Dumazet3b098e22010-05-15 23:57:10 -07002839 if (!netdev_tstamp_prequeue)
2840 net_timestamp_check(skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07002841
Changli Gao05532122010-08-22 21:03:33 -07002842 if (vlan_tx_tag_present(skb))
2843 vlan_hwaccel_do_receive(skb);
Patrick McHardy9b22ea52008-11-04 14:49:57 -08002844
Linus Torvalds1da177e2005-04-16 15:20:36 -07002845 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002846 if (netpoll_receive_skb(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002847 return NET_RX_DROP;
2848
Eric Dumazet8964be42009-11-20 15:35:04 -08002849 if (!skb->skb_iif)
2850 skb->skb_iif = skb->dev->ifindex;
David S. Miller86e65da2005-08-09 19:36:29 -07002851
John Fastabend597a2642010-06-03 09:30:11 +00002852 /*
2853 * bonding note: skbs received on inactive slaves should only
2854 * be delivered to pkt handlers that are exact matches. Also
2855 * the deliver_no_wcard flag will be set. If packet handlers
2856 * are sensitive to duplicate packets these skbs will need to
2857 * be dropped at the handler. The vlan accel path may have
2858 * already set the deliver_no_wcard flag.
2859 */
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002860 null_or_orig = NULL;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07002861 orig_dev = skb->dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002862 master = ACCESS_ONCE(orig_dev->master);
John Fastabend597a2642010-06-03 09:30:11 +00002863 if (skb->deliver_no_wcard)
2864 null_or_orig = orig_dev;
2865 else if (master) {
2866 if (skb_bond_should_drop(skb, master)) {
2867 skb->deliver_no_wcard = 1;
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002868 null_or_orig = orig_dev; /* deliver only exact match */
John Fastabend597a2642010-06-03 09:30:11 +00002869 } else
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002870 skb->dev = master;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07002871 }
Jay Vosburgh8f903c72006-02-21 16:36:44 -08002872
Eric Dumazet27f39c73e2010-05-19 22:07:23 +00002873 __this_cpu_inc(softnet_data.processed);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002874 skb_reset_network_header(skb);
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -03002875 skb_reset_transport_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07002876 skb->mac_len = skb->network_header - skb->mac_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002877
2878 pt_prev = NULL;
2879
2880 rcu_read_lock();
2881
2882#ifdef CONFIG_NET_CLS_ACT
2883 if (skb->tc_verd & TC_NCLS) {
2884 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2885 goto ncls;
2886 }
2887#endif
2888
2889 list_for_each_entry_rcu(ptype, &ptype_all, list) {
Joe Eykholtf9823072008-07-02 18:22:02 -07002890 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2891 ptype->dev == orig_dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002892 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002893 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002894 pt_prev = ptype;
2895 }
2896 }
2897
2898#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07002899 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2900 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002901 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002902ncls:
2903#endif
2904
Jiri Pirkoab95bfe2010-06-01 21:52:08 +00002905 /* Handle special case of bridge or macvlan */
2906 rx_handler = rcu_dereference(skb->dev->rx_handler);
2907 if (rx_handler) {
2908 if (pt_prev) {
2909 ret = deliver_skb(skb, pt_prev, orig_dev);
2910 pt_prev = NULL;
2911 }
2912 skb = rx_handler(skb);
2913 if (!skb)
2914 goto out;
2915 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002916
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002917 /*
2918 * Make sure frames received on VLAN interfaces stacked on
2919 * bonding interfaces still make their way to any base bonding
2920 * device that may have registered for a specific ptype. The
2921 * handler may have to adjust skb->dev and orig_dev.
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002922 */
John Fastabend2df4a0f2010-05-12 21:31:11 +00002923 orig_or_bond = orig_dev;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002924 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2925 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
John Fastabend2df4a0f2010-05-12 21:31:11 +00002926 orig_or_bond = vlan_dev_real_dev(skb->dev);
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002927 }
2928
Linus Torvalds1da177e2005-04-16 15:20:36 -07002929 type = skb->protocol;
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08002930 list_for_each_entry_rcu(ptype,
2931 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002932 if (ptype->type == type && (ptype->dev == null_or_orig ||
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002933 ptype->dev == skb->dev || ptype->dev == orig_dev ||
John Fastabend2df4a0f2010-05-12 21:31:11 +00002934 ptype->dev == orig_or_bond)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002935 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002936 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002937 pt_prev = ptype;
2938 }
2939 }
2940
2941 if (pt_prev) {
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002942 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002943 } else {
2944 kfree_skb(skb);
2945 /* Jamal, now you will not able to escape explaining
2946 * me how you were going to use this. :-)
2947 */
2948 ret = NET_RX_DROP;
2949 }
2950
2951out:
2952 rcu_read_unlock();
2953 return ret;
2954}
Tom Herbert0a9627f2010-03-16 08:03:29 +00002955
2956/**
2957 * netif_receive_skb - process receive buffer from network
2958 * @skb: buffer to process
2959 *
2960 * netif_receive_skb() is the main receive data processing function.
2961 * It always succeeds. The buffer may be dropped during processing
2962 * for congestion control or by the protocol layers.
2963 *
2964 * This function may only be called from softirq context and interrupts
2965 * should be enabled.
2966 *
2967 * Return values (usually ignored):
2968 * NET_RX_SUCCESS: no congestion
2969 * NET_RX_DROP: packet was dropped
2970 */
2971int netif_receive_skb(struct sk_buff *skb)
2972{
Eric Dumazet3b098e22010-05-15 23:57:10 -07002973 if (netdev_tstamp_prequeue)
2974 net_timestamp_check(skb);
2975
Richard Cochranc1f19b52010-07-17 08:49:36 +00002976 if (skb_defer_rx_timestamp(skb))
2977 return NET_RX_SUCCESS;
2978
Eric Dumazetdf334542010-03-24 19:13:54 +00002979#ifdef CONFIG_RPS
Eric Dumazet3b098e22010-05-15 23:57:10 -07002980 {
2981 struct rps_dev_flow voidflow, *rflow = &voidflow;
2982 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002983
Eric Dumazet3b098e22010-05-15 23:57:10 -07002984 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00002985
Eric Dumazet3b098e22010-05-15 23:57:10 -07002986 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07002987
Eric Dumazet3b098e22010-05-15 23:57:10 -07002988 if (cpu >= 0) {
2989 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2990 rcu_read_unlock();
2991 } else {
2992 rcu_read_unlock();
2993 ret = __netif_receive_skb(skb);
2994 }
2995
2996 return ret;
Tom Herbertfec5e652010-04-16 16:01:27 -07002997 }
Tom Herbert1e94d722010-03-18 17:45:44 -07002998#else
2999 return __netif_receive_skb(skb);
3000#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00003001}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003002EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003003
Eric Dumazet88751272010-04-19 05:07:33 +00003004/* Network device is going away, flush any packets still pending
3005 * Called with irqs disabled.
3006 */
Changli Gao152102c2010-03-30 20:16:22 +00003007static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003008{
Changli Gao152102c2010-03-30 20:16:22 +00003009 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003010 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003011 struct sk_buff *skb, *tmp;
3012
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003013 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003014 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003015 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003016 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003017 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003018 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003019 }
Changli Gao6e7676c2010-04-27 15:07:33 -07003020 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00003021 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003022
3023 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3024 if (skb->dev == dev) {
3025 __skb_unlink(skb, &sd->process_queue);
3026 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003027 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07003028 }
3029 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07003030}
3031
Herbert Xud565b0a2008-12-15 23:38:52 -08003032static int napi_gro_complete(struct sk_buff *skb)
3033{
3034 struct packet_type *ptype;
3035 __be16 type = skb->protocol;
3036 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3037 int err = -ENOENT;
3038
Herbert Xufc59f9a2009-04-14 15:11:06 -07003039 if (NAPI_GRO_CB(skb)->count == 1) {
3040 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003041 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07003042 }
Herbert Xud565b0a2008-12-15 23:38:52 -08003043
3044 rcu_read_lock();
3045 list_for_each_entry_rcu(ptype, head, list) {
3046 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3047 continue;
3048
3049 err = ptype->gro_complete(skb);
3050 break;
3051 }
3052 rcu_read_unlock();
3053
3054 if (err) {
3055 WARN_ON(&ptype->list == head);
3056 kfree_skb(skb);
3057 return NET_RX_SUCCESS;
3058 }
3059
3060out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003061 return netif_receive_skb(skb);
3062}
3063
Eric Dumazet86cac582010-08-31 18:25:32 +00003064inline void napi_gro_flush(struct napi_struct *napi)
Herbert Xud565b0a2008-12-15 23:38:52 -08003065{
3066 struct sk_buff *skb, *next;
3067
3068 for (skb = napi->gro_list; skb; skb = next) {
3069 next = skb->next;
3070 skb->next = NULL;
3071 napi_gro_complete(skb);
3072 }
3073
Herbert Xu4ae55442009-02-08 18:00:36 +00003074 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003075 napi->gro_list = NULL;
3076}
Eric Dumazet86cac582010-08-31 18:25:32 +00003077EXPORT_SYMBOL(napi_gro_flush);
Herbert Xud565b0a2008-12-15 23:38:52 -08003078
Ben Hutchings5b252f02009-10-29 07:17:09 +00003079enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003080{
3081 struct sk_buff **pp = NULL;
3082 struct packet_type *ptype;
3083 __be16 type = skb->protocol;
3084 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
Herbert Xu0da2afd52008-12-26 14:57:42 -08003085 int same_flow;
Herbert Xud565b0a2008-12-15 23:38:52 -08003086 int mac_len;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003087 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003088
Jarek Poplawskice9e76c2010-08-05 01:19:11 +00003089 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
Herbert Xud565b0a2008-12-15 23:38:52 -08003090 goto normal;
3091
David S. Miller21dc3302010-08-23 00:13:46 -07003092 if (skb_is_gso(skb) || skb_has_frag_list(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003093 goto normal;
3094
Herbert Xud565b0a2008-12-15 23:38:52 -08003095 rcu_read_lock();
3096 list_for_each_entry_rcu(ptype, head, list) {
Herbert Xud565b0a2008-12-15 23:38:52 -08003097 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3098 continue;
3099
Herbert Xu86911732009-01-29 14:19:50 +00003100 skb_set_network_header(skb, skb_gro_offset(skb));
Herbert Xud565b0a2008-12-15 23:38:52 -08003101 mac_len = skb->network_header - skb->mac_header;
3102 skb->mac_len = mac_len;
3103 NAPI_GRO_CB(skb)->same_flow = 0;
3104 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003105 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003106
Herbert Xud565b0a2008-12-15 23:38:52 -08003107 pp = ptype->gro_receive(&napi->gro_list, skb);
3108 break;
3109 }
3110 rcu_read_unlock();
3111
3112 if (&ptype->list == head)
3113 goto normal;
3114
Herbert Xu0da2afd52008-12-26 14:57:42 -08003115 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003116 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd52008-12-26 14:57:42 -08003117
Herbert Xud565b0a2008-12-15 23:38:52 -08003118 if (pp) {
3119 struct sk_buff *nskb = *pp;
3120
3121 *pp = nskb->next;
3122 nskb->next = NULL;
3123 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003124 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003125 }
3126
Herbert Xu0da2afd52008-12-26 14:57:42 -08003127 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003128 goto ok;
3129
Herbert Xu4ae55442009-02-08 18:00:36 +00003130 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003131 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003132
Herbert Xu4ae55442009-02-08 18:00:36 +00003133 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003134 NAPI_GRO_CB(skb)->count = 1;
Herbert Xu86911732009-01-29 14:19:50 +00003135 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003136 skb->next = napi->gro_list;
3137 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003138 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003139
Herbert Xuad0f9902009-02-01 01:24:55 -08003140pull:
Herbert Xucb189782009-05-26 18:50:31 +00003141 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3142 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3143
3144 BUG_ON(skb->end - skb->tail < grow);
3145
3146 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3147
3148 skb->tail += grow;
3149 skb->data_len -= grow;
3150
3151 skb_shinfo(skb)->frags[0].page_offset += grow;
3152 skb_shinfo(skb)->frags[0].size -= grow;
3153
3154 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3155 put_page(skb_shinfo(skb)->frags[0].page);
3156 memmove(skb_shinfo(skb)->frags,
3157 skb_shinfo(skb)->frags + 1,
Jarek Poplawskie5093ae2010-08-11 02:02:10 +00003158 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
Herbert Xucb189782009-05-26 18:50:31 +00003159 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003160 }
3161
Herbert Xud565b0a2008-12-15 23:38:52 -08003162ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003163 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003164
3165normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003166 ret = GRO_NORMAL;
3167 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003168}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003169EXPORT_SYMBOL(dev_gro_receive);
3170
Eric Dumazet40d08022010-08-26 22:03:08 -07003171static inline gro_result_t
Ben Hutchings5b252f02009-10-29 07:17:09 +00003172__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003173{
3174 struct sk_buff *p;
3175
3176 for (p = napi->gro_list; p; p = p->next) {
Eric Dumazet40d08022010-08-26 22:03:08 -07003177 unsigned long diffs;
3178
3179 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3180 diffs |= compare_ether_header(skb_mac_header(p),
Joe Perchesf64f9e72009-11-29 16:55:45 -08003181 skb_gro_mac_header(skb));
Eric Dumazet40d08022010-08-26 22:03:08 -07003182 NAPI_GRO_CB(p)->same_flow = !diffs;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003183 NAPI_GRO_CB(p)->flush = 0;
3184 }
3185
3186 return dev_gro_receive(napi, skb);
3187}
Herbert Xu5d38a072009-01-04 16:13:40 -08003188
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003189gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003190{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003191 switch (ret) {
3192 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003193 if (netif_receive_skb(skb))
3194 ret = GRO_DROP;
3195 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003196
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003197 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003198 case GRO_MERGED_FREE:
Herbert Xu5d38a072009-01-04 16:13:40 -08003199 kfree_skb(skb);
3200 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003201
3202 case GRO_HELD:
3203 case GRO_MERGED:
3204 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003205 }
3206
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003207 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003208}
3209EXPORT_SYMBOL(napi_skb_finish);
3210
Herbert Xu78a478d2009-05-26 18:50:21 +00003211void skb_gro_reset_offset(struct sk_buff *skb)
3212{
3213 NAPI_GRO_CB(skb)->data_offset = 0;
3214 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003215 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003216
Herbert Xu78d3fd02009-05-26 18:50:23 +00003217 if (skb->mac_header == skb->tail &&
Herbert Xu74895942009-05-26 18:50:27 +00003218 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
Herbert Xu78a478d2009-05-26 18:50:21 +00003219 NAPI_GRO_CB(skb)->frag0 =
3220 page_address(skb_shinfo(skb)->frags[0].page) +
3221 skb_shinfo(skb)->frags[0].page_offset;
Herbert Xu74895942009-05-26 18:50:27 +00003222 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3223 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003224}
3225EXPORT_SYMBOL(skb_gro_reset_offset);
3226
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003227gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003228{
Herbert Xu86911732009-01-29 14:19:50 +00003229 skb_gro_reset_offset(skb);
3230
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003231 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003232}
3233EXPORT_SYMBOL(napi_gro_receive);
3234
Herbert Xu96e93ea2009-01-06 10:49:34 -08003235void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3236{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003237 __skb_pull(skb, skb_headlen(skb));
3238 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3239
3240 napi->skb = skb;
3241}
3242EXPORT_SYMBOL(napi_reuse_skb);
3243
Herbert Xu76620aa2009-04-16 02:02:07 -07003244struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003245{
Herbert Xu5d38a072009-01-04 16:13:40 -08003246 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003247
3248 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003249 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3250 if (skb)
3251 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003252 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003253 return skb;
3254}
Herbert Xu76620aa2009-04-16 02:02:07 -07003255EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003256
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003257gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3258 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003259{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003260 switch (ret) {
3261 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003262 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003263 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003264
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003265 if (ret == GRO_HELD)
3266 skb_gro_pull(skb, -ETH_HLEN);
3267 else if (netif_receive_skb(skb))
3268 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003269 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003270
3271 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003272 case GRO_MERGED_FREE:
3273 napi_reuse_skb(napi, skb);
3274 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003275
3276 case GRO_MERGED:
3277 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003278 }
3279
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003280 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003281}
3282EXPORT_SYMBOL(napi_frags_finish);
3283
Herbert Xu76620aa2009-04-16 02:02:07 -07003284struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003285{
Herbert Xu76620aa2009-04-16 02:02:07 -07003286 struct sk_buff *skb = napi->skb;
3287 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00003288 unsigned int hlen;
3289 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07003290
3291 napi->skb = NULL;
3292
3293 skb_reset_mac_header(skb);
3294 skb_gro_reset_offset(skb);
3295
Herbert Xua5b1cf22009-05-26 18:50:28 +00003296 off = skb_gro_offset(skb);
3297 hlen = off + sizeof(*eth);
3298 eth = skb_gro_header_fast(skb, off);
3299 if (skb_gro_header_hard(skb, hlen)) {
3300 eth = skb_gro_header_slow(skb, hlen, off);
3301 if (unlikely(!eth)) {
3302 napi_reuse_skb(napi, skb);
3303 skb = NULL;
3304 goto out;
3305 }
Herbert Xu76620aa2009-04-16 02:02:07 -07003306 }
3307
3308 skb_gro_pull(skb, sizeof(*eth));
3309
3310 /*
3311 * This works because the only protocols we care about don't require
3312 * special handling. We'll fix it up properly at the end.
3313 */
3314 skb->protocol = eth->h_proto;
3315
3316out:
3317 return skb;
3318}
3319EXPORT_SYMBOL(napi_frags_skb);
3320
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003321gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07003322{
3323 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003324
3325 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003326 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003327
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003328 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08003329}
3330EXPORT_SYMBOL(napi_gro_frags);
3331
Eric Dumazete326bed2010-04-22 00:22:45 -07003332/*
3333 * net_rps_action sends any pending IPI's for rps.
3334 * Note: called with local irq disabled, but exits with local irq enabled.
3335 */
3336static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3337{
3338#ifdef CONFIG_RPS
3339 struct softnet_data *remsd = sd->rps_ipi_list;
3340
3341 if (remsd) {
3342 sd->rps_ipi_list = NULL;
3343
3344 local_irq_enable();
3345
3346 /* Send pending IPI's to kick RPS processing on remote cpus. */
3347 while (remsd) {
3348 struct softnet_data *next = remsd->rps_ipi_next;
3349
3350 if (cpu_online(remsd->cpu))
3351 __smp_call_function_single(remsd->cpu,
3352 &remsd->csd, 0);
3353 remsd = next;
3354 }
3355 } else
3356#endif
3357 local_irq_enable();
3358}
3359
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003360static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003361{
3362 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003363 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003364
Eric Dumazete326bed2010-04-22 00:22:45 -07003365#ifdef CONFIG_RPS
3366 /* Check if we have pending ipi, its better to send them now,
3367 * not waiting net_rx_action() end.
3368 */
3369 if (sd->rps_ipi_list) {
3370 local_irq_disable();
3371 net_rps_action_and_irq_enable(sd);
3372 }
3373#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003374 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07003375 local_irq_disable();
3376 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003377 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07003378 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003379
Changli Gao6e7676c2010-04-27 15:07:33 -07003380 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07003381 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07003382 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07003383 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00003384 input_queue_head_incr(sd);
3385 if (++work >= quota) {
3386 local_irq_enable();
3387 return work;
3388 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003389 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003390
Changli Gao6e7676c2010-04-27 15:07:33 -07003391 rps_lock(sd);
3392 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003393 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07003394 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3395 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003396
Changli Gao6e7676c2010-04-27 15:07:33 -07003397 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003398 /*
3399 * Inline a custom version of __napi_complete().
3400 * only current cpu owns and manipulates this napi,
3401 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3402 * we can use a plain write instead of clear_bit(),
3403 * and we dont need an smp_mb() memory barrier.
3404 */
3405 list_del(&napi->poll_list);
3406 napi->state = 0;
3407
Changli Gao6e7676c2010-04-27 15:07:33 -07003408 quota = work + qlen;
3409 }
3410 rps_unlock(sd);
3411 }
3412 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003413
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003414 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003415}
3416
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003417/**
3418 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07003419 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003420 *
3421 * The entry's receive function will be scheduled to run
3422 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08003423void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003424{
3425 unsigned long flags;
3426
3427 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003428 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003429 local_irq_restore(flags);
3430}
3431EXPORT_SYMBOL(__napi_schedule);
3432
Herbert Xud565b0a2008-12-15 23:38:52 -08003433void __napi_complete(struct napi_struct *n)
3434{
3435 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3436 BUG_ON(n->gro_list);
3437
3438 list_del(&n->poll_list);
3439 smp_mb__before_clear_bit();
3440 clear_bit(NAPI_STATE_SCHED, &n->state);
3441}
3442EXPORT_SYMBOL(__napi_complete);
3443
3444void napi_complete(struct napi_struct *n)
3445{
3446 unsigned long flags;
3447
3448 /*
3449 * don't let napi dequeue from the cpu poll list
3450 * just in case its running on a different cpu
3451 */
3452 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3453 return;
3454
3455 napi_gro_flush(n);
3456 local_irq_save(flags);
3457 __napi_complete(n);
3458 local_irq_restore(flags);
3459}
3460EXPORT_SYMBOL(napi_complete);
3461
3462void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3463 int (*poll)(struct napi_struct *, int), int weight)
3464{
3465 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00003466 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003467 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08003468 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003469 napi->poll = poll;
3470 napi->weight = weight;
3471 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08003472 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08003473#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08003474 spin_lock_init(&napi->poll_lock);
3475 napi->poll_owner = -1;
3476#endif
3477 set_bit(NAPI_STATE_SCHED, &napi->state);
3478}
3479EXPORT_SYMBOL(netif_napi_add);
3480
3481void netif_napi_del(struct napi_struct *napi)
3482{
3483 struct sk_buff *skb, *next;
3484
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08003485 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07003486 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08003487
3488 for (skb = napi->gro_list; skb; skb = next) {
3489 next = skb->next;
3490 skb->next = NULL;
3491 kfree_skb(skb);
3492 }
3493
3494 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00003495 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003496}
3497EXPORT_SYMBOL(netif_napi_del);
3498
Linus Torvalds1da177e2005-04-16 15:20:36 -07003499static void net_rx_action(struct softirq_action *h)
3500{
Eric Dumazete326bed2010-04-22 00:22:45 -07003501 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003502 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07003503 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07003504 void *have;
3505
Linus Torvalds1da177e2005-04-16 15:20:36 -07003506 local_irq_disable();
3507
Eric Dumazete326bed2010-04-22 00:22:45 -07003508 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003509 struct napi_struct *n;
3510 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003511
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003512 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003513 * Allow this to run for 2 jiffies since which will allow
3514 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003515 */
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003516 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003517 goto softnet_break;
3518
3519 local_irq_enable();
3520
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003521 /* Even though interrupts have been re-enabled, this
3522 * access is safe because interrupts can only add new
3523 * entries to the tail of this list, and only ->poll()
3524 * calls can remove this head entry from the list.
3525 */
Eric Dumazete326bed2010-04-22 00:22:45 -07003526 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003527
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003528 have = netpoll_poll_lock(n);
3529
3530 weight = n->weight;
3531
David S. Miller0a7606c2007-10-29 21:28:47 -07003532 /* This NAPI_STATE_SCHED test is for avoiding a race
3533 * with netpoll's poll_napi(). Only the entity which
3534 * obtains the lock and sees NAPI_STATE_SCHED set will
3535 * actually make the ->poll() call. Therefore we avoid
3536 * accidently calling ->poll() when NAPI is not scheduled.
3537 */
3538 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00003539 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07003540 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00003541 trace_napi_poll(n);
3542 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003543
3544 WARN_ON_ONCE(work > weight);
3545
3546 budget -= work;
3547
3548 local_irq_disable();
3549
3550 /* Drivers must not modify the NAPI state if they
3551 * consume the entire weight. In such cases this code
3552 * still "owns" the NAPI instance and therefore can
3553 * move the instance around on the list at-will.
3554 */
David S. Millerfed17f32008-01-07 21:00:40 -08003555 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07003556 if (unlikely(napi_disable_pending(n))) {
3557 local_irq_enable();
3558 napi_complete(n);
3559 local_irq_disable();
3560 } else
Eric Dumazete326bed2010-04-22 00:22:45 -07003561 list_move_tail(&n->poll_list, &sd->poll_list);
David S. Millerfed17f32008-01-07 21:00:40 -08003562 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003563
3564 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003565 }
3566out:
Eric Dumazete326bed2010-04-22 00:22:45 -07003567 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003568
Chris Leechdb217332006-06-17 21:24:58 -07003569#ifdef CONFIG_NET_DMA
3570 /*
3571 * There may not be any more sk_buffs coming right now, so push
3572 * any pending DMA copies to hardware
3573 */
Dan Williams2ba05622009-01-06 11:38:14 -07003574 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07003575#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003576
Linus Torvalds1da177e2005-04-16 15:20:36 -07003577 return;
3578
3579softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00003580 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003581 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3582 goto out;
3583}
3584
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003585static gifconf_func_t *gifconf_list[NPROTO];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003586
3587/**
3588 * register_gifconf - register a SIOCGIF handler
3589 * @family: Address family
3590 * @gifconf: Function handler
3591 *
3592 * Register protocol dependent address dumping routines. The handler
3593 * that is passed must not be freed or reused until it has been replaced
3594 * by another handler.
3595 */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003596int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003597{
3598 if (family >= NPROTO)
3599 return -EINVAL;
3600 gifconf_list[family] = gifconf;
3601 return 0;
3602}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003603EXPORT_SYMBOL(register_gifconf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003604
3605
3606/*
3607 * Map an interface index to its name (SIOCGIFNAME)
3608 */
3609
3610/*
3611 * We need this ioctl for efficient implementation of the
3612 * if_indextoname() function required by the IPv6 API. Without
3613 * it, we would have to search all the interfaces to find a
3614 * match. --pb
3615 */
3616
Eric W. Biederman881d9662007-09-17 11:56:21 -07003617static int dev_ifname(struct net *net, struct ifreq __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003618{
3619 struct net_device *dev;
3620 struct ifreq ifr;
3621
3622 /*
3623 * Fetch the caller's info block.
3624 */
3625
3626 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3627 return -EFAULT;
3628
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003629 rcu_read_lock();
3630 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003631 if (!dev) {
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003632 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003633 return -ENODEV;
3634 }
3635
3636 strcpy(ifr.ifr_name, dev->name);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003637 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003638
3639 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3640 return -EFAULT;
3641 return 0;
3642}
3643
3644/*
3645 * Perform a SIOCGIFCONF call. This structure will change
3646 * size eventually, and there is nothing I can do about it.
3647 * Thus we will need a 'compatibility mode'.
3648 */
3649
Eric W. Biederman881d9662007-09-17 11:56:21 -07003650static int dev_ifconf(struct net *net, char __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003651{
3652 struct ifconf ifc;
3653 struct net_device *dev;
3654 char __user *pos;
3655 int len;
3656 int total;
3657 int i;
3658
3659 /*
3660 * Fetch the caller's info block.
3661 */
3662
3663 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3664 return -EFAULT;
3665
3666 pos = ifc.ifc_buf;
3667 len = ifc.ifc_len;
3668
3669 /*
3670 * Loop over the interfaces, and write an info block for each.
3671 */
3672
3673 total = 0;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003674 for_each_netdev(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003675 for (i = 0; i < NPROTO; i++) {
3676 if (gifconf_list[i]) {
3677 int done;
3678 if (!pos)
3679 done = gifconf_list[i](dev, NULL, 0);
3680 else
3681 done = gifconf_list[i](dev, pos + total,
3682 len - total);
3683 if (done < 0)
3684 return -EFAULT;
3685 total += done;
3686 }
3687 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003688 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003689
3690 /*
3691 * All done. Write the updated control block back to the caller.
3692 */
3693 ifc.ifc_len = total;
3694
3695 /*
3696 * Both BSD and Solaris return 0 here, so we do too.
3697 */
3698 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3699}
3700
3701#ifdef CONFIG_PROC_FS
3702/*
3703 * This is invoked by the /proc filesystem handler to display a device
3704 * in detail.
3705 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003706void *dev_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003707 __acquires(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003708{
Denis V. Luneve372c412007-11-19 22:31:54 -08003709 struct net *net = seq_file_net(seq);
Pavel Emelianov7562f872007-05-03 15:13:45 -07003710 loff_t off;
3711 struct net_device *dev;
3712
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003713 rcu_read_lock();
Pavel Emelianov7562f872007-05-03 15:13:45 -07003714 if (!*pos)
3715 return SEQ_START_TOKEN;
3716
3717 off = 1;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003718 for_each_netdev_rcu(net, dev)
Pavel Emelianov7562f872007-05-03 15:13:45 -07003719 if (off++ == *pos)
3720 return dev;
3721
3722 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003723}
3724
3725void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3726{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003727 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3728 first_net_device(seq_file_net(seq)) :
3729 next_net_device((struct net_device *)v);
3730
Linus Torvalds1da177e2005-04-16 15:20:36 -07003731 ++*pos;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003732 return rcu_dereference(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003733}
3734
3735void dev_seq_stop(struct seq_file *seq, void *v)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003736 __releases(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003737{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003738 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003739}
3740
3741static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3742{
Eric Dumazet28172732010-07-07 14:58:56 -07003743 struct rtnl_link_stats64 temp;
3744 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003745
Ben Hutchingsbe1f3c22010-06-08 07:19:54 +00003746 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3747 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
Rusty Russell5a1b5892007-04-28 21:04:03 -07003748 dev->name, stats->rx_bytes, stats->rx_packets,
3749 stats->rx_errors,
3750 stats->rx_dropped + stats->rx_missed_errors,
3751 stats->rx_fifo_errors,
3752 stats->rx_length_errors + stats->rx_over_errors +
3753 stats->rx_crc_errors + stats->rx_frame_errors,
3754 stats->rx_compressed, stats->multicast,
3755 stats->tx_bytes, stats->tx_packets,
3756 stats->tx_errors, stats->tx_dropped,
3757 stats->tx_fifo_errors, stats->collisions,
3758 stats->tx_carrier_errors +
3759 stats->tx_aborted_errors +
3760 stats->tx_window_errors +
3761 stats->tx_heartbeat_errors,
3762 stats->tx_compressed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003763}
3764
3765/*
3766 * Called from the PROCfs module. This now uses the new arbitrary sized
3767 * /proc/net interface to create /proc/net/dev
3768 */
3769static int dev_seq_show(struct seq_file *seq, void *v)
3770{
3771 if (v == SEQ_START_TOKEN)
3772 seq_puts(seq, "Inter-| Receive "
3773 " | Transmit\n"
3774 " face |bytes packets errs drop fifo frame "
3775 "compressed multicast|bytes packets errs "
3776 "drop fifo colls carrier compressed\n");
3777 else
3778 dev_seq_printf_stats(seq, v);
3779 return 0;
3780}
3781
Changli Gaodee42872010-05-02 05:42:16 +00003782static struct softnet_data *softnet_get_online(loff_t *pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003783{
Changli Gaodee42872010-05-02 05:42:16 +00003784 struct softnet_data *sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003785
Mike Travis0c0b0ac2008-05-02 16:43:08 -07003786 while (*pos < nr_cpu_ids)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003787 if (cpu_online(*pos)) {
Changli Gaodee42872010-05-02 05:42:16 +00003788 sd = &per_cpu(softnet_data, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003789 break;
3790 } else
3791 ++*pos;
Changli Gaodee42872010-05-02 05:42:16 +00003792 return sd;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003793}
3794
3795static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3796{
3797 return softnet_get_online(pos);
3798}
3799
3800static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3801{
3802 ++*pos;
3803 return softnet_get_online(pos);
3804}
3805
3806static void softnet_seq_stop(struct seq_file *seq, void *v)
3807{
3808}
3809
3810static int softnet_seq_show(struct seq_file *seq, void *v)
3811{
Changli Gaodee42872010-05-02 05:42:16 +00003812 struct softnet_data *sd = v;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003813
Tom Herbert0a9627f2010-03-16 08:03:29 +00003814 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
Changli Gaodee42872010-05-02 05:42:16 +00003815 sd->processed, sd->dropped, sd->time_squeeze, 0,
Stephen Hemmingerc1ebcdb2005-06-23 20:08:59 -07003816 0, 0, 0, 0, /* was fastroute */
Changli Gaodee42872010-05-02 05:42:16 +00003817 sd->cpu_collision, sd->received_rps);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003818 return 0;
3819}
3820
Stephen Hemmingerf6908082007-03-12 14:34:29 -07003821static const struct seq_operations dev_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003822 .start = dev_seq_start,
3823 .next = dev_seq_next,
3824 .stop = dev_seq_stop,
3825 .show = dev_seq_show,
3826};
3827
3828static int dev_seq_open(struct inode *inode, struct file *file)
3829{
Denis V. Luneve372c412007-11-19 22:31:54 -08003830 return seq_open_net(inode, file, &dev_seq_ops,
3831 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003832}
3833
Arjan van de Ven9a321442007-02-12 00:55:35 -08003834static const struct file_operations dev_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003835 .owner = THIS_MODULE,
3836 .open = dev_seq_open,
3837 .read = seq_read,
3838 .llseek = seq_lseek,
Denis V. Luneve372c412007-11-19 22:31:54 -08003839 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003840};
3841
Stephen Hemmingerf6908082007-03-12 14:34:29 -07003842static const struct seq_operations softnet_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003843 .start = softnet_seq_start,
3844 .next = softnet_seq_next,
3845 .stop = softnet_seq_stop,
3846 .show = softnet_seq_show,
3847};
3848
3849static int softnet_seq_open(struct inode *inode, struct file *file)
3850{
3851 return seq_open(file, &softnet_seq_ops);
3852}
3853
Arjan van de Ven9a321442007-02-12 00:55:35 -08003854static const struct file_operations softnet_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003855 .owner = THIS_MODULE,
3856 .open = softnet_seq_open,
3857 .read = seq_read,
3858 .llseek = seq_lseek,
3859 .release = seq_release,
3860};
3861
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003862static void *ptype_get_idx(loff_t pos)
3863{
3864 struct packet_type *pt = NULL;
3865 loff_t i = 0;
3866 int t;
3867
3868 list_for_each_entry_rcu(pt, &ptype_all, list) {
3869 if (i == pos)
3870 return pt;
3871 ++i;
3872 }
3873
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003874 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003875 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3876 if (i == pos)
3877 return pt;
3878 ++i;
3879 }
3880 }
3881 return NULL;
3882}
3883
3884static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
Stephen Hemminger72348a42008-01-21 02:27:29 -08003885 __acquires(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003886{
3887 rcu_read_lock();
3888 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3889}
3890
3891static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3892{
3893 struct packet_type *pt;
3894 struct list_head *nxt;
3895 int hash;
3896
3897 ++*pos;
3898 if (v == SEQ_START_TOKEN)
3899 return ptype_get_idx(0);
3900
3901 pt = v;
3902 nxt = pt->list.next;
3903 if (pt->type == htons(ETH_P_ALL)) {
3904 if (nxt != &ptype_all)
3905 goto found;
3906 hash = 0;
3907 nxt = ptype_base[0].next;
3908 } else
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003909 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003910
3911 while (nxt == &ptype_base[hash]) {
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08003912 if (++hash >= PTYPE_HASH_SIZE)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003913 return NULL;
3914 nxt = ptype_base[hash].next;
3915 }
3916found:
3917 return list_entry(nxt, struct packet_type, list);
3918}
3919
3920static void ptype_seq_stop(struct seq_file *seq, void *v)
Stephen Hemminger72348a42008-01-21 02:27:29 -08003921 __releases(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003922{
3923 rcu_read_unlock();
3924}
3925
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003926static int ptype_seq_show(struct seq_file *seq, void *v)
3927{
3928 struct packet_type *pt = v;
3929
3930 if (v == SEQ_START_TOKEN)
3931 seq_puts(seq, "Type Device Function\n");
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09003932 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003933 if (pt->type == htons(ETH_P_ALL))
3934 seq_puts(seq, "ALL ");
3935 else
3936 seq_printf(seq, "%04x", ntohs(pt->type));
3937
Alexey Dobriyan908cd2d2008-11-16 19:50:35 -08003938 seq_printf(seq, " %-8s %pF\n",
3939 pt->dev ? pt->dev->name : "", pt->func);
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003940 }
3941
3942 return 0;
3943}
3944
3945static const struct seq_operations ptype_seq_ops = {
3946 .start = ptype_seq_start,
3947 .next = ptype_seq_next,
3948 .stop = ptype_seq_stop,
3949 .show = ptype_seq_show,
3950};
3951
3952static int ptype_seq_open(struct inode *inode, struct file *file)
3953{
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07003954 return seq_open_net(inode, file, &ptype_seq_ops,
3955 sizeof(struct seq_net_private));
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003956}
3957
3958static const struct file_operations ptype_seq_fops = {
3959 .owner = THIS_MODULE,
3960 .open = ptype_seq_open,
3961 .read = seq_read,
3962 .llseek = seq_lseek,
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07003963 .release = seq_release_net,
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003964};
3965
3966
Pavel Emelyanov46650792007-10-08 20:38:39 -07003967static int __net_init dev_proc_net_init(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003968{
3969 int rc = -ENOMEM;
3970
Eric W. Biederman881d9662007-09-17 11:56:21 -07003971 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003972 goto out;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003973 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003974 goto out_dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003975 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003976 goto out_softnet;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003977
Eric W. Biederman881d9662007-09-17 11:56:21 -07003978 if (wext_proc_init(net))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003979 goto out_ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003980 rc = 0;
3981out:
3982 return rc;
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003983out_ptype:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003984 proc_net_remove(net, "ptype");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003985out_softnet:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003986 proc_net_remove(net, "softnet_stat");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003987out_dev:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003988 proc_net_remove(net, "dev");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003989 goto out;
3990}
Eric W. Biederman881d9662007-09-17 11:56:21 -07003991
Pavel Emelyanov46650792007-10-08 20:38:39 -07003992static void __net_exit dev_proc_net_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07003993{
3994 wext_proc_exit(net);
3995
3996 proc_net_remove(net, "ptype");
3997 proc_net_remove(net, "softnet_stat");
3998 proc_net_remove(net, "dev");
3999}
4000
Denis V. Lunev022cbae2007-11-13 03:23:50 -08004001static struct pernet_operations __net_initdata dev_proc_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07004002 .init = dev_proc_net_init,
4003 .exit = dev_proc_net_exit,
4004};
4005
4006static int __init dev_proc_init(void)
4007{
4008 return register_pernet_subsys(&dev_proc_ops);
4009}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004010#else
4011#define dev_proc_init() 0
4012#endif /* CONFIG_PROC_FS */
4013
4014
4015/**
4016 * netdev_set_master - set up master/slave pair
4017 * @slave: slave device
4018 * @master: new master device
4019 *
4020 * Changes the master device of the slave. Pass %NULL to break the
4021 * bonding. The caller must hold the RTNL semaphore. On a failure
4022 * a negative errno code is returned. On success the reference counts
4023 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
4024 * function returns zero.
4025 */
4026int netdev_set_master(struct net_device *slave, struct net_device *master)
4027{
4028 struct net_device *old = slave->master;
4029
4030 ASSERT_RTNL();
4031
4032 if (master) {
4033 if (old)
4034 return -EBUSY;
4035 dev_hold(master);
4036 }
4037
4038 slave->master = master;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004039
Eric Dumazet283f2fe2010-03-18 13:37:40 +00004040 if (old) {
4041 synchronize_net();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004042 dev_put(old);
Eric Dumazet283f2fe2010-03-18 13:37:40 +00004043 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004044 if (master)
4045 slave->flags |= IFF_SLAVE;
4046 else
4047 slave->flags &= ~IFF_SLAVE;
4048
4049 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4050 return 0;
4051}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004052EXPORT_SYMBOL(netdev_set_master);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004053
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004054static void dev_change_rx_flags(struct net_device *dev, int flags)
4055{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004056 const struct net_device_ops *ops = dev->netdev_ops;
4057
4058 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4059 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004060}
4061
Wang Chendad9b332008-06-18 01:48:28 -07004062static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07004063{
4064 unsigned short old_flags = dev->flags;
David Howells8192b0c2008-11-14 10:39:10 +11004065 uid_t uid;
4066 gid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07004067
Patrick McHardy24023452007-07-14 18:51:31 -07004068 ASSERT_RTNL();
4069
Wang Chendad9b332008-06-18 01:48:28 -07004070 dev->flags |= IFF_PROMISC;
4071 dev->promiscuity += inc;
4072 if (dev->promiscuity == 0) {
4073 /*
4074 * Avoid overflow.
4075 * If inc causes overflow, untouch promisc and return error.
4076 */
4077 if (inc < 0)
4078 dev->flags &= ~IFF_PROMISC;
4079 else {
4080 dev->promiscuity -= inc;
4081 printk(KERN_WARNING "%s: promiscuity touches roof, "
4082 "set promiscuity failed, promiscuity feature "
4083 "of device might be broken.\n", dev->name);
4084 return -EOVERFLOW;
4085 }
4086 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004087 if (dev->flags != old_flags) {
4088 printk(KERN_INFO "device %s %s promiscuous mode\n",
4089 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4090 "left");
David Howells8192b0c2008-11-14 10:39:10 +11004091 if (audit_enabled) {
4092 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004093 audit_log(current->audit_context, GFP_ATOMIC,
4094 AUDIT_ANOM_PROMISCUOUS,
4095 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4096 dev->name, (dev->flags & IFF_PROMISC),
4097 (old_flags & IFF_PROMISC),
4098 audit_get_loginuid(current),
David Howells8192b0c2008-11-14 10:39:10 +11004099 uid, gid,
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004100 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11004101 }
Patrick McHardy24023452007-07-14 18:51:31 -07004102
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004103 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07004104 }
Wang Chendad9b332008-06-18 01:48:28 -07004105 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004106}
4107
Linus Torvalds1da177e2005-04-16 15:20:36 -07004108/**
4109 * dev_set_promiscuity - update promiscuity count on a device
4110 * @dev: device
4111 * @inc: modifier
4112 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07004113 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004114 * remains above zero the interface remains promiscuous. Once it hits zero
4115 * the device reverts back to normal filtering operation. A negative inc
4116 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07004117 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004118 */
Wang Chendad9b332008-06-18 01:48:28 -07004119int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004120{
4121 unsigned short old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07004122 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004123
Wang Chendad9b332008-06-18 01:48:28 -07004124 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07004125 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07004126 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07004127 if (dev->flags != old_flags)
4128 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07004129 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004130}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004131EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004132
4133/**
4134 * dev_set_allmulti - update allmulti count on a device
4135 * @dev: device
4136 * @inc: modifier
4137 *
4138 * Add or remove reception of all multicast frames to a device. While the
4139 * count in the device remains above zero the interface remains listening
4140 * to all interfaces. Once it hits zero the device reverts back to normal
4141 * filtering operation. A negative @inc value is used to drop the counter
4142 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07004143 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004144 */
4145
Wang Chendad9b332008-06-18 01:48:28 -07004146int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004147{
4148 unsigned short old_flags = dev->flags;
4149
Patrick McHardy24023452007-07-14 18:51:31 -07004150 ASSERT_RTNL();
4151
Linus Torvalds1da177e2005-04-16 15:20:36 -07004152 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07004153 dev->allmulti += inc;
4154 if (dev->allmulti == 0) {
4155 /*
4156 * Avoid overflow.
4157 * If inc causes overflow, untouch allmulti and return error.
4158 */
4159 if (inc < 0)
4160 dev->flags &= ~IFF_ALLMULTI;
4161 else {
4162 dev->allmulti -= inc;
4163 printk(KERN_WARNING "%s: allmulti touches roof, "
4164 "set allmulti failed, allmulti feature of "
4165 "device might be broken.\n", dev->name);
4166 return -EOVERFLOW;
4167 }
4168 }
Patrick McHardy24023452007-07-14 18:51:31 -07004169 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004170 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07004171 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07004172 }
Wang Chendad9b332008-06-18 01:48:28 -07004173 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004174}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004175EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07004176
4177/*
4178 * Upload unicast and multicast address lists to device and
4179 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08004180 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07004181 * are present.
4182 */
4183void __dev_set_rx_mode(struct net_device *dev)
4184{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004185 const struct net_device_ops *ops = dev->netdev_ops;
4186
Patrick McHardy4417da62007-06-27 01:28:10 -07004187 /* dev_open will call this function so the list will stay sane. */
4188 if (!(dev->flags&IFF_UP))
4189 return;
4190
4191 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09004192 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07004193
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004194 if (ops->ndo_set_rx_mode)
4195 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004196 else {
4197 /* Unicast addresses changes may only happen under the rtnl,
4198 * therefore calling __dev_set_promiscuity here is safe.
4199 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004200 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004201 __dev_set_promiscuity(dev, 1);
4202 dev->uc_promisc = 1;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004203 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004204 __dev_set_promiscuity(dev, -1);
4205 dev->uc_promisc = 0;
4206 }
4207
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004208 if (ops->ndo_set_multicast_list)
4209 ops->ndo_set_multicast_list(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004210 }
4211}
4212
4213void dev_set_rx_mode(struct net_device *dev)
4214{
David S. Millerb9e40852008-07-15 00:15:08 -07004215 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004216 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07004217 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004218}
4219
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004220/**
4221 * dev_get_flags - get flags reported to userspace
4222 * @dev: device
4223 *
4224 * Get the combination of flag bits exported through APIs to userspace.
4225 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004226unsigned dev_get_flags(const struct net_device *dev)
4227{
4228 unsigned flags;
4229
4230 flags = (dev->flags & ~(IFF_PROMISC |
4231 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08004232 IFF_RUNNING |
4233 IFF_LOWER_UP |
4234 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07004235 (dev->gflags & (IFF_PROMISC |
4236 IFF_ALLMULTI));
4237
Stefan Rompfb00055a2006-03-20 17:09:11 -08004238 if (netif_running(dev)) {
4239 if (netif_oper_up(dev))
4240 flags |= IFF_RUNNING;
4241 if (netif_carrier_ok(dev))
4242 flags |= IFF_LOWER_UP;
4243 if (netif_dormant(dev))
4244 flags |= IFF_DORMANT;
4245 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004246
4247 return flags;
4248}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004249EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004250
Patrick McHardybd380812010-02-26 06:34:53 +00004251int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004252{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004253 int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004254 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004255
Patrick McHardy24023452007-07-14 18:51:31 -07004256 ASSERT_RTNL();
4257
Linus Torvalds1da177e2005-04-16 15:20:36 -07004258 /*
4259 * Set the flags on our device.
4260 */
4261
4262 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4263 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4264 IFF_AUTOMEDIA)) |
4265 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4266 IFF_ALLMULTI));
4267
4268 /*
4269 * Load in the correct multicast list now the flags have changed.
4270 */
4271
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004272 if ((old_flags ^ flags) & IFF_MULTICAST)
4273 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07004274
Patrick McHardy4417da62007-06-27 01:28:10 -07004275 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004276
4277 /*
4278 * Have we downed the interface. We handle IFF_UP ourselves
4279 * according to user attempts to set it, rather than blindly
4280 * setting it.
4281 */
4282
4283 ret = 0;
4284 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00004285 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004286
4287 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07004288 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004289 }
4290
Linus Torvalds1da177e2005-04-16 15:20:36 -07004291 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004292 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4293
Linus Torvalds1da177e2005-04-16 15:20:36 -07004294 dev->gflags ^= IFF_PROMISC;
4295 dev_set_promiscuity(dev, inc);
4296 }
4297
4298 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4299 is important. Some (broken) drivers set IFF_PROMISC, when
4300 IFF_ALLMULTI is requested not asking us and not reporting.
4301 */
4302 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004303 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4304
Linus Torvalds1da177e2005-04-16 15:20:36 -07004305 dev->gflags ^= IFF_ALLMULTI;
4306 dev_set_allmulti(dev, inc);
4307 }
4308
Patrick McHardybd380812010-02-26 06:34:53 +00004309 return ret;
4310}
4311
4312void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4313{
4314 unsigned int changes = dev->flags ^ old_flags;
4315
4316 if (changes & IFF_UP) {
4317 if (dev->flags & IFF_UP)
4318 call_netdevice_notifiers(NETDEV_UP, dev);
4319 else
4320 call_netdevice_notifiers(NETDEV_DOWN, dev);
4321 }
4322
4323 if (dev->flags & IFF_UP &&
4324 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4325 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4326}
4327
4328/**
4329 * dev_change_flags - change device settings
4330 * @dev: device
4331 * @flags: device state flags
4332 *
4333 * Change settings on device based state flags. The flags are
4334 * in the userspace exported format.
4335 */
4336int dev_change_flags(struct net_device *dev, unsigned flags)
4337{
4338 int ret, changes;
4339 int old_flags = dev->flags;
4340
4341 ret = __dev_change_flags(dev, flags);
4342 if (ret < 0)
4343 return ret;
4344
4345 changes = old_flags ^ dev->flags;
Thomas Graf7c355f52007-06-05 16:03:03 -07004346 if (changes)
4347 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004348
Patrick McHardybd380812010-02-26 06:34:53 +00004349 __dev_notify_flags(dev, old_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004350 return ret;
4351}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004352EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004353
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004354/**
4355 * dev_set_mtu - Change maximum transfer unit
4356 * @dev: device
4357 * @new_mtu: new transfer unit
4358 *
4359 * Change the maximum transfer size of the network device.
4360 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004361int dev_set_mtu(struct net_device *dev, int new_mtu)
4362{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004363 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004364 int err;
4365
4366 if (new_mtu == dev->mtu)
4367 return 0;
4368
4369 /* MTU must be positive. */
4370 if (new_mtu < 0)
4371 return -EINVAL;
4372
4373 if (!netif_device_present(dev))
4374 return -ENODEV;
4375
4376 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004377 if (ops->ndo_change_mtu)
4378 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004379 else
4380 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004381
Linus Torvalds1da177e2005-04-16 15:20:36 -07004382 if (!err && dev->flags & IFF_UP)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004383 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004384 return err;
4385}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004386EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004387
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004388/**
4389 * dev_set_mac_address - Change Media Access Control Address
4390 * @dev: device
4391 * @sa: new address
4392 *
4393 * Change the hardware (MAC) address of the device
4394 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004395int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4396{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004397 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004398 int err;
4399
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004400 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004401 return -EOPNOTSUPP;
4402 if (sa->sa_family != dev->type)
4403 return -EINVAL;
4404 if (!netif_device_present(dev))
4405 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004406 err = ops->ndo_set_mac_address(dev, sa);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004407 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004408 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004409 return err;
4410}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004411EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004412
4413/*
Eric Dumazet3710bec2009-11-01 19:42:09 +00004414 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07004415 */
Jeff Garzik14e3e072007-10-08 00:06:32 -07004416static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004417{
4418 int err;
Eric Dumazet3710bec2009-11-01 19:42:09 +00004419 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004420
4421 if (!dev)
4422 return -ENODEV;
4423
4424 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004425 case SIOCGIFFLAGS: /* Get interface flags */
4426 ifr->ifr_flags = (short) dev_get_flags(dev);
4427 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004428
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004429 case SIOCGIFMETRIC: /* Get the metric on the interface
4430 (currently unused) */
4431 ifr->ifr_metric = 0;
4432 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004433
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004434 case SIOCGIFMTU: /* Get the MTU of a device */
4435 ifr->ifr_mtu = dev->mtu;
4436 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004437
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004438 case SIOCGIFHWADDR:
4439 if (!dev->addr_len)
4440 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4441 else
4442 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4443 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4444 ifr->ifr_hwaddr.sa_family = dev->type;
4445 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004446
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004447 case SIOCGIFSLAVE:
4448 err = -EINVAL;
4449 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004450
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004451 case SIOCGIFMAP:
4452 ifr->ifr_map.mem_start = dev->mem_start;
4453 ifr->ifr_map.mem_end = dev->mem_end;
4454 ifr->ifr_map.base_addr = dev->base_addr;
4455 ifr->ifr_map.irq = dev->irq;
4456 ifr->ifr_map.dma = dev->dma;
4457 ifr->ifr_map.port = dev->if_port;
4458 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004459
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004460 case SIOCGIFINDEX:
4461 ifr->ifr_ifindex = dev->ifindex;
4462 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004463
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004464 case SIOCGIFTXQLEN:
4465 ifr->ifr_qlen = dev->tx_queue_len;
4466 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004467
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004468 default:
4469 /* dev_ioctl() should ensure this case
4470 * is never reached
4471 */
4472 WARN_ON(1);
4473 err = -EINVAL;
4474 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004475
4476 }
4477 return err;
4478}
4479
4480/*
4481 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4482 */
4483static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4484{
4485 int err;
4486 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004487 const struct net_device_ops *ops;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004488
4489 if (!dev)
4490 return -ENODEV;
4491
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004492 ops = dev->netdev_ops;
4493
Jeff Garzik14e3e072007-10-08 00:06:32 -07004494 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004495 case SIOCSIFFLAGS: /* Set interface flags */
4496 return dev_change_flags(dev, ifr->ifr_flags);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004497
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004498 case SIOCSIFMETRIC: /* Set the metric on the interface
4499 (currently unused) */
4500 return -EOPNOTSUPP;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004501
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004502 case SIOCSIFMTU: /* Set the MTU of a device */
4503 return dev_set_mtu(dev, ifr->ifr_mtu);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004504
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004505 case SIOCSIFHWADDR:
4506 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004507
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004508 case SIOCSIFHWBROADCAST:
4509 if (ifr->ifr_hwaddr.sa_family != dev->type)
4510 return -EINVAL;
4511 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4512 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4513 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4514 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004515
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004516 case SIOCSIFMAP:
4517 if (ops->ndo_set_config) {
4518 if (!netif_device_present(dev))
4519 return -ENODEV;
4520 return ops->ndo_set_config(dev, &ifr->ifr_map);
4521 }
4522 return -EOPNOTSUPP;
4523
4524 case SIOCADDMULTI:
4525 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4526 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4527 return -EINVAL;
4528 if (!netif_device_present(dev))
4529 return -ENODEV;
Jiri Pirko22bedad32010-04-01 21:22:57 +00004530 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004531
4532 case SIOCDELMULTI:
4533 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4534 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4535 return -EINVAL;
4536 if (!netif_device_present(dev))
4537 return -ENODEV;
Jiri Pirko22bedad32010-04-01 21:22:57 +00004538 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004539
4540 case SIOCSIFTXQLEN:
4541 if (ifr->ifr_qlen < 0)
4542 return -EINVAL;
4543 dev->tx_queue_len = ifr->ifr_qlen;
4544 return 0;
4545
4546 case SIOCSIFNAME:
4547 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4548 return dev_change_name(dev, ifr->ifr_newname);
4549
4550 /*
4551 * Unknown or private ioctl
4552 */
4553 default:
4554 if ((cmd >= SIOCDEVPRIVATE &&
4555 cmd <= SIOCDEVPRIVATE + 15) ||
4556 cmd == SIOCBONDENSLAVE ||
4557 cmd == SIOCBONDRELEASE ||
4558 cmd == SIOCBONDSETHWADDR ||
4559 cmd == SIOCBONDSLAVEINFOQUERY ||
4560 cmd == SIOCBONDINFOQUERY ||
4561 cmd == SIOCBONDCHANGEACTIVE ||
4562 cmd == SIOCGMIIPHY ||
4563 cmd == SIOCGMIIREG ||
4564 cmd == SIOCSMIIREG ||
4565 cmd == SIOCBRADDIF ||
4566 cmd == SIOCBRDELIF ||
4567 cmd == SIOCSHWTSTAMP ||
4568 cmd == SIOCWANDEV) {
4569 err = -EOPNOTSUPP;
4570 if (ops->ndo_do_ioctl) {
4571 if (netif_device_present(dev))
4572 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4573 else
4574 err = -ENODEV;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004575 }
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004576 } else
4577 err = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004578
4579 }
4580 return err;
4581}
4582
4583/*
4584 * This function handles all "interface"-type I/O control requests. The actual
4585 * 'doing' part of this is dev_ifsioc above.
4586 */
4587
4588/**
4589 * dev_ioctl - network device ioctl
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004590 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004591 * @cmd: command to issue
4592 * @arg: pointer to a struct ifreq in user space
4593 *
4594 * Issue ioctl functions to devices. This is normally called by the
4595 * user space syscall interfaces but can sometimes be useful for
4596 * other purposes. The return value is the return from the syscall if
4597 * positive or a negative errno code on error.
4598 */
4599
Eric W. Biederman881d9662007-09-17 11:56:21 -07004600int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004601{
4602 struct ifreq ifr;
4603 int ret;
4604 char *colon;
4605
4606 /* One special case: SIOCGIFCONF takes ifconf argument
4607 and requires shared lock, because it sleeps writing
4608 to user space.
4609 */
4610
4611 if (cmd == SIOCGIFCONF) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004612 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07004613 ret = dev_ifconf(net, (char __user *) arg);
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004614 rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004615 return ret;
4616 }
4617 if (cmd == SIOCGIFNAME)
Eric W. Biederman881d9662007-09-17 11:56:21 -07004618 return dev_ifname(net, (struct ifreq __user *)arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004619
4620 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4621 return -EFAULT;
4622
4623 ifr.ifr_name[IFNAMSIZ-1] = 0;
4624
4625 colon = strchr(ifr.ifr_name, ':');
4626 if (colon)
4627 *colon = 0;
4628
4629 /*
4630 * See which interface the caller is talking about.
4631 */
4632
4633 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004634 /*
4635 * These ioctl calls:
4636 * - can be done by all.
4637 * - atomic and do not require locking.
4638 * - return a value
4639 */
4640 case SIOCGIFFLAGS:
4641 case SIOCGIFMETRIC:
4642 case SIOCGIFMTU:
4643 case SIOCGIFHWADDR:
4644 case SIOCGIFSLAVE:
4645 case SIOCGIFMAP:
4646 case SIOCGIFINDEX:
4647 case SIOCGIFTXQLEN:
4648 dev_load(net, ifr.ifr_name);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004649 rcu_read_lock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004650 ret = dev_ifsioc_locked(net, &ifr, cmd);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004651 rcu_read_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004652 if (!ret) {
4653 if (colon)
4654 *colon = ':';
4655 if (copy_to_user(arg, &ifr,
4656 sizeof(struct ifreq)))
4657 ret = -EFAULT;
4658 }
4659 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004660
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004661 case SIOCETHTOOL:
4662 dev_load(net, ifr.ifr_name);
4663 rtnl_lock();
4664 ret = dev_ethtool(net, &ifr);
4665 rtnl_unlock();
4666 if (!ret) {
4667 if (colon)
4668 *colon = ':';
4669 if (copy_to_user(arg, &ifr,
4670 sizeof(struct ifreq)))
4671 ret = -EFAULT;
4672 }
4673 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004674
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004675 /*
4676 * These ioctl calls:
4677 * - require superuser power.
4678 * - require strict serialization.
4679 * - return a value
4680 */
4681 case SIOCGMIIPHY:
4682 case SIOCGMIIREG:
4683 case SIOCSIFNAME:
4684 if (!capable(CAP_NET_ADMIN))
4685 return -EPERM;
4686 dev_load(net, ifr.ifr_name);
4687 rtnl_lock();
4688 ret = dev_ifsioc(net, &ifr, cmd);
4689 rtnl_unlock();
4690 if (!ret) {
4691 if (colon)
4692 *colon = ':';
4693 if (copy_to_user(arg, &ifr,
4694 sizeof(struct ifreq)))
4695 ret = -EFAULT;
4696 }
4697 return ret;
4698
4699 /*
4700 * These ioctl calls:
4701 * - require superuser power.
4702 * - require strict serialization.
4703 * - do not return a value
4704 */
4705 case SIOCSIFFLAGS:
4706 case SIOCSIFMETRIC:
4707 case SIOCSIFMTU:
4708 case SIOCSIFMAP:
4709 case SIOCSIFHWADDR:
4710 case SIOCSIFSLAVE:
4711 case SIOCADDMULTI:
4712 case SIOCDELMULTI:
4713 case SIOCSIFHWBROADCAST:
4714 case SIOCSIFTXQLEN:
4715 case SIOCSMIIREG:
4716 case SIOCBONDENSLAVE:
4717 case SIOCBONDRELEASE:
4718 case SIOCBONDSETHWADDR:
4719 case SIOCBONDCHANGEACTIVE:
4720 case SIOCBRADDIF:
4721 case SIOCBRDELIF:
4722 case SIOCSHWTSTAMP:
4723 if (!capable(CAP_NET_ADMIN))
4724 return -EPERM;
4725 /* fall through */
4726 case SIOCBONDSLAVEINFOQUERY:
4727 case SIOCBONDINFOQUERY:
4728 dev_load(net, ifr.ifr_name);
4729 rtnl_lock();
4730 ret = dev_ifsioc(net, &ifr, cmd);
4731 rtnl_unlock();
4732 return ret;
4733
4734 case SIOCGIFMEM:
4735 /* Get the per device memory space. We can add this but
4736 * currently do not support it */
4737 case SIOCSIFMEM:
4738 /* Set the per device memory buffer space.
4739 * Not applicable in our case */
4740 case SIOCSIFLINK:
4741 return -EINVAL;
4742
4743 /*
4744 * Unknown or private ioctl.
4745 */
4746 default:
4747 if (cmd == SIOCWANDEV ||
4748 (cmd >= SIOCDEVPRIVATE &&
4749 cmd <= SIOCDEVPRIVATE + 15)) {
Eric W. Biederman881d9662007-09-17 11:56:21 -07004750 dev_load(net, ifr.ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004751 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07004752 ret = dev_ifsioc(net, &ifr, cmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004753 rtnl_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004754 if (!ret && copy_to_user(arg, &ifr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07004755 sizeof(struct ifreq)))
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004756 ret = -EFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004757 return ret;
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004758 }
4759 /* Take care of Wireless Extensions */
4760 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4761 return wext_handle_ioctl(net, &ifr, cmd, arg);
4762 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004763 }
4764}
4765
4766
4767/**
4768 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004769 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004770 *
4771 * Returns a suitable unique value for a new device interface
4772 * number. The caller must hold the rtnl semaphore or the
4773 * dev_base_lock to be sure it remains unique.
4774 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07004775static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004776{
4777 static int ifindex;
4778 for (;;) {
4779 if (++ifindex <= 0)
4780 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004781 if (!__dev_get_by_index(net, ifindex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004782 return ifindex;
4783 }
4784}
4785
Linus Torvalds1da177e2005-04-16 15:20:36 -07004786/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08004787static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004788
Stephen Hemminger6f05f622007-03-08 20:46:03 -08004789static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004790{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004791 list_add_tail(&dev->todo_list, &net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004792}
4793
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004794static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004795{
Krishna Kumare93737b2009-12-08 22:26:02 +00004796 struct net_device *dev, *tmp;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004797
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004798 BUG_ON(dev_boot_phase);
4799 ASSERT_RTNL();
4800
Krishna Kumare93737b2009-12-08 22:26:02 +00004801 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004802 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00004803 * for initialization unwind. Remove those
4804 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004805 */
4806 if (dev->reg_state == NETREG_UNINITIALIZED) {
4807 pr_debug("unregister_netdevice: device %s/%p never "
4808 "was registered\n", dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004809
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004810 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00004811 list_del(&dev->unreg_list);
4812 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004813 }
4814
4815 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4816
4817 /* If device is running, close it first. */
4818 dev_close(dev);
4819
4820 /* And unlink it from device chain. */
4821 unlist_netdevice(dev);
4822
4823 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004824 }
4825
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004826 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004827
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004828 list_for_each_entry(dev, head, unreg_list) {
4829 /* Shutdown queueing discipline. */
4830 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004831
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004832
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004833 /* Notify protocols, that we are about to destroy
4834 this device. They should clean all the things.
4835 */
4836 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4837
Patrick McHardya2835762010-02-26 06:34:51 +00004838 if (!dev->rtnl_link_ops ||
4839 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4840 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4841
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004842 /*
4843 * Flush the unicast and multicast chains
4844 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00004845 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00004846 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004847
4848 if (dev->netdev_ops->ndo_uninit)
4849 dev->netdev_ops->ndo_uninit(dev);
4850
4851 /* Notifier chain MUST detach us from master device. */
4852 WARN_ON(dev->master);
4853
4854 /* Remove entries from kobject tree */
4855 netdev_unregister_kobject(dev);
4856 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004857
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004858 /* Process any work delayed until the end of the batch */
stephen hemmingere5e26d72010-02-24 14:01:38 +00004859 dev = list_first_entry(head, struct net_device, unreg_list);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004860 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4861
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004862 synchronize_net();
4863
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004864 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004865 dev_put(dev);
4866}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004867
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004868static void rollback_registered(struct net_device *dev)
4869{
4870 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004871
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004872 list_add(&dev->unreg_list, &single);
4873 rollback_registered_many(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004874}
4875
David S. Millere8a04642008-07-17 00:34:19 -07004876static void __netdev_init_queue_locks_one(struct net_device *dev,
4877 struct netdev_queue *dev_queue,
4878 void *_unused)
David S. Millerc773e842008-07-08 23:13:53 -07004879{
4880 spin_lock_init(&dev_queue->_xmit_lock);
David S. Millercf508b12008-07-22 14:16:42 -07004881 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
David S. Millerc773e842008-07-08 23:13:53 -07004882 dev_queue->xmit_lock_owner = -1;
4883}
4884
4885static void netdev_init_queue_locks(struct net_device *dev)
4886{
David S. Millere8a04642008-07-17 00:34:19 -07004887 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4888 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
David S. Millerc773e842008-07-08 23:13:53 -07004889}
4890
Herbert Xub63365a2008-10-23 01:11:29 -07004891unsigned long netdev_fix_features(unsigned long features, const char *name)
4892{
4893 /* Fix illegal SG+CSUM combinations. */
4894 if ((features & NETIF_F_SG) &&
4895 !(features & NETIF_F_ALL_CSUM)) {
4896 if (name)
4897 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4898 "checksum feature.\n", name);
4899 features &= ~NETIF_F_SG;
4900 }
4901
4902 /* TSO requires that SG is present as well. */
4903 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4904 if (name)
4905 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4906 "SG feature.\n", name);
4907 features &= ~NETIF_F_TSO;
4908 }
4909
4910 if (features & NETIF_F_UFO) {
4911 if (!(features & NETIF_F_GEN_CSUM)) {
4912 if (name)
4913 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4914 "since no NETIF_F_HW_CSUM feature.\n",
4915 name);
4916 features &= ~NETIF_F_UFO;
4917 }
4918
4919 if (!(features & NETIF_F_SG)) {
4920 if (name)
4921 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4922 "since no NETIF_F_SG feature.\n", name);
4923 features &= ~NETIF_F_UFO;
4924 }
4925 }
4926
4927 return features;
4928}
4929EXPORT_SYMBOL(netdev_fix_features);
4930
Linus Torvalds1da177e2005-04-16 15:20:36 -07004931/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08004932 * netif_stacked_transfer_operstate - transfer operstate
4933 * @rootdev: the root or lower level device to transfer state from
4934 * @dev: the device to transfer operstate to
4935 *
4936 * Transfer operational state from root to device. This is normally
4937 * called when a stacking relationship exists between the root
4938 * device and the device(a leaf device).
4939 */
4940void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4941 struct net_device *dev)
4942{
4943 if (rootdev->operstate == IF_OPER_DORMANT)
4944 netif_dormant_on(dev);
4945 else
4946 netif_dormant_off(dev);
4947
4948 if (netif_carrier_ok(rootdev)) {
4949 if (!netif_carrier_ok(dev))
4950 netif_carrier_on(dev);
4951 } else {
4952 if (netif_carrier_ok(dev))
4953 netif_carrier_off(dev);
4954 }
4955}
4956EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4957
4958/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07004959 * register_netdevice - register a network device
4960 * @dev: device to register
4961 *
4962 * Take a completed network device structure and add it to the kernel
4963 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4964 * chain. 0 is returned on success. A negative errno code is returned
4965 * on a failure to set up the device, or if the name is a duplicate.
4966 *
4967 * Callers must hold the rtnl semaphore. You may want
4968 * register_netdev() instead of this.
4969 *
4970 * BUGS:
4971 * The locking appears insufficient to guarantee two parallel registers
4972 * will not get the same name.
4973 */
4974
4975int register_netdevice(struct net_device *dev)
4976{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004977 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004978 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004979
4980 BUG_ON(dev_boot_phase);
4981 ASSERT_RTNL();
4982
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07004983 might_sleep();
4984
Linus Torvalds1da177e2005-04-16 15:20:36 -07004985 /* When net_device's are persistent, this will be fatal. */
4986 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004987 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004988
David S. Millerf1f28aa2008-07-15 00:08:33 -07004989 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07004990 netdev_set_addr_lockdep_class(dev);
David S. Millerc773e842008-07-08 23:13:53 -07004991 netdev_init_queue_locks(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004992
Linus Torvalds1da177e2005-04-16 15:20:36 -07004993 dev->iflink = -1;
4994
Eric Dumazetdf334542010-03-24 19:13:54 +00004995#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00004996 if (!dev->num_rx_queues) {
4997 /*
4998 * Allocate a single RX queue if driver never called
4999 * alloc_netdev_mq
5000 */
5001
5002 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
5003 if (!dev->_rx) {
5004 ret = -ENOMEM;
5005 goto out;
5006 }
5007
5008 dev->_rx->first = dev->_rx;
5009 atomic_set(&dev->_rx->count, 1);
5010 dev->num_rx_queues = 1;
5011 }
Eric Dumazetdf334542010-03-24 19:13:54 +00005012#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005013 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005014 if (dev->netdev_ops->ndo_init) {
5015 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005016 if (ret) {
5017 if (ret > 0)
5018 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08005019 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005020 }
5021 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005022
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00005023 ret = dev_get_valid_name(dev, dev->name, 0);
Octavian Purdilad9031022009-11-18 02:36:59 +00005024 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005025 goto err_uninit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005026
Eric W. Biederman881d9662007-09-17 11:56:21 -07005027 dev->ifindex = dev_new_index(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005028 if (dev->iflink == -1)
5029 dev->iflink = dev->ifindex;
5030
Stephen Hemmingerd212f872007-06-27 00:47:37 -07005031 /* Fix illegal checksum combinations */
5032 if ((dev->features & NETIF_F_HW_CSUM) &&
5033 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5034 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5035 dev->name);
5036 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5037 }
5038
5039 if ((dev->features & NETIF_F_NO_CSUM) &&
5040 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5041 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5042 dev->name);
5043 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5044 }
5045
Herbert Xub63365a2008-10-23 01:11:29 -07005046 dev->features = netdev_fix_features(dev->features, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005047
Lennert Buytenheke5a4a722008-08-03 01:23:10 -07005048 /* Enable software GSO if SG is supported. */
5049 if (dev->features & NETIF_F_SG)
5050 dev->features |= NETIF_F_GSO;
5051
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005052 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5053 ret = notifier_to_errno(ret);
5054 if (ret)
5055 goto err_uninit;
5056
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005057 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005058 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005059 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005060 dev->reg_state = NETREG_REGISTERED;
5061
Linus Torvalds1da177e2005-04-16 15:20:36 -07005062 /*
5063 * Default initial state at registry is that the
5064 * device is present.
5065 */
5066
5067 set_bit(__LINK_STATE_PRESENT, &dev->state);
5068
Linus Torvalds1da177e2005-04-16 15:20:36 -07005069 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005070 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005071 list_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005072
5073 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005074 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005075 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005076 if (ret) {
5077 rollback_registered(dev);
5078 dev->reg_state = NETREG_UNREGISTERED;
5079 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005080 /*
5081 * Prevent userspace races by waiting until the network
5082 * device is fully setup before sending notifications.
5083 */
Patrick McHardya2835762010-02-26 06:34:51 +00005084 if (!dev->rtnl_link_ops ||
5085 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5086 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005087
5088out:
5089 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005090
5091err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005092 if (dev->netdev_ops->ndo_uninit)
5093 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005094 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005095}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005096EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005097
5098/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005099 * init_dummy_netdev - init a dummy network device for NAPI
5100 * @dev: device to init
5101 *
5102 * This takes a network device structure and initialize the minimum
5103 * amount of fields so it can be used to schedule NAPI polls without
5104 * registering a full blown interface. This is to be used by drivers
5105 * that need to tie several hardware interfaces to a single NAPI
5106 * poll scheduler due to HW limitations.
5107 */
5108int init_dummy_netdev(struct net_device *dev)
5109{
5110 /* Clear everything. Note we don't initialize spinlocks
5111 * are they aren't supposed to be taken by any of the
5112 * NAPI code and this dummy netdev is supposed to be
5113 * only ever used for NAPI polls
5114 */
5115 memset(dev, 0, sizeof(struct net_device));
5116
5117 /* make sure we BUG if trying to hit standard
5118 * register/unregister code path
5119 */
5120 dev->reg_state = NETREG_DUMMY;
5121
5122 /* initialize the ref count */
5123 atomic_set(&dev->refcnt, 1);
5124
5125 /* NAPI wants this */
5126 INIT_LIST_HEAD(&dev->napi_list);
5127
5128 /* a dummy interface is started by default */
5129 set_bit(__LINK_STATE_PRESENT, &dev->state);
5130 set_bit(__LINK_STATE_START, &dev->state);
5131
5132 return 0;
5133}
5134EXPORT_SYMBOL_GPL(init_dummy_netdev);
5135
5136
5137/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005138 * register_netdev - register a network device
5139 * @dev: device to register
5140 *
5141 * Take a completed network device structure and add it to the kernel
5142 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5143 * chain. 0 is returned on success. A negative errno code is returned
5144 * on a failure to set up the device, or if the name is a duplicate.
5145 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005146 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005147 * and expands the device name if you passed a format string to
5148 * alloc_netdev.
5149 */
5150int register_netdev(struct net_device *dev)
5151{
5152 int err;
5153
5154 rtnl_lock();
5155
5156 /*
5157 * If the name is a format string the caller wants us to do a
5158 * name allocation.
5159 */
5160 if (strchr(dev->name, '%')) {
5161 err = dev_alloc_name(dev, dev->name);
5162 if (err < 0)
5163 goto out;
5164 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005165
Linus Torvalds1da177e2005-04-16 15:20:36 -07005166 err = register_netdevice(dev);
5167out:
5168 rtnl_unlock();
5169 return err;
5170}
5171EXPORT_SYMBOL(register_netdev);
5172
5173/*
5174 * netdev_wait_allrefs - wait until all references are gone.
5175 *
5176 * This is called when unregistering network devices.
5177 *
5178 * Any protocol or device that holds a reference should register
5179 * for netdevice notification, and cleanup and put back the
5180 * reference if they receive an UNREGISTER event.
5181 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005182 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005183 */
5184static void netdev_wait_allrefs(struct net_device *dev)
5185{
5186 unsigned long rebroadcast_time, warning_time;
5187
Eric Dumazete014deb2009-11-17 05:59:21 +00005188 linkwatch_forget_dev(dev);
5189
Linus Torvalds1da177e2005-04-16 15:20:36 -07005190 rebroadcast_time = warning_time = jiffies;
5191 while (atomic_read(&dev->refcnt) != 0) {
5192 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005193 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005194
5195 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005196 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005197 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
Octavian Purdila395264d2009-11-16 13:49:35 +00005198 * should have already handle it the first time */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005199
5200 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5201 &dev->state)) {
5202 /* We must not have linkwatch events
5203 * pending on unregister. If this
5204 * happens, we simply run the queue
5205 * unscheduled, resulting in a noop
5206 * for this device.
5207 */
5208 linkwatch_run_queue();
5209 }
5210
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005211 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005212
5213 rebroadcast_time = jiffies;
5214 }
5215
5216 msleep(250);
5217
5218 if (time_after(jiffies, warning_time + 10 * HZ)) {
5219 printk(KERN_EMERG "unregister_netdevice: "
5220 "waiting for %s to become free. Usage "
5221 "count = %d\n",
5222 dev->name, atomic_read(&dev->refcnt));
5223 warning_time = jiffies;
5224 }
5225 }
5226}
5227
5228/* The sequence is:
5229 *
5230 * rtnl_lock();
5231 * ...
5232 * register_netdevice(x1);
5233 * register_netdevice(x2);
5234 * ...
5235 * unregister_netdevice(y1);
5236 * unregister_netdevice(y2);
5237 * ...
5238 * rtnl_unlock();
5239 * free_netdev(y1);
5240 * free_netdev(y2);
5241 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07005242 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07005243 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005244 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07005245 * without deadlocking with linkwatch via keventd.
5246 * 2) Since we run with the RTNL semaphore not held, we can sleep
5247 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07005248 *
5249 * We must not return until all unregister events added during
5250 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005251 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005252void netdev_run_todo(void)
5253{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005254 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005255
Linus Torvalds1da177e2005-04-16 15:20:36 -07005256 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005257 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07005258
5259 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005260
Linus Torvalds1da177e2005-04-16 15:20:36 -07005261 while (!list_empty(&list)) {
5262 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00005263 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005264 list_del(&dev->todo_list);
5265
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005266 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005267 printk(KERN_ERR "network todo '%s' but state %d\n",
5268 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005269 dump_stack();
5270 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005271 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005272
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005273 dev->reg_state = NETREG_UNREGISTERED;
5274
Changli Gao152102c2010-03-30 20:16:22 +00005275 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07005276
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005277 netdev_wait_allrefs(dev);
5278
5279 /* paranoia */
5280 BUG_ON(atomic_read(&dev->refcnt));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07005281 WARN_ON(dev->ip_ptr);
5282 WARN_ON(dev->ip6_ptr);
5283 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005284
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005285 if (dev->destructor)
5286 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07005287
5288 /* Free network device */
5289 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005290 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005291}
5292
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005293/**
Eric Dumazetd83345a2009-11-16 03:36:51 +00005294 * dev_txq_stats_fold - fold tx_queues stats
5295 * @dev: device to get statistics from
Ben Hutchings3cfde792010-07-09 09:11:52 +00005296 * @stats: struct rtnl_link_stats64 to hold results
Eric Dumazetd83345a2009-11-16 03:36:51 +00005297 */
5298void dev_txq_stats_fold(const struct net_device *dev,
Ben Hutchings3cfde792010-07-09 09:11:52 +00005299 struct rtnl_link_stats64 *stats)
Eric Dumazetd83345a2009-11-16 03:36:51 +00005300{
Eric Dumazetbd272902010-07-19 09:35:40 -07005301 u64 tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
Eric Dumazetd83345a2009-11-16 03:36:51 +00005302 unsigned int i;
5303 struct netdev_queue *txq;
5304
5305 for (i = 0; i < dev->num_tx_queues; i++) {
5306 txq = netdev_get_tx_queue(dev, i);
Eric Dumazetbd272902010-07-19 09:35:40 -07005307 spin_lock_bh(&txq->_xmit_lock);
Eric Dumazetd83345a2009-11-16 03:36:51 +00005308 tx_bytes += txq->tx_bytes;
5309 tx_packets += txq->tx_packets;
5310 tx_dropped += txq->tx_dropped;
Eric Dumazetbd272902010-07-19 09:35:40 -07005311 spin_unlock_bh(&txq->_xmit_lock);
Eric Dumazetd83345a2009-11-16 03:36:51 +00005312 }
5313 if (tx_bytes || tx_packets || tx_dropped) {
5314 stats->tx_bytes = tx_bytes;
5315 stats->tx_packets = tx_packets;
5316 stats->tx_dropped = tx_dropped;
5317 }
5318}
5319EXPORT_SYMBOL(dev_txq_stats_fold);
5320
Ben Hutchings3cfde792010-07-09 09:11:52 +00005321/* Convert net_device_stats to rtnl_link_stats64. They have the same
5322 * fields in the same order, with only the type differing.
5323 */
5324static void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5325 const struct net_device_stats *netdev_stats)
5326{
5327#if BITS_PER_LONG == 64
5328 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5329 memcpy(stats64, netdev_stats, sizeof(*stats64));
5330#else
5331 size_t i, n = sizeof(*stats64) / sizeof(u64);
5332 const unsigned long *src = (const unsigned long *)netdev_stats;
5333 u64 *dst = (u64 *)stats64;
5334
5335 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5336 sizeof(*stats64) / sizeof(u64));
5337 for (i = 0; i < n; i++)
5338 dst[i] = src[i];
5339#endif
5340}
5341
Eric Dumazetd83345a2009-11-16 03:36:51 +00005342/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005343 * dev_get_stats - get network device statistics
5344 * @dev: device to get statistics from
Eric Dumazet28172732010-07-07 14:58:56 -07005345 * @storage: place to store stats
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005346 *
Ben Hutchingsd7753512010-07-09 09:12:41 +00005347 * Get network statistics from device. Return @storage.
5348 * The device driver may provide its own method by setting
5349 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5350 * otherwise the internal statistics structure is used.
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005351 */
Ben Hutchingsd7753512010-07-09 09:12:41 +00005352struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5353 struct rtnl_link_stats64 *storage)
Eric Dumazet7004bf22009-05-18 00:34:33 +00005354{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005355 const struct net_device_ops *ops = dev->netdev_ops;
5356
Eric Dumazet28172732010-07-07 14:58:56 -07005357 if (ops->ndo_get_stats64) {
5358 memset(storage, 0, sizeof(*storage));
5359 return ops->ndo_get_stats64(dev, storage);
5360 }
5361 if (ops->ndo_get_stats) {
Ben Hutchings3cfde792010-07-09 09:11:52 +00005362 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
Eric Dumazet28172732010-07-07 14:58:56 -07005363 return storage;
5364 }
Ben Hutchings3cfde792010-07-09 09:11:52 +00005365 netdev_stats_to_stats64(storage, &dev->stats);
5366 dev_txq_stats_fold(dev, storage);
Eric Dumazet28172732010-07-07 14:58:56 -07005367 return storage;
Rusty Russellc45d2862007-03-28 14:29:08 -07005368}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005369EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07005370
David S. Millerdc2b4842008-07-08 17:18:23 -07005371static void netdev_init_one_queue(struct net_device *dev,
David S. Millere8a04642008-07-17 00:34:19 -07005372 struct netdev_queue *queue,
5373 void *_unused)
David S. Millerdc2b4842008-07-08 17:18:23 -07005374{
David S. Millerdc2b4842008-07-08 17:18:23 -07005375 queue->dev = dev;
5376}
5377
David S. Millerbb949fb2008-07-08 16:55:56 -07005378static void netdev_init_queues(struct net_device *dev)
5379{
David S. Millere8a04642008-07-17 00:34:19 -07005380 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5381 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
David S. Millerc3f26a22008-07-31 16:58:50 -07005382 spin_lock_init(&dev->tx_global_lock);
David S. Millerbb949fb2008-07-08 16:55:56 -07005383}
5384
Linus Torvalds1da177e2005-04-16 15:20:36 -07005385/**
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005386 * alloc_netdev_mq - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005387 * @sizeof_priv: size of private data to allocate space for
5388 * @name: device name format string
5389 * @setup: callback to initialize device
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005390 * @queue_count: the number of subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005391 *
5392 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005393 * and performs basic initialization. Also allocates subquue structs
5394 * for each queue on the device at the end of the netdevice.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005395 */
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005396struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5397 void (*setup)(struct net_device *), unsigned int queue_count)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005398{
David S. Millere8a04642008-07-17 00:34:19 -07005399 struct netdev_queue *tx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005400 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07005401 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005402 struct net_device *p;
Eric Dumazetdf334542010-03-24 19:13:54 +00005403#ifdef CONFIG_RPS
5404 struct netdev_rx_queue *rx;
Tom Herbert0a9627f2010-03-16 08:03:29 +00005405 int i;
Eric Dumazetdf334542010-03-24 19:13:54 +00005406#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005407
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005408 BUG_ON(strlen(name) >= sizeof(dev->name));
5409
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005410 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005411 if (sizeof_priv) {
5412 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005413 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005414 alloc_size += sizeof_priv;
5415 }
5416 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005417 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005418
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07005419 p = kzalloc(alloc_size, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005420 if (!p) {
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005421 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005422 return NULL;
5423 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005424
Stephen Hemminger79439862008-07-21 13:28:44 -07005425 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
David S. Millere8a04642008-07-17 00:34:19 -07005426 if (!tx) {
5427 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5428 "tx qdiscs.\n");
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005429 goto free_p;
David S. Millere8a04642008-07-17 00:34:19 -07005430 }
5431
Eric Dumazetdf334542010-03-24 19:13:54 +00005432#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005433 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5434 if (!rx) {
5435 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5436 "rx queues.\n");
5437 goto free_tx;
5438 }
5439
5440 atomic_set(&rx->count, queue_count);
5441
5442 /*
5443 * Set a pointer to first element in the array which holds the
5444 * reference count.
5445 */
5446 for (i = 0; i < queue_count; i++)
5447 rx[i].first = rx;
Eric Dumazetdf334542010-03-24 19:13:54 +00005448#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005449
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005450 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005451 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005452
5453 if (dev_addr_init(dev))
Tom Herbert0a9627f2010-03-16 08:03:29 +00005454 goto free_rx;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005455
Jiri Pirko22bedad32010-04-01 21:22:57 +00005456 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005457 dev_uc_init(dev);
Jiri Pirkoccffad252009-05-22 23:22:17 +00005458
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005459 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005460
David S. Millere8a04642008-07-17 00:34:19 -07005461 dev->_tx = tx;
5462 dev->num_tx_queues = queue_count;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005463 dev->real_num_tx_queues = queue_count;
David S. Millere8a04642008-07-17 00:34:19 -07005464
Eric Dumazetdf334542010-03-24 19:13:54 +00005465#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005466 dev->_rx = rx;
5467 dev->num_rx_queues = queue_count;
Eric Dumazetdf334542010-03-24 19:13:54 +00005468#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005469
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07005470 dev->gso_max_size = GSO_MAX_SIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005471
David S. Millerbb949fb2008-07-08 16:55:56 -07005472 netdev_init_queues(dev);
5473
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005474 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5475 dev->ethtool_ntuple_list.count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08005476 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005477 INIT_LIST_HEAD(&dev->unreg_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00005478 INIT_LIST_HEAD(&dev->link_watch_list);
Eric Dumazet93f154b2009-05-18 22:19:19 -07005479 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005480 setup(dev);
5481 strcpy(dev->name, name);
5482 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005483
Tom Herbert0a9627f2010-03-16 08:03:29 +00005484free_rx:
Eric Dumazetdf334542010-03-24 19:13:54 +00005485#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005486 kfree(rx);
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005487free_tx:
Eric Dumazetdf334542010-03-24 19:13:54 +00005488#endif
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005489 kfree(tx);
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005490free_p:
5491 kfree(p);
5492 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005493}
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005494EXPORT_SYMBOL(alloc_netdev_mq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005495
5496/**
5497 * free_netdev - free network device
5498 * @dev: device
5499 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005500 * This function does the last stage of destroying an allocated device
5501 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005502 * If this is the last reference then it will be freed.
5503 */
5504void free_netdev(struct net_device *dev)
5505{
Herbert Xud565b0a2008-12-15 23:38:52 -08005506 struct napi_struct *p, *n;
5507
Denis V. Lunevf3005d72008-04-16 02:02:18 -07005508 release_net(dev_net(dev));
5509
David S. Millere8a04642008-07-17 00:34:19 -07005510 kfree(dev->_tx);
5511
Jiri Pirkof001fde2009-05-05 02:48:28 +00005512 /* Flush device addresses */
5513 dev_addr_flush(dev);
5514
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005515 /* Clear ethtool n-tuple list */
5516 ethtool_ntuple_flush(dev);
5517
Herbert Xud565b0a2008-12-15 23:38:52 -08005518 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5519 netif_napi_del(p);
5520
Stephen Hemminger3041a062006-05-26 13:25:24 -07005521 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005522 if (dev->reg_state == NETREG_UNINITIALIZED) {
5523 kfree((char *)dev - dev->padded);
5524 return;
5525 }
5526
5527 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5528 dev->reg_state = NETREG_RELEASED;
5529
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07005530 /* will free via device release */
5531 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005532}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005533EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005534
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005535/**
5536 * synchronize_net - Synchronize with packet receive processing
5537 *
5538 * Wait for packets currently being received to be done.
5539 * Does not block later packets from starting.
5540 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005541void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005542{
5543 might_sleep();
Paul E. McKenneyfbd568a3e2005-05-01 08:59:04 -07005544 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005545}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005546EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005547
5548/**
Eric Dumazet44a08732009-10-27 07:03:04 +00005549 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07005550 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00005551 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08005552 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07005553 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005554 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00005555 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005556 *
5557 * Callers must hold the rtnl semaphore. You may want
5558 * unregister_netdev() instead of this.
5559 */
5560
Eric Dumazet44a08732009-10-27 07:03:04 +00005561void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005562{
Herbert Xua6620712007-12-12 19:21:56 -08005563 ASSERT_RTNL();
5564
Eric Dumazet44a08732009-10-27 07:03:04 +00005565 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005566 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00005567 } else {
5568 rollback_registered(dev);
5569 /* Finish processing unregister after unlock */
5570 net_set_todo(dev);
5571 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005572}
Eric Dumazet44a08732009-10-27 07:03:04 +00005573EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005574
5575/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005576 * unregister_netdevice_many - unregister many devices
5577 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005578 */
5579void unregister_netdevice_many(struct list_head *head)
5580{
5581 struct net_device *dev;
5582
5583 if (!list_empty(head)) {
5584 rollback_registered_many(head);
5585 list_for_each_entry(dev, head, unreg_list)
5586 net_set_todo(dev);
5587 }
5588}
Eric Dumazet63c80992009-10-27 07:06:49 +00005589EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005590
5591/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005592 * unregister_netdev - remove device from the kernel
5593 * @dev: device
5594 *
5595 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005596 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005597 *
5598 * This is just a wrapper for unregister_netdevice that takes
5599 * the rtnl semaphore. In general you want to use this and not
5600 * unregister_netdevice.
5601 */
5602void unregister_netdev(struct net_device *dev)
5603{
5604 rtnl_lock();
5605 unregister_netdevice(dev);
5606 rtnl_unlock();
5607}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005608EXPORT_SYMBOL(unregister_netdev);
5609
Eric W. Biedermance286d32007-09-12 13:53:49 +02005610/**
5611 * dev_change_net_namespace - move device to different nethost namespace
5612 * @dev: device
5613 * @net: network namespace
5614 * @pat: If not NULL name pattern to try if the current device name
5615 * is already taken in the destination network namespace.
5616 *
5617 * This function shuts down a device interface and moves it
5618 * to a new network namespace. On success 0 is returned, on
5619 * a failure a netagive errno code is returned.
5620 *
5621 * Callers must hold the rtnl semaphore.
5622 */
5623
5624int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5625{
Eric W. Biedermance286d32007-09-12 13:53:49 +02005626 int err;
5627
5628 ASSERT_RTNL();
5629
5630 /* Don't allow namespace local devices to be moved. */
5631 err = -EINVAL;
5632 if (dev->features & NETIF_F_NETNS_LOCAL)
5633 goto out;
5634
5635 /* Ensure the device has been registrered */
5636 err = -EINVAL;
5637 if (dev->reg_state != NETREG_REGISTERED)
5638 goto out;
5639
5640 /* Get out if there is nothing todo */
5641 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09005642 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005643 goto out;
5644
5645 /* Pick the destination device name, and ensure
5646 * we can use it in the destination network namespace.
5647 */
5648 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00005649 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005650 /* We get here if we can't use the current device name */
5651 if (!pat)
5652 goto out;
Daniel Lezcano8ce6cebc2010-05-19 10:12:19 +00005653 if (dev_get_valid_name(dev, pat, 1))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005654 goto out;
5655 }
5656
5657 /*
5658 * And now a mini version of register_netdevice unregister_netdevice.
5659 */
5660
5661 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07005662 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005663
5664 /* And unlink it from device chain */
5665 err = -ENODEV;
5666 unlist_netdevice(dev);
5667
5668 synchronize_net();
5669
5670 /* Shutdown queueing discipline. */
5671 dev_shutdown(dev);
5672
5673 /* Notify protocols, that we are about to destroy
5674 this device. They should clean all the things.
5675 */
5676 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005677 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005678
5679 /*
5680 * Flush the unicast and multicast chains
5681 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005682 dev_uc_flush(dev);
Jiri Pirko22bedad32010-04-01 21:22:57 +00005683 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005684
5685 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005686 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005687
Eric W. Biedermance286d32007-09-12 13:53:49 +02005688 /* If there is an ifindex conflict assign a new one */
5689 if (__dev_get_by_index(net, dev->ifindex)) {
5690 int iflink = (dev->iflink == dev->ifindex);
5691 dev->ifindex = dev_new_index(net);
5692 if (iflink)
5693 dev->iflink = dev->ifindex;
5694 }
5695
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005696 /* Fixup kobjects */
Eric W. Biedermana1b3f592010-05-04 17:36:49 -07005697 err = device_rename(&dev->dev, dev->name);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005698 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005699
5700 /* Add the device back in the hashes */
5701 list_netdevice(dev);
5702
5703 /* Notify protocols, that a new device appeared. */
5704 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5705
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005706 /*
5707 * Prevent userspace races by waiting until the network
5708 * device is fully setup before sending notifications.
5709 */
5710 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5711
Eric W. Biedermance286d32007-09-12 13:53:49 +02005712 synchronize_net();
5713 err = 0;
5714out:
5715 return err;
5716}
Johannes Berg463d0182009-07-14 00:33:35 +02005717EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005718
Linus Torvalds1da177e2005-04-16 15:20:36 -07005719static int dev_cpu_callback(struct notifier_block *nfb,
5720 unsigned long action,
5721 void *ocpu)
5722{
5723 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005724 struct sk_buff *skb;
5725 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5726 struct softnet_data *sd, *oldsd;
5727
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005728 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005729 return NOTIFY_OK;
5730
5731 local_irq_disable();
5732 cpu = smp_processor_id();
5733 sd = &per_cpu(softnet_data, cpu);
5734 oldsd = &per_cpu(softnet_data, oldcpu);
5735
5736 /* Find end of our completion_queue. */
5737 list_skb = &sd->completion_queue;
5738 while (*list_skb)
5739 list_skb = &(*list_skb)->next;
5740 /* Append completion queue from offline CPU. */
5741 *list_skb = oldsd->completion_queue;
5742 oldsd->completion_queue = NULL;
5743
Linus Torvalds1da177e2005-04-16 15:20:36 -07005744 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00005745 if (oldsd->output_queue) {
5746 *sd->output_queue_tailp = oldsd->output_queue;
5747 sd->output_queue_tailp = oldsd->output_queue_tailp;
5748 oldsd->output_queue = NULL;
5749 oldsd->output_queue_tailp = &oldsd->output_queue;
5750 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005751
5752 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5753 local_irq_enable();
5754
5755 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00005756 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5757 netif_rx(skb);
5758 input_queue_head_incr(oldsd);
5759 }
Tom Herbertfec5e652010-04-16 16:01:27 -07005760 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005761 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00005762 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07005763 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005764
5765 return NOTIFY_OK;
5766}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005767
5768
Herbert Xu7f353bf2007-08-10 15:47:58 -07005769/**
Herbert Xub63365a2008-10-23 01:11:29 -07005770 * netdev_increment_features - increment feature set by one
5771 * @all: current feature set
5772 * @one: new feature set
5773 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07005774 *
5775 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07005776 * @one to the master device with current feature set @all. Will not
5777 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07005778 */
Herbert Xub63365a2008-10-23 01:11:29 -07005779unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5780 unsigned long mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07005781{
Herbert Xub63365a2008-10-23 01:11:29 -07005782 /* If device needs checksumming, downgrade to it. */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005783 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
Herbert Xub63365a2008-10-23 01:11:29 -07005784 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5785 else if (mask & NETIF_F_ALL_CSUM) {
5786 /* If one device supports v4/v6 checksumming, set for all. */
5787 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5788 !(all & NETIF_F_GEN_CSUM)) {
5789 all &= ~NETIF_F_ALL_CSUM;
5790 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5791 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07005792
Herbert Xub63365a2008-10-23 01:11:29 -07005793 /* If one device supports hw checksumming, set for all. */
5794 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5795 all &= ~NETIF_F_ALL_CSUM;
5796 all |= NETIF_F_HW_CSUM;
5797 }
5798 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07005799
Herbert Xub63365a2008-10-23 01:11:29 -07005800 one |= NETIF_F_ALL_CSUM;
Herbert Xu7f353bf2007-08-10 15:47:58 -07005801
Herbert Xub63365a2008-10-23 01:11:29 -07005802 one |= all & NETIF_F_ONE_FOR_ALL;
Sridhar Samudralad9f59502009-10-07 12:24:25 +00005803 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
Herbert Xub63365a2008-10-23 01:11:29 -07005804 all |= one & mask & NETIF_F_ONE_FOR_ALL;
Herbert Xu7f353bf2007-08-10 15:47:58 -07005805
5806 return all;
5807}
Herbert Xub63365a2008-10-23 01:11:29 -07005808EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07005809
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005810static struct hlist_head *netdev_create_hash(void)
5811{
5812 int i;
5813 struct hlist_head *hash;
5814
5815 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5816 if (hash != NULL)
5817 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5818 INIT_HLIST_HEAD(&hash[i]);
5819
5820 return hash;
5821}
5822
Eric W. Biederman881d9662007-09-17 11:56:21 -07005823/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07005824static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07005825{
Eric W. Biederman881d9662007-09-17 11:56:21 -07005826 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07005827
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005828 net->dev_name_head = netdev_create_hash();
5829 if (net->dev_name_head == NULL)
5830 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005831
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005832 net->dev_index_head = netdev_create_hash();
5833 if (net->dev_index_head == NULL)
5834 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005835
5836 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005837
5838err_idx:
5839 kfree(net->dev_name_head);
5840err_name:
5841 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005842}
5843
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005844/**
5845 * netdev_drivername - network driver for the device
5846 * @dev: network device
5847 * @buffer: buffer for resulting name
5848 * @len: size of buffer
5849 *
5850 * Determine network driver for device.
5851 */
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07005852char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
Arjan van de Ven6579e572008-07-21 13:31:48 -07005853{
Stephen Hemmingercf04a4c72008-09-30 02:22:14 -07005854 const struct device_driver *driver;
5855 const struct device *parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07005856
5857 if (len <= 0 || !buffer)
5858 return buffer;
5859 buffer[0] = 0;
5860
5861 parent = dev->dev.parent;
5862
5863 if (!parent)
5864 return buffer;
5865
5866 driver = parent->driver;
5867 if (driver && driver->name)
5868 strlcpy(buffer, driver->name, len);
5869 return buffer;
5870}
5871
Joe Perches256df2f2010-06-27 01:02:35 +00005872static int __netdev_printk(const char *level, const struct net_device *dev,
5873 struct va_format *vaf)
5874{
5875 int r;
5876
5877 if (dev && dev->dev.parent)
5878 r = dev_printk(level, dev->dev.parent, "%s: %pV",
5879 netdev_name(dev), vaf);
5880 else if (dev)
5881 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
5882 else
5883 r = printk("%s(NULL net_device): %pV", level, vaf);
5884
5885 return r;
5886}
5887
5888int netdev_printk(const char *level, const struct net_device *dev,
5889 const char *format, ...)
5890{
5891 struct va_format vaf;
5892 va_list args;
5893 int r;
5894
5895 va_start(args, format);
5896
5897 vaf.fmt = format;
5898 vaf.va = &args;
5899
5900 r = __netdev_printk(level, dev, &vaf);
5901 va_end(args);
5902
5903 return r;
5904}
5905EXPORT_SYMBOL(netdev_printk);
5906
5907#define define_netdev_printk_level(func, level) \
5908int func(const struct net_device *dev, const char *fmt, ...) \
5909{ \
5910 int r; \
5911 struct va_format vaf; \
5912 va_list args; \
5913 \
5914 va_start(args, fmt); \
5915 \
5916 vaf.fmt = fmt; \
5917 vaf.va = &args; \
5918 \
5919 r = __netdev_printk(level, dev, &vaf); \
5920 va_end(args); \
5921 \
5922 return r; \
5923} \
5924EXPORT_SYMBOL(func);
5925
5926define_netdev_printk_level(netdev_emerg, KERN_EMERG);
5927define_netdev_printk_level(netdev_alert, KERN_ALERT);
5928define_netdev_printk_level(netdev_crit, KERN_CRIT);
5929define_netdev_printk_level(netdev_err, KERN_ERR);
5930define_netdev_printk_level(netdev_warn, KERN_WARNING);
5931define_netdev_printk_level(netdev_notice, KERN_NOTICE);
5932define_netdev_printk_level(netdev_info, KERN_INFO);
5933
Pavel Emelyanov46650792007-10-08 20:38:39 -07005934static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07005935{
5936 kfree(net->dev_name_head);
5937 kfree(net->dev_index_head);
5938}
5939
Denis V. Lunev022cbae2007-11-13 03:23:50 -08005940static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07005941 .init = netdev_init,
5942 .exit = netdev_exit,
5943};
5944
Pavel Emelyanov46650792007-10-08 20:38:39 -07005945static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02005946{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005947 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02005948 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005949 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02005950 * initial network namespace
5951 */
5952 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005953 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005954 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005955 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02005956
5957 /* Ignore unmoveable devices (i.e. loopback) */
5958 if (dev->features & NETIF_F_NETNS_LOCAL)
5959 continue;
5960
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005961 /* Leave virtual devices for the generic cleanup */
5962 if (dev->rtnl_link_ops)
5963 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08005964
Eric W. Biedermance286d32007-09-12 13:53:49 +02005965 /* Push remaing network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005966 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5967 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005968 if (err) {
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005969 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
Eric W. Biedermance286d32007-09-12 13:53:49 +02005970 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005971 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02005972 }
5973 }
5974 rtnl_unlock();
5975}
5976
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00005977static void __net_exit default_device_exit_batch(struct list_head *net_list)
5978{
5979 /* At exit all network devices most be removed from a network
5980 * namespace. Do this in the reverse order of registeration.
5981 * Do this across as many network namespaces as possible to
5982 * improve batching efficiency.
5983 */
5984 struct net_device *dev;
5985 struct net *net;
5986 LIST_HEAD(dev_kill_list);
5987
5988 rtnl_lock();
5989 list_for_each_entry(net, net_list, exit_list) {
5990 for_each_netdev_reverse(net, dev) {
5991 if (dev->rtnl_link_ops)
5992 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5993 else
5994 unregister_netdevice_queue(dev, &dev_kill_list);
5995 }
5996 }
5997 unregister_netdevice_many(&dev_kill_list);
5998 rtnl_unlock();
5999}
6000
Denis V. Lunev022cbae2007-11-13 03:23:50 -08006001static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02006002 .exit = default_device_exit,
Eric W. Biederman04dc7f6b2009-12-03 02:29:04 +00006003 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02006004};
6005
Linus Torvalds1da177e2005-04-16 15:20:36 -07006006/*
6007 * Initialize the DEV module. At boot time this walks the device list and
6008 * unhooks any devices that fail to initialise (normally hardware not
6009 * present) and leaves us with a valid list of present and active devices.
6010 *
6011 */
6012
6013/*
6014 * This is called single threaded during boot, so no need
6015 * to take the rtnl semaphore.
6016 */
6017static int __init net_dev_init(void)
6018{
6019 int i, rc = -ENOMEM;
6020
6021 BUG_ON(!dev_boot_phase);
6022
Linus Torvalds1da177e2005-04-16 15:20:36 -07006023 if (dev_proc_init())
6024 goto out;
6025
Eric W. Biederman8b41d182007-09-26 22:02:53 -07006026 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07006027 goto out;
6028
6029 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a8672007-11-26 20:12:58 +08006030 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006031 INIT_LIST_HEAD(&ptype_base[i]);
6032
Eric W. Biederman881d9662007-09-17 11:56:21 -07006033 if (register_pernet_subsys(&netdev_net_ops))
6034 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006035
6036 /*
6037 * Initialise the packet receive queues.
6038 */
6039
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07006040 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006041 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006042
Changli Gaodee42872010-05-02 05:42:16 +00006043 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006044 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07006045 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006046 sd->completion_queue = NULL;
6047 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00006048 sd->output_queue = NULL;
6049 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00006050#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006051 sd->csd.func = rps_trigger_softirq;
6052 sd->csd.info = sd;
6053 sd->csd.flags = 0;
6054 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07006055#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00006056
Eric Dumazete36fa2f2010-04-19 21:17:14 +00006057 sd->backlog.poll = process_backlog;
6058 sd->backlog.weight = weight_p;
6059 sd->backlog.gro_list = NULL;
6060 sd->backlog.gro_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006061 }
6062
Linus Torvalds1da177e2005-04-16 15:20:36 -07006063 dev_boot_phase = 0;
6064
Eric W. Biederman505d4f72008-11-07 22:54:20 -08006065 /* The loopback device is special if any other network devices
6066 * is present in a network namespace the loopback device must
6067 * be present. Since we now dynamically allocate and free the
6068 * loopback device ensure this invariant is maintained by
6069 * keeping the loopback device as the first device on the
6070 * list of network devices. Ensuring the loopback devices
6071 * is the first device that appears and the last network device
6072 * that disappears.
6073 */
6074 if (register_pernet_device(&loopback_net_ops))
6075 goto out;
6076
6077 if (register_pernet_device(&default_device_ops))
6078 goto out;
6079
Carlos R. Mafra962cf362008-05-15 11:15:37 -03006080 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6081 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006082
6083 hotcpu_notifier(dev_cpu_callback, 0);
6084 dst_init();
6085 dev_mcast_init();
6086 rc = 0;
6087out:
6088 return rc;
6089}
6090
6091subsys_initcall(net_dev_init);
6092
Krishna Kumare88721f2009-02-18 17:55:02 -08006093static int __init initialize_hashrnd(void)
6094{
Tom Herbert0a9627f2010-03-16 08:03:29 +00006095 get_random_bytes(&hashrnd, sizeof(hashrnd));
Krishna Kumare88721f2009-02-18 17:55:02 -08006096 return 0;
6097}
6098
6099late_initcall_sync(initialize_hashrnd);
6100