blob: 07a48e2bf7db9c077b24395481b34cad6a8eb2eb [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
Jesper Juhl02c30a82005-05-05 16:16:16 -070010 * Authors: Ross Biro
Linus Torvalds1da177e2005-04-16 15:20:36 -070011 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
76#include <asm/system.h>
77#include <linux/bitops.h>
Randy Dunlap4fc268d2006-01-11 12:17:47 -080078#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070079#include <linux/cpu.h>
80#include <linux/types.h>
81#include <linux/kernel.h>
stephen hemminger08e98972009-11-10 07:20:34 +000082#include <linux/hash.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090083#include <linux/slab.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070084#include <linux/sched.h>
Arjan van de Ven4a3e2f72006-03-20 22:33:17 -080085#include <linux/mutex.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070086#include <linux/string.h>
87#include <linux/mm.h>
88#include <linux/socket.h>
89#include <linux/sockios.h>
90#include <linux/errno.h>
91#include <linux/interrupt.h>
92#include <linux/if_ether.h>
93#include <linux/netdevice.h>
94#include <linux/etherdevice.h>
Ben Hutchings0187bdf2008-06-19 16:15:47 -070095#include <linux/ethtool.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070096#include <linux/notifier.h>
97#include <linux/skbuff.h>
Eric W. Biederman457c4cb2007-09-12 12:01:34 +020098#include <net/net_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070099#include <net/sock.h>
100#include <linux/rtnetlink.h>
101#include <linux/proc_fs.h>
102#include <linux/seq_file.h>
103#include <linux/stat.h>
104#include <linux/if_bridge.h>
Patrick McHardyb863ceb2007-07-14 18:55:06 -0700105#include <linux/if_macvlan.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106#include <net/dst.h>
107#include <net/pkt_sched.h>
108#include <net/checksum.h>
Arnd Bergmann44540962009-11-26 06:07:08 +0000109#include <net/xfrm.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110#include <linux/highmem.h>
111#include <linux/init.h>
112#include <linux/kmod.h>
113#include <linux/module.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114#include <linux/netpoll.h>
115#include <linux/rcupdate.h>
116#include <linux/delay.h>
Johannes Berg295f4a12007-04-26 20:43:56 -0700117#include <net/wext.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118#include <net/iw_handler.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700119#include <asm/current.h>
Steve Grubb5bdb9882005-12-03 08:39:35 -0500120#include <linux/audit.h>
Chris Leechdb217332006-06-17 21:24:58 -0700121#include <linux/dmaengine.h>
Herbert Xuf6a78bf2006-06-22 02:57:17 -0700122#include <linux/err.h>
David S. Millerc7fa9d12006-08-15 16:34:13 -0700123#include <linux/ctype.h>
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700124#include <linux/if_arp.h>
Ben Hutchings6de329e2008-06-16 17:02:28 -0700125#include <linux/if_vlan.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700126#include <linux/ip.h>
Alexander Duyckad55dca2008-09-20 22:05:50 -0700127#include <net/ip.h>
David S. Miller8f0f2222008-07-15 03:47:03 -0700128#include <linux/ipv6.h>
129#include <linux/in.h>
David S. Millerb6b2fed2008-07-21 09:48:06 -0700130#include <linux/jhash.h>
131#include <linux/random.h>
David S. Miller9cbc1cb2009-06-15 03:02:23 -0700132#include <trace/events/napi.h>
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +0000133#include <linux/pci.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -0700134
Pavel Emelyanov342709e2007-10-23 21:14:45 -0700135#include "net-sysfs.h"
136
Herbert Xud565b0a2008-12-15 23:38:52 -0800137/* Instead of increasing this, you should create a hash table. */
138#define MAX_GRO_SKBS 8
139
Herbert Xu5d38a072009-01-04 16:13:40 -0800140/* This should be increased if a protocol with a bigger head is added. */
141#define GRO_MAX_HEAD (MAX_HEADER + 128)
142
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143/*
144 * The list of packet types we will receive (as opposed to discard)
145 * and the routines to invoke.
146 *
147 * Why 16. Because with 16 the only overlap we get on a hash of the
148 * low nibble of the protocol value is RARP/SNAP/X.25.
149 *
150 * NOTE: That is no longer true with the addition of VLAN tags. Not
151 * sure which should go first, but I bet it won't make much
152 * difference if we are running VLANs. The good news is that
153 * this protocol won't be in the list unless compiled in, so
Stephen Hemminger3041a062006-05-26 13:25:24 -0700154 * the average user (w/out VLANs) will not be adversely affected.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155 * --BLG
156 *
157 * 0800 IP
158 * 8100 802.1Q VLAN
159 * 0001 802.3
160 * 0002 AX.25
161 * 0004 802.2
162 * 8035 RARP
163 * 0005 SNAP
164 * 0805 X.25
165 * 0806 ARP
166 * 8137 IPX
167 * 0009 Localtalk
168 * 86DD IPv6
169 */
170
Pavel Emelyanov82d8a862007-11-26 20:12:58 +0800171#define PTYPE_HASH_SIZE (16)
172#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
173
Linus Torvalds1da177e2005-04-16 15:20:36 -0700174static DEFINE_SPINLOCK(ptype_lock);
Pavel Emelyanov82d8a862007-11-26 20:12:58 +0800175static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -0700176static struct list_head ptype_all __read_mostly; /* Taps */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178/*
Pavel Emelianov7562f872007-05-03 15:13:45 -0700179 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 * semaphore.
181 *
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800182 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 *
184 * Writers must hold the rtnl semaphore while they loop through the
Pavel Emelianov7562f872007-05-03 15:13:45 -0700185 * dev_base_head list, and hold dev_base_lock for writing when they do the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700186 * actual updates. This allows pure readers to access the list even
187 * while a writer is preparing to update it.
188 *
189 * To put it another way, dev_base_lock is held for writing only to
190 * protect against pure readers; the rtnl semaphore provides the
191 * protection against other writers.
192 *
193 * See, for example usages, register_netdevice() and
194 * unregister_netdevice(), which must be called with the rtnl
195 * semaphore held.
196 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197DEFINE_RWLOCK(dev_base_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198EXPORT_SYMBOL(dev_base_lock);
199
Eric W. Biederman881d9662007-09-17 11:56:21 -0700200static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700201{
202 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
stephen hemminger08e98972009-11-10 07:20:34 +0000203 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204}
205
Eric W. Biederman881d9662007-09-17 11:56:21 -0700206static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207{
Eric Dumazet7c28bd02009-10-24 06:13:17 -0700208 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700209}
210
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000211static inline void rps_lock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000212{
213#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000214 spin_lock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000215#endif
216}
217
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000218static inline void rps_unlock(struct softnet_data *sd)
Changli Gao152102c2010-03-30 20:16:22 +0000219{
220#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +0000221 spin_unlock(&sd->input_pkt_queue.lock);
Changli Gao152102c2010-03-30 20:16:22 +0000222#endif
223}
224
Eric W. Biedermance286d32007-09-12 13:53:49 +0200225/* Device list insertion */
226static int list_netdevice(struct net_device *dev)
227{
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900228 struct net *net = dev_net(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200229
230 ASSERT_RTNL();
231
232 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800233 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
Eric Dumazet72c95282009-10-30 07:11:27 +0000234 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000235 hlist_add_head_rcu(&dev->index_hlist,
236 dev_index_hash(net, dev->ifindex));
Eric W. Biedermance286d32007-09-12 13:53:49 +0200237 write_unlock_bh(&dev_base_lock);
238 return 0;
239}
240
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000241/* Device list removal
242 * caller must respect a RCU grace period before freeing/reusing dev
243 */
Eric W. Biedermance286d32007-09-12 13:53:49 +0200244static void unlist_netdevice(struct net_device *dev)
245{
246 ASSERT_RTNL();
247
248 /* Unlink dev from the device chain */
249 write_lock_bh(&dev_base_lock);
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800250 list_del_rcu(&dev->dev_list);
Eric Dumazet72c95282009-10-30 07:11:27 +0000251 hlist_del_rcu(&dev->name_hlist);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000252 hlist_del_rcu(&dev->index_hlist);
Eric W. Biedermance286d32007-09-12 13:53:49 +0200253 write_unlock_bh(&dev_base_lock);
254}
255
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256/*
257 * Our notifier list
258 */
259
Alan Sternf07d5b92006-05-09 15:23:03 -0700260static RAW_NOTIFIER_HEAD(netdev_chain);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261
262/*
263 * Device drivers call our routines to queue packets here. We empty the
264 * queue in the local softnet handler.
265 */
Stephen Hemmingerbea33482007-10-03 16:41:36 -0700266
Eric Dumazet9958da02010-04-17 04:17:02 +0000267DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700268EXPORT_PER_CPU_SYMBOL(softnet_data);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269
David S. Millercf508b12008-07-22 14:16:42 -0700270#ifdef CONFIG_LOCKDEP
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700271/*
David S. Millerc773e842008-07-08 23:13:53 -0700272 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700273 * according to dev->type
274 */
275static const unsigned short netdev_lock_type[] =
276 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
277 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
278 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
279 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
280 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
281 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
282 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
283 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
284 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
285 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
286 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
287 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
288 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800289 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
Dmitry Eremin-Solenikov929122c2009-08-14 20:00:20 +0400290 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000291 ARPHRD_VOID, ARPHRD_NONE};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700292
Jan Engelhardt36cbd3d2009-08-05 10:42:58 -0700293static const char *const netdev_lock_name[] =
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700294 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
295 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
296 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
297 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
298 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
299 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
300 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
301 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
302 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
303 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
304 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
305 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
306 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
Rémi Denis-Courmont2d91d782008-12-17 15:47:29 -0800307 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
Dmitry Eremin-Solenikov929122c2009-08-14 20:00:20 +0400308 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
Sergey Lapinfcb94e42009-06-08 12:18:47 +0000309 "_xmit_VOID", "_xmit_NONE"};
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700310
311static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
David S. Millercf508b12008-07-22 14:16:42 -0700312static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700313
314static inline unsigned short netdev_lock_pos(unsigned short dev_type)
315{
316 int i;
317
318 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
319 if (netdev_lock_type[i] == dev_type)
320 return i;
321 /* the last key is used by default */
322 return ARRAY_SIZE(netdev_lock_type) - 1;
323}
324
David S. Millercf508b12008-07-22 14:16:42 -0700325static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
326 unsigned short dev_type)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700327{
328 int i;
329
330 i = netdev_lock_pos(dev_type);
331 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
332 netdev_lock_name[i]);
333}
David S. Millercf508b12008-07-22 14:16:42 -0700334
335static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
336{
337 int i;
338
339 i = netdev_lock_pos(dev->type);
340 lockdep_set_class_and_name(&dev->addr_list_lock,
341 &netdev_addr_lock_key[i],
342 netdev_lock_name[i]);
343}
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700344#else
David S. Millercf508b12008-07-22 14:16:42 -0700345static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
346 unsigned short dev_type)
347{
348}
349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
Jarek Poplawski723e98b2007-05-15 22:46:18 -0700350{
351}
352#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353
354/*******************************************************************************
355
356 Protocol management and registration routines
357
358*******************************************************************************/
359
360/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700361 * Add a protocol ID to the list. Now that the input handler is
362 * smarter we can dispense with all the messy stuff that used to be
363 * here.
364 *
365 * BEWARE!!! Protocol handlers, mangling input packets,
366 * MUST BE last in hash buckets and checking protocol handlers
367 * MUST start from promiscuous ptype_all chain in net_bh.
368 * It is true now, do not change it.
369 * Explanation follows: if protocol handler, mangling packet, will
370 * be the first on list, it is not able to sense, that packet
371 * is cloned and should be copied-on-write, so that it will
372 * change it and subsequent readers will get broken packet.
373 * --ANK (980803)
374 */
375
376/**
377 * dev_add_pack - add packet handler
378 * @pt: packet type declaration
379 *
380 * Add a protocol handler to the networking stack. The passed &packet_type
381 * is linked into kernel lists and may not be freed until it has been
382 * removed from the kernel lists.
383 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900384 * This call does not sleep therefore it can not
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385 * guarantee all CPU's that are in middle of receiving packets
386 * will see the new packet type (until the next received packet).
387 */
388
389void dev_add_pack(struct packet_type *pt)
390{
391 int hash;
392
393 spin_lock_bh(&ptype_lock);
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700394 if (pt->type == htons(ETH_P_ALL))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700395 list_add_rcu(&pt->list, &ptype_all);
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700396 else {
Pavel Emelyanov82d8a862007-11-26 20:12:58 +0800397 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700398 list_add_rcu(&pt->list, &ptype_base[hash]);
399 }
400 spin_unlock_bh(&ptype_lock);
401}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700402EXPORT_SYMBOL(dev_add_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700403
Linus Torvalds1da177e2005-04-16 15:20:36 -0700404/**
405 * __dev_remove_pack - remove packet handler
406 * @pt: packet type declaration
407 *
408 * Remove a protocol handler that was previously added to the kernel
409 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
410 * from the kernel lists and can be freed or reused once this function
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900411 * returns.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700412 *
413 * The packet type might still be in use by receivers
414 * and must not be freed until after all the CPU's have gone
415 * through a quiescent state.
416 */
417void __dev_remove_pack(struct packet_type *pt)
418{
419 struct list_head *head;
420 struct packet_type *pt1;
421
422 spin_lock_bh(&ptype_lock);
423
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700424 if (pt->type == htons(ETH_P_ALL))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 head = &ptype_all;
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -0700426 else
Pavel Emelyanov82d8a862007-11-26 20:12:58 +0800427 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700428
429 list_for_each_entry(pt1, head, list) {
430 if (pt == pt1) {
431 list_del_rcu(&pt->list);
432 goto out;
433 }
434 }
435
436 printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
437out:
438 spin_unlock_bh(&ptype_lock);
439}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700440EXPORT_SYMBOL(__dev_remove_pack);
441
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442/**
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
445 *
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
449 * returns.
450 *
451 * This call sleeps to guarantee that no CPU is looking at the packet
452 * type after return.
453 */
454void dev_remove_pack(struct packet_type *pt)
455{
456 __dev_remove_pack(pt);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900457
Linus Torvalds1da177e2005-04-16 15:20:36 -0700458 synchronize_net();
459}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700460EXPORT_SYMBOL(dev_remove_pack);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700461
462/******************************************************************************
463
464 Device Boot-time Settings Routines
465
466*******************************************************************************/
467
468/* Boot time configuration table */
469static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
470
471/**
472 * netdev_boot_setup_add - add new setup entry
473 * @name: name of the device
474 * @map: configured settings for the device
475 *
476 * Adds new setup entry to the dev_boot_setup list. The function
477 * returns 0 on error and 1 on success. This is a generic routine to
478 * all netdevices.
479 */
480static int netdev_boot_setup_add(char *name, struct ifmap *map)
481{
482 struct netdev_boot_setup *s;
483 int i;
484
485 s = dev_boot_setup;
486 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
487 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
488 memset(s[i].name, 0, sizeof(s[i].name));
Wang Chen93b3cff2008-07-01 19:57:19 -0700489 strlcpy(s[i].name, name, IFNAMSIZ);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490 memcpy(&s[i].map, map, sizeof(s[i].map));
491 break;
492 }
493 }
494
495 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
496}
497
498/**
499 * netdev_boot_setup_check - check boot time settings
500 * @dev: the netdevice
501 *
502 * Check boot time settings for the device.
503 * The found settings are set for the device to be used
504 * later in the device probing.
505 * Returns 0 if no settings found, 1 if they are.
506 */
507int netdev_boot_setup_check(struct net_device *dev)
508{
509 struct netdev_boot_setup *s = dev_boot_setup;
510 int i;
511
512 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
513 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
Wang Chen93b3cff2008-07-01 19:57:19 -0700514 !strcmp(dev->name, s[i].name)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700515 dev->irq = s[i].map.irq;
516 dev->base_addr = s[i].map.base_addr;
517 dev->mem_start = s[i].map.mem_start;
518 dev->mem_end = s[i].map.mem_end;
519 return 1;
520 }
521 }
522 return 0;
523}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700524EXPORT_SYMBOL(netdev_boot_setup_check);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525
526
527/**
528 * netdev_boot_base - get address from boot time settings
529 * @prefix: prefix for network device
530 * @unit: id for network device
531 *
532 * Check boot time settings for the base address of device.
533 * The found settings are set for the device to be used
534 * later in the device probing.
535 * Returns 0 if no settings found.
536 */
537unsigned long netdev_boot_base(const char *prefix, int unit)
538{
539 const struct netdev_boot_setup *s = dev_boot_setup;
540 char name[IFNAMSIZ];
541 int i;
542
543 sprintf(name, "%s%d", prefix, unit);
544
545 /*
546 * If device already registered then return base of 1
547 * to indicate not to probe for this interface
548 */
Eric W. Biederman881d9662007-09-17 11:56:21 -0700549 if (__dev_get_by_name(&init_net, name))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550 return 1;
551
552 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
553 if (!strcmp(name, s[i].name))
554 return s[i].map.base_addr;
555 return 0;
556}
557
558/*
559 * Saves at boot time configured settings for any netdevice.
560 */
561int __init netdev_boot_setup(char *str)
562{
563 int ints[5];
564 struct ifmap map;
565
566 str = get_options(str, ARRAY_SIZE(ints), ints);
567 if (!str || !*str)
568 return 0;
569
570 /* Save settings */
571 memset(&map, 0, sizeof(map));
572 if (ints[0] > 0)
573 map.irq = ints[1];
574 if (ints[0] > 1)
575 map.base_addr = ints[2];
576 if (ints[0] > 2)
577 map.mem_start = ints[3];
578 if (ints[0] > 3)
579 map.mem_end = ints[4];
580
581 /* Add new entry to the list */
582 return netdev_boot_setup_add(str, &map);
583}
584
585__setup("netdev=", netdev_boot_setup);
586
587/*******************************************************************************
588
589 Device Interface Subroutines
590
591*******************************************************************************/
592
593/**
594 * __dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700595 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700596 * @name: name to find
597 *
598 * Find an interface by name. Must be called under RTNL semaphore
599 * or @dev_base_lock. If the name is found a pointer to the device
600 * is returned. If the name is not found then %NULL is returned. The
601 * reference counters are not incremented so the caller must be
602 * careful with locks.
603 */
604
Eric W. Biederman881d9662007-09-17 11:56:21 -0700605struct net_device *__dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606{
607 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700608 struct net_device *dev;
609 struct hlist_head *head = dev_name_hash(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700611 hlist_for_each_entry(dev, p, head, name_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 if (!strncmp(dev->name, name, IFNAMSIZ))
613 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700614
Linus Torvalds1da177e2005-04-16 15:20:36 -0700615 return NULL;
616}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700617EXPORT_SYMBOL(__dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700618
619/**
Eric Dumazet72c95282009-10-30 07:11:27 +0000620 * dev_get_by_name_rcu - find a device by its name
621 * @net: the applicable net namespace
622 * @name: name to find
623 *
624 * Find an interface by name.
625 * If the name is found a pointer to the device is returned.
626 * If the name is not found then %NULL is returned.
627 * The reference counters are not incremented so the caller must be
628 * careful with locks. The caller must hold RCU lock.
629 */
630
631struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
632{
633 struct hlist_node *p;
634 struct net_device *dev;
635 struct hlist_head *head = dev_name_hash(net, name);
636
637 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
638 if (!strncmp(dev->name, name, IFNAMSIZ))
639 return dev;
640
641 return NULL;
642}
643EXPORT_SYMBOL(dev_get_by_name_rcu);
644
645/**
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646 * dev_get_by_name - find a device by its name
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700647 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 * @name: name to find
649 *
650 * Find an interface by name. This can be called from any
651 * context and does its own locking. The returned handle has
652 * the usage count incremented and the caller must use dev_put() to
653 * release it when it is no longer needed. %NULL is returned if no
654 * matching device is found.
655 */
656
Eric W. Biederman881d9662007-09-17 11:56:21 -0700657struct net_device *dev_get_by_name(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700658{
659 struct net_device *dev;
660
Eric Dumazet72c95282009-10-30 07:11:27 +0000661 rcu_read_lock();
662 dev = dev_get_by_name_rcu(net, name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663 if (dev)
664 dev_hold(dev);
Eric Dumazet72c95282009-10-30 07:11:27 +0000665 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 return dev;
667}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700668EXPORT_SYMBOL(dev_get_by_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669
670/**
671 * __dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700672 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700673 * @ifindex: index of device
674 *
675 * Search for an interface by index. Returns %NULL if the device
676 * is not found or a pointer to the device. The device has not
677 * had its reference counter increased so the caller must be careful
678 * about locking. The caller must hold either the RTNL semaphore
679 * or @dev_base_lock.
680 */
681
Eric W. Biederman881d9662007-09-17 11:56:21 -0700682struct net_device *__dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683{
684 struct hlist_node *p;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700685 struct net_device *dev;
686 struct hlist_head *head = dev_index_hash(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700687
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700688 hlist_for_each_entry(dev, p, head, index_hlist)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700689 if (dev->ifindex == ifindex)
690 return dev;
Eric Dumazet0bd8d532009-10-30 01:40:11 -0700691
Linus Torvalds1da177e2005-04-16 15:20:36 -0700692 return NULL;
693}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700694EXPORT_SYMBOL(__dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700695
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000696/**
697 * dev_get_by_index_rcu - find a device by its ifindex
698 * @net: the applicable net namespace
699 * @ifindex: index of device
700 *
701 * Search for an interface by index. Returns %NULL if the device
702 * is not found or a pointer to the device. The device has not
703 * had its reference counter increased so the caller must be careful
704 * about locking. The caller must hold RCU lock.
705 */
706
707struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
708{
709 struct hlist_node *p;
710 struct net_device *dev;
711 struct hlist_head *head = dev_index_hash(net, ifindex);
712
713 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
714 if (dev->ifindex == ifindex)
715 return dev;
716
717 return NULL;
718}
719EXPORT_SYMBOL(dev_get_by_index_rcu);
720
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721
722/**
723 * dev_get_by_index - find a device by its ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700724 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700725 * @ifindex: index of device
726 *
727 * Search for an interface by index. Returns NULL if the device
728 * is not found or a pointer to the device. The device returned has
729 * had a reference added and the pointer is safe until the user calls
730 * dev_put to indicate they have finished with it.
731 */
732
Eric W. Biederman881d9662007-09-17 11:56:21 -0700733struct net_device *dev_get_by_index(struct net *net, int ifindex)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734{
735 struct net_device *dev;
736
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000737 rcu_read_lock();
738 dev = dev_get_by_index_rcu(net, ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739 if (dev)
740 dev_hold(dev);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +0000741 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700742 return dev;
743}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700744EXPORT_SYMBOL(dev_get_by_index);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700745
746/**
747 * dev_getbyhwaddr - find a device by its hardware address
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700748 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700749 * @type: media type of device
750 * @ha: hardware address
751 *
752 * Search for an interface by MAC address. Returns NULL if the device
753 * is not found or a pointer to the device. The caller must hold the
754 * rtnl semaphore. The returned device has not had its ref count increased
755 * and the caller must therefore be careful about locking
756 *
757 * BUGS:
758 * If the API was consistent this would be __dev_get_by_hwaddr
759 */
760
Eric W. Biederman881d9662007-09-17 11:56:21 -0700761struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700762{
763 struct net_device *dev;
764
765 ASSERT_RTNL();
766
Denis V. Lunev81103a52007-12-12 10:47:38 -0800767 for_each_netdev(net, dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700768 if (dev->type == type &&
769 !memcmp(dev->dev_addr, ha, dev->addr_len))
Pavel Emelianov7562f872007-05-03 15:13:45 -0700770 return dev;
771
772 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700773}
Jochen Friedrichcf309e32005-09-22 04:44:55 -0300774EXPORT_SYMBOL(dev_getbyhwaddr);
775
Eric W. Biederman881d9662007-09-17 11:56:21 -0700776struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700777{
778 struct net_device *dev;
779
780 ASSERT_RTNL();
Eric W. Biederman881d9662007-09-17 11:56:21 -0700781 for_each_netdev(net, dev)
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700782 if (dev->type == type)
Pavel Emelianov7562f872007-05-03 15:13:45 -0700783 return dev;
784
785 return NULL;
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700786}
Patrick McHardy4e9cac22007-05-03 03:28:13 -0700787EXPORT_SYMBOL(__dev_getfirstbyhwtype);
788
Eric W. Biederman881d9662007-09-17 11:56:21 -0700789struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790{
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000791 struct net_device *dev, *ret = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700792
Eric Dumazet99fe3c32010-03-18 11:27:25 +0000793 rcu_read_lock();
794 for_each_netdev_rcu(net, dev)
795 if (dev->type == type) {
796 dev_hold(dev);
797 ret = dev;
798 break;
799 }
800 rcu_read_unlock();
801 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700802}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803EXPORT_SYMBOL(dev_getfirstbyhwtype);
804
805/**
806 * dev_get_by_flags - find any device with given flags
Randy Dunlapc4ea43c2007-10-12 21:17:49 -0700807 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -0700808 * @if_flags: IFF_* values
809 * @mask: bitmask of bits in if_flags to check
810 *
811 * Search for any interface with the given flags. Returns NULL if a device
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +0900812 * is not found or a pointer to the device. The device returned has
Linus Torvalds1da177e2005-04-16 15:20:36 -0700813 * had a reference added and the pointer is safe until the user calls
814 * dev_put to indicate they have finished with it.
815 */
816
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700817struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
818 unsigned short mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700819{
Pavel Emelianov7562f872007-05-03 15:13:45 -0700820 struct net_device *dev, *ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821
Pavel Emelianov7562f872007-05-03 15:13:45 -0700822 ret = NULL;
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800823 rcu_read_lock();
824 for_each_netdev_rcu(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700825 if (((dev->flags ^ if_flags) & mask) == 0) {
826 dev_hold(dev);
Pavel Emelianov7562f872007-05-03 15:13:45 -0700827 ret = dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700828 break;
829 }
830 }
Eric Dumazetc6d14c82009-11-04 05:43:23 -0800831 rcu_read_unlock();
Pavel Emelianov7562f872007-05-03 15:13:45 -0700832 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700834EXPORT_SYMBOL(dev_get_by_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835
836/**
837 * dev_valid_name - check if name is okay for network device
838 * @name: name string
839 *
840 * Network device names need to be valid file names to
David S. Millerc7fa9d12006-08-15 16:34:13 -0700841 * to allow sysfs to work. We also disallow any kind of
842 * whitespace.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700843 */
Mitch Williamsc2373ee2005-11-09 10:34:45 -0800844int dev_valid_name(const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700845{
David S. Millerc7fa9d12006-08-15 16:34:13 -0700846 if (*name == '\0')
847 return 0;
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -0700848 if (strlen(name) >= IFNAMSIZ)
849 return 0;
David S. Millerc7fa9d12006-08-15 16:34:13 -0700850 if (!strcmp(name, ".") || !strcmp(name, ".."))
851 return 0;
852
853 while (*name) {
854 if (*name == '/' || isspace(*name))
855 return 0;
856 name++;
857 }
858 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700859}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700860EXPORT_SYMBOL(dev_valid_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861
862/**
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200863 * __dev_alloc_name - allocate a name for a device
864 * @net: network namespace to allocate the device name in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700865 * @name: name format string
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200866 * @buf: scratch buffer and result name string
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867 *
868 * Passed a format string - eg "lt%d" it will try and find a suitable
Stephen Hemminger3041a062006-05-26 13:25:24 -0700869 * id. It scans list of devices to build up a free map, then chooses
870 * the first empty slot. The caller must hold the dev_base or rtnl lock
871 * while allocating the name and adding the device in order to avoid
872 * duplicates.
873 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
874 * Returns the number of the unit assigned or a negative errno code.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700875 */
876
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200877static int __dev_alloc_name(struct net *net, const char *name, char *buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878{
879 int i = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700880 const char *p;
881 const int max_netdevices = 8*PAGE_SIZE;
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700882 unsigned long *inuse;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 struct net_device *d;
884
885 p = strnchr(name, IFNAMSIZ-1, '%');
886 if (p) {
887 /*
888 * Verify the string as this thing may have come from
889 * the user. There must be either one "%d" and no other "%"
890 * characters.
891 */
892 if (p[1] != 'd' || strchr(p + 2, '%'))
893 return -EINVAL;
894
895 /* Use one page as a bit array of possible slots */
Stephen Hemmingercfcabdc2007-10-09 01:59:42 -0700896 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700897 if (!inuse)
898 return -ENOMEM;
899
Eric W. Biederman881d9662007-09-17 11:56:21 -0700900 for_each_netdev(net, d) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 if (!sscanf(d->name, name, &i))
902 continue;
903 if (i < 0 || i >= max_netdevices)
904 continue;
905
906 /* avoid cases where sscanf is not exact inverse of printf */
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200907 snprintf(buf, IFNAMSIZ, name, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700908 if (!strncmp(buf, d->name, IFNAMSIZ))
909 set_bit(i, inuse);
910 }
911
912 i = find_first_zero_bit(inuse, max_netdevices);
913 free_page((unsigned long) inuse);
914 }
915
Octavian Purdilad9031022009-11-18 02:36:59 +0000916 if (buf != name)
917 snprintf(buf, IFNAMSIZ, name, i);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200918 if (!__dev_get_by_name(net, buf))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700919 return i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700920
921 /* It is possible to run out of possible slots
922 * when the name is long and there isn't enough space left
923 * for the digits, or if all bits are used.
924 */
925 return -ENFILE;
926}
927
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200928/**
929 * dev_alloc_name - allocate a name for a device
930 * @dev: device
931 * @name: name format string
932 *
933 * Passed a format string - eg "lt%d" it will try and find a suitable
934 * id. It scans list of devices to build up a free map, then chooses
935 * the first empty slot. The caller must hold the dev_base or rtnl lock
936 * while allocating the name and adding the device in order to avoid
937 * duplicates.
938 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
939 * Returns the number of the unit assigned or a negative errno code.
940 */
941
942int dev_alloc_name(struct net_device *dev, const char *name)
943{
944 char buf[IFNAMSIZ];
945 struct net *net;
946 int ret;
947
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900948 BUG_ON(!dev_net(dev));
949 net = dev_net(dev);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200950 ret = __dev_alloc_name(net, name, buf);
951 if (ret >= 0)
952 strlcpy(dev->name, buf, IFNAMSIZ);
953 return ret;
954}
Eric Dumazetd1b19df2009-09-03 01:29:39 -0700955EXPORT_SYMBOL(dev_alloc_name);
Eric W. Biedermanb267b172007-09-12 13:48:45 +0200956
Daniel Lezcano8ce6ceb2010-05-19 10:12:19 +0000957static int dev_get_valid_name(struct net_device *dev, const char *name, bool fmt)
Octavian Purdilad9031022009-11-18 02:36:59 +0000958{
Daniel Lezcano8ce6ceb2010-05-19 10:12:19 +0000959 struct net *net;
960
961 BUG_ON(!dev_net(dev));
962 net = dev_net(dev);
963
Octavian Purdilad9031022009-11-18 02:36:59 +0000964 if (!dev_valid_name(name))
965 return -EINVAL;
966
967 if (fmt && strchr(name, '%'))
Daniel Lezcano8ce6ceb2010-05-19 10:12:19 +0000968 return dev_alloc_name(dev, name);
Octavian Purdilad9031022009-11-18 02:36:59 +0000969 else if (__dev_get_by_name(net, name))
970 return -EEXIST;
Daniel Lezcano8ce6ceb2010-05-19 10:12:19 +0000971 else if (dev->name != name)
972 strlcpy(dev->name, name, IFNAMSIZ);
Octavian Purdilad9031022009-11-18 02:36:59 +0000973
974 return 0;
975}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700976
977/**
978 * dev_change_name - change name of a device
979 * @dev: device
980 * @newname: name (or format string) must be at least IFNAMSIZ
981 *
982 * Change name of a device, can pass format strings "eth%d".
983 * for wildcarding.
984 */
Stephen Hemmingercf04a4c2008-09-30 02:22:14 -0700985int dev_change_name(struct net_device *dev, const char *newname)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700986{
Herbert Xufcc5a032007-07-30 17:03:38 -0700987 char oldname[IFNAMSIZ];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700988 int err = 0;
Herbert Xufcc5a032007-07-30 17:03:38 -0700989 int ret;
Eric W. Biederman881d9662007-09-17 11:56:21 -0700990 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991
992 ASSERT_RTNL();
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900993 BUG_ON(!dev_net(dev));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700994
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +0900995 net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700996 if (dev->flags & IFF_UP)
997 return -EBUSY;
998
Stephen Hemmingerc8d90dc2007-10-26 03:53:42 -0700999 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1000 return 0;
1001
Herbert Xufcc5a032007-07-30 17:03:38 -07001002 memcpy(oldname, dev->name, IFNAMSIZ);
1003
Daniel Lezcano8ce6ceb2010-05-19 10:12:19 +00001004 err = dev_get_valid_name(dev, newname, 1);
Octavian Purdilad9031022009-11-18 02:36:59 +00001005 if (err < 0)
1006 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001007
Herbert Xufcc5a032007-07-30 17:03:38 -07001008rollback:
Eric W. Biederman38918452008-10-27 17:51:47 -07001009 /* For now only devices in the initial network namespace
1010 * are in sysfs.
1011 */
Octavian Purdila09ad9bc2009-11-25 15:14:13 -08001012 if (net_eq(net, &init_net)) {
Eric W. Biederman38918452008-10-27 17:51:47 -07001013 ret = device_rename(&dev->dev, dev->name);
1014 if (ret) {
1015 memcpy(dev->name, oldname, IFNAMSIZ);
1016 return ret;
1017 }
Stephen Hemmingerdcc99772008-05-14 22:33:38 -07001018 }
Herbert Xu7f988ea2007-07-30 16:35:46 -07001019
1020 write_lock_bh(&dev_base_lock);
Eric W. Biederman92749822007-04-03 00:07:30 -06001021 hlist_del(&dev->name_hlist);
Eric Dumazet72c95282009-10-30 07:11:27 +00001022 write_unlock_bh(&dev_base_lock);
1023
1024 synchronize_rcu();
1025
1026 write_lock_bh(&dev_base_lock);
1027 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
Herbert Xu7f988ea2007-07-30 16:35:46 -07001028 write_unlock_bh(&dev_base_lock);
1029
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001030 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001031 ret = notifier_to_errno(ret);
1032
1033 if (ret) {
Eric Dumazet91e9c072009-11-15 23:30:24 +00001034 /* err >= 0 after dev_alloc_name() or stores the first errno */
1035 if (err >= 0) {
Herbert Xufcc5a032007-07-30 17:03:38 -07001036 err = ret;
1037 memcpy(dev->name, oldname, IFNAMSIZ);
1038 goto rollback;
Eric Dumazet91e9c072009-11-15 23:30:24 +00001039 } else {
1040 printk(KERN_ERR
1041 "%s: name change rollback failed: %d.\n",
1042 dev->name, ret);
Herbert Xufcc5a032007-07-30 17:03:38 -07001043 }
1044 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001045
1046 return err;
1047}
1048
1049/**
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001050 * dev_set_alias - change ifalias of a device
1051 * @dev: device
1052 * @alias: name up to IFALIASZ
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07001053 * @len: limit of bytes to copy from info
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001054 *
1055 * Set ifalias for a device,
1056 */
1057int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1058{
1059 ASSERT_RTNL();
1060
1061 if (len >= IFALIASZ)
1062 return -EINVAL;
1063
Oliver Hartkopp96ca4a22008-09-23 21:23:19 -07001064 if (!len) {
1065 if (dev->ifalias) {
1066 kfree(dev->ifalias);
1067 dev->ifalias = NULL;
1068 }
1069 return 0;
1070 }
1071
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001072 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
Stephen Hemminger0b815a12008-09-22 21:28:11 -07001073 if (!dev->ifalias)
1074 return -ENOMEM;
1075
1076 strlcpy(dev->ifalias, alias, len+1);
1077 return len;
1078}
1079
1080
1081/**
Stephen Hemminger3041a062006-05-26 13:25:24 -07001082 * netdev_features_change - device changes features
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001083 * @dev: device to cause notification
1084 *
1085 * Called to indicate a device has changed features.
1086 */
1087void netdev_features_change(struct net_device *dev)
1088{
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001089 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
Stephen Hemmingerd8a33ac2005-05-29 14:13:47 -07001090}
1091EXPORT_SYMBOL(netdev_features_change);
1092
1093/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001094 * netdev_state_change - device changes state
1095 * @dev: device to cause notification
1096 *
1097 * Called to indicate a device has changed state. This function calls
1098 * the notifier chains for netdev_chain and sends a NEWLINK message
1099 * to the routing socket.
1100 */
1101void netdev_state_change(struct net_device *dev)
1102{
1103 if (dev->flags & IFF_UP) {
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001104 call_netdevice_notifiers(NETDEV_CHANGE, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001105 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1106 }
1107}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001108EXPORT_SYMBOL(netdev_state_change);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001109
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001110int netdev_bonding_change(struct net_device *dev, unsigned long event)
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001111{
Jiri Pirko3ca5b402010-03-10 10:29:35 +00001112 return call_netdevice_notifiers(event, dev);
Or Gerlitzc1da4ac2008-06-13 18:12:00 -07001113}
1114EXPORT_SYMBOL(netdev_bonding_change);
1115
Linus Torvalds1da177e2005-04-16 15:20:36 -07001116/**
1117 * dev_load - load a network module
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001118 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07001119 * @name: name of interface
1120 *
1121 * If a network interface is not present and the process has suitable
1122 * privileges this function loads the module. If module loading is not
1123 * available in this kernel then it becomes a nop.
1124 */
1125
Eric W. Biederman881d9662007-09-17 11:56:21 -07001126void dev_load(struct net *net, const char *name)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127{
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001128 struct net_device *dev;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001129
Eric Dumazet72c95282009-10-30 07:11:27 +00001130 rcu_read_lock();
1131 dev = dev_get_by_name_rcu(net, name);
1132 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133
Eric Parisa8f80e82009-08-13 09:44:51 -04001134 if (!dev && capable(CAP_NET_ADMIN))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001135 request_module("%s", name);
1136}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001137EXPORT_SYMBOL(dev_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001138
Patrick McHardybd380812010-02-26 06:34:53 +00001139static int __dev_open(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001140{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001141 const struct net_device_ops *ops = dev->netdev_ops;
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001142 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001143
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001144 ASSERT_RTNL();
1145
Linus Torvalds1da177e2005-04-16 15:20:36 -07001146 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001147 * Is it even present?
1148 */
1149 if (!netif_device_present(dev))
1150 return -ENODEV;
1151
Johannes Berg3b8bcfd2009-05-30 01:39:53 +02001152 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1153 ret = notifier_to_errno(ret);
1154 if (ret)
1155 return ret;
1156
Linus Torvalds1da177e2005-04-16 15:20:36 -07001157 /*
1158 * Call device private open method
1159 */
1160 set_bit(__LINK_STATE_START, &dev->state);
Jeff Garzikbada3392007-10-23 20:19:37 -07001161
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001162 if (ops->ndo_validate_addr)
1163 ret = ops->ndo_validate_addr(dev);
Jeff Garzikbada3392007-10-23 20:19:37 -07001164
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001165 if (!ret && ops->ndo_open)
1166 ret = ops->ndo_open(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001167
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001168 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001169 * If it went open OK then:
1170 */
1171
Jeff Garzikbada3392007-10-23 20:19:37 -07001172 if (ret)
1173 clear_bit(__LINK_STATE_START, &dev->state);
1174 else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001175 /*
1176 * Set the flags.
1177 */
1178 dev->flags |= IFF_UP;
1179
1180 /*
Dan Williams649274d2009-01-11 00:20:39 -08001181 * Enable NET_DMA
1182 */
David S. Millerb4bd07c2009-02-06 22:06:43 -08001183 net_dmaengine_get();
Dan Williams649274d2009-01-11 00:20:39 -08001184
1185 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001186 * Initialize multicasting status
1187 */
Patrick McHardy4417da62007-06-27 01:28:10 -07001188 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001189
1190 /*
1191 * Wakeup transmit queue engine
1192 */
1193 dev_activate(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001194 }
Jeff Garzikbada3392007-10-23 20:19:37 -07001195
Linus Torvalds1da177e2005-04-16 15:20:36 -07001196 return ret;
1197}
Patrick McHardybd380812010-02-26 06:34:53 +00001198
1199/**
1200 * dev_open - prepare an interface for use.
1201 * @dev: device to open
1202 *
1203 * Takes a device from down to up state. The device's private open
1204 * function is invoked and then the multicast lists are loaded. Finally
1205 * the device is moved into the up state and a %NETDEV_UP message is
1206 * sent to the netdev notifier chain.
1207 *
1208 * Calling this function on an active interface is a nop. On a failure
1209 * a negative errno code is returned.
1210 */
1211int dev_open(struct net_device *dev)
1212{
1213 int ret;
1214
1215 /*
1216 * Is it already up?
1217 */
1218 if (dev->flags & IFF_UP)
1219 return 0;
1220
1221 /*
1222 * Open device
1223 */
1224 ret = __dev_open(dev);
1225 if (ret < 0)
1226 return ret;
1227
1228 /*
1229 * ... and announce new interface.
1230 */
1231 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1232 call_netdevice_notifiers(NETDEV_UP, dev);
1233
1234 return ret;
1235}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001236EXPORT_SYMBOL(dev_open);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001237
Patrick McHardybd380812010-02-26 06:34:53 +00001238static int __dev_close(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001239{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001240 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardybd380812010-02-26 06:34:53 +00001241
Ben Hutchingse46b66b2008-05-08 02:53:17 -07001242 ASSERT_RTNL();
David S. Miller9d5010d2007-09-12 14:33:25 +02001243 might_sleep();
1244
Linus Torvalds1da177e2005-04-16 15:20:36 -07001245 /*
1246 * Tell people we are going down, so that they can
1247 * prepare to death, when device is still operating.
1248 */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07001249 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001250
Linus Torvalds1da177e2005-04-16 15:20:36 -07001251 clear_bit(__LINK_STATE_START, &dev->state);
1252
1253 /* Synchronize to scheduled poll. We cannot touch poll list,
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001254 * it can be even on different cpu. So just clear netif_running().
1255 *
1256 * dev->stop() will invoke napi_disable() on all of it's
1257 * napi_struct instances on this device.
1258 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001259 smp_mb__after_clear_bit(); /* Commit netif_running(). */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001260
Matti Linnanvuorid8b2a4d2008-02-12 23:10:11 -08001261 dev_deactivate(dev);
1262
Linus Torvalds1da177e2005-04-16 15:20:36 -07001263 /*
1264 * Call the device specific close. This cannot fail.
1265 * Only if device is UP
1266 *
1267 * We allow it to be called even after a DETACH hot-plug
1268 * event.
1269 */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08001270 if (ops->ndo_stop)
1271 ops->ndo_stop(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001272
1273 /*
1274 * Device is now down.
1275 */
1276
1277 dev->flags &= ~IFF_UP;
1278
1279 /*
Dan Williams649274d2009-01-11 00:20:39 -08001280 * Shutdown NET_DMA
1281 */
David S. Millerb4bd07c2009-02-06 22:06:43 -08001282 net_dmaengine_put();
Dan Williams649274d2009-01-11 00:20:39 -08001283
Linus Torvalds1da177e2005-04-16 15:20:36 -07001284 return 0;
1285}
Patrick McHardybd380812010-02-26 06:34:53 +00001286
1287/**
1288 * dev_close - shutdown an interface.
1289 * @dev: device to shutdown
1290 *
1291 * This function moves an active device into down state. A
1292 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1293 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1294 * chain.
1295 */
1296int dev_close(struct net_device *dev)
1297{
1298 if (!(dev->flags & IFF_UP))
1299 return 0;
1300
1301 __dev_close(dev);
1302
1303 /*
1304 * Tell people we are down
1305 */
1306 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1307 call_netdevice_notifiers(NETDEV_DOWN, dev);
1308
1309 return 0;
1310}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001311EXPORT_SYMBOL(dev_close);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001312
1313
Ben Hutchings0187bdf2008-06-19 16:15:47 -07001314/**
1315 * dev_disable_lro - disable Large Receive Offload on a device
1316 * @dev: device
1317 *
1318 * Disable Large Receive Offload (LRO) on a net device. Must be
1319 * called under RTNL. This is needed if received packets may be
1320 * forwarded to another interface.
1321 */
1322void dev_disable_lro(struct net_device *dev)
1323{
1324 if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1325 dev->ethtool_ops->set_flags) {
1326 u32 flags = dev->ethtool_ops->get_flags(dev);
1327 if (flags & ETH_FLAG_LRO) {
1328 flags &= ~ETH_FLAG_LRO;
1329 dev->ethtool_ops->set_flags(dev, flags);
1330 }
1331 }
1332 WARN_ON(dev->features & NETIF_F_LRO);
1333}
1334EXPORT_SYMBOL(dev_disable_lro);
1335
1336
Eric W. Biederman881d9662007-09-17 11:56:21 -07001337static int dev_boot_phase = 1;
1338
Linus Torvalds1da177e2005-04-16 15:20:36 -07001339/*
1340 * Device change register/unregister. These are not inline or static
1341 * as we export them to the world.
1342 */
1343
1344/**
1345 * register_netdevice_notifier - register a network notifier block
1346 * @nb: notifier
1347 *
1348 * Register a notifier to be called when network device events occur.
1349 * The notifier passed is linked into the kernel structures and must
1350 * not be reused until it has been unregistered. A negative errno code
1351 * is returned on a failure.
1352 *
1353 * When registered all registration and up events are replayed
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001354 * to the new notifier to allow device to have a race free
Linus Torvalds1da177e2005-04-16 15:20:36 -07001355 * view of the network device list.
1356 */
1357
1358int register_netdevice_notifier(struct notifier_block *nb)
1359{
1360 struct net_device *dev;
Herbert Xufcc5a032007-07-30 17:03:38 -07001361 struct net_device *last;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001362 struct net *net;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001363 int err;
1364
1365 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001366 err = raw_notifier_chain_register(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001367 if (err)
1368 goto unlock;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001369 if (dev_boot_phase)
1370 goto unlock;
1371 for_each_net(net) {
1372 for_each_netdev(net, dev) {
1373 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1374 err = notifier_to_errno(err);
1375 if (err)
1376 goto rollback;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001377
Eric W. Biederman881d9662007-09-17 11:56:21 -07001378 if (!(dev->flags & IFF_UP))
1379 continue;
Herbert Xufcc5a032007-07-30 17:03:38 -07001380
Eric W. Biederman881d9662007-09-17 11:56:21 -07001381 nb->notifier_call(nb, NETDEV_UP, dev);
1382 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001383 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001384
1385unlock:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386 rtnl_unlock();
1387 return err;
Herbert Xufcc5a032007-07-30 17:03:38 -07001388
1389rollback:
1390 last = dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07001391 for_each_net(net) {
1392 for_each_netdev(net, dev) {
1393 if (dev == last)
1394 break;
Herbert Xufcc5a032007-07-30 17:03:38 -07001395
Eric W. Biederman881d9662007-09-17 11:56:21 -07001396 if (dev->flags & IFF_UP) {
1397 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1398 nb->notifier_call(nb, NETDEV_DOWN, dev);
1399 }
1400 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00001401 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07001402 }
Herbert Xufcc5a032007-07-30 17:03:38 -07001403 }
Pavel Emelyanovc67625a2007-11-14 15:53:16 -08001404
1405 raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xufcc5a032007-07-30 17:03:38 -07001406 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001407}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001408EXPORT_SYMBOL(register_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001409
1410/**
1411 * unregister_netdevice_notifier - unregister a network notifier block
1412 * @nb: notifier
1413 *
1414 * Unregister a notifier previously registered by
1415 * register_netdevice_notifier(). The notifier is unlinked into the
1416 * kernel structures and may then be reused. A negative errno code
1417 * is returned on a failure.
1418 */
1419
1420int unregister_netdevice_notifier(struct notifier_block *nb)
1421{
Herbert Xu9f514952006-03-25 01:24:25 -08001422 int err;
1423
1424 rtnl_lock();
Alan Sternf07d5b92006-05-09 15:23:03 -07001425 err = raw_notifier_chain_unregister(&netdev_chain, nb);
Herbert Xu9f514952006-03-25 01:24:25 -08001426 rtnl_unlock();
1427 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001429EXPORT_SYMBOL(unregister_netdevice_notifier);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430
1431/**
1432 * call_netdevice_notifiers - call all network notifier blocks
1433 * @val: value passed unmodified to notifier function
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07001434 * @dev: net_device pointer passed unmodified to notifier function
Linus Torvalds1da177e2005-04-16 15:20:36 -07001435 *
1436 * Call all network notifier blocks. Parameters and return value
Alan Sternf07d5b92006-05-09 15:23:03 -07001437 * are as for raw_notifier_call_chain().
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438 */
1439
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001440int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001441{
Jiri Pirkoab930472010-04-20 01:45:37 -07001442 ASSERT_RTNL();
Eric W. Biedermanad7379d2007-09-16 15:33:32 -07001443 return raw_notifier_call_chain(&netdev_chain, val, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444}
1445
1446/* When > 0 there are consumers of rx skb time stamps */
1447static atomic_t netstamp_needed = ATOMIC_INIT(0);
1448
1449void net_enable_timestamp(void)
1450{
1451 atomic_inc(&netstamp_needed);
1452}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001453EXPORT_SYMBOL(net_enable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001454
1455void net_disable_timestamp(void)
1456{
1457 atomic_dec(&netstamp_needed);
1458}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001459EXPORT_SYMBOL(net_disable_timestamp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001460
Eric Dumazet3b098e22010-05-15 23:57:10 -07001461static inline void net_timestamp_set(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001462{
1463 if (atomic_read(&netstamp_needed))
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001464 __net_timestamp(skb);
Eric Dumazetb7aa0bf2007-04-19 16:16:32 -07001465 else
1466 skb->tstamp.tv64 = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001467}
1468
Eric Dumazet3b098e22010-05-15 23:57:10 -07001469static inline void net_timestamp_check(struct sk_buff *skb)
1470{
1471 if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
1472 __net_timestamp(skb);
1473}
1474
Arnd Bergmann44540962009-11-26 06:07:08 +00001475/**
1476 * dev_forward_skb - loopback an skb to another netif
1477 *
1478 * @dev: destination network device
1479 * @skb: buffer to forward
1480 *
1481 * return values:
1482 * NET_RX_SUCCESS (no congestion)
Eric Dumazet6ec82562010-05-06 00:53:53 -07001483 * NET_RX_DROP (packet was dropped, but freed)
Arnd Bergmann44540962009-11-26 06:07:08 +00001484 *
1485 * dev_forward_skb can be used for injecting an skb from the
1486 * start_xmit function of one device into the receive queue
1487 * of another device.
1488 *
1489 * The receiving device may be in another namespace, so
1490 * we have to clear all information in the skb that could
1491 * impact namespace isolation.
1492 */
1493int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1494{
1495 skb_orphan(skb);
1496
Eric Dumazet6ec82562010-05-06 00:53:53 -07001497 if (!(dev->flags & IFF_UP) ||
1498 (skb->len > (dev->mtu + dev->hard_header_len))) {
1499 kfree_skb(skb);
Arnd Bergmann44540962009-11-26 06:07:08 +00001500 return NET_RX_DROP;
Eric Dumazet6ec82562010-05-06 00:53:53 -07001501 }
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001502 skb_set_dev(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001503 skb->tstamp.tv64 = 0;
1504 skb->pkt_type = PACKET_HOST;
1505 skb->protocol = eth_type_trans(skb, dev);
Arnd Bergmann44540962009-11-26 06:07:08 +00001506 return netif_rx(skb);
1507}
1508EXPORT_SYMBOL_GPL(dev_forward_skb);
1509
Linus Torvalds1da177e2005-04-16 15:20:36 -07001510/*
1511 * Support routine. Sends outgoing frames to any network
1512 * taps currently in use.
1513 */
1514
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001515static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516{
1517 struct packet_type *ptype;
Patrick McHardya61bbcf2005-08-14 17:24:31 -07001518
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001519#ifdef CONFIG_NET_CLS_ACT
1520 if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
Eric Dumazet3b098e22010-05-15 23:57:10 -07001521 net_timestamp_set(skb);
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001522#else
Eric Dumazet3b098e22010-05-15 23:57:10 -07001523 net_timestamp_set(skb);
Jarek Poplawski8caf1532009-04-17 10:08:49 +00001524#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001525
1526 rcu_read_lock();
1527 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1528 /* Never send packets back to the socket
1529 * they originated from - MvS (miquels@drinkel.ow.org)
1530 */
1531 if ((ptype->dev == dev || !ptype->dev) &&
1532 (ptype->af_packet_priv == NULL ||
1533 (struct sock *)ptype->af_packet_priv != skb->sk)) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001534 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001535 if (!skb2)
1536 break;
1537
1538 /* skb->nh should be correctly
1539 set by sender, so that the second statement is
1540 just protection against buggy protocols.
1541 */
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001542 skb_reset_mac_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001544 if (skb_network_header(skb2) < skb2->data ||
Arnaldo Carvalho de Melo27a884d2007-04-19 20:29:13 -07001545 skb2->network_header > skb2->tail) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001546 if (net_ratelimit())
1547 printk(KERN_CRIT "protocol %04x is "
1548 "buggy, dev %s\n",
1549 skb2->protocol, dev->name);
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07001550 skb_reset_network_header(skb2);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001551 }
1552
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001553 skb2->transport_header = skb2->network_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001554 skb2->pkt_type = PACKET_OUTGOING;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07001555 ptype->func(skb2, skb->dev, ptype, skb->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001556 }
1557 }
1558 rcu_read_unlock();
1559}
1560
Denis Vlasenko56079432006-03-29 15:57:29 -08001561
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001562static inline void __netif_reschedule(struct Qdisc *q)
1563{
1564 struct softnet_data *sd;
1565 unsigned long flags;
1566
1567 local_irq_save(flags);
1568 sd = &__get_cpu_var(softnet_data);
Changli Gaoa9cbd582010-04-26 23:06:24 +00001569 q->next_sched = NULL;
1570 *sd->output_queue_tailp = q;
1571 sd->output_queue_tailp = &q->next_sched;
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001572 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1573 local_irq_restore(flags);
1574}
1575
David S. Miller37437bb2008-07-16 02:15:04 -07001576void __netif_schedule(struct Qdisc *q)
Denis Vlasenko56079432006-03-29 15:57:29 -08001577{
Jarek Poplawskidef82a12008-08-17 21:54:43 -07001578 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1579 __netif_reschedule(q);
Denis Vlasenko56079432006-03-29 15:57:29 -08001580}
1581EXPORT_SYMBOL(__netif_schedule);
1582
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001583void dev_kfree_skb_irq(struct sk_buff *skb)
Denis Vlasenko56079432006-03-29 15:57:29 -08001584{
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001585 if (atomic_dec_and_test(&skb->users)) {
1586 struct softnet_data *sd;
1587 unsigned long flags;
Denis Vlasenko56079432006-03-29 15:57:29 -08001588
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001589 local_irq_save(flags);
1590 sd = &__get_cpu_var(softnet_data);
1591 skb->next = sd->completion_queue;
1592 sd->completion_queue = skb;
1593 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1594 local_irq_restore(flags);
1595 }
Denis Vlasenko56079432006-03-29 15:57:29 -08001596}
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001597EXPORT_SYMBOL(dev_kfree_skb_irq);
Denis Vlasenko56079432006-03-29 15:57:29 -08001598
1599void dev_kfree_skb_any(struct sk_buff *skb)
1600{
1601 if (in_irq() || irqs_disabled())
1602 dev_kfree_skb_irq(skb);
1603 else
1604 dev_kfree_skb(skb);
1605}
1606EXPORT_SYMBOL(dev_kfree_skb_any);
1607
1608
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001609/**
1610 * netif_device_detach - mark device as removed
1611 * @dev: network device
1612 *
1613 * Mark device as removed from system and therefore no longer available.
1614 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001615void netif_device_detach(struct net_device *dev)
1616{
1617 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1618 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001619 netif_tx_stop_all_queues(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001620 }
1621}
1622EXPORT_SYMBOL(netif_device_detach);
1623
Stephen Hemmingerbea33482007-10-03 16:41:36 -07001624/**
1625 * netif_device_attach - mark device as attached
1626 * @dev: network device
1627 *
1628 * Mark device as attached from system and restart if needed.
1629 */
Denis Vlasenko56079432006-03-29 15:57:29 -08001630void netif_device_attach(struct net_device *dev)
1631{
1632 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1633 netif_running(dev)) {
Alexander Duyckd5431032009-04-08 13:15:22 +00001634 netif_tx_wake_all_queues(dev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001635 __netdev_watchdog_up(dev);
Denis Vlasenko56079432006-03-29 15:57:29 -08001636 }
1637}
1638EXPORT_SYMBOL(netif_device_attach);
1639
Ben Hutchings6de329e2008-06-16 17:02:28 -07001640static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1641{
1642 return ((features & NETIF_F_GEN_CSUM) ||
1643 ((features & NETIF_F_IP_CSUM) &&
1644 protocol == htons(ETH_P_IP)) ||
1645 ((features & NETIF_F_IPV6_CSUM) &&
Yi Zou1c8dbcf2009-02-27 14:06:54 -08001646 protocol == htons(ETH_P_IPV6)) ||
1647 ((features & NETIF_F_FCOE_CRC) &&
1648 protocol == htons(ETH_P_FCOE)));
Ben Hutchings6de329e2008-06-16 17:02:28 -07001649}
1650
1651static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1652{
1653 if (can_checksum_protocol(dev->features, skb->protocol))
1654 return true;
1655
1656 if (skb->protocol == htons(ETH_P_8021Q)) {
1657 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1658 if (can_checksum_protocol(dev->features & dev->vlan_features,
1659 veh->h_vlan_encapsulated_proto))
1660 return true;
1661 }
1662
1663 return false;
1664}
Denis Vlasenko56079432006-03-29 15:57:29 -08001665
Arnd Bergmann8a83a002010-01-30 12:23:03 +00001666/**
1667 * skb_dev_set -- assign a new device to a buffer
1668 * @skb: buffer for the new device
1669 * @dev: network device
1670 *
1671 * If an skb is owned by a device already, we have to reset
1672 * all data private to the namespace a device belongs to
1673 * before assigning it a new device.
1674 */
1675#ifdef CONFIG_NET_NS
1676void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1677{
1678 skb_dst_drop(skb);
1679 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1680 secpath_reset(skb);
1681 nf_reset(skb);
1682 skb_init_secmark(skb);
1683 skb->mark = 0;
1684 skb->priority = 0;
1685 skb->nf_trace = 0;
1686 skb->ipvs_property = 0;
1687#ifdef CONFIG_NET_SCHED
1688 skb->tc_index = 0;
1689#endif
1690 }
1691 skb->dev = dev;
1692}
1693EXPORT_SYMBOL(skb_set_dev);
1694#endif /* CONFIG_NET_NS */
1695
Linus Torvalds1da177e2005-04-16 15:20:36 -07001696/*
1697 * Invalidate hardware checksum when packet is to be mangled, and
1698 * complete checksum manually on outgoing path.
1699 */
Patrick McHardy84fa7932006-08-29 16:44:56 -07001700int skb_checksum_help(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001701{
Al Virod3bc23e2006-11-14 21:24:49 -08001702 __wsum csum;
Herbert Xu663ead32007-04-09 11:59:07 -07001703 int ret = 0, offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001704
Patrick McHardy84fa7932006-08-29 16:44:56 -07001705 if (skb->ip_summed == CHECKSUM_COMPLETE)
Herbert Xua430a432006-07-08 13:34:56 -07001706 goto out_set_summed;
1707
1708 if (unlikely(skb_shinfo(skb)->gso_size)) {
Herbert Xua430a432006-07-08 13:34:56 -07001709 /* Let GSO fix up the checksum. */
1710 goto out_set_summed;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001711 }
1712
Herbert Xua0308472007-10-15 01:47:15 -07001713 offset = skb->csum_start - skb_headroom(skb);
1714 BUG_ON(offset >= skb_headlen(skb));
1715 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1716
1717 offset += skb->csum_offset;
1718 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1719
1720 if (skb_cloned(skb) &&
1721 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001722 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1723 if (ret)
1724 goto out;
1725 }
1726
Herbert Xua0308472007-10-15 01:47:15 -07001727 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
Herbert Xua430a432006-07-08 13:34:56 -07001728out_set_summed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001729 skb->ip_summed = CHECKSUM_NONE;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001730out:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731 return ret;
1732}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001733EXPORT_SYMBOL(skb_checksum_help);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001734
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001735/**
1736 * skb_gso_segment - Perform segmentation on skb.
1737 * @skb: buffer to segment
Herbert Xu576a30e2006-06-27 13:22:38 -07001738 * @features: features for the output path (see dev->features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001739 *
1740 * This function segments the given skb and returns a list of segments.
Herbert Xu576a30e2006-06-27 13:22:38 -07001741 *
1742 * It may return NULL if the skb requires no segmentation. This is
1743 * only possible when GSO is used for verifying header integrity.
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001744 */
Herbert Xu576a30e2006-06-27 13:22:38 -07001745struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001746{
1747 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1748 struct packet_type *ptype;
Al Viro252e3342006-11-14 20:48:11 -08001749 __be16 type = skb->protocol;
Herbert Xua430a432006-07-08 13:34:56 -07001750 int err;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001751
Arnaldo Carvalho de Melo459a98e2007-03-19 15:30:44 -07001752 skb_reset_mac_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07001753 skb->mac_len = skb->network_header - skb->mac_header;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001754 __skb_pull(skb, skb->mac_len);
1755
Herbert Xu67fd1a72009-01-19 16:26:44 -08001756 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1757 struct net_device *dev = skb->dev;
1758 struct ethtool_drvinfo info = {};
1759
1760 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1761 dev->ethtool_ops->get_drvinfo(dev, &info);
1762
1763 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1764 "ip_summed=%d",
1765 info.driver, dev ? dev->features : 0L,
1766 skb->sk ? skb->sk->sk_route_caps : 0L,
1767 skb->len, skb->data_len, skb->ip_summed);
1768
Herbert Xua430a432006-07-08 13:34:56 -07001769 if (skb_header_cloned(skb) &&
1770 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1771 return ERR_PTR(err);
1772 }
1773
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001774 rcu_read_lock();
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08001775 list_for_each_entry_rcu(ptype,
1776 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001777 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
Patrick McHardy84fa7932006-08-29 16:44:56 -07001778 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
Herbert Xua430a432006-07-08 13:34:56 -07001779 err = ptype->gso_send_check(skb);
1780 segs = ERR_PTR(err);
1781 if (err || skb_gso_ok(skb, features))
1782 break;
Arnaldo Carvalho de Melod56f90a2007-04-10 20:50:43 -07001783 __skb_push(skb, (skb->data -
1784 skb_network_header(skb)));
Herbert Xua430a432006-07-08 13:34:56 -07001785 }
Herbert Xu576a30e2006-06-27 13:22:38 -07001786 segs = ptype->gso_segment(skb, features);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001787 break;
1788 }
1789 }
1790 rcu_read_unlock();
1791
Arnaldo Carvalho de Melo98e399f2007-03-19 15:33:04 -07001792 __skb_push(skb, skb->data - skb_mac_header(skb));
Herbert Xu576a30e2006-06-27 13:22:38 -07001793
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001794 return segs;
1795}
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001796EXPORT_SYMBOL(skb_gso_segment);
1797
Herbert Xufb286bb2005-11-10 13:01:24 -08001798/* Take action when hardware reception checksum errors are detected. */
1799#ifdef CONFIG_BUG
1800void netdev_rx_csum_fault(struct net_device *dev)
1801{
1802 if (net_ratelimit()) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001803 printk(KERN_ERR "%s: hw csum failure.\n",
Stephen Hemminger246a4212005-12-08 15:21:39 -08001804 dev ? dev->name : "<unknown>");
Herbert Xufb286bb2005-11-10 13:01:24 -08001805 dump_stack();
1806 }
1807}
1808EXPORT_SYMBOL(netdev_rx_csum_fault);
1809#endif
1810
Linus Torvalds1da177e2005-04-16 15:20:36 -07001811/* Actually, we should eliminate this check as soon as we know, that:
1812 * 1. IOMMU is present and allows to map all the memory.
1813 * 2. No high memory really exists on this machine.
1814 */
1815
Eric Dumazet9092c652010-04-02 13:34:49 -07001816static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001817{
Herbert Xu3d3a8532006-06-27 13:33:10 -07001818#ifdef CONFIG_HIGHMEM
Linus Torvalds1da177e2005-04-16 15:20:36 -07001819 int i;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001820 if (!(dev->features & NETIF_F_HIGHDMA)) {
1821 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1822 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1823 return 1;
1824 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001826 if (PCI_DMA_BUS_IS_PHYS) {
1827 struct device *pdev = dev->dev.parent;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001828
Eric Dumazet9092c652010-04-02 13:34:49 -07001829 if (!pdev)
1830 return 0;
FUJITA Tomonori5acbbd42010-03-30 22:35:50 +00001831 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1832 dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1833 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1834 return 1;
1835 }
1836 }
Herbert Xu3d3a8532006-06-27 13:33:10 -07001837#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001838 return 0;
1839}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001840
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001841struct dev_gso_cb {
1842 void (*destructor)(struct sk_buff *skb);
1843};
1844
1845#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1846
1847static void dev_gso_skb_destructor(struct sk_buff *skb)
1848{
1849 struct dev_gso_cb *cb;
1850
1851 do {
1852 struct sk_buff *nskb = skb->next;
1853
1854 skb->next = nskb->next;
1855 nskb->next = NULL;
1856 kfree_skb(nskb);
1857 } while (skb->next);
1858
1859 cb = DEV_GSO_CB(skb);
1860 if (cb->destructor)
1861 cb->destructor(skb);
1862}
1863
1864/**
1865 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1866 * @skb: buffer to segment
1867 *
1868 * This function segments the given skb and stores the list of segments
1869 * in skb->next.
1870 */
1871static int dev_gso_segment(struct sk_buff *skb)
1872{
1873 struct net_device *dev = skb->dev;
1874 struct sk_buff *segs;
Herbert Xu576a30e2006-06-27 13:22:38 -07001875 int features = dev->features & ~(illegal_highdma(dev, skb) ?
1876 NETIF_F_SG : 0);
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001877
Herbert Xu576a30e2006-06-27 13:22:38 -07001878 segs = skb_gso_segment(skb, features);
1879
1880 /* Verifying header integrity only. */
1881 if (!segs)
1882 return 0;
1883
Hirofumi Nakagawa801678c2008-04-29 01:03:09 -07001884 if (IS_ERR(segs))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001885 return PTR_ERR(segs);
1886
1887 skb->next = segs;
1888 DEV_GSO_CB(skb)->destructor = skb->destructor;
1889 skb->destructor = dev_gso_skb_destructor;
1890
1891 return 0;
1892}
1893
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001894/*
1895 * Try to orphan skb early, right before transmission by the device.
1896 * We cannot orphan skb if tx timestamp is requested, since
1897 * drivers need to call skb_tstamp_tx() to send the timestamp.
1898 */
1899static inline void skb_orphan_try(struct sk_buff *skb)
1900{
1901 if (!skb_tx(skb)->flags)
1902 skb_orphan(skb);
1903}
1904
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001905int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1906 struct netdev_queue *txq)
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001907{
Stephen Hemminger00829822008-11-20 20:14:53 -08001908 const struct net_device_ops *ops = dev->netdev_ops;
Patrick McHardy572a9d72009-11-10 06:14:14 +00001909 int rc = NETDEV_TX_OK;
Stephen Hemminger00829822008-11-20 20:14:53 -08001910
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001911 if (likely(!skb->next)) {
Stephen Hemminger9be9a6b2007-04-20 17:02:45 -07001912 if (!list_empty(&ptype_all))
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001913 dev_queue_xmit_nit(skb, dev);
1914
Eric Dumazet93f154b2009-05-18 22:19:19 -07001915 /*
1916 * If device doesnt need skb->dst, release it right now while
1917 * its hot in this cpu cache
1918 */
Eric Dumazetadf30902009-06-02 05:19:30 +00001919 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1920 skb_dst_drop(skb);
1921
Eric Dumazetfc6055a2010-04-16 12:18:22 +00001922 skb_orphan_try(skb);
David S. Miller9ccb8972010-04-22 01:02:07 -07001923
1924 if (netif_needs_gso(dev, skb)) {
1925 if (unlikely(dev_gso_segment(skb)))
1926 goto out_kfree_skb;
1927 if (skb->next)
1928 goto gso;
1929 }
1930
Patrick Ohlyac45f602009-02-12 05:03:37 +00001931 rc = ops->ndo_start_xmit(skb, dev);
Patrick McHardyec634fe2009-07-05 19:23:38 -07001932 if (rc == NETDEV_TX_OK)
Eric Dumazet08baf562009-05-25 22:58:01 -07001933 txq_trans_update(txq);
Patrick Ohlyac45f602009-02-12 05:03:37 +00001934 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001935 }
1936
Herbert Xu576a30e2006-06-27 13:22:38 -07001937gso:
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001938 do {
1939 struct sk_buff *nskb = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001940
1941 skb->next = nskb->next;
1942 nskb->next = NULL;
Krishna Kumar068a2de2009-12-09 20:59:58 +00001943
1944 /*
1945 * If device doesnt need nskb->dst, release it right now while
1946 * its hot in this cpu cache
1947 */
1948 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1949 skb_dst_drop(nskb);
1950
Stephen Hemminger00829822008-11-20 20:14:53 -08001951 rc = ops->ndo_start_xmit(nskb, dev);
Patrick McHardyec634fe2009-07-05 19:23:38 -07001952 if (unlikely(rc != NETDEV_TX_OK)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00001953 if (rc & ~NETDEV_TX_MASK)
1954 goto out_kfree_gso_skb;
Michael Chanf54d9e82006-06-25 23:57:04 -07001955 nskb->next = skb->next;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001956 skb->next = nskb;
1957 return rc;
1958 }
Eric Dumazet08baf562009-05-25 22:58:01 -07001959 txq_trans_update(txq);
David S. Millerfd2ea0a2008-07-17 01:56:23 -07001960 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
Michael Chanf54d9e82006-06-25 23:57:04 -07001961 return NETDEV_TX_BUSY;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001962 } while (skb->next);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09001963
Patrick McHardy572a9d72009-11-10 06:14:14 +00001964out_kfree_gso_skb:
1965 if (likely(skb->next == NULL))
1966 skb->destructor = DEV_GSO_CB(skb)->destructor;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001967out_kfree_skb:
1968 kfree_skb(skb);
Patrick McHardy572a9d72009-11-10 06:14:14 +00001969 return rc;
Herbert Xuf6a78bf2006-06-22 02:57:17 -07001970}
1971
Tom Herbert0a9627f2010-03-16 08:03:29 +00001972static u32 hashrnd __read_mostly;
David S. Millerb6b2fed2008-07-21 09:48:06 -07001973
Stephen Hemminger92477442009-03-21 13:39:26 -07001974u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
David S. Miller8f0f2222008-07-15 03:47:03 -07001975{
David S. Miller70192982009-01-27 16:34:47 -08001976 u32 hash;
David S. Millerb6b2fed2008-07-21 09:48:06 -07001977
David S. Miller513de112009-05-03 14:43:10 -07001978 if (skb_rx_queue_recorded(skb)) {
1979 hash = skb_get_rx_queue(skb);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07001980 while (unlikely(hash >= dev->real_num_tx_queues))
David S. Miller513de112009-05-03 14:43:10 -07001981 hash -= dev->real_num_tx_queues;
1982 return hash;
1983 }
Eric Dumazetec581f62009-05-01 09:05:06 -07001984
1985 if (skb->sk && skb->sk->sk_hash)
David S. Miller70192982009-01-27 16:34:47 -08001986 hash = skb->sk->sk_hash;
Eric Dumazetec581f62009-05-01 09:05:06 -07001987 else
Eric Dumazetb249dcb2010-04-19 21:56:38 +00001988 hash = (__force u16) skb->protocol;
David S. Millerd5a9e242009-01-27 16:22:11 -08001989
Tom Herbert0a9627f2010-03-16 08:03:29 +00001990 hash = jhash_1word(hash, hashrnd);
David S. Millerd5a9e242009-01-27 16:22:11 -08001991
David S. Millerb6b2fed2008-07-21 09:48:06 -07001992 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
David S. Miller8f0f2222008-07-15 03:47:03 -07001993}
Stephen Hemminger92477442009-03-21 13:39:26 -07001994EXPORT_SYMBOL(skb_tx_hash);
David S. Miller8f0f2222008-07-15 03:47:03 -07001995
Eric Dumazeted046422009-11-13 21:54:04 +00001996static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1997{
1998 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1999 if (net_ratelimit()) {
Eric Dumazet7a161ea2010-04-08 21:26:13 +00002000 pr_warning("%s selects TX queue %d, but "
2001 "real number of TX queues is %d\n",
2002 dev->name, queue_index, dev->real_num_tx_queues);
Eric Dumazeted046422009-11-13 21:54:04 +00002003 }
2004 return 0;
2005 }
2006 return queue_index;
2007}
2008
David S. Millere8a04642008-07-17 00:34:19 -07002009static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2010 struct sk_buff *skb)
2011{
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002012 u16 queue_index;
2013 struct sock *sk = skb->sk;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002014
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002015 if (sk_tx_queue_recorded(sk)) {
2016 queue_index = sk_tx_queue_get(sk);
2017 } else {
2018 const struct net_device_ops *ops = dev->netdev_ops;
2019
2020 if (ops->ndo_select_queue) {
2021 queue_index = ops->ndo_select_queue(dev, skb);
Eric Dumazeted046422009-11-13 21:54:04 +00002022 queue_index = dev_cap_txqueue(dev, queue_index);
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002023 } else {
2024 queue_index = 0;
2025 if (dev->real_num_tx_queues > 1)
2026 queue_index = skb_tx_hash(dev, skb);
2027
Eric Dumazet8728c542010-04-11 21:18:17 +00002028 if (sk) {
David S. Miller87eb3672010-04-21 01:14:25 -07002029 struct dst_entry *dst = rcu_dereference_check(sk->sk_dst_cache, 1);
Eric Dumazet8728c542010-04-11 21:18:17 +00002030
2031 if (dst && skb_dst(skb) == dst)
2032 sk_tx_queue_set(sk, queue_index);
2033 }
Krishna Kumara4ee3ce2009-10-19 23:50:07 +00002034 }
2035 }
David S. Millereae792b2008-07-15 03:03:33 -07002036
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002037 skb_set_queue_mapping(skb, queue_index);
2038 return netdev_get_tx_queue(dev, queue_index);
David S. Millere8a04642008-07-17 00:34:19 -07002039}
2040
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002041static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2042 struct net_device *dev,
2043 struct netdev_queue *txq)
2044{
2045 spinlock_t *root_lock = qdisc_lock(q);
2046 int rc;
2047
2048 spin_lock(root_lock);
2049 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2050 kfree_skb(skb);
2051 rc = NET_XMIT_DROP;
2052 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2053 !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2054 /*
2055 * This is a work-conserving queue; there are no old skbs
2056 * waiting to be sent out; and the qdisc is not running -
2057 * xmit the skb directly.
2058 */
Eric Dumazet7fee2262010-05-11 23:19:48 +00002059 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2060 skb_dst_force(skb);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002061 __qdisc_update_bstats(q, skb->len);
2062 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2063 __qdisc_run(q);
2064 else
2065 clear_bit(__QDISC_STATE_RUNNING, &q->state);
2066
2067 rc = NET_XMIT_SUCCESS;
2068 } else {
Eric Dumazet7fee2262010-05-11 23:19:48 +00002069 skb_dst_force(skb);
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002070 rc = qdisc_enqueue_root(skb, q);
2071 qdisc_run(q);
2072 }
2073 spin_unlock(root_lock);
2074
2075 return rc;
2076}
2077
Krishna Kumar4b258462010-01-21 01:26:29 -08002078/*
2079 * Returns true if either:
2080 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2081 * 2. skb is fragmented and the device does not support SG, or if
2082 * at least one of fragments is in highmem and device does not
2083 * support DMA from it.
2084 */
2085static inline int skb_needs_linearize(struct sk_buff *skb,
2086 struct net_device *dev)
2087{
2088 return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2089 (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2090 illegal_highdma(dev, skb)));
2091}
2092
Dave Jonesd29f7492008-07-22 14:09:06 -07002093/**
2094 * dev_queue_xmit - transmit a buffer
2095 * @skb: buffer to transmit
2096 *
2097 * Queue a buffer for transmission to a network device. The caller must
2098 * have set the device and priority and built the buffer before calling
2099 * this function. The function can be called from an interrupt.
2100 *
2101 * A negative errno code is returned on a failure. A success does not
2102 * guarantee the frame will be transmitted as it may be dropped due
2103 * to congestion or traffic shaping.
2104 *
2105 * -----------------------------------------------------------------------------------
2106 * I notice this method can also return errors from the queue disciplines,
2107 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2108 * be positive.
2109 *
2110 * Regardless of the return value, the skb is consumed, so it is currently
2111 * difficult to retry a send to this method. (You can bump the ref count
2112 * before sending to hold a reference for retry if you are careful.)
2113 *
2114 * When calling this method, interrupts MUST be enabled. This is because
2115 * the BH enable code must have IRQs enabled so that it will not deadlock.
2116 * --BLG
2117 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002118int dev_queue_xmit(struct sk_buff *skb)
2119{
2120 struct net_device *dev = skb->dev;
David S. Millerdc2b4842008-07-08 17:18:23 -07002121 struct netdev_queue *txq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002122 struct Qdisc *q;
2123 int rc = -ENOMEM;
2124
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002125 /* GSO will handle the following emulations directly. */
2126 if (netif_needs_gso(dev, skb))
2127 goto gso;
2128
Krishna Kumar4b258462010-01-21 01:26:29 -08002129 /* Convert a paged skb to linear, if required */
2130 if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002131 goto out_kfree_skb;
2132
2133 /* If packet is not checksummed and device does not support
2134 * checksumming for this protocol, complete checksumming here.
2135 */
Herbert Xu663ead32007-04-09 11:59:07 -07002136 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2137 skb_set_transport_header(skb, skb->csum_start -
2138 skb_headroom(skb));
Ben Hutchings6de329e2008-06-16 17:02:28 -07002139 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2140 goto out_kfree_skb;
Herbert Xu663ead32007-04-09 11:59:07 -07002141 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002142
Herbert Xuf6a78bf2006-06-22 02:57:17 -07002143gso:
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002144 /* Disable soft irqs for various locks below. Also
2145 * stops preemption for RCU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002146 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002147 rcu_read_lock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148
David S. Millereae792b2008-07-15 03:03:33 -07002149 txq = dev_pick_tx(dev, skb);
Paul E. McKenneya898def2010-02-22 17:04:49 -08002150 q = rcu_dereference_bh(txq->qdisc);
David S. Miller37437bb2008-07-16 02:15:04 -07002151
Linus Torvalds1da177e2005-04-16 15:20:36 -07002152#ifdef CONFIG_NET_CLS_ACT
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002153 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002154#endif
2155 if (q->enqueue) {
Krishna Kumarbbd8a0d2009-08-06 01:44:21 +00002156 rc = __dev_xmit_skb(skb, q, dev, txq);
David S. Miller37437bb2008-07-16 02:15:04 -07002157 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002158 }
2159
2160 /* The device has no queue. Common case for software devices:
2161 loopback, all the sorts of tunnels...
2162
Herbert Xu932ff272006-06-09 12:20:56 -07002163 Really, it is unlikely that netif_tx_lock protection is necessary
2164 here. (f.e. loopback and IP tunnels are clean ignoring statistics
Linus Torvalds1da177e2005-04-16 15:20:36 -07002165 counters.)
2166 However, it is possible, that they rely on protection
2167 made by us here.
2168
2169 Check this and shot the lock. It is not prone from deadlocks.
2170 Either shot noqueue qdisc, it is even simpler 8)
2171 */
2172 if (dev->flags & IFF_UP) {
2173 int cpu = smp_processor_id(); /* ok because BHs are off */
2174
David S. Millerc773e842008-07-08 23:13:53 -07002175 if (txq->xmit_lock_owner != cpu) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002176
David S. Millerc773e842008-07-08 23:13:53 -07002177 HARD_TX_LOCK(dev, txq, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002178
David S. Millerfd2ea0a2008-07-17 01:56:23 -07002179 if (!netif_tx_queue_stopped(txq)) {
Patrick McHardy572a9d72009-11-10 06:14:14 +00002180 rc = dev_hard_start_xmit(skb, dev, txq);
2181 if (dev_xmit_complete(rc)) {
David S. Millerc773e842008-07-08 23:13:53 -07002182 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002183 goto out;
2184 }
2185 }
David S. Millerc773e842008-07-08 23:13:53 -07002186 HARD_TX_UNLOCK(dev, txq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002187 if (net_ratelimit())
2188 printk(KERN_CRIT "Virtual device %s asks to "
2189 "queue packet!\n", dev->name);
2190 } else {
2191 /* Recursion is detected! It is possible,
2192 * unfortunately */
2193 if (net_ratelimit())
2194 printk(KERN_CRIT "Dead loop on virtual device "
2195 "%s, fix it urgently!\n", dev->name);
2196 }
2197 }
2198
2199 rc = -ENETDOWN;
Herbert Xud4828d82006-06-22 02:28:18 -07002200 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002201
2202out_kfree_skb:
2203 kfree_skb(skb);
2204 return rc;
2205out:
Herbert Xud4828d82006-06-22 02:28:18 -07002206 rcu_read_unlock_bh();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002207 return rc;
2208}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002209EXPORT_SYMBOL(dev_queue_xmit);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002210
2211
2212/*=======================================================================
2213 Receiver routines
2214 =======================================================================*/
2215
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002216int netdev_max_backlog __read_mostly = 1000;
Eric Dumazet3b098e22010-05-15 23:57:10 -07002217int netdev_tstamp_prequeue __read_mostly = 1;
Stephen Hemminger6b2bedc2007-03-12 14:33:50 -07002218int netdev_budget __read_mostly = 300;
2219int weight_p __read_mostly = 64; /* old backlog weight */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002220
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002221/* Called with irq disabled */
2222static inline void ____napi_schedule(struct softnet_data *sd,
2223 struct napi_struct *napi)
2224{
2225 list_add_tail(&napi->poll_list, &sd->poll_list);
2226 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2227}
2228
Eric Dumazetdf334542010-03-24 19:13:54 +00002229#ifdef CONFIG_RPS
Tom Herbertfec5e652010-04-16 16:01:27 -07002230
2231/* One global table that all flow-based protocols share. */
Eric Dumazet8770acf2010-04-17 00:54:36 -07002232struct rps_sock_flow_table *rps_sock_flow_table __read_mostly;
Tom Herbertfec5e652010-04-16 16:01:27 -07002233EXPORT_SYMBOL(rps_sock_flow_table);
2234
Tom Herbert0a9627f2010-03-16 08:03:29 +00002235/*
2236 * get_rps_cpu is called from netif_receive_skb and returns the target
2237 * CPU from the RPS map of the receiving queue for a given skb.
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002238 * rcu_read_lock must be held on entry.
Tom Herbert0a9627f2010-03-16 08:03:29 +00002239 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002240static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2241 struct rps_dev_flow **rflowp)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002242{
2243 struct ipv6hdr *ip6;
2244 struct iphdr *ip;
2245 struct netdev_rx_queue *rxqueue;
2246 struct rps_map *map;
Tom Herbertfec5e652010-04-16 16:01:27 -07002247 struct rps_dev_flow_table *flow_table;
2248 struct rps_sock_flow_table *sock_flow_table;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002249 int cpu = -1;
2250 u8 ip_proto;
Tom Herbertfec5e652010-04-16 16:01:27 -07002251 u16 tcpu;
Changli Gao8c52d502010-04-24 22:50:10 -07002252 u32 addr1, addr2, ihl;
2253 union {
2254 u32 v32;
2255 u16 v16[2];
2256 } ports;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002257
Tom Herbert0a9627f2010-03-16 08:03:29 +00002258 if (skb_rx_queue_recorded(skb)) {
2259 u16 index = skb_get_rx_queue(skb);
2260 if (unlikely(index >= dev->num_rx_queues)) {
2261 if (net_ratelimit()) {
Eric Dumazet7a161ea2010-04-08 21:26:13 +00002262 pr_warning("%s received packet on queue "
2263 "%u, but number of RX queues is %u\n",
2264 dev->name, index, dev->num_rx_queues);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002265 }
2266 goto done;
2267 }
2268 rxqueue = dev->_rx + index;
2269 } else
2270 rxqueue = dev->_rx;
2271
Tom Herbertfec5e652010-04-16 16:01:27 -07002272 if (!rxqueue->rps_map && !rxqueue->rps_flow_table)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002273 goto done;
2274
2275 if (skb->rxhash)
2276 goto got_hash; /* Skip hash computation on packet header */
2277
2278 switch (skb->protocol) {
2279 case __constant_htons(ETH_P_IP):
2280 if (!pskb_may_pull(skb, sizeof(*ip)))
2281 goto done;
2282
2283 ip = (struct iphdr *) skb->data;
2284 ip_proto = ip->protocol;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002285 addr1 = (__force u32) ip->saddr;
2286 addr2 = (__force u32) ip->daddr;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002287 ihl = ip->ihl;
2288 break;
2289 case __constant_htons(ETH_P_IPV6):
2290 if (!pskb_may_pull(skb, sizeof(*ip6)))
2291 goto done;
2292
2293 ip6 = (struct ipv6hdr *) skb->data;
2294 ip_proto = ip6->nexthdr;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002295 addr1 = (__force u32) ip6->saddr.s6_addr32[3];
2296 addr2 = (__force u32) ip6->daddr.s6_addr32[3];
Tom Herbert0a9627f2010-03-16 08:03:29 +00002297 ihl = (40 >> 2);
2298 break;
2299 default:
2300 goto done;
2301 }
Tom Herbert0a9627f2010-03-16 08:03:29 +00002302 switch (ip_proto) {
2303 case IPPROTO_TCP:
2304 case IPPROTO_UDP:
2305 case IPPROTO_DCCP:
2306 case IPPROTO_ESP:
2307 case IPPROTO_AH:
2308 case IPPROTO_SCTP:
2309 case IPPROTO_UDPLITE:
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002310 if (pskb_may_pull(skb, (ihl * 4) + 4)) {
Changli Gao8c52d502010-04-24 22:50:10 -07002311 ports.v32 = * (__force u32 *) (skb->data + (ihl * 4));
2312 if (ports.v16[1] < ports.v16[0])
2313 swap(ports.v16[0], ports.v16[1]);
2314 break;
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002315 }
Tom Herbert0a9627f2010-03-16 08:03:29 +00002316 default:
Changli Gao8c52d502010-04-24 22:50:10 -07002317 ports.v32 = 0;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002318 break;
2319 }
2320
Eric Dumazetb249dcb2010-04-19 21:56:38 +00002321 /* get a consistent hash (same value on both flow directions) */
2322 if (addr2 < addr1)
2323 swap(addr1, addr2);
Changli Gao8c52d502010-04-24 22:50:10 -07002324 skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002325 if (!skb->rxhash)
2326 skb->rxhash = 1;
2327
2328got_hash:
Tom Herbertfec5e652010-04-16 16:01:27 -07002329 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2330 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2331 if (flow_table && sock_flow_table) {
2332 u16 next_cpu;
2333 struct rps_dev_flow *rflow;
2334
2335 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2336 tcpu = rflow->cpu;
2337
2338 next_cpu = sock_flow_table->ents[skb->rxhash &
2339 sock_flow_table->mask];
2340
2341 /*
2342 * If the desired CPU (where last recvmsg was done) is
2343 * different from current CPU (one in the rx-queue flow
2344 * table entry), switch if one of the following holds:
2345 * - Current CPU is unset (equal to RPS_NO_CPU).
2346 * - Current CPU is offline.
2347 * - The current CPU's queue tail has advanced beyond the
2348 * last packet that was enqueued using this table entry.
2349 * This guarantees that all previous packets for the flow
2350 * have been dequeued, thus preserving in order delivery.
2351 */
2352 if (unlikely(tcpu != next_cpu) &&
2353 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2354 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2355 rflow->last_qtail)) >= 0)) {
2356 tcpu = rflow->cpu = next_cpu;
2357 if (tcpu != RPS_NO_CPU)
2358 rflow->last_qtail = per_cpu(softnet_data,
2359 tcpu).input_queue_head;
2360 }
2361 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2362 *rflowp = rflow;
2363 cpu = tcpu;
2364 goto done;
2365 }
2366 }
2367
Tom Herbert0a9627f2010-03-16 08:03:29 +00002368 map = rcu_dereference(rxqueue->rps_map);
2369 if (map) {
Tom Herbertfec5e652010-04-16 16:01:27 -07002370 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
Tom Herbert0a9627f2010-03-16 08:03:29 +00002371
2372 if (cpu_online(tcpu)) {
2373 cpu = tcpu;
2374 goto done;
2375 }
2376 }
2377
2378done:
Tom Herbert0a9627f2010-03-16 08:03:29 +00002379 return cpu;
2380}
2381
Tom Herbert0a9627f2010-03-16 08:03:29 +00002382/* Called from hardirq (IPI) context */
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002383static void rps_trigger_softirq(void *data)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002384{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002385 struct softnet_data *sd = data;
2386
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002387 ____napi_schedule(sd, &sd->backlog);
Changli Gaodee42872010-05-02 05:42:16 +00002388 sd->received_rps++;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002389}
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002390
Tom Herbertfec5e652010-04-16 16:01:27 -07002391#endif /* CONFIG_RPS */
Tom Herbert0a9627f2010-03-16 08:03:29 +00002392
2393/*
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002394 * Check if this softnet_data structure is another cpu one
2395 * If yes, queue it to our IPI list and return 1
2396 * If no, return 0
2397 */
2398static int rps_ipi_queued(struct softnet_data *sd)
2399{
2400#ifdef CONFIG_RPS
2401 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2402
2403 if (sd != mysd) {
2404 sd->rps_ipi_next = mysd->rps_ipi_list;
2405 mysd->rps_ipi_list = sd;
2406
2407 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2408 return 1;
2409 }
2410#endif /* CONFIG_RPS */
2411 return 0;
2412}
2413
2414/*
Tom Herbert0a9627f2010-03-16 08:03:29 +00002415 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2416 * queue (may be a remote CPU queue).
2417 */
Tom Herbertfec5e652010-04-16 16:01:27 -07002418static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2419 unsigned int *qtail)
Tom Herbert0a9627f2010-03-16 08:03:29 +00002420{
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002421 struct softnet_data *sd;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002422 unsigned long flags;
2423
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002424 sd = &per_cpu(softnet_data, cpu);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002425
2426 local_irq_save(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002427
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002428 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002429 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2430 if (skb_queue_len(&sd->input_pkt_queue)) {
Tom Herbert0a9627f2010-03-16 08:03:29 +00002431enqueue:
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002432 __skb_queue_tail(&sd->input_pkt_queue, skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002433 input_queue_tail_incr_save(sd, qtail);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002434 rps_unlock(sd);
Changli Gao152102c2010-03-30 20:16:22 +00002435 local_irq_restore(flags);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002436 return NET_RX_SUCCESS;
2437 }
2438
Eric Dumazetebda37c2010-05-06 23:51:21 +00002439 /* Schedule NAPI for backlog device
2440 * We can use non atomic operation since we own the queue lock
2441 */
2442 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002443 if (!rps_ipi_queued(sd))
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07002444 ____napi_schedule(sd, &sd->backlog);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002445 }
2446 goto enqueue;
2447 }
2448
Changli Gaodee42872010-05-02 05:42:16 +00002449 sd->dropped++;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002450 rps_unlock(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00002451
Tom Herbert0a9627f2010-03-16 08:03:29 +00002452 local_irq_restore(flags);
2453
2454 kfree_skb(skb);
2455 return NET_RX_DROP;
2456}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002457
Linus Torvalds1da177e2005-04-16 15:20:36 -07002458/**
2459 * netif_rx - post buffer to the network code
2460 * @skb: buffer to post
2461 *
2462 * This function receives a packet from a device driver and queues it for
2463 * the upper (protocol) levels to process. It always succeeds. The buffer
2464 * may be dropped during processing for congestion control or by the
2465 * protocol layers.
2466 *
2467 * return values:
2468 * NET_RX_SUCCESS (no congestion)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002469 * NET_RX_DROP (packet was dropped)
2470 *
2471 */
2472
2473int netif_rx(struct sk_buff *skb)
2474{
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002475 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002476
2477 /* if netpoll wants it, pretend we never saw it */
2478 if (netpoll_rx(skb))
2479 return NET_RX_DROP;
2480
Eric Dumazet3b098e22010-05-15 23:57:10 -07002481 if (netdev_tstamp_prequeue)
2482 net_timestamp_check(skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002483
Eric Dumazetdf334542010-03-24 19:13:54 +00002484#ifdef CONFIG_RPS
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002485 {
Tom Herbertfec5e652010-04-16 16:01:27 -07002486 struct rps_dev_flow voidflow, *rflow = &voidflow;
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002487 int cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002489 rcu_read_lock();
Tom Herbertfec5e652010-04-16 16:01:27 -07002490
2491 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002492 if (cpu < 0)
2493 cpu = smp_processor_id();
Tom Herbertfec5e652010-04-16 16:01:27 -07002494
2495 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2496
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002497 rcu_read_unlock();
2498 }
2499#else
Tom Herbertfec5e652010-04-16 16:01:27 -07002500 {
2501 unsigned int qtail;
2502 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2503 put_cpu();
2504 }
Eric Dumazetb0e28f12010-04-15 00:14:07 -07002505#endif
2506 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002507}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002508EXPORT_SYMBOL(netif_rx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002509
2510int netif_rx_ni(struct sk_buff *skb)
2511{
2512 int err;
2513
2514 preempt_disable();
2515 err = netif_rx(skb);
2516 if (local_softirq_pending())
2517 do_softirq();
2518 preempt_enable();
2519
2520 return err;
2521}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002522EXPORT_SYMBOL(netif_rx_ni);
2523
Linus Torvalds1da177e2005-04-16 15:20:36 -07002524static void net_tx_action(struct softirq_action *h)
2525{
2526 struct softnet_data *sd = &__get_cpu_var(softnet_data);
2527
2528 if (sd->completion_queue) {
2529 struct sk_buff *clist;
2530
2531 local_irq_disable();
2532 clist = sd->completion_queue;
2533 sd->completion_queue = NULL;
2534 local_irq_enable();
2535
2536 while (clist) {
2537 struct sk_buff *skb = clist;
2538 clist = clist->next;
2539
Ilpo Järvinen547b7922008-07-25 21:43:18 -07002540 WARN_ON(atomic_read(&skb->users));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002541 __kfree_skb(skb);
2542 }
2543 }
2544
2545 if (sd->output_queue) {
David S. Miller37437bb2008-07-16 02:15:04 -07002546 struct Qdisc *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002547
2548 local_irq_disable();
2549 head = sd->output_queue;
2550 sd->output_queue = NULL;
Changli Gaoa9cbd582010-04-26 23:06:24 +00002551 sd->output_queue_tailp = &sd->output_queue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002552 local_irq_enable();
2553
2554 while (head) {
David S. Miller37437bb2008-07-16 02:15:04 -07002555 struct Qdisc *q = head;
2556 spinlock_t *root_lock;
2557
Linus Torvalds1da177e2005-04-16 15:20:36 -07002558 head = head->next_sched;
2559
David S. Miller5fb66222008-08-02 20:02:43 -07002560 root_lock = qdisc_lock(q);
David S. Miller37437bb2008-07-16 02:15:04 -07002561 if (spin_trylock(root_lock)) {
Jarek Poplawskidef82a12008-08-17 21:54:43 -07002562 smp_mb__before_clear_bit();
2563 clear_bit(__QDISC_STATE_SCHED,
2564 &q->state);
David S. Miller37437bb2008-07-16 02:15:04 -07002565 qdisc_run(q);
2566 spin_unlock(root_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002567 } else {
David S. Miller195648b2008-08-19 04:00:36 -07002568 if (!test_bit(__QDISC_STATE_DEACTIVATED,
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002569 &q->state)) {
David S. Miller195648b2008-08-19 04:00:36 -07002570 __netif_reschedule(q);
Jarek Poplawskie8a83e12008-09-07 18:41:21 -07002571 } else {
2572 smp_mb__before_clear_bit();
2573 clear_bit(__QDISC_STATE_SCHED,
2574 &q->state);
2575 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002576 }
2577 }
2578 }
2579}
2580
Stephen Hemminger6f05f622007-03-08 20:46:03 -08002581static inline int deliver_skb(struct sk_buff *skb,
2582 struct packet_type *pt_prev,
2583 struct net_device *orig_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002584{
2585 atomic_inc(&skb->users);
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002586 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002587}
2588
2589#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
Michał Mirosławda678292009-06-05 05:35:28 +00002590
2591#if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2592/* This hook is defined here for ATM LANE */
2593int (*br_fdb_test_addr_hook)(struct net_device *dev,
2594 unsigned char *addr) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07002595EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00002596#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002597
Stephen Hemminger6229e362007-03-21 13:38:47 -07002598/*
2599 * If bridge module is loaded call bridging hook.
2600 * returns NULL if packet was consumed.
2601 */
2602struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2603 struct sk_buff *skb) __read_mostly;
Stephen Hemminger4fb019a2009-09-11 11:50:08 -07002604EXPORT_SYMBOL_GPL(br_handle_frame_hook);
Michał Mirosławda678292009-06-05 05:35:28 +00002605
Stephen Hemminger6229e362007-03-21 13:38:47 -07002606static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2607 struct packet_type **pt_prev, int *ret,
2608 struct net_device *orig_dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002609{
2610 struct net_bridge_port *port;
2611
Stephen Hemminger6229e362007-03-21 13:38:47 -07002612 if (skb->pkt_type == PACKET_LOOPBACK ||
2613 (port = rcu_dereference(skb->dev->br_port)) == NULL)
2614 return skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002615
2616 if (*pt_prev) {
Stephen Hemminger6229e362007-03-21 13:38:47 -07002617 *ret = deliver_skb(skb, *pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002618 *pt_prev = NULL;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002619 }
2620
Stephen Hemminger6229e362007-03-21 13:38:47 -07002621 return br_handle_frame_hook(port, skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002622}
2623#else
Stephen Hemminger6229e362007-03-21 13:38:47 -07002624#define handle_bridge(skb, pt_prev, ret, orig_dev) (skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002625#endif
2626
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002627#if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
Jiri Pirkoa14462f2010-05-06 01:33:53 +00002628struct sk_buff *(*macvlan_handle_frame_hook)(struct macvlan_port *p,
2629 struct sk_buff *skb) __read_mostly;
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002630EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2631
2632static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2633 struct packet_type **pt_prev,
2634 int *ret,
2635 struct net_device *orig_dev)
2636{
Jiri Pirkoa14462f2010-05-06 01:33:53 +00002637 struct macvlan_port *port;
2638
2639 port = rcu_dereference(skb->dev->macvlan_port);
2640 if (!port)
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002641 return skb;
2642
2643 if (*pt_prev) {
2644 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2645 *pt_prev = NULL;
2646 }
Jiri Pirkoa14462f2010-05-06 01:33:53 +00002647 return macvlan_handle_frame_hook(port, skb);
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002648}
2649#else
2650#define handle_macvlan(skb, pt_prev, ret, orig_dev) (skb)
2651#endif
2652
Linus Torvalds1da177e2005-04-16 15:20:36 -07002653#ifdef CONFIG_NET_CLS_ACT
2654/* TODO: Maybe we should just force sch_ingress to be compiled in
2655 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2656 * a compare and 2 stores extra right now if we dont have it on
2657 * but have CONFIG_NET_CLS_ACT
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002658 * NOTE: This doesnt stop any functionality; if you dont have
Linus Torvalds1da177e2005-04-16 15:20:36 -07002659 * the ingress scheduler, you just cant add policies on ingress.
2660 *
2661 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002662static int ing_filter(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002663{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002664 struct net_device *dev = skb->dev;
Herbert Xuf697c3e2007-10-14 00:38:47 -07002665 u32 ttl = G_TC_RTTL(skb->tc_verd);
David S. Miller555353c2008-07-08 17:33:13 -07002666 struct netdev_queue *rxq;
2667 int result = TC_ACT_OK;
2668 struct Qdisc *q;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002669
Herbert Xuf697c3e2007-10-14 00:38:47 -07002670 if (MAX_RED_LOOP < ttl++) {
2671 printk(KERN_WARNING
2672 "Redir loop detected Dropping packet (%d->%d)\n",
Eric Dumazet8964be42009-11-20 15:35:04 -08002673 skb->skb_iif, dev->ifindex);
Herbert Xuf697c3e2007-10-14 00:38:47 -07002674 return TC_ACT_SHOT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002675 }
2676
Herbert Xuf697c3e2007-10-14 00:38:47 -07002677 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2678 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2679
David S. Miller555353c2008-07-08 17:33:13 -07002680 rxq = &dev->rx_queue;
2681
David S. Miller83874002008-07-17 00:53:03 -07002682 q = rxq->qdisc;
David S. Miller8d50b532008-07-30 02:37:46 -07002683 if (q != &noop_qdisc) {
David S. Miller83874002008-07-17 00:53:03 -07002684 spin_lock(qdisc_lock(q));
David S. Millera9312ae2008-08-17 21:51:03 -07002685 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2686 result = qdisc_enqueue_root(skb, q);
David S. Miller83874002008-07-17 00:53:03 -07002687 spin_unlock(qdisc_lock(q));
2688 }
Herbert Xuf697c3e2007-10-14 00:38:47 -07002689
Linus Torvalds1da177e2005-04-16 15:20:36 -07002690 return result;
2691}
Herbert Xuf697c3e2007-10-14 00:38:47 -07002692
2693static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2694 struct packet_type **pt_prev,
2695 int *ret, struct net_device *orig_dev)
2696{
David S. Miller8d50b532008-07-30 02:37:46 -07002697 if (skb->dev->rx_queue.qdisc == &noop_qdisc)
Herbert Xuf697c3e2007-10-14 00:38:47 -07002698 goto out;
2699
2700 if (*pt_prev) {
2701 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2702 *pt_prev = NULL;
2703 } else {
2704 /* Huh? Why does turning on AF_PACKET affect this? */
2705 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2706 }
2707
2708 switch (ing_filter(skb)) {
2709 case TC_ACT_SHOT:
2710 case TC_ACT_STOLEN:
2711 kfree_skb(skb);
2712 return NULL;
2713 }
2714
2715out:
2716 skb->tc_verd = 0;
2717 return skb;
2718}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002719#endif
2720
Patrick McHardybc1d0412008-07-14 22:49:30 -07002721/*
2722 * netif_nit_deliver - deliver received packets to network taps
2723 * @skb: buffer
2724 *
2725 * This function is used to deliver incoming packets to network
2726 * taps. It should be used when the normal netif_receive_skb path
2727 * is bypassed, for example because of VLAN acceleration.
2728 */
2729void netif_nit_deliver(struct sk_buff *skb)
2730{
2731 struct packet_type *ptype;
2732
2733 if (list_empty(&ptype_all))
2734 return;
2735
2736 skb_reset_network_header(skb);
2737 skb_reset_transport_header(skb);
2738 skb->mac_len = skb->network_header - skb->mac_header;
2739
2740 rcu_read_lock();
2741 list_for_each_entry_rcu(ptype, &ptype_all, list) {
2742 if (!ptype->dev || ptype->dev == skb->dev)
2743 deliver_skb(skb, ptype, skb->dev);
2744 }
2745 rcu_read_unlock();
2746}
2747
Eric Dumazetacbbc072010-04-11 06:56:11 +00002748static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2749 struct net_device *master)
2750{
2751 if (skb->pkt_type == PACKET_HOST) {
2752 u16 *dest = (u16 *) eth_hdr(skb)->h_dest;
2753
2754 memcpy(dest, master->dev_addr, ETH_ALEN);
2755 }
2756}
2757
2758/* On bonding slaves other than the currently active slave, suppress
2759 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2760 * ARP on active-backup slaves with arp_validate enabled.
2761 */
2762int __skb_bond_should_drop(struct sk_buff *skb, struct net_device *master)
2763{
2764 struct net_device *dev = skb->dev;
2765
2766 if (master->priv_flags & IFF_MASTER_ARPMON)
2767 dev->last_rx = jiffies;
2768
2769 if ((master->priv_flags & IFF_MASTER_ALB) && master->br_port) {
2770 /* Do address unmangle. The local destination address
2771 * will be always the one master has. Provides the right
2772 * functionality in a bridge.
2773 */
2774 skb_bond_set_mac_by_master(skb, master);
2775 }
2776
2777 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
2778 if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
2779 skb->protocol == __cpu_to_be16(ETH_P_ARP))
2780 return 0;
2781
2782 if (master->priv_flags & IFF_MASTER_ALB) {
2783 if (skb->pkt_type != PACKET_BROADCAST &&
2784 skb->pkt_type != PACKET_MULTICAST)
2785 return 0;
2786 }
2787 if (master->priv_flags & IFF_MASTER_8023AD &&
2788 skb->protocol == __cpu_to_be16(ETH_P_SLOW))
2789 return 0;
2790
2791 return 1;
2792 }
2793 return 0;
2794}
2795EXPORT_SYMBOL(__skb_bond_should_drop);
2796
Eric Dumazet10f744d2010-03-28 23:07:20 -07002797static int __netif_receive_skb(struct sk_buff *skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002798{
2799 struct packet_type *ptype, *pt_prev;
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002800 struct net_device *orig_dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002801 struct net_device *master;
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002802 struct net_device *null_or_orig;
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002803 struct net_device *null_or_bond;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002804 int ret = NET_RX_DROP;
Al Viro252e3342006-11-14 20:48:11 -08002805 __be16 type;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002806
Eric Dumazet3b098e22010-05-15 23:57:10 -07002807 if (!netdev_tstamp_prequeue)
2808 net_timestamp_check(skb);
Eric Dumazet81bbb3d2009-09-30 16:42:42 -07002809
Eric Dumazet05423b22009-10-26 18:40:35 -07002810 if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
Patrick McHardy9b22ea52008-11-04 14:49:57 -08002811 return NET_RX_SUCCESS;
2812
Linus Torvalds1da177e2005-04-16 15:20:36 -07002813 /* if we've gotten here through NAPI, check netpoll */
Stephen Hemmingerbea33482007-10-03 16:41:36 -07002814 if (netpoll_receive_skb(skb))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002815 return NET_RX_DROP;
2816
Eric Dumazet8964be42009-11-20 15:35:04 -08002817 if (!skb->skb_iif)
2818 skb->skb_iif = skb->dev->ifindex;
David S. Miller86e65da2005-08-09 19:36:29 -07002819
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002820 null_or_orig = NULL;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07002821 orig_dev = skb->dev;
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002822 master = ACCESS_ONCE(orig_dev->master);
2823 if (master) {
2824 if (skb_bond_should_drop(skb, master))
Joe Eykholt0d7a3682008-07-02 18:22:01 -07002825 null_or_orig = orig_dev; /* deliver only exact match */
2826 else
Eric Dumazet0641e4f2010-03-18 21:16:45 -07002827 skb->dev = master;
Joe Eykholtcc9bd5c2008-07-02 18:22:00 -07002828 }
Jay Vosburgh8f903c72006-02-21 16:36:44 -08002829
Changli Gaodee42872010-05-02 05:42:16 +00002830 __get_cpu_var(softnet_data).processed++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002831
Arnaldo Carvalho de Meloc1d2bbe2007-04-10 20:45:18 -07002832 skb_reset_network_header(skb);
Arnaldo Carvalho de Melobadff6d2007-03-13 13:06:52 -03002833 skb_reset_transport_header(skb);
Arnaldo Carvalho de Melob0e380b2007-04-10 21:21:55 -07002834 skb->mac_len = skb->network_header - skb->mac_header;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002835
2836 pt_prev = NULL;
2837
2838 rcu_read_lock();
2839
2840#ifdef CONFIG_NET_CLS_ACT
2841 if (skb->tc_verd & TC_NCLS) {
2842 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2843 goto ncls;
2844 }
2845#endif
2846
2847 list_for_each_entry_rcu(ptype, &ptype_all, list) {
Joe Eykholtf9823072008-07-02 18:22:02 -07002848 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2849 ptype->dev == orig_dev) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002850 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002851 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002852 pt_prev = ptype;
2853 }
2854 }
2855
2856#ifdef CONFIG_NET_CLS_ACT
Herbert Xuf697c3e2007-10-14 00:38:47 -07002857 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2858 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002859 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002860ncls:
2861#endif
2862
Stephen Hemminger6229e362007-03-21 13:38:47 -07002863 skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2864 if (!skb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002865 goto out;
Patrick McHardyb863ceb2007-07-14 18:55:06 -07002866 skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2867 if (!skb)
2868 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002869
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002870 /*
2871 * Make sure frames received on VLAN interfaces stacked on
2872 * bonding interfaces still make their way to any base bonding
2873 * device that may have registered for a specific ptype. The
2874 * handler may have to adjust skb->dev and orig_dev.
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002875 */
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002876 null_or_bond = NULL;
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002877 if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2878 (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002879 null_or_bond = vlan_dev_real_dev(skb->dev);
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002880 }
2881
Linus Torvalds1da177e2005-04-16 15:20:36 -07002882 type = skb->protocol;
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08002883 list_for_each_entry_rcu(ptype,
2884 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
Andy Gospodarek1f3c8802009-12-14 10:48:58 +00002885 if (ptype->type == type && (ptype->dev == null_or_orig ||
Andy Gospodarekca8d9ea2010-01-06 12:56:37 +00002886 ptype->dev == skb->dev || ptype->dev == orig_dev ||
2887 ptype->dev == null_or_bond)) {
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09002888 if (pt_prev)
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002889 ret = deliver_skb(skb, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002890 pt_prev = ptype;
2891 }
2892 }
2893
2894 if (pt_prev) {
David S. Millerf2ccd8f2005-08-09 19:34:12 -07002895 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002896 } else {
2897 kfree_skb(skb);
2898 /* Jamal, now you will not able to escape explaining
2899 * me how you were going to use this. :-)
2900 */
2901 ret = NET_RX_DROP;
2902 }
2903
2904out:
2905 rcu_read_unlock();
2906 return ret;
2907}
Tom Herbert0a9627f2010-03-16 08:03:29 +00002908
2909/**
2910 * netif_receive_skb - process receive buffer from network
2911 * @skb: buffer to process
2912 *
2913 * netif_receive_skb() is the main receive data processing function.
2914 * It always succeeds. The buffer may be dropped during processing
2915 * for congestion control or by the protocol layers.
2916 *
2917 * This function may only be called from softirq context and interrupts
2918 * should be enabled.
2919 *
2920 * Return values (usually ignored):
2921 * NET_RX_SUCCESS: no congestion
2922 * NET_RX_DROP: packet was dropped
2923 */
2924int netif_receive_skb(struct sk_buff *skb)
2925{
Eric Dumazet3b098e22010-05-15 23:57:10 -07002926 if (netdev_tstamp_prequeue)
2927 net_timestamp_check(skb);
2928
Eric Dumazetdf334542010-03-24 19:13:54 +00002929#ifdef CONFIG_RPS
Eric Dumazet3b098e22010-05-15 23:57:10 -07002930 {
2931 struct rps_dev_flow voidflow, *rflow = &voidflow;
2932 int cpu, ret;
Tom Herbert0a9627f2010-03-16 08:03:29 +00002933
Eric Dumazet3b098e22010-05-15 23:57:10 -07002934 rcu_read_lock();
Tom Herbert0a9627f2010-03-16 08:03:29 +00002935
Eric Dumazet3b098e22010-05-15 23:57:10 -07002936 cpu = get_rps_cpu(skb->dev, skb, &rflow);
Tom Herbertfec5e652010-04-16 16:01:27 -07002937
Eric Dumazet3b098e22010-05-15 23:57:10 -07002938 if (cpu >= 0) {
2939 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2940 rcu_read_unlock();
2941 } else {
2942 rcu_read_unlock();
2943 ret = __netif_receive_skb(skb);
2944 }
2945
2946 return ret;
Tom Herbertfec5e652010-04-16 16:01:27 -07002947 }
Tom Herbert1e94d722010-03-18 17:45:44 -07002948#else
2949 return __netif_receive_skb(skb);
2950#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00002951}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07002952EXPORT_SYMBOL(netif_receive_skb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002953
Eric Dumazet88751272010-04-19 05:07:33 +00002954/* Network device is going away, flush any packets still pending
2955 * Called with irqs disabled.
2956 */
Changli Gao152102c2010-03-30 20:16:22 +00002957static void flush_backlog(void *arg)
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002958{
Changli Gao152102c2010-03-30 20:16:22 +00002959 struct net_device *dev = arg;
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002960 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002961 struct sk_buff *skb, *tmp;
2962
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002963 rps_lock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002964 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002965 if (skb->dev == dev) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002966 __skb_unlink(skb, &sd->input_pkt_queue);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002967 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002968 input_queue_head_incr(sd);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002969 }
Changli Gao6e7676c2010-04-27 15:07:33 -07002970 }
Eric Dumazete36fa2f2010-04-19 21:17:14 +00002971 rps_unlock(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002972
2973 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
2974 if (skb->dev == dev) {
2975 __skb_unlink(skb, &sd->process_queue);
2976 kfree_skb(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00002977 input_queue_head_incr(sd);
Changli Gao6e7676c2010-04-27 15:07:33 -07002978 }
2979 }
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07002980}
2981
Herbert Xud565b0a2008-12-15 23:38:52 -08002982static int napi_gro_complete(struct sk_buff *skb)
2983{
2984 struct packet_type *ptype;
2985 __be16 type = skb->protocol;
2986 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2987 int err = -ENOENT;
2988
Herbert Xufc59f9a2009-04-14 15:11:06 -07002989 if (NAPI_GRO_CB(skb)->count == 1) {
2990 skb_shinfo(skb)->gso_size = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08002991 goto out;
Herbert Xufc59f9a2009-04-14 15:11:06 -07002992 }
Herbert Xud565b0a2008-12-15 23:38:52 -08002993
2994 rcu_read_lock();
2995 list_for_each_entry_rcu(ptype, head, list) {
2996 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2997 continue;
2998
2999 err = ptype->gro_complete(skb);
3000 break;
3001 }
3002 rcu_read_unlock();
3003
3004 if (err) {
3005 WARN_ON(&ptype->list == head);
3006 kfree_skb(skb);
3007 return NET_RX_SUCCESS;
3008 }
3009
3010out:
Herbert Xud565b0a2008-12-15 23:38:52 -08003011 return netif_receive_skb(skb);
3012}
3013
David S. Miller11380a42010-01-19 13:46:10 -08003014static void napi_gro_flush(struct napi_struct *napi)
Herbert Xud565b0a2008-12-15 23:38:52 -08003015{
3016 struct sk_buff *skb, *next;
3017
3018 for (skb = napi->gro_list; skb; skb = next) {
3019 next = skb->next;
3020 skb->next = NULL;
3021 napi_gro_complete(skb);
3022 }
3023
Herbert Xu4ae55442009-02-08 18:00:36 +00003024 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003025 napi->gro_list = NULL;
3026}
Herbert Xud565b0a2008-12-15 23:38:52 -08003027
Ben Hutchings5b252f02009-10-29 07:17:09 +00003028enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xud565b0a2008-12-15 23:38:52 -08003029{
3030 struct sk_buff **pp = NULL;
3031 struct packet_type *ptype;
3032 __be16 type = skb->protocol;
3033 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
Herbert Xu0da2afd2008-12-26 14:57:42 -08003034 int same_flow;
Herbert Xud565b0a2008-12-15 23:38:52 -08003035 int mac_len;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003036 enum gro_result ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003037
3038 if (!(skb->dev->features & NETIF_F_GRO))
3039 goto normal;
3040
David S. Miller4cf704f2009-06-09 00:18:51 -07003041 if (skb_is_gso(skb) || skb_has_frags(skb))
Herbert Xuf17f5c92009-01-14 14:36:12 -08003042 goto normal;
3043
Herbert Xud565b0a2008-12-15 23:38:52 -08003044 rcu_read_lock();
3045 list_for_each_entry_rcu(ptype, head, list) {
Herbert Xud565b0a2008-12-15 23:38:52 -08003046 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3047 continue;
3048
Herbert Xu86911732009-01-29 14:19:50 +00003049 skb_set_network_header(skb, skb_gro_offset(skb));
Herbert Xud565b0a2008-12-15 23:38:52 -08003050 mac_len = skb->network_header - skb->mac_header;
3051 skb->mac_len = mac_len;
3052 NAPI_GRO_CB(skb)->same_flow = 0;
3053 NAPI_GRO_CB(skb)->flush = 0;
Herbert Xu5d38a072009-01-04 16:13:40 -08003054 NAPI_GRO_CB(skb)->free = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003055
Herbert Xud565b0a2008-12-15 23:38:52 -08003056 pp = ptype->gro_receive(&napi->gro_list, skb);
3057 break;
3058 }
3059 rcu_read_unlock();
3060
3061 if (&ptype->list == head)
3062 goto normal;
3063
Herbert Xu0da2afd2008-12-26 14:57:42 -08003064 same_flow = NAPI_GRO_CB(skb)->same_flow;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003065 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
Herbert Xu0da2afd2008-12-26 14:57:42 -08003066
Herbert Xud565b0a2008-12-15 23:38:52 -08003067 if (pp) {
3068 struct sk_buff *nskb = *pp;
3069
3070 *pp = nskb->next;
3071 nskb->next = NULL;
3072 napi_gro_complete(nskb);
Herbert Xu4ae55442009-02-08 18:00:36 +00003073 napi->gro_count--;
Herbert Xud565b0a2008-12-15 23:38:52 -08003074 }
3075
Herbert Xu0da2afd2008-12-26 14:57:42 -08003076 if (same_flow)
Herbert Xud565b0a2008-12-15 23:38:52 -08003077 goto ok;
3078
Herbert Xu4ae55442009-02-08 18:00:36 +00003079 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
Herbert Xud565b0a2008-12-15 23:38:52 -08003080 goto normal;
Herbert Xud565b0a2008-12-15 23:38:52 -08003081
Herbert Xu4ae55442009-02-08 18:00:36 +00003082 napi->gro_count++;
Herbert Xud565b0a2008-12-15 23:38:52 -08003083 NAPI_GRO_CB(skb)->count = 1;
Herbert Xu86911732009-01-29 14:19:50 +00003084 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003085 skb->next = napi->gro_list;
3086 napi->gro_list = skb;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003087 ret = GRO_HELD;
Herbert Xud565b0a2008-12-15 23:38:52 -08003088
Herbert Xuad0f9902009-02-01 01:24:55 -08003089pull:
Herbert Xucb189782009-05-26 18:50:31 +00003090 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3091 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3092
3093 BUG_ON(skb->end - skb->tail < grow);
3094
3095 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3096
3097 skb->tail += grow;
3098 skb->data_len -= grow;
3099
3100 skb_shinfo(skb)->frags[0].page_offset += grow;
3101 skb_shinfo(skb)->frags[0].size -= grow;
3102
3103 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
3104 put_page(skb_shinfo(skb)->frags[0].page);
3105 memmove(skb_shinfo(skb)->frags,
3106 skb_shinfo(skb)->frags + 1,
3107 --skb_shinfo(skb)->nr_frags);
3108 }
Herbert Xuad0f9902009-02-01 01:24:55 -08003109 }
3110
Herbert Xud565b0a2008-12-15 23:38:52 -08003111ok:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003112 return ret;
Herbert Xud565b0a2008-12-15 23:38:52 -08003113
3114normal:
Herbert Xuad0f9902009-02-01 01:24:55 -08003115 ret = GRO_NORMAL;
3116 goto pull;
Herbert Xu5d38a072009-01-04 16:13:40 -08003117}
Herbert Xu96e93ea2009-01-06 10:49:34 -08003118EXPORT_SYMBOL(dev_gro_receive);
3119
Ben Hutchings5b252f02009-10-29 07:17:09 +00003120static gro_result_t
3121__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003122{
3123 struct sk_buff *p;
3124
Herbert Xud1c76af2009-03-16 10:50:02 -07003125 if (netpoll_rx_on(skb))
3126 return GRO_NORMAL;
3127
Herbert Xu96e93ea2009-01-06 10:49:34 -08003128 for (p = napi->gro_list; p; p = p->next) {
Joe Perchesf64f9e72009-11-29 16:55:45 -08003129 NAPI_GRO_CB(p)->same_flow =
3130 (p->dev == skb->dev) &&
3131 !compare_ether_header(skb_mac_header(p),
3132 skb_gro_mac_header(skb));
Herbert Xu96e93ea2009-01-06 10:49:34 -08003133 NAPI_GRO_CB(p)->flush = 0;
3134 }
3135
3136 return dev_gro_receive(napi, skb);
3137}
Herbert Xu5d38a072009-01-04 16:13:40 -08003138
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003139gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
Herbert Xu5d38a072009-01-04 16:13:40 -08003140{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003141 switch (ret) {
3142 case GRO_NORMAL:
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003143 if (netif_receive_skb(skb))
3144 ret = GRO_DROP;
3145 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003146
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003147 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003148 case GRO_MERGED_FREE:
Herbert Xu5d38a072009-01-04 16:13:40 -08003149 kfree_skb(skb);
3150 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003151
3152 case GRO_HELD:
3153 case GRO_MERGED:
3154 break;
Herbert Xu5d38a072009-01-04 16:13:40 -08003155 }
3156
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003157 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003158}
3159EXPORT_SYMBOL(napi_skb_finish);
3160
Herbert Xu78a478d2009-05-26 18:50:21 +00003161void skb_gro_reset_offset(struct sk_buff *skb)
3162{
3163 NAPI_GRO_CB(skb)->data_offset = 0;
3164 NAPI_GRO_CB(skb)->frag0 = NULL;
Herbert Xu74895942009-05-26 18:50:27 +00003165 NAPI_GRO_CB(skb)->frag0_len = 0;
Herbert Xu78a478d2009-05-26 18:50:21 +00003166
Herbert Xu78d3fd02009-05-26 18:50:23 +00003167 if (skb->mac_header == skb->tail &&
Herbert Xu74895942009-05-26 18:50:27 +00003168 !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
Herbert Xu78a478d2009-05-26 18:50:21 +00003169 NAPI_GRO_CB(skb)->frag0 =
3170 page_address(skb_shinfo(skb)->frags[0].page) +
3171 skb_shinfo(skb)->frags[0].page_offset;
Herbert Xu74895942009-05-26 18:50:27 +00003172 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
3173 }
Herbert Xu78a478d2009-05-26 18:50:21 +00003174}
3175EXPORT_SYMBOL(skb_gro_reset_offset);
3176
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003177gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003178{
Herbert Xu86911732009-01-29 14:19:50 +00003179 skb_gro_reset_offset(skb);
3180
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003181 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
Herbert Xud565b0a2008-12-15 23:38:52 -08003182}
3183EXPORT_SYMBOL(napi_gro_receive);
3184
Herbert Xu96e93ea2009-01-06 10:49:34 -08003185void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3186{
Herbert Xu96e93ea2009-01-06 10:49:34 -08003187 __skb_pull(skb, skb_headlen(skb));
3188 skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3189
3190 napi->skb = skb;
3191}
3192EXPORT_SYMBOL(napi_reuse_skb);
3193
Herbert Xu76620aa2009-04-16 02:02:07 -07003194struct sk_buff *napi_get_frags(struct napi_struct *napi)
Herbert Xu5d38a072009-01-04 16:13:40 -08003195{
Herbert Xu5d38a072009-01-04 16:13:40 -08003196 struct sk_buff *skb = napi->skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003197
3198 if (!skb) {
Eric Dumazet89d71a62009-10-13 05:34:20 +00003199 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3200 if (skb)
3201 napi->skb = skb;
Herbert Xu5d38a072009-01-04 16:13:40 -08003202 }
Herbert Xu96e93ea2009-01-06 10:49:34 -08003203 return skb;
3204}
Herbert Xu76620aa2009-04-16 02:02:07 -07003205EXPORT_SYMBOL(napi_get_frags);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003206
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003207gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3208 gro_result_t ret)
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003209{
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003210 switch (ret) {
3211 case GRO_NORMAL:
Herbert Xu86911732009-01-29 14:19:50 +00003212 case GRO_HELD:
Ajit Khapardee76b69c2010-02-16 20:25:43 +00003213 skb->protocol = eth_type_trans(skb, skb->dev);
Herbert Xu86911732009-01-29 14:19:50 +00003214
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003215 if (ret == GRO_HELD)
3216 skb_gro_pull(skb, -ETH_HLEN);
3217 else if (netif_receive_skb(skb))
3218 ret = GRO_DROP;
Herbert Xu86911732009-01-29 14:19:50 +00003219 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003220
3221 case GRO_DROP:
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003222 case GRO_MERGED_FREE:
3223 napi_reuse_skb(napi, skb);
3224 break;
Ben Hutchings5b252f02009-10-29 07:17:09 +00003225
3226 case GRO_MERGED:
3227 break;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003228 }
3229
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003230 return ret;
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003231}
3232EXPORT_SYMBOL(napi_frags_finish);
3233
Herbert Xu76620aa2009-04-16 02:02:07 -07003234struct sk_buff *napi_frags_skb(struct napi_struct *napi)
Herbert Xu96e93ea2009-01-06 10:49:34 -08003235{
Herbert Xu76620aa2009-04-16 02:02:07 -07003236 struct sk_buff *skb = napi->skb;
3237 struct ethhdr *eth;
Herbert Xua5b1cf22009-05-26 18:50:28 +00003238 unsigned int hlen;
3239 unsigned int off;
Herbert Xu76620aa2009-04-16 02:02:07 -07003240
3241 napi->skb = NULL;
3242
3243 skb_reset_mac_header(skb);
3244 skb_gro_reset_offset(skb);
3245
Herbert Xua5b1cf22009-05-26 18:50:28 +00003246 off = skb_gro_offset(skb);
3247 hlen = off + sizeof(*eth);
3248 eth = skb_gro_header_fast(skb, off);
3249 if (skb_gro_header_hard(skb, hlen)) {
3250 eth = skb_gro_header_slow(skb, hlen, off);
3251 if (unlikely(!eth)) {
3252 napi_reuse_skb(napi, skb);
3253 skb = NULL;
3254 goto out;
3255 }
Herbert Xu76620aa2009-04-16 02:02:07 -07003256 }
3257
3258 skb_gro_pull(skb, sizeof(*eth));
3259
3260 /*
3261 * This works because the only protocols we care about don't require
3262 * special handling. We'll fix it up properly at the end.
3263 */
3264 skb->protocol = eth->h_proto;
3265
3266out:
3267 return skb;
3268}
3269EXPORT_SYMBOL(napi_frags_skb);
3270
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003271gro_result_t napi_gro_frags(struct napi_struct *napi)
Herbert Xu76620aa2009-04-16 02:02:07 -07003272{
3273 struct sk_buff *skb = napi_frags_skb(napi);
Herbert Xu96e93ea2009-01-06 10:49:34 -08003274
3275 if (!skb)
Ben Hutchingsc7c4b3b2009-10-29 21:36:53 -07003276 return GRO_DROP;
Herbert Xu96e93ea2009-01-06 10:49:34 -08003277
Herbert Xu5d0d9be2009-01-29 14:19:48 +00003278 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
Herbert Xu5d38a072009-01-04 16:13:40 -08003279}
3280EXPORT_SYMBOL(napi_gro_frags);
3281
Eric Dumazete326bed2010-04-22 00:22:45 -07003282/*
3283 * net_rps_action sends any pending IPI's for rps.
3284 * Note: called with local irq disabled, but exits with local irq enabled.
3285 */
3286static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3287{
3288#ifdef CONFIG_RPS
3289 struct softnet_data *remsd = sd->rps_ipi_list;
3290
3291 if (remsd) {
3292 sd->rps_ipi_list = NULL;
3293
3294 local_irq_enable();
3295
3296 /* Send pending IPI's to kick RPS processing on remote cpus. */
3297 while (remsd) {
3298 struct softnet_data *next = remsd->rps_ipi_next;
3299
3300 if (cpu_online(remsd->cpu))
3301 __smp_call_function_single(remsd->cpu,
3302 &remsd->csd, 0);
3303 remsd = next;
3304 }
3305 } else
3306#endif
3307 local_irq_enable();
3308}
3309
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003310static int process_backlog(struct napi_struct *napi, int quota)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003311{
3312 int work = 0;
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003313 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003314
Eric Dumazete326bed2010-04-22 00:22:45 -07003315#ifdef CONFIG_RPS
3316 /* Check if we have pending ipi, its better to send them now,
3317 * not waiting net_rx_action() end.
3318 */
3319 if (sd->rps_ipi_list) {
3320 local_irq_disable();
3321 net_rps_action_and_irq_enable(sd);
3322 }
3323#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003324 napi->weight = weight_p;
Changli Gao6e7676c2010-04-27 15:07:33 -07003325 local_irq_disable();
3326 while (work < quota) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003327 struct sk_buff *skb;
Changli Gao6e7676c2010-04-27 15:07:33 -07003328 unsigned int qlen;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003329
Changli Gao6e7676c2010-04-27 15:07:33 -07003330 while ((skb = __skb_dequeue(&sd->process_queue))) {
Eric Dumazete4008272010-04-05 15:42:39 -07003331 local_irq_enable();
Changli Gao6e7676c2010-04-27 15:07:33 -07003332 __netif_receive_skb(skb);
Changli Gao6e7676c2010-04-27 15:07:33 -07003333 local_irq_disable();
Tom Herbert76cc8b12010-05-20 18:37:59 +00003334 input_queue_head_incr(sd);
3335 if (++work >= quota) {
3336 local_irq_enable();
3337 return work;
3338 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003339 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003340
Changli Gao6e7676c2010-04-27 15:07:33 -07003341 rps_lock(sd);
3342 qlen = skb_queue_len(&sd->input_pkt_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003343 if (qlen)
Changli Gao6e7676c2010-04-27 15:07:33 -07003344 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3345 &sd->process_queue);
Tom Herbert76cc8b12010-05-20 18:37:59 +00003346
Changli Gao6e7676c2010-04-27 15:07:33 -07003347 if (qlen < quota - work) {
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003348 /*
3349 * Inline a custom version of __napi_complete().
3350 * only current cpu owns and manipulates this napi,
3351 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3352 * we can use a plain write instead of clear_bit(),
3353 * and we dont need an smp_mb() memory barrier.
3354 */
3355 list_del(&napi->poll_list);
3356 napi->state = 0;
3357
Changli Gao6e7676c2010-04-27 15:07:33 -07003358 quota = work + qlen;
3359 }
3360 rps_unlock(sd);
3361 }
3362 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003363
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003364 return work;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003365}
3366
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003367/**
3368 * __napi_schedule - schedule for receive
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07003369 * @n: entry to schedule
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003370 *
3371 * The entry's receive function will be scheduled to run
3372 */
Harvey Harrisonb5606c22008-02-13 15:03:16 -08003373void __napi_schedule(struct napi_struct *n)
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003374{
3375 unsigned long flags;
3376
3377 local_irq_save(flags);
Eric Dumazeteecfd7c2010-05-06 22:07:48 -07003378 ____napi_schedule(&__get_cpu_var(softnet_data), n);
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003379 local_irq_restore(flags);
3380}
3381EXPORT_SYMBOL(__napi_schedule);
3382
Herbert Xud565b0a2008-12-15 23:38:52 -08003383void __napi_complete(struct napi_struct *n)
3384{
3385 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3386 BUG_ON(n->gro_list);
3387
3388 list_del(&n->poll_list);
3389 smp_mb__before_clear_bit();
3390 clear_bit(NAPI_STATE_SCHED, &n->state);
3391}
3392EXPORT_SYMBOL(__napi_complete);
3393
3394void napi_complete(struct napi_struct *n)
3395{
3396 unsigned long flags;
3397
3398 /*
3399 * don't let napi dequeue from the cpu poll list
3400 * just in case its running on a different cpu
3401 */
3402 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3403 return;
3404
3405 napi_gro_flush(n);
3406 local_irq_save(flags);
3407 __napi_complete(n);
3408 local_irq_restore(flags);
3409}
3410EXPORT_SYMBOL(napi_complete);
3411
3412void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3413 int (*poll)(struct napi_struct *, int), int weight)
3414{
3415 INIT_LIST_HEAD(&napi->poll_list);
Herbert Xu4ae55442009-02-08 18:00:36 +00003416 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003417 napi->gro_list = NULL;
Herbert Xu5d38a072009-01-04 16:13:40 -08003418 napi->skb = NULL;
Herbert Xud565b0a2008-12-15 23:38:52 -08003419 napi->poll = poll;
3420 napi->weight = weight;
3421 list_add(&napi->dev_list, &dev->napi_list);
Herbert Xud565b0a2008-12-15 23:38:52 -08003422 napi->dev = dev;
Herbert Xu5d38a072009-01-04 16:13:40 -08003423#ifdef CONFIG_NETPOLL
Herbert Xud565b0a2008-12-15 23:38:52 -08003424 spin_lock_init(&napi->poll_lock);
3425 napi->poll_owner = -1;
3426#endif
3427 set_bit(NAPI_STATE_SCHED, &napi->state);
3428}
3429EXPORT_SYMBOL(netif_napi_add);
3430
3431void netif_napi_del(struct napi_struct *napi)
3432{
3433 struct sk_buff *skb, *next;
3434
Peter P Waskiewicz Jrd7b06632008-12-26 01:35:35 -08003435 list_del_init(&napi->dev_list);
Herbert Xu76620aa2009-04-16 02:02:07 -07003436 napi_free_frags(napi);
Herbert Xud565b0a2008-12-15 23:38:52 -08003437
3438 for (skb = napi->gro_list; skb; skb = next) {
3439 next = skb->next;
3440 skb->next = NULL;
3441 kfree_skb(skb);
3442 }
3443
3444 napi->gro_list = NULL;
Herbert Xu4ae55442009-02-08 18:00:36 +00003445 napi->gro_count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08003446}
3447EXPORT_SYMBOL(netif_napi_del);
3448
Linus Torvalds1da177e2005-04-16 15:20:36 -07003449static void net_rx_action(struct softirq_action *h)
3450{
Eric Dumazete326bed2010-04-22 00:22:45 -07003451 struct softnet_data *sd = &__get_cpu_var(softnet_data);
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003452 unsigned long time_limit = jiffies + 2;
Stephen Hemminger51b0bde2005-06-23 20:14:40 -07003453 int budget = netdev_budget;
Matt Mackall53fb95d2005-08-11 19:27:43 -07003454 void *have;
3455
Linus Torvalds1da177e2005-04-16 15:20:36 -07003456 local_irq_disable();
3457
Eric Dumazete326bed2010-04-22 00:22:45 -07003458 while (!list_empty(&sd->poll_list)) {
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003459 struct napi_struct *n;
3460 int work, weight;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003461
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003462 /* If softirq window is exhuasted then punt.
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003463 * Allow this to run for 2 jiffies since which will allow
3464 * an average latency of 1.5/HZ.
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003465 */
Stephen Hemminger24f8b232008-11-03 17:14:38 -08003466 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003467 goto softnet_break;
3468
3469 local_irq_enable();
3470
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003471 /* Even though interrupts have been re-enabled, this
3472 * access is safe because interrupts can only add new
3473 * entries to the tail of this list, and only ->poll()
3474 * calls can remove this head entry from the list.
3475 */
Eric Dumazete326bed2010-04-22 00:22:45 -07003476 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003477
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003478 have = netpoll_poll_lock(n);
3479
3480 weight = n->weight;
3481
David S. Miller0a7606c2007-10-29 21:28:47 -07003482 /* This NAPI_STATE_SCHED test is for avoiding a race
3483 * with netpoll's poll_napi(). Only the entity which
3484 * obtains the lock and sees NAPI_STATE_SCHED set will
3485 * actually make the ->poll() call. Therefore we avoid
3486 * accidently calling ->poll() when NAPI is not scheduled.
3487 */
3488 work = 0;
Neil Horman4ea7e382009-05-21 07:36:08 +00003489 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
David S. Miller0a7606c2007-10-29 21:28:47 -07003490 work = n->poll(n, weight);
Neil Horman4ea7e382009-05-21 07:36:08 +00003491 trace_napi_poll(n);
3492 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003493
3494 WARN_ON_ONCE(work > weight);
3495
3496 budget -= work;
3497
3498 local_irq_disable();
3499
3500 /* Drivers must not modify the NAPI state if they
3501 * consume the entire weight. In such cases this code
3502 * still "owns" the NAPI instance and therefore can
3503 * move the instance around on the list at-will.
3504 */
David S. Millerfed17f32008-01-07 21:00:40 -08003505 if (unlikely(work == weight)) {
Herbert Xuff780cd2009-06-26 19:27:04 -07003506 if (unlikely(napi_disable_pending(n))) {
3507 local_irq_enable();
3508 napi_complete(n);
3509 local_irq_disable();
3510 } else
Eric Dumazete326bed2010-04-22 00:22:45 -07003511 list_move_tail(&n->poll_list, &sd->poll_list);
David S. Millerfed17f32008-01-07 21:00:40 -08003512 }
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003513
3514 netpoll_poll_unlock(have);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003515 }
3516out:
Eric Dumazete326bed2010-04-22 00:22:45 -07003517 net_rps_action_and_irq_enable(sd);
Tom Herbert0a9627f2010-03-16 08:03:29 +00003518
Chris Leechdb217332006-06-17 21:24:58 -07003519#ifdef CONFIG_NET_DMA
3520 /*
3521 * There may not be any more sk_buffs coming right now, so push
3522 * any pending DMA copies to hardware
3523 */
Dan Williams2ba05622009-01-06 11:38:14 -07003524 dma_issue_pending_all();
Chris Leechdb217332006-06-17 21:24:58 -07003525#endif
Stephen Hemmingerbea33482007-10-03 16:41:36 -07003526
Linus Torvalds1da177e2005-04-16 15:20:36 -07003527 return;
3528
3529softnet_break:
Changli Gaodee42872010-05-02 05:42:16 +00003530 sd->time_squeeze++;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003531 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3532 goto out;
3533}
3534
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003535static gifconf_func_t *gifconf_list[NPROTO];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003536
3537/**
3538 * register_gifconf - register a SIOCGIF handler
3539 * @family: Address family
3540 * @gifconf: Function handler
3541 *
3542 * Register protocol dependent address dumping routines. The handler
3543 * that is passed must not be freed or reused until it has been replaced
3544 * by another handler.
3545 */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003546int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003547{
3548 if (family >= NPROTO)
3549 return -EINVAL;
3550 gifconf_list[family] = gifconf;
3551 return 0;
3552}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07003553EXPORT_SYMBOL(register_gifconf);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003554
3555
3556/*
3557 * Map an interface index to its name (SIOCGIFNAME)
3558 */
3559
3560/*
3561 * We need this ioctl for efficient implementation of the
3562 * if_indextoname() function required by the IPv6 API. Without
3563 * it, we would have to search all the interfaces to find a
3564 * match. --pb
3565 */
3566
Eric W. Biederman881d9662007-09-17 11:56:21 -07003567static int dev_ifname(struct net *net, struct ifreq __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003568{
3569 struct net_device *dev;
3570 struct ifreq ifr;
3571
3572 /*
3573 * Fetch the caller's info block.
3574 */
3575
3576 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3577 return -EFAULT;
3578
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003579 rcu_read_lock();
3580 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003581 if (!dev) {
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003582 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003583 return -ENODEV;
3584 }
3585
3586 strcpy(ifr.ifr_name, dev->name);
Eric Dumazetfb699dfd2009-10-19 19:18:49 +00003587 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003588
3589 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3590 return -EFAULT;
3591 return 0;
3592}
3593
3594/*
3595 * Perform a SIOCGIFCONF call. This structure will change
3596 * size eventually, and there is nothing I can do about it.
3597 * Thus we will need a 'compatibility mode'.
3598 */
3599
Eric W. Biederman881d9662007-09-17 11:56:21 -07003600static int dev_ifconf(struct net *net, char __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003601{
3602 struct ifconf ifc;
3603 struct net_device *dev;
3604 char __user *pos;
3605 int len;
3606 int total;
3607 int i;
3608
3609 /*
3610 * Fetch the caller's info block.
3611 */
3612
3613 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3614 return -EFAULT;
3615
3616 pos = ifc.ifc_buf;
3617 len = ifc.ifc_len;
3618
3619 /*
3620 * Loop over the interfaces, and write an info block for each.
3621 */
3622
3623 total = 0;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003624 for_each_netdev(net, dev) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003625 for (i = 0; i < NPROTO; i++) {
3626 if (gifconf_list[i]) {
3627 int done;
3628 if (!pos)
3629 done = gifconf_list[i](dev, NULL, 0);
3630 else
3631 done = gifconf_list[i](dev, pos + total,
3632 len - total);
3633 if (done < 0)
3634 return -EFAULT;
3635 total += done;
3636 }
3637 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003638 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003639
3640 /*
3641 * All done. Write the updated control block back to the caller.
3642 */
3643 ifc.ifc_len = total;
3644
3645 /*
3646 * Both BSD and Solaris return 0 here, so we do too.
3647 */
3648 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3649}
3650
3651#ifdef CONFIG_PROC_FS
3652/*
3653 * This is invoked by the /proc filesystem handler to display a device
3654 * in detail.
3655 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003656void *dev_seq_start(struct seq_file *seq, loff_t *pos)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003657 __acquires(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003658{
Denis V. Luneve372c412007-11-19 22:31:54 -08003659 struct net *net = seq_file_net(seq);
Pavel Emelianov7562f872007-05-03 15:13:45 -07003660 loff_t off;
3661 struct net_device *dev;
3662
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003663 rcu_read_lock();
Pavel Emelianov7562f872007-05-03 15:13:45 -07003664 if (!*pos)
3665 return SEQ_START_TOKEN;
3666
3667 off = 1;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003668 for_each_netdev_rcu(net, dev)
Pavel Emelianov7562f872007-05-03 15:13:45 -07003669 if (off++ == *pos)
3670 return dev;
3671
3672 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003673}
3674
3675void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3676{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003677 struct net_device *dev = (v == SEQ_START_TOKEN) ?
3678 first_net_device(seq_file_net(seq)) :
3679 next_net_device((struct net_device *)v);
3680
Linus Torvalds1da177e2005-04-16 15:20:36 -07003681 ++*pos;
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003682 return rcu_dereference(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003683}
3684
3685void dev_seq_stop(struct seq_file *seq, void *v)
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003686 __releases(RCU)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003687{
Eric Dumazetc6d14c82009-11-04 05:43:23 -08003688 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003689}
3690
3691static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3692{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08003693 const struct net_device_stats *stats = dev_get_stats(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003694
Jesper Dangaard Brouer2d13baf2010-01-05 05:50:52 +00003695 seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
Rusty Russell5a1b5892007-04-28 21:04:03 -07003696 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3697 dev->name, stats->rx_bytes, stats->rx_packets,
3698 stats->rx_errors,
3699 stats->rx_dropped + stats->rx_missed_errors,
3700 stats->rx_fifo_errors,
3701 stats->rx_length_errors + stats->rx_over_errors +
3702 stats->rx_crc_errors + stats->rx_frame_errors,
3703 stats->rx_compressed, stats->multicast,
3704 stats->tx_bytes, stats->tx_packets,
3705 stats->tx_errors, stats->tx_dropped,
3706 stats->tx_fifo_errors, stats->collisions,
3707 stats->tx_carrier_errors +
3708 stats->tx_aborted_errors +
3709 stats->tx_window_errors +
3710 stats->tx_heartbeat_errors,
3711 stats->tx_compressed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003712}
3713
3714/*
3715 * Called from the PROCfs module. This now uses the new arbitrary sized
3716 * /proc/net interface to create /proc/net/dev
3717 */
3718static int dev_seq_show(struct seq_file *seq, void *v)
3719{
3720 if (v == SEQ_START_TOKEN)
3721 seq_puts(seq, "Inter-| Receive "
3722 " | Transmit\n"
3723 " face |bytes packets errs drop fifo frame "
3724 "compressed multicast|bytes packets errs "
3725 "drop fifo colls carrier compressed\n");
3726 else
3727 dev_seq_printf_stats(seq, v);
3728 return 0;
3729}
3730
Changli Gaodee42872010-05-02 05:42:16 +00003731static struct softnet_data *softnet_get_online(loff_t *pos)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003732{
Changli Gaodee42872010-05-02 05:42:16 +00003733 struct softnet_data *sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003734
Mike Travis0c0b0ac2008-05-02 16:43:08 -07003735 while (*pos < nr_cpu_ids)
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003736 if (cpu_online(*pos)) {
Changli Gaodee42872010-05-02 05:42:16 +00003737 sd = &per_cpu(softnet_data, *pos);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003738 break;
3739 } else
3740 ++*pos;
Changli Gaodee42872010-05-02 05:42:16 +00003741 return sd;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003742}
3743
3744static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3745{
3746 return softnet_get_online(pos);
3747}
3748
3749static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3750{
3751 ++*pos;
3752 return softnet_get_online(pos);
3753}
3754
3755static void softnet_seq_stop(struct seq_file *seq, void *v)
3756{
3757}
3758
3759static int softnet_seq_show(struct seq_file *seq, void *v)
3760{
Changli Gaodee42872010-05-02 05:42:16 +00003761 struct softnet_data *sd = v;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003762
Tom Herbert0a9627f2010-03-16 08:03:29 +00003763 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
Changli Gaodee42872010-05-02 05:42:16 +00003764 sd->processed, sd->dropped, sd->time_squeeze, 0,
Stephen Hemmingerc1ebcdb2005-06-23 20:08:59 -07003765 0, 0, 0, 0, /* was fastroute */
Changli Gaodee42872010-05-02 05:42:16 +00003766 sd->cpu_collision, sd->received_rps);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003767 return 0;
3768}
3769
Stephen Hemmingerf6908082007-03-12 14:34:29 -07003770static const struct seq_operations dev_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003771 .start = dev_seq_start,
3772 .next = dev_seq_next,
3773 .stop = dev_seq_stop,
3774 .show = dev_seq_show,
3775};
3776
3777static int dev_seq_open(struct inode *inode, struct file *file)
3778{
Denis V. Luneve372c412007-11-19 22:31:54 -08003779 return seq_open_net(inode, file, &dev_seq_ops,
3780 sizeof(struct seq_net_private));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003781}
3782
Arjan van de Ven9a321442007-02-12 00:55:35 -08003783static const struct file_operations dev_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003784 .owner = THIS_MODULE,
3785 .open = dev_seq_open,
3786 .read = seq_read,
3787 .llseek = seq_lseek,
Denis V. Luneve372c412007-11-19 22:31:54 -08003788 .release = seq_release_net,
Linus Torvalds1da177e2005-04-16 15:20:36 -07003789};
3790
Stephen Hemmingerf6908082007-03-12 14:34:29 -07003791static const struct seq_operations softnet_seq_ops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003792 .start = softnet_seq_start,
3793 .next = softnet_seq_next,
3794 .stop = softnet_seq_stop,
3795 .show = softnet_seq_show,
3796};
3797
3798static int softnet_seq_open(struct inode *inode, struct file *file)
3799{
3800 return seq_open(file, &softnet_seq_ops);
3801}
3802
Arjan van de Ven9a321442007-02-12 00:55:35 -08003803static const struct file_operations softnet_seq_fops = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003804 .owner = THIS_MODULE,
3805 .open = softnet_seq_open,
3806 .read = seq_read,
3807 .llseek = seq_lseek,
3808 .release = seq_release,
3809};
3810
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003811static void *ptype_get_idx(loff_t pos)
3812{
3813 struct packet_type *pt = NULL;
3814 loff_t i = 0;
3815 int t;
3816
3817 list_for_each_entry_rcu(pt, &ptype_all, list) {
3818 if (i == pos)
3819 return pt;
3820 ++i;
3821 }
3822
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08003823 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003824 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3825 if (i == pos)
3826 return pt;
3827 ++i;
3828 }
3829 }
3830 return NULL;
3831}
3832
3833static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
Stephen Hemminger72348a42008-01-21 02:27:29 -08003834 __acquires(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003835{
3836 rcu_read_lock();
3837 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3838}
3839
3840static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3841{
3842 struct packet_type *pt;
3843 struct list_head *nxt;
3844 int hash;
3845
3846 ++*pos;
3847 if (v == SEQ_START_TOKEN)
3848 return ptype_get_idx(0);
3849
3850 pt = v;
3851 nxt = pt->list.next;
3852 if (pt->type == htons(ETH_P_ALL)) {
3853 if (nxt != &ptype_all)
3854 goto found;
3855 hash = 0;
3856 nxt = ptype_base[0].next;
3857 } else
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08003858 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003859
3860 while (nxt == &ptype_base[hash]) {
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08003861 if (++hash >= PTYPE_HASH_SIZE)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003862 return NULL;
3863 nxt = ptype_base[hash].next;
3864 }
3865found:
3866 return list_entry(nxt, struct packet_type, list);
3867}
3868
3869static void ptype_seq_stop(struct seq_file *seq, void *v)
Stephen Hemminger72348a42008-01-21 02:27:29 -08003870 __releases(RCU)
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003871{
3872 rcu_read_unlock();
3873}
3874
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003875static int ptype_seq_show(struct seq_file *seq, void *v)
3876{
3877 struct packet_type *pt = v;
3878
3879 if (v == SEQ_START_TOKEN)
3880 seq_puts(seq, "Type Device Function\n");
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09003881 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003882 if (pt->type == htons(ETH_P_ALL))
3883 seq_puts(seq, "ALL ");
3884 else
3885 seq_printf(seq, "%04x", ntohs(pt->type));
3886
Alexey Dobriyan908cd2d2008-11-16 19:50:35 -08003887 seq_printf(seq, " %-8s %pF\n",
3888 pt->dev ? pt->dev->name : "", pt->func);
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003889 }
3890
3891 return 0;
3892}
3893
3894static const struct seq_operations ptype_seq_ops = {
3895 .start = ptype_seq_start,
3896 .next = ptype_seq_next,
3897 .stop = ptype_seq_stop,
3898 .show = ptype_seq_show,
3899};
3900
3901static int ptype_seq_open(struct inode *inode, struct file *file)
3902{
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07003903 return seq_open_net(inode, file, &ptype_seq_ops,
3904 sizeof(struct seq_net_private));
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003905}
3906
3907static const struct file_operations ptype_seq_fops = {
3908 .owner = THIS_MODULE,
3909 .open = ptype_seq_open,
3910 .read = seq_read,
3911 .llseek = seq_lseek,
Pavel Emelyanov2feb27d2008-03-24 14:57:45 -07003912 .release = seq_release_net,
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003913};
3914
3915
Pavel Emelyanov46650792007-10-08 20:38:39 -07003916static int __net_init dev_proc_net_init(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003917{
3918 int rc = -ENOMEM;
3919
Eric W. Biederman881d9662007-09-17 11:56:21 -07003920 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003921 goto out;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003922 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003923 goto out_dev;
Eric W. Biederman881d9662007-09-17 11:56:21 -07003924 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003925 goto out_softnet;
Stephen Hemminger0e1256f2007-03-12 14:35:37 -07003926
Eric W. Biederman881d9662007-09-17 11:56:21 -07003927 if (wext_proc_init(net))
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003928 goto out_ptype;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003929 rc = 0;
3930out:
3931 return rc;
Eric W. Biederman457c4cb2007-09-12 12:01:34 +02003932out_ptype:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003933 proc_net_remove(net, "ptype");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003934out_softnet:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003935 proc_net_remove(net, "softnet_stat");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003936out_dev:
Eric W. Biederman881d9662007-09-17 11:56:21 -07003937 proc_net_remove(net, "dev");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003938 goto out;
3939}
Eric W. Biederman881d9662007-09-17 11:56:21 -07003940
Pavel Emelyanov46650792007-10-08 20:38:39 -07003941static void __net_exit dev_proc_net_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07003942{
3943 wext_proc_exit(net);
3944
3945 proc_net_remove(net, "ptype");
3946 proc_net_remove(net, "softnet_stat");
3947 proc_net_remove(net, "dev");
3948}
3949
Denis V. Lunev022cbae2007-11-13 03:23:50 -08003950static struct pernet_operations __net_initdata dev_proc_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07003951 .init = dev_proc_net_init,
3952 .exit = dev_proc_net_exit,
3953};
3954
3955static int __init dev_proc_init(void)
3956{
3957 return register_pernet_subsys(&dev_proc_ops);
3958}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003959#else
3960#define dev_proc_init() 0
3961#endif /* CONFIG_PROC_FS */
3962
3963
3964/**
3965 * netdev_set_master - set up master/slave pair
3966 * @slave: slave device
3967 * @master: new master device
3968 *
3969 * Changes the master device of the slave. Pass %NULL to break the
3970 * bonding. The caller must hold the RTNL semaphore. On a failure
3971 * a negative errno code is returned. On success the reference counts
3972 * are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3973 * function returns zero.
3974 */
3975int netdev_set_master(struct net_device *slave, struct net_device *master)
3976{
3977 struct net_device *old = slave->master;
3978
3979 ASSERT_RTNL();
3980
3981 if (master) {
3982 if (old)
3983 return -EBUSY;
3984 dev_hold(master);
3985 }
3986
3987 slave->master = master;
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09003988
Eric Dumazet283f2fe2010-03-18 13:37:40 +00003989 if (old) {
3990 synchronize_net();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003991 dev_put(old);
Eric Dumazet283f2fe2010-03-18 13:37:40 +00003992 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003993 if (master)
3994 slave->flags |= IFF_SLAVE;
3995 else
3996 slave->flags &= ~IFF_SLAVE;
3997
3998 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3999 return 0;
4000}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004001EXPORT_SYMBOL(netdev_set_master);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004002
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004003static void dev_change_rx_flags(struct net_device *dev, int flags)
4004{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004005 const struct net_device_ops *ops = dev->netdev_ops;
4006
4007 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4008 ops->ndo_change_rx_flags(dev, flags);
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004009}
4010
Wang Chendad9b332008-06-18 01:48:28 -07004011static int __dev_set_promiscuity(struct net_device *dev, int inc)
Patrick McHardy4417da62007-06-27 01:28:10 -07004012{
4013 unsigned short old_flags = dev->flags;
David Howells8192b0c2008-11-14 10:39:10 +11004014 uid_t uid;
4015 gid_t gid;
Patrick McHardy4417da62007-06-27 01:28:10 -07004016
Patrick McHardy24023452007-07-14 18:51:31 -07004017 ASSERT_RTNL();
4018
Wang Chendad9b332008-06-18 01:48:28 -07004019 dev->flags |= IFF_PROMISC;
4020 dev->promiscuity += inc;
4021 if (dev->promiscuity == 0) {
4022 /*
4023 * Avoid overflow.
4024 * If inc causes overflow, untouch promisc and return error.
4025 */
4026 if (inc < 0)
4027 dev->flags &= ~IFF_PROMISC;
4028 else {
4029 dev->promiscuity -= inc;
4030 printk(KERN_WARNING "%s: promiscuity touches roof, "
4031 "set promiscuity failed, promiscuity feature "
4032 "of device might be broken.\n", dev->name);
4033 return -EOVERFLOW;
4034 }
4035 }
Patrick McHardy4417da62007-06-27 01:28:10 -07004036 if (dev->flags != old_flags) {
4037 printk(KERN_INFO "device %s %s promiscuous mode\n",
4038 dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
4039 "left");
David Howells8192b0c2008-11-14 10:39:10 +11004040 if (audit_enabled) {
4041 current_uid_gid(&uid, &gid);
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004042 audit_log(current->audit_context, GFP_ATOMIC,
4043 AUDIT_ANOM_PROMISCUOUS,
4044 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4045 dev->name, (dev->flags & IFF_PROMISC),
4046 (old_flags & IFF_PROMISC),
4047 audit_get_loginuid(current),
David Howells8192b0c2008-11-14 10:39:10 +11004048 uid, gid,
Klaus Heinrich Kiwi7759db82008-01-23 22:57:45 -05004049 audit_get_sessionid(current));
David Howells8192b0c2008-11-14 10:39:10 +11004050 }
Patrick McHardy24023452007-07-14 18:51:31 -07004051
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004052 dev_change_rx_flags(dev, IFF_PROMISC);
Patrick McHardy4417da62007-06-27 01:28:10 -07004053 }
Wang Chendad9b332008-06-18 01:48:28 -07004054 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004055}
4056
Linus Torvalds1da177e2005-04-16 15:20:36 -07004057/**
4058 * dev_set_promiscuity - update promiscuity count on a device
4059 * @dev: device
4060 * @inc: modifier
4061 *
Stephen Hemminger3041a062006-05-26 13:25:24 -07004062 * Add or remove promiscuity from a device. While the count in the device
Linus Torvalds1da177e2005-04-16 15:20:36 -07004063 * remains above zero the interface remains promiscuous. Once it hits zero
4064 * the device reverts back to normal filtering operation. A negative inc
4065 * value is used to drop promiscuity on the device.
Wang Chendad9b332008-06-18 01:48:28 -07004066 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004067 */
Wang Chendad9b332008-06-18 01:48:28 -07004068int dev_set_promiscuity(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004069{
4070 unsigned short old_flags = dev->flags;
Wang Chendad9b332008-06-18 01:48:28 -07004071 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004072
Wang Chendad9b332008-06-18 01:48:28 -07004073 err = __dev_set_promiscuity(dev, inc);
Patrick McHardy4b5a6982008-07-06 15:49:08 -07004074 if (err < 0)
Wang Chendad9b332008-06-18 01:48:28 -07004075 return err;
Patrick McHardy4417da62007-06-27 01:28:10 -07004076 if (dev->flags != old_flags)
4077 dev_set_rx_mode(dev);
Wang Chendad9b332008-06-18 01:48:28 -07004078 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004079}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004080EXPORT_SYMBOL(dev_set_promiscuity);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004081
4082/**
4083 * dev_set_allmulti - update allmulti count on a device
4084 * @dev: device
4085 * @inc: modifier
4086 *
4087 * Add or remove reception of all multicast frames to a device. While the
4088 * count in the device remains above zero the interface remains listening
4089 * to all interfaces. Once it hits zero the device reverts back to normal
4090 * filtering operation. A negative @inc value is used to drop the counter
4091 * when releasing a resource needing all multicasts.
Wang Chendad9b332008-06-18 01:48:28 -07004092 * Return 0 if successful or a negative errno code on error.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004093 */
4094
Wang Chendad9b332008-06-18 01:48:28 -07004095int dev_set_allmulti(struct net_device *dev, int inc)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004096{
4097 unsigned short old_flags = dev->flags;
4098
Patrick McHardy24023452007-07-14 18:51:31 -07004099 ASSERT_RTNL();
4100
Linus Torvalds1da177e2005-04-16 15:20:36 -07004101 dev->flags |= IFF_ALLMULTI;
Wang Chendad9b332008-06-18 01:48:28 -07004102 dev->allmulti += inc;
4103 if (dev->allmulti == 0) {
4104 /*
4105 * Avoid overflow.
4106 * If inc causes overflow, untouch allmulti and return error.
4107 */
4108 if (inc < 0)
4109 dev->flags &= ~IFF_ALLMULTI;
4110 else {
4111 dev->allmulti -= inc;
4112 printk(KERN_WARNING "%s: allmulti touches roof, "
4113 "set allmulti failed, allmulti feature of "
4114 "device might be broken.\n", dev->name);
4115 return -EOVERFLOW;
4116 }
4117 }
Patrick McHardy24023452007-07-14 18:51:31 -07004118 if (dev->flags ^ old_flags) {
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004119 dev_change_rx_flags(dev, IFF_ALLMULTI);
Patrick McHardy4417da62007-06-27 01:28:10 -07004120 dev_set_rx_mode(dev);
Patrick McHardy24023452007-07-14 18:51:31 -07004121 }
Wang Chendad9b332008-06-18 01:48:28 -07004122 return 0;
Patrick McHardy4417da62007-06-27 01:28:10 -07004123}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004124EXPORT_SYMBOL(dev_set_allmulti);
Patrick McHardy4417da62007-06-27 01:28:10 -07004125
4126/*
4127 * Upload unicast and multicast address lists to device and
4128 * configure RX filtering. When the device doesn't support unicast
Joe Perches53ccaae2007-12-20 14:02:06 -08004129 * filtering it is put in promiscuous mode while unicast addresses
Patrick McHardy4417da62007-06-27 01:28:10 -07004130 * are present.
4131 */
4132void __dev_set_rx_mode(struct net_device *dev)
4133{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004134 const struct net_device_ops *ops = dev->netdev_ops;
4135
Patrick McHardy4417da62007-06-27 01:28:10 -07004136 /* dev_open will call this function so the list will stay sane. */
4137 if (!(dev->flags&IFF_UP))
4138 return;
4139
4140 if (!netif_device_present(dev))
YOSHIFUJI Hideaki40b77c92007-07-19 10:43:23 +09004141 return;
Patrick McHardy4417da62007-06-27 01:28:10 -07004142
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004143 if (ops->ndo_set_rx_mode)
4144 ops->ndo_set_rx_mode(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004145 else {
4146 /* Unicast addresses changes may only happen under the rtnl,
4147 * therefore calling __dev_set_promiscuity here is safe.
4148 */
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004149 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004150 __dev_set_promiscuity(dev, 1);
4151 dev->uc_promisc = 1;
Jiri Pirko32e7bfc2010-01-25 13:36:10 -08004152 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
Patrick McHardy4417da62007-06-27 01:28:10 -07004153 __dev_set_promiscuity(dev, -1);
4154 dev->uc_promisc = 0;
4155 }
4156
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004157 if (ops->ndo_set_multicast_list)
4158 ops->ndo_set_multicast_list(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004159 }
4160}
4161
4162void dev_set_rx_mode(struct net_device *dev)
4163{
David S. Millerb9e40852008-07-15 00:15:08 -07004164 netif_addr_lock_bh(dev);
Patrick McHardy4417da62007-06-27 01:28:10 -07004165 __dev_set_rx_mode(dev);
David S. Millerb9e40852008-07-15 00:15:08 -07004166 netif_addr_unlock_bh(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004167}
4168
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004169/**
4170 * dev_get_flags - get flags reported to userspace
4171 * @dev: device
4172 *
4173 * Get the combination of flag bits exported through APIs to userspace.
4174 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004175unsigned dev_get_flags(const struct net_device *dev)
4176{
4177 unsigned flags;
4178
4179 flags = (dev->flags & ~(IFF_PROMISC |
4180 IFF_ALLMULTI |
Stefan Rompfb00055a2006-03-20 17:09:11 -08004181 IFF_RUNNING |
4182 IFF_LOWER_UP |
4183 IFF_DORMANT)) |
Linus Torvalds1da177e2005-04-16 15:20:36 -07004184 (dev->gflags & (IFF_PROMISC |
4185 IFF_ALLMULTI));
4186
Stefan Rompfb00055a2006-03-20 17:09:11 -08004187 if (netif_running(dev)) {
4188 if (netif_oper_up(dev))
4189 flags |= IFF_RUNNING;
4190 if (netif_carrier_ok(dev))
4191 flags |= IFF_LOWER_UP;
4192 if (netif_dormant(dev))
4193 flags |= IFF_DORMANT;
4194 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004195
4196 return flags;
4197}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004198EXPORT_SYMBOL(dev_get_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004199
Patrick McHardybd380812010-02-26 06:34:53 +00004200int __dev_change_flags(struct net_device *dev, unsigned int flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004201{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004202 int old_flags = dev->flags;
Patrick McHardybd380812010-02-26 06:34:53 +00004203 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004204
Patrick McHardy24023452007-07-14 18:51:31 -07004205 ASSERT_RTNL();
4206
Linus Torvalds1da177e2005-04-16 15:20:36 -07004207 /*
4208 * Set the flags on our device.
4209 */
4210
4211 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4212 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4213 IFF_AUTOMEDIA)) |
4214 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4215 IFF_ALLMULTI));
4216
4217 /*
4218 * Load in the correct multicast list now the flags have changed.
4219 */
4220
Patrick McHardyb6c40d62008-10-07 15:26:48 -07004221 if ((old_flags ^ flags) & IFF_MULTICAST)
4222 dev_change_rx_flags(dev, IFF_MULTICAST);
Patrick McHardy24023452007-07-14 18:51:31 -07004223
Patrick McHardy4417da62007-06-27 01:28:10 -07004224 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004225
4226 /*
4227 * Have we downed the interface. We handle IFF_UP ourselves
4228 * according to user attempts to set it, rather than blindly
4229 * setting it.
4230 */
4231
4232 ret = 0;
4233 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
Patrick McHardybd380812010-02-26 06:34:53 +00004234 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004235
4236 if (!ret)
Patrick McHardy4417da62007-06-27 01:28:10 -07004237 dev_set_rx_mode(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004238 }
4239
Linus Torvalds1da177e2005-04-16 15:20:36 -07004240 if ((flags ^ dev->gflags) & IFF_PROMISC) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004241 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4242
Linus Torvalds1da177e2005-04-16 15:20:36 -07004243 dev->gflags ^= IFF_PROMISC;
4244 dev_set_promiscuity(dev, inc);
4245 }
4246
4247 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4248 is important. Some (broken) drivers set IFF_PROMISC, when
4249 IFF_ALLMULTI is requested not asking us and not reporting.
4250 */
4251 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004252 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4253
Linus Torvalds1da177e2005-04-16 15:20:36 -07004254 dev->gflags ^= IFF_ALLMULTI;
4255 dev_set_allmulti(dev, inc);
4256 }
4257
Patrick McHardybd380812010-02-26 06:34:53 +00004258 return ret;
4259}
4260
4261void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4262{
4263 unsigned int changes = dev->flags ^ old_flags;
4264
4265 if (changes & IFF_UP) {
4266 if (dev->flags & IFF_UP)
4267 call_netdevice_notifiers(NETDEV_UP, dev);
4268 else
4269 call_netdevice_notifiers(NETDEV_DOWN, dev);
4270 }
4271
4272 if (dev->flags & IFF_UP &&
4273 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4274 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4275}
4276
4277/**
4278 * dev_change_flags - change device settings
4279 * @dev: device
4280 * @flags: device state flags
4281 *
4282 * Change settings on device based state flags. The flags are
4283 * in the userspace exported format.
4284 */
4285int dev_change_flags(struct net_device *dev, unsigned flags)
4286{
4287 int ret, changes;
4288 int old_flags = dev->flags;
4289
4290 ret = __dev_change_flags(dev, flags);
4291 if (ret < 0)
4292 return ret;
4293
4294 changes = old_flags ^ dev->flags;
Thomas Graf7c355f52007-06-05 16:03:03 -07004295 if (changes)
4296 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004297
Patrick McHardybd380812010-02-26 06:34:53 +00004298 __dev_notify_flags(dev, old_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004299 return ret;
4300}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004301EXPORT_SYMBOL(dev_change_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004302
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004303/**
4304 * dev_set_mtu - Change maximum transfer unit
4305 * @dev: device
4306 * @new_mtu: new transfer unit
4307 *
4308 * Change the maximum transfer size of the network device.
4309 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004310int dev_set_mtu(struct net_device *dev, int new_mtu)
4311{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004312 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004313 int err;
4314
4315 if (new_mtu == dev->mtu)
4316 return 0;
4317
4318 /* MTU must be positive. */
4319 if (new_mtu < 0)
4320 return -EINVAL;
4321
4322 if (!netif_device_present(dev))
4323 return -ENODEV;
4324
4325 err = 0;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004326 if (ops->ndo_change_mtu)
4327 err = ops->ndo_change_mtu(dev, new_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004328 else
4329 dev->mtu = new_mtu;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004330
Linus Torvalds1da177e2005-04-16 15:20:36 -07004331 if (!err && dev->flags & IFF_UP)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004332 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004333 return err;
4334}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004335EXPORT_SYMBOL(dev_set_mtu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004336
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07004337/**
4338 * dev_set_mac_address - Change Media Access Control Address
4339 * @dev: device
4340 * @sa: new address
4341 *
4342 * Change the hardware (MAC) address of the device
4343 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004344int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4345{
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004346 const struct net_device_ops *ops = dev->netdev_ops;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004347 int err;
4348
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004349 if (!ops->ndo_set_mac_address)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004350 return -EOPNOTSUPP;
4351 if (sa->sa_family != dev->type)
4352 return -EINVAL;
4353 if (!netif_device_present(dev))
4354 return -ENODEV;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004355 err = ops->ndo_set_mac_address(dev, sa);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004356 if (!err)
Pavel Emelyanov056925a2007-09-16 15:42:43 -07004357 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004358 return err;
4359}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004360EXPORT_SYMBOL(dev_set_mac_address);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004361
4362/*
Eric Dumazet3710bec2009-11-01 19:42:09 +00004363 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
Linus Torvalds1da177e2005-04-16 15:20:36 -07004364 */
Jeff Garzik14e3e072007-10-08 00:06:32 -07004365static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004366{
4367 int err;
Eric Dumazet3710bec2009-11-01 19:42:09 +00004368 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004369
4370 if (!dev)
4371 return -ENODEV;
4372
4373 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004374 case SIOCGIFFLAGS: /* Get interface flags */
4375 ifr->ifr_flags = (short) dev_get_flags(dev);
4376 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004377
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004378 case SIOCGIFMETRIC: /* Get the metric on the interface
4379 (currently unused) */
4380 ifr->ifr_metric = 0;
4381 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004382
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004383 case SIOCGIFMTU: /* Get the MTU of a device */
4384 ifr->ifr_mtu = dev->mtu;
4385 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004386
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004387 case SIOCGIFHWADDR:
4388 if (!dev->addr_len)
4389 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4390 else
4391 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4392 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4393 ifr->ifr_hwaddr.sa_family = dev->type;
4394 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004395
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004396 case SIOCGIFSLAVE:
4397 err = -EINVAL;
4398 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004399
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004400 case SIOCGIFMAP:
4401 ifr->ifr_map.mem_start = dev->mem_start;
4402 ifr->ifr_map.mem_end = dev->mem_end;
4403 ifr->ifr_map.base_addr = dev->base_addr;
4404 ifr->ifr_map.irq = dev->irq;
4405 ifr->ifr_map.dma = dev->dma;
4406 ifr->ifr_map.port = dev->if_port;
4407 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004408
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004409 case SIOCGIFINDEX:
4410 ifr->ifr_ifindex = dev->ifindex;
4411 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004412
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004413 case SIOCGIFTXQLEN:
4414 ifr->ifr_qlen = dev->tx_queue_len;
4415 return 0;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004416
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004417 default:
4418 /* dev_ioctl() should ensure this case
4419 * is never reached
4420 */
4421 WARN_ON(1);
4422 err = -EINVAL;
4423 break;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004424
4425 }
4426 return err;
4427}
4428
4429/*
4430 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4431 */
4432static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4433{
4434 int err;
4435 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004436 const struct net_device_ops *ops;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004437
4438 if (!dev)
4439 return -ENODEV;
4440
Jarek Poplawski5f2f6da2008-12-22 19:35:28 -08004441 ops = dev->netdev_ops;
4442
Jeff Garzik14e3e072007-10-08 00:06:32 -07004443 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004444 case SIOCSIFFLAGS: /* Set interface flags */
4445 return dev_change_flags(dev, ifr->ifr_flags);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004446
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004447 case SIOCSIFMETRIC: /* Set the metric on the interface
4448 (currently unused) */
4449 return -EOPNOTSUPP;
Jeff Garzik14e3e072007-10-08 00:06:32 -07004450
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004451 case SIOCSIFMTU: /* Set the MTU of a device */
4452 return dev_set_mtu(dev, ifr->ifr_mtu);
Jeff Garzik14e3e072007-10-08 00:06:32 -07004453
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004454 case SIOCSIFHWADDR:
4455 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004456
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004457 case SIOCSIFHWBROADCAST:
4458 if (ifr->ifr_hwaddr.sa_family != dev->type)
4459 return -EINVAL;
4460 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4461 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4462 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4463 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004464
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004465 case SIOCSIFMAP:
4466 if (ops->ndo_set_config) {
4467 if (!netif_device_present(dev))
4468 return -ENODEV;
4469 return ops->ndo_set_config(dev, &ifr->ifr_map);
4470 }
4471 return -EOPNOTSUPP;
4472
4473 case SIOCADDMULTI:
4474 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4475 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4476 return -EINVAL;
4477 if (!netif_device_present(dev))
4478 return -ENODEV;
Jiri Pirko22bedad2010-04-01 21:22:57 +00004479 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004480
4481 case SIOCDELMULTI:
4482 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4483 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4484 return -EINVAL;
4485 if (!netif_device_present(dev))
4486 return -ENODEV;
Jiri Pirko22bedad2010-04-01 21:22:57 +00004487 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004488
4489 case SIOCSIFTXQLEN:
4490 if (ifr->ifr_qlen < 0)
4491 return -EINVAL;
4492 dev->tx_queue_len = ifr->ifr_qlen;
4493 return 0;
4494
4495 case SIOCSIFNAME:
4496 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4497 return dev_change_name(dev, ifr->ifr_newname);
4498
4499 /*
4500 * Unknown or private ioctl
4501 */
4502 default:
4503 if ((cmd >= SIOCDEVPRIVATE &&
4504 cmd <= SIOCDEVPRIVATE + 15) ||
4505 cmd == SIOCBONDENSLAVE ||
4506 cmd == SIOCBONDRELEASE ||
4507 cmd == SIOCBONDSETHWADDR ||
4508 cmd == SIOCBONDSLAVEINFOQUERY ||
4509 cmd == SIOCBONDINFOQUERY ||
4510 cmd == SIOCBONDCHANGEACTIVE ||
4511 cmd == SIOCGMIIPHY ||
4512 cmd == SIOCGMIIREG ||
4513 cmd == SIOCSMIIREG ||
4514 cmd == SIOCBRADDIF ||
4515 cmd == SIOCBRDELIF ||
4516 cmd == SIOCSHWTSTAMP ||
4517 cmd == SIOCWANDEV) {
4518 err = -EOPNOTSUPP;
4519 if (ops->ndo_do_ioctl) {
4520 if (netif_device_present(dev))
4521 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4522 else
4523 err = -ENODEV;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004524 }
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004525 } else
4526 err = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004527
4528 }
4529 return err;
4530}
4531
4532/*
4533 * This function handles all "interface"-type I/O control requests. The actual
4534 * 'doing' part of this is dev_ifsioc above.
4535 */
4536
4537/**
4538 * dev_ioctl - network device ioctl
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004539 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004540 * @cmd: command to issue
4541 * @arg: pointer to a struct ifreq in user space
4542 *
4543 * Issue ioctl functions to devices. This is normally called by the
4544 * user space syscall interfaces but can sometimes be useful for
4545 * other purposes. The return value is the return from the syscall if
4546 * positive or a negative errno code on error.
4547 */
4548
Eric W. Biederman881d9662007-09-17 11:56:21 -07004549int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004550{
4551 struct ifreq ifr;
4552 int ret;
4553 char *colon;
4554
4555 /* One special case: SIOCGIFCONF takes ifconf argument
4556 and requires shared lock, because it sleeps writing
4557 to user space.
4558 */
4559
4560 if (cmd == SIOCGIFCONF) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004561 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07004562 ret = dev_ifconf(net, (char __user *) arg);
Stephen Hemminger6756ae42006-03-20 22:23:58 -08004563 rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004564 return ret;
4565 }
4566 if (cmd == SIOCGIFNAME)
Eric W. Biederman881d9662007-09-17 11:56:21 -07004567 return dev_ifname(net, (struct ifreq __user *)arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004568
4569 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4570 return -EFAULT;
4571
4572 ifr.ifr_name[IFNAMSIZ-1] = 0;
4573
4574 colon = strchr(ifr.ifr_name, ':');
4575 if (colon)
4576 *colon = 0;
4577
4578 /*
4579 * See which interface the caller is talking about.
4580 */
4581
4582 switch (cmd) {
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004583 /*
4584 * These ioctl calls:
4585 * - can be done by all.
4586 * - atomic and do not require locking.
4587 * - return a value
4588 */
4589 case SIOCGIFFLAGS:
4590 case SIOCGIFMETRIC:
4591 case SIOCGIFMTU:
4592 case SIOCGIFHWADDR:
4593 case SIOCGIFSLAVE:
4594 case SIOCGIFMAP:
4595 case SIOCGIFINDEX:
4596 case SIOCGIFTXQLEN:
4597 dev_load(net, ifr.ifr_name);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004598 rcu_read_lock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004599 ret = dev_ifsioc_locked(net, &ifr, cmd);
Eric Dumazet3710bec2009-11-01 19:42:09 +00004600 rcu_read_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004601 if (!ret) {
4602 if (colon)
4603 *colon = ':';
4604 if (copy_to_user(arg, &ifr,
4605 sizeof(struct ifreq)))
4606 ret = -EFAULT;
4607 }
4608 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004609
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004610 case SIOCETHTOOL:
4611 dev_load(net, ifr.ifr_name);
4612 rtnl_lock();
4613 ret = dev_ethtool(net, &ifr);
4614 rtnl_unlock();
4615 if (!ret) {
4616 if (colon)
4617 *colon = ':';
4618 if (copy_to_user(arg, &ifr,
4619 sizeof(struct ifreq)))
4620 ret = -EFAULT;
4621 }
4622 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004623
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004624 /*
4625 * These ioctl calls:
4626 * - require superuser power.
4627 * - require strict serialization.
4628 * - return a value
4629 */
4630 case SIOCGMIIPHY:
4631 case SIOCGMIIREG:
4632 case SIOCSIFNAME:
4633 if (!capable(CAP_NET_ADMIN))
4634 return -EPERM;
4635 dev_load(net, ifr.ifr_name);
4636 rtnl_lock();
4637 ret = dev_ifsioc(net, &ifr, cmd);
4638 rtnl_unlock();
4639 if (!ret) {
4640 if (colon)
4641 *colon = ':';
4642 if (copy_to_user(arg, &ifr,
4643 sizeof(struct ifreq)))
4644 ret = -EFAULT;
4645 }
4646 return ret;
4647
4648 /*
4649 * These ioctl calls:
4650 * - require superuser power.
4651 * - require strict serialization.
4652 * - do not return a value
4653 */
4654 case SIOCSIFFLAGS:
4655 case SIOCSIFMETRIC:
4656 case SIOCSIFMTU:
4657 case SIOCSIFMAP:
4658 case SIOCSIFHWADDR:
4659 case SIOCSIFSLAVE:
4660 case SIOCADDMULTI:
4661 case SIOCDELMULTI:
4662 case SIOCSIFHWBROADCAST:
4663 case SIOCSIFTXQLEN:
4664 case SIOCSMIIREG:
4665 case SIOCBONDENSLAVE:
4666 case SIOCBONDRELEASE:
4667 case SIOCBONDSETHWADDR:
4668 case SIOCBONDCHANGEACTIVE:
4669 case SIOCBRADDIF:
4670 case SIOCBRDELIF:
4671 case SIOCSHWTSTAMP:
4672 if (!capable(CAP_NET_ADMIN))
4673 return -EPERM;
4674 /* fall through */
4675 case SIOCBONDSLAVEINFOQUERY:
4676 case SIOCBONDINFOQUERY:
4677 dev_load(net, ifr.ifr_name);
4678 rtnl_lock();
4679 ret = dev_ifsioc(net, &ifr, cmd);
4680 rtnl_unlock();
4681 return ret;
4682
4683 case SIOCGIFMEM:
4684 /* Get the per device memory space. We can add this but
4685 * currently do not support it */
4686 case SIOCSIFMEM:
4687 /* Set the per device memory buffer space.
4688 * Not applicable in our case */
4689 case SIOCSIFLINK:
4690 return -EINVAL;
4691
4692 /*
4693 * Unknown or private ioctl.
4694 */
4695 default:
4696 if (cmd == SIOCWANDEV ||
4697 (cmd >= SIOCDEVPRIVATE &&
4698 cmd <= SIOCDEVPRIVATE + 15)) {
Eric W. Biederman881d9662007-09-17 11:56:21 -07004699 dev_load(net, ifr.ifr_name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004700 rtnl_lock();
Eric W. Biederman881d9662007-09-17 11:56:21 -07004701 ret = dev_ifsioc(net, &ifr, cmd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004702 rtnl_unlock();
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004703 if (!ret && copy_to_user(arg, &ifr,
Linus Torvalds1da177e2005-04-16 15:20:36 -07004704 sizeof(struct ifreq)))
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004705 ret = -EFAULT;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004706 return ret;
Eric Dumazetd1b19df2009-09-03 01:29:39 -07004707 }
4708 /* Take care of Wireless Extensions */
4709 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4710 return wext_handle_ioctl(net, &ifr, cmd, arg);
4711 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004712 }
4713}
4714
4715
4716/**
4717 * dev_new_index - allocate an ifindex
Randy Dunlapc4ea43c2007-10-12 21:17:49 -07004718 * @net: the applicable net namespace
Linus Torvalds1da177e2005-04-16 15:20:36 -07004719 *
4720 * Returns a suitable unique value for a new device interface
4721 * number. The caller must hold the rtnl semaphore or the
4722 * dev_base_lock to be sure it remains unique.
4723 */
Eric W. Biederman881d9662007-09-17 11:56:21 -07004724static int dev_new_index(struct net *net)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004725{
4726 static int ifindex;
4727 for (;;) {
4728 if (++ifindex <= 0)
4729 ifindex = 1;
Eric W. Biederman881d9662007-09-17 11:56:21 -07004730 if (!__dev_get_by_index(net, ifindex))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004731 return ifindex;
4732 }
4733}
4734
Linus Torvalds1da177e2005-04-16 15:20:36 -07004735/* Delayed registration/unregisteration */
Denis Cheng3b5b34f2007-12-07 00:49:17 -08004736static LIST_HEAD(net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004737
Stephen Hemminger6f05f622007-03-08 20:46:03 -08004738static void net_set_todo(struct net_device *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004739{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004740 list_add_tail(&dev->todo_list, &net_todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004741}
4742
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004743static void rollback_registered_many(struct list_head *head)
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004744{
Krishna Kumare93737b2009-12-08 22:26:02 +00004745 struct net_device *dev, *tmp;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004746
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004747 BUG_ON(dev_boot_phase);
4748 ASSERT_RTNL();
4749
Krishna Kumare93737b2009-12-08 22:26:02 +00004750 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004751 /* Some devices call without registering
Krishna Kumare93737b2009-12-08 22:26:02 +00004752 * for initialization unwind. Remove those
4753 * devices and proceed with the remaining.
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004754 */
4755 if (dev->reg_state == NETREG_UNINITIALIZED) {
4756 pr_debug("unregister_netdevice: device %s/%p never "
4757 "was registered\n", dev->name, dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004758
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004759 WARN_ON(1);
Krishna Kumare93737b2009-12-08 22:26:02 +00004760 list_del(&dev->unreg_list);
4761 continue;
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004762 }
4763
4764 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4765
4766 /* If device is running, close it first. */
4767 dev_close(dev);
4768
4769 /* And unlink it from device chain. */
4770 unlist_netdevice(dev);
4771
4772 dev->reg_state = NETREG_UNREGISTERING;
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004773 }
4774
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004775 synchronize_net();
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004776
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004777 list_for_each_entry(dev, head, unreg_list) {
4778 /* Shutdown queueing discipline. */
4779 dev_shutdown(dev);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004780
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004781
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004782 /* Notify protocols, that we are about to destroy
4783 this device. They should clean all the things.
4784 */
4785 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4786
Patrick McHardya2835762010-02-26 06:34:51 +00004787 if (!dev->rtnl_link_ops ||
4788 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4789 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4790
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004791 /*
4792 * Flush the unicast and multicast chains
4793 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00004794 dev_uc_flush(dev);
Jiri Pirko22bedad2010-04-01 21:22:57 +00004795 dev_mc_flush(dev);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004796
4797 if (dev->netdev_ops->ndo_uninit)
4798 dev->netdev_ops->ndo_uninit(dev);
4799
4800 /* Notifier chain MUST detach us from master device. */
4801 WARN_ON(dev->master);
4802
4803 /* Remove entries from kobject tree */
4804 netdev_unregister_kobject(dev);
4805 }
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004806
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004807 /* Process any work delayed until the end of the batch */
stephen hemmingere5e26d72010-02-24 14:01:38 +00004808 dev = list_first_entry(head, struct net_device, unreg_list);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004809 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
4810
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004811 synchronize_net();
4812
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00004813 list_for_each_entry(dev, head, unreg_list)
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004814 dev_put(dev);
4815}
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004816
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004817static void rollback_registered(struct net_device *dev)
4818{
4819 LIST_HEAD(single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004820
Eric Dumazet9b5e3832009-10-27 07:04:19 +00004821 list_add(&dev->unreg_list, &single);
4822 rollback_registered_many(&single);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07004823}
4824
David S. Millere8a04642008-07-17 00:34:19 -07004825static void __netdev_init_queue_locks_one(struct net_device *dev,
4826 struct netdev_queue *dev_queue,
4827 void *_unused)
David S. Millerc773e842008-07-08 23:13:53 -07004828{
4829 spin_lock_init(&dev_queue->_xmit_lock);
David S. Millercf508b12008-07-22 14:16:42 -07004830 netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
David S. Millerc773e842008-07-08 23:13:53 -07004831 dev_queue->xmit_lock_owner = -1;
4832}
4833
4834static void netdev_init_queue_locks(struct net_device *dev)
4835{
David S. Millere8a04642008-07-17 00:34:19 -07004836 netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4837 __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
David S. Millerc773e842008-07-08 23:13:53 -07004838}
4839
Herbert Xub63365a2008-10-23 01:11:29 -07004840unsigned long netdev_fix_features(unsigned long features, const char *name)
4841{
4842 /* Fix illegal SG+CSUM combinations. */
4843 if ((features & NETIF_F_SG) &&
4844 !(features & NETIF_F_ALL_CSUM)) {
4845 if (name)
4846 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4847 "checksum feature.\n", name);
4848 features &= ~NETIF_F_SG;
4849 }
4850
4851 /* TSO requires that SG is present as well. */
4852 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4853 if (name)
4854 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4855 "SG feature.\n", name);
4856 features &= ~NETIF_F_TSO;
4857 }
4858
4859 if (features & NETIF_F_UFO) {
4860 if (!(features & NETIF_F_GEN_CSUM)) {
4861 if (name)
4862 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4863 "since no NETIF_F_HW_CSUM feature.\n",
4864 name);
4865 features &= ~NETIF_F_UFO;
4866 }
4867
4868 if (!(features & NETIF_F_SG)) {
4869 if (name)
4870 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4871 "since no NETIF_F_SG feature.\n", name);
4872 features &= ~NETIF_F_UFO;
4873 }
4874 }
4875
4876 return features;
4877}
4878EXPORT_SYMBOL(netdev_fix_features);
4879
Linus Torvalds1da177e2005-04-16 15:20:36 -07004880/**
Patrick Mullaneyfc4a7482009-12-03 15:59:22 -08004881 * netif_stacked_transfer_operstate - transfer operstate
4882 * @rootdev: the root or lower level device to transfer state from
4883 * @dev: the device to transfer operstate to
4884 *
4885 * Transfer operational state from root to device. This is normally
4886 * called when a stacking relationship exists between the root
4887 * device and the device(a leaf device).
4888 */
4889void netif_stacked_transfer_operstate(const struct net_device *rootdev,
4890 struct net_device *dev)
4891{
4892 if (rootdev->operstate == IF_OPER_DORMANT)
4893 netif_dormant_on(dev);
4894 else
4895 netif_dormant_off(dev);
4896
4897 if (netif_carrier_ok(rootdev)) {
4898 if (!netif_carrier_ok(dev))
4899 netif_carrier_on(dev);
4900 } else {
4901 if (netif_carrier_ok(dev))
4902 netif_carrier_off(dev);
4903 }
4904}
4905EXPORT_SYMBOL(netif_stacked_transfer_operstate);
4906
4907/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07004908 * register_netdevice - register a network device
4909 * @dev: device to register
4910 *
4911 * Take a completed network device structure and add it to the kernel
4912 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4913 * chain. 0 is returned on success. A negative errno code is returned
4914 * on a failure to set up the device, or if the name is a duplicate.
4915 *
4916 * Callers must hold the rtnl semaphore. You may want
4917 * register_netdev() instead of this.
4918 *
4919 * BUGS:
4920 * The locking appears insufficient to guarantee two parallel registers
4921 * will not get the same name.
4922 */
4923
4924int register_netdevice(struct net_device *dev)
4925{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004926 int ret;
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004927 struct net *net = dev_net(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004928
4929 BUG_ON(dev_boot_phase);
4930 ASSERT_RTNL();
4931
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07004932 might_sleep();
4933
Linus Torvalds1da177e2005-04-16 15:20:36 -07004934 /* When net_device's are persistent, this will be fatal. */
4935 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004936 BUG_ON(!net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004937
David S. Millerf1f28aa2008-07-15 00:08:33 -07004938 spin_lock_init(&dev->addr_list_lock);
David S. Millercf508b12008-07-22 14:16:42 -07004939 netdev_set_addr_lockdep_class(dev);
David S. Millerc773e842008-07-08 23:13:53 -07004940 netdev_init_queue_locks(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004941
Linus Torvalds1da177e2005-04-16 15:20:36 -07004942 dev->iflink = -1;
4943
Eric Dumazetdf334542010-03-24 19:13:54 +00004944#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00004945 if (!dev->num_rx_queues) {
4946 /*
4947 * Allocate a single RX queue if driver never called
4948 * alloc_netdev_mq
4949 */
4950
4951 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
4952 if (!dev->_rx) {
4953 ret = -ENOMEM;
4954 goto out;
4955 }
4956
4957 dev->_rx->first = dev->_rx;
4958 atomic_set(&dev->_rx->count, 1);
4959 dev->num_rx_queues = 1;
4960 }
Eric Dumazetdf334542010-03-24 19:13:54 +00004961#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004962 /* Init, if this function is available */
Stephen Hemmingerd3147742008-11-19 21:32:24 -08004963 if (dev->netdev_ops->ndo_init) {
4964 ret = dev->netdev_ops->ndo_init(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004965 if (ret) {
4966 if (ret > 0)
4967 ret = -EIO;
Adrian Bunk90833aa2006-11-13 16:02:22 -08004968 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004969 }
4970 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09004971
Daniel Lezcano8ce6ceb2010-05-19 10:12:19 +00004972 ret = dev_get_valid_name(dev, dev->name, 0);
Octavian Purdilad9031022009-11-18 02:36:59 +00004973 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07004974 goto err_uninit;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004975
Eric W. Biederman881d9662007-09-17 11:56:21 -07004976 dev->ifindex = dev_new_index(net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004977 if (dev->iflink == -1)
4978 dev->iflink = dev->ifindex;
4979
Stephen Hemmingerd212f872007-06-27 00:47:37 -07004980 /* Fix illegal checksum combinations */
4981 if ((dev->features & NETIF_F_HW_CSUM) &&
4982 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4983 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4984 dev->name);
4985 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4986 }
4987
4988 if ((dev->features & NETIF_F_NO_CSUM) &&
4989 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4990 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4991 dev->name);
4992 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4993 }
4994
Herbert Xub63365a2008-10-23 01:11:29 -07004995 dev->features = netdev_fix_features(dev->features, dev->name);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004996
Lennert Buytenheke5a4a722008-08-03 01:23:10 -07004997 /* Enable software GSO if SG is supported. */
4998 if (dev->features & NETIF_F_SG)
4999 dev->features |= NETIF_F_GSO;
5000
Daniel Lezcanoaaf8cdc2008-05-02 17:00:58 -07005001 netdev_initialize_kobject(dev);
Johannes Berg7ffbe3f2009-10-02 05:15:27 +00005002
5003 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5004 ret = notifier_to_errno(ret);
5005 if (ret)
5006 goto err_uninit;
5007
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005008 ret = netdev_register_kobject(dev);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005009 if (ret)
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005010 goto err_uninit;
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005011 dev->reg_state = NETREG_REGISTERED;
5012
Linus Torvalds1da177e2005-04-16 15:20:36 -07005013 /*
5014 * Default initial state at registry is that the
5015 * device is present.
5016 */
5017
5018 set_bit(__LINK_STATE_PRESENT, &dev->state);
5019
Linus Torvalds1da177e2005-04-16 15:20:36 -07005020 dev_init_scheduler(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005021 dev_hold(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005022 list_netdevice(dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005023
5024 /* Notify protocols, that a new device appeared. */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005025 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
Herbert Xufcc5a032007-07-30 17:03:38 -07005026 ret = notifier_to_errno(ret);
Daniel Lezcano93ee31f2007-10-30 15:38:18 -07005027 if (ret) {
5028 rollback_registered(dev);
5029 dev->reg_state = NETREG_UNREGISTERED;
5030 }
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005031 /*
5032 * Prevent userspace races by waiting until the network
5033 * device is fully setup before sending notifications.
5034 */
Patrick McHardya2835762010-02-26 06:34:51 +00005035 if (!dev->rtnl_link_ops ||
5036 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5037 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005038
5039out:
5040 return ret;
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005041
5042err_uninit:
Stephen Hemmingerd3147742008-11-19 21:32:24 -08005043 if (dev->netdev_ops->ndo_uninit)
5044 dev->netdev_ops->ndo_uninit(dev);
Herbert Xu7ce1b0e2007-07-30 16:29:40 -07005045 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005046}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005047EXPORT_SYMBOL(register_netdevice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005048
5049/**
Benjamin Herrenschmidt937f1ba2009-01-14 21:05:05 -08005050 * init_dummy_netdev - init a dummy network device for NAPI
5051 * @dev: device to init
5052 *
5053 * This takes a network device structure and initialize the minimum
5054 * amount of fields so it can be used to schedule NAPI polls without
5055 * registering a full blown interface. This is to be used by drivers
5056 * that need to tie several hardware interfaces to a single NAPI
5057 * poll scheduler due to HW limitations.
5058 */
5059int init_dummy_netdev(struct net_device *dev)
5060{
5061 /* Clear everything. Note we don't initialize spinlocks
5062 * are they aren't supposed to be taken by any of the
5063 * NAPI code and this dummy netdev is supposed to be
5064 * only ever used for NAPI polls
5065 */
5066 memset(dev, 0, sizeof(struct net_device));
5067
5068 /* make sure we BUG if trying to hit standard
5069 * register/unregister code path
5070 */
5071 dev->reg_state = NETREG_DUMMY;
5072
5073 /* initialize the ref count */
5074 atomic_set(&dev->refcnt, 1);
5075
5076 /* NAPI wants this */
5077 INIT_LIST_HEAD(&dev->napi_list);
5078
5079 /* a dummy interface is started by default */
5080 set_bit(__LINK_STATE_PRESENT, &dev->state);
5081 set_bit(__LINK_STATE_START, &dev->state);
5082
5083 return 0;
5084}
5085EXPORT_SYMBOL_GPL(init_dummy_netdev);
5086
5087
5088/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005089 * register_netdev - register a network device
5090 * @dev: device to register
5091 *
5092 * Take a completed network device structure and add it to the kernel
5093 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5094 * chain. 0 is returned on success. A negative errno code is returned
5095 * on a failure to set up the device, or if the name is a duplicate.
5096 *
Borislav Petkov38b4da32007-04-20 22:14:10 -07005097 * This is a wrapper around register_netdevice that takes the rtnl semaphore
Linus Torvalds1da177e2005-04-16 15:20:36 -07005098 * and expands the device name if you passed a format string to
5099 * alloc_netdev.
5100 */
5101int register_netdev(struct net_device *dev)
5102{
5103 int err;
5104
5105 rtnl_lock();
5106
5107 /*
5108 * If the name is a format string the caller wants us to do a
5109 * name allocation.
5110 */
5111 if (strchr(dev->name, '%')) {
5112 err = dev_alloc_name(dev, dev->name);
5113 if (err < 0)
5114 goto out;
5115 }
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005116
Linus Torvalds1da177e2005-04-16 15:20:36 -07005117 err = register_netdevice(dev);
5118out:
5119 rtnl_unlock();
5120 return err;
5121}
5122EXPORT_SYMBOL(register_netdev);
5123
5124/*
5125 * netdev_wait_allrefs - wait until all references are gone.
5126 *
5127 * This is called when unregistering network devices.
5128 *
5129 * Any protocol or device that holds a reference should register
5130 * for netdevice notification, and cleanup and put back the
5131 * reference if they receive an UNREGISTER event.
5132 * We can get stuck here if buggy protocols don't correctly
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005133 * call dev_put.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005134 */
5135static void netdev_wait_allrefs(struct net_device *dev)
5136{
5137 unsigned long rebroadcast_time, warning_time;
5138
Eric Dumazete014deb2009-11-17 05:59:21 +00005139 linkwatch_forget_dev(dev);
5140
Linus Torvalds1da177e2005-04-16 15:20:36 -07005141 rebroadcast_time = warning_time = jiffies;
5142 while (atomic_read(&dev->refcnt) != 0) {
5143 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005144 rtnl_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005145
5146 /* Rebroadcast unregister notification */
Pavel Emelyanov056925a2007-09-16 15:42:43 -07005147 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005148 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
Octavian Purdila395264d2009-11-16 13:49:35 +00005149 * should have already handle it the first time */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005150
5151 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5152 &dev->state)) {
5153 /* We must not have linkwatch events
5154 * pending on unregister. If this
5155 * happens, we simply run the queue
5156 * unscheduled, resulting in a noop
5157 * for this device.
5158 */
5159 linkwatch_run_queue();
5160 }
5161
Stephen Hemminger6756ae42006-03-20 22:23:58 -08005162 __rtnl_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005163
5164 rebroadcast_time = jiffies;
5165 }
5166
5167 msleep(250);
5168
5169 if (time_after(jiffies, warning_time + 10 * HZ)) {
5170 printk(KERN_EMERG "unregister_netdevice: "
5171 "waiting for %s to become free. Usage "
5172 "count = %d\n",
5173 dev->name, atomic_read(&dev->refcnt));
5174 warning_time = jiffies;
5175 }
5176 }
5177}
5178
5179/* The sequence is:
5180 *
5181 * rtnl_lock();
5182 * ...
5183 * register_netdevice(x1);
5184 * register_netdevice(x2);
5185 * ...
5186 * unregister_netdevice(y1);
5187 * unregister_netdevice(y2);
5188 * ...
5189 * rtnl_unlock();
5190 * free_netdev(y1);
5191 * free_netdev(y2);
5192 *
Herbert Xu58ec3b42008-10-07 15:50:03 -07005193 * We are invoked by rtnl_unlock().
Linus Torvalds1da177e2005-04-16 15:20:36 -07005194 * This allows us to deal with problems:
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005195 * 1) We can delete sysfs objects which invoke hotplug
Linus Torvalds1da177e2005-04-16 15:20:36 -07005196 * without deadlocking with linkwatch via keventd.
5197 * 2) Since we run with the RTNL semaphore not held, we can sleep
5198 * safely in order to wait for the netdev refcnt to drop to zero.
Herbert Xu58ec3b42008-10-07 15:50:03 -07005199 *
5200 * We must not return until all unregister events added during
5201 * the interval the lock was held have been completed.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005202 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005203void netdev_run_todo(void)
5204{
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005205 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005206
Linus Torvalds1da177e2005-04-16 15:20:36 -07005207 /* Snapshot list, allow later requests */
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005208 list_replace_init(&net_todo_list, &list);
Herbert Xu58ec3b42008-10-07 15:50:03 -07005209
5210 __rtnl_unlock();
Oleg Nesterov626ab0e2006-06-23 02:05:55 -07005211
Linus Torvalds1da177e2005-04-16 15:20:36 -07005212 while (!list_empty(&list)) {
5213 struct net_device *dev
stephen hemmingere5e26d72010-02-24 14:01:38 +00005214 = list_first_entry(&list, struct net_device, todo_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005215 list_del(&dev->todo_list);
5216
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005217 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005218 printk(KERN_ERR "network todo '%s' but state %d\n",
5219 dev->name, dev->reg_state);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005220 dump_stack();
5221 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005222 }
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005223
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005224 dev->reg_state = NETREG_UNREGISTERED;
5225
Changli Gao152102c2010-03-30 20:16:22 +00005226 on_each_cpu(flush_backlog, dev, 1);
Stephen Hemminger6e583ce2008-08-03 21:29:57 -07005227
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005228 netdev_wait_allrefs(dev);
5229
5230 /* paranoia */
5231 BUG_ON(atomic_read(&dev->refcnt));
Ilpo Järvinen547b7922008-07-25 21:43:18 -07005232 WARN_ON(dev->ip_ptr);
5233 WARN_ON(dev->ip6_ptr);
5234 WARN_ON(dev->dn_ptr);
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005235
Stephen Hemmingerb17a7c12006-05-10 13:21:17 -07005236 if (dev->destructor)
5237 dev->destructor(dev);
Stephen Hemminger9093bbb2007-05-19 15:39:25 -07005238
5239 /* Free network device */
5240 kobject_put(&dev->dev.kobj);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005241 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005242}
5243
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005244/**
Eric Dumazetd83345a2009-11-16 03:36:51 +00005245 * dev_txq_stats_fold - fold tx_queues stats
5246 * @dev: device to get statistics from
5247 * @stats: struct net_device_stats to hold results
5248 */
5249void dev_txq_stats_fold(const struct net_device *dev,
5250 struct net_device_stats *stats)
5251{
5252 unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5253 unsigned int i;
5254 struct netdev_queue *txq;
5255
5256 for (i = 0; i < dev->num_tx_queues; i++) {
5257 txq = netdev_get_tx_queue(dev, i);
5258 tx_bytes += txq->tx_bytes;
5259 tx_packets += txq->tx_packets;
5260 tx_dropped += txq->tx_dropped;
5261 }
5262 if (tx_bytes || tx_packets || tx_dropped) {
5263 stats->tx_bytes = tx_bytes;
5264 stats->tx_packets = tx_packets;
5265 stats->tx_dropped = tx_dropped;
5266 }
5267}
5268EXPORT_SYMBOL(dev_txq_stats_fold);
5269
5270/**
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005271 * dev_get_stats - get network device statistics
5272 * @dev: device to get statistics from
5273 *
5274 * Get network statistics from device. The device driver may provide
5275 * its own method by setting dev->netdev_ops->get_stats; otherwise
5276 * the internal statistics structure is used.
5277 */
5278const struct net_device_stats *dev_get_stats(struct net_device *dev)
Eric Dumazet7004bf22009-05-18 00:34:33 +00005279{
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005280 const struct net_device_ops *ops = dev->netdev_ops;
5281
5282 if (ops->ndo_get_stats)
5283 return ops->ndo_get_stats(dev);
Eric Dumazet7004bf22009-05-18 00:34:33 +00005284
Eric Dumazetd83345a2009-11-16 03:36:51 +00005285 dev_txq_stats_fold(dev, &dev->stats);
5286 return &dev->stats;
Rusty Russellc45d2862007-03-28 14:29:08 -07005287}
Stephen Hemmingereeda3fd2008-11-19 21:40:23 -08005288EXPORT_SYMBOL(dev_get_stats);
Rusty Russellc45d2862007-03-28 14:29:08 -07005289
David S. Millerdc2b4842008-07-08 17:18:23 -07005290static void netdev_init_one_queue(struct net_device *dev,
David S. Millere8a04642008-07-17 00:34:19 -07005291 struct netdev_queue *queue,
5292 void *_unused)
David S. Millerdc2b4842008-07-08 17:18:23 -07005293{
David S. Millerdc2b4842008-07-08 17:18:23 -07005294 queue->dev = dev;
5295}
5296
David S. Millerbb949fb2008-07-08 16:55:56 -07005297static void netdev_init_queues(struct net_device *dev)
5298{
David S. Millere8a04642008-07-17 00:34:19 -07005299 netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5300 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
David S. Millerc3f26a22008-07-31 16:58:50 -07005301 spin_lock_init(&dev->tx_global_lock);
David S. Millerbb949fb2008-07-08 16:55:56 -07005302}
5303
Linus Torvalds1da177e2005-04-16 15:20:36 -07005304/**
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005305 * alloc_netdev_mq - allocate network device
Linus Torvalds1da177e2005-04-16 15:20:36 -07005306 * @sizeof_priv: size of private data to allocate space for
5307 * @name: device name format string
5308 * @setup: callback to initialize device
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005309 * @queue_count: the number of subqueues to allocate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005310 *
5311 * Allocates a struct net_device with private data area for driver use
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005312 * and performs basic initialization. Also allocates subquue structs
5313 * for each queue on the device at the end of the netdevice.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005314 */
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005315struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5316 void (*setup)(struct net_device *), unsigned int queue_count)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005317{
David S. Millere8a04642008-07-17 00:34:19 -07005318 struct netdev_queue *tx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005319 struct net_device *dev;
Stephen Hemminger79439862008-07-21 13:28:44 -07005320 size_t alloc_size;
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005321 struct net_device *p;
Eric Dumazetdf334542010-03-24 19:13:54 +00005322#ifdef CONFIG_RPS
5323 struct netdev_rx_queue *rx;
Tom Herbert0a9627f2010-03-16 08:03:29 +00005324 int i;
Eric Dumazetdf334542010-03-24 19:13:54 +00005325#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005326
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005327 BUG_ON(strlen(name) >= sizeof(dev->name));
5328
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005329 alloc_size = sizeof(struct net_device);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005330 if (sizeof_priv) {
5331 /* ensure 32-byte alignment of private area */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005332 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
Alexey Dobriyand1643d22008-04-18 15:43:32 -07005333 alloc_size += sizeof_priv;
5334 }
5335 /* ensure 32-byte alignment of whole construct */
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005336 alloc_size += NETDEV_ALIGN - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005337
Paolo 'Blaisorblade' Giarrusso31380de2006-04-06 22:38:28 -07005338 p = kzalloc(alloc_size, GFP_KERNEL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005339 if (!p) {
Stephen Hemmingerb6fe17d2006-08-29 17:06:13 -07005340 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005341 return NULL;
5342 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005343
Stephen Hemminger79439862008-07-21 13:28:44 -07005344 tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
David S. Millere8a04642008-07-17 00:34:19 -07005345 if (!tx) {
5346 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5347 "tx qdiscs.\n");
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005348 goto free_p;
David S. Millere8a04642008-07-17 00:34:19 -07005349 }
5350
Eric Dumazetdf334542010-03-24 19:13:54 +00005351#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005352 rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5353 if (!rx) {
5354 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5355 "rx queues.\n");
5356 goto free_tx;
5357 }
5358
5359 atomic_set(&rx->count, queue_count);
5360
5361 /*
5362 * Set a pointer to first element in the array which holds the
5363 * reference count.
5364 */
5365 for (i = 0; i < queue_count; i++)
5366 rx[i].first = rx;
Eric Dumazetdf334542010-03-24 19:13:54 +00005367#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005368
Eric Dumazet1ce8e7b2009-05-27 04:42:37 +00005369 dev = PTR_ALIGN(p, NETDEV_ALIGN);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005370 dev->padded = (char *)dev - (char *)p;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005371
5372 if (dev_addr_init(dev))
Tom Herbert0a9627f2010-03-16 08:03:29 +00005373 goto free_rx;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005374
Jiri Pirko22bedad2010-04-01 21:22:57 +00005375 dev_mc_init(dev);
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005376 dev_uc_init(dev);
Jiri Pirkoccffad22009-05-22 23:22:17 +00005377
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005378 dev_net_set(dev, &init_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005379
David S. Millere8a04642008-07-17 00:34:19 -07005380 dev->_tx = tx;
5381 dev->num_tx_queues = queue_count;
David S. Millerfd2ea0a2008-07-17 01:56:23 -07005382 dev->real_num_tx_queues = queue_count;
David S. Millere8a04642008-07-17 00:34:19 -07005383
Eric Dumazetdf334542010-03-24 19:13:54 +00005384#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005385 dev->_rx = rx;
5386 dev->num_rx_queues = queue_count;
Eric Dumazetdf334542010-03-24 19:13:54 +00005387#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005388
Peter P Waskiewicz Jr82cc1a72008-03-21 03:43:19 -07005389 dev->gso_max_size = GSO_MAX_SIZE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005390
David S. Millerbb949fb2008-07-08 16:55:56 -07005391 netdev_init_queues(dev);
5392
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005393 INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5394 dev->ethtool_ntuple_list.count = 0;
Herbert Xud565b0a2008-12-15 23:38:52 -08005395 INIT_LIST_HEAD(&dev->napi_list);
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005396 INIT_LIST_HEAD(&dev->unreg_list);
Eric Dumazete014deb2009-11-17 05:59:21 +00005397 INIT_LIST_HEAD(&dev->link_watch_list);
Eric Dumazet93f154b2009-05-18 22:19:19 -07005398 dev->priv_flags = IFF_XMIT_DST_RELEASE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005399 setup(dev);
5400 strcpy(dev->name, name);
5401 return dev;
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005402
Tom Herbert0a9627f2010-03-16 08:03:29 +00005403free_rx:
Eric Dumazetdf334542010-03-24 19:13:54 +00005404#ifdef CONFIG_RPS
Tom Herbert0a9627f2010-03-16 08:03:29 +00005405 kfree(rx);
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005406free_tx:
Eric Dumazetdf334542010-03-24 19:13:54 +00005407#endif
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005408 kfree(tx);
Jiri Pirkoab9c73c2009-05-08 13:30:17 +00005409free_p:
5410 kfree(p);
5411 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005412}
Peter P Waskiewicz Jrf25f4e42007-07-06 13:36:20 -07005413EXPORT_SYMBOL(alloc_netdev_mq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005414
5415/**
5416 * free_netdev - free network device
5417 * @dev: device
5418 *
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005419 * This function does the last stage of destroying an allocated device
5420 * interface. The reference to the device object is released.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005421 * If this is the last reference then it will be freed.
5422 */
5423void free_netdev(struct net_device *dev)
5424{
Herbert Xud565b0a2008-12-15 23:38:52 -08005425 struct napi_struct *p, *n;
5426
Denis V. Lunevf3005d72008-04-16 02:02:18 -07005427 release_net(dev_net(dev));
5428
David S. Millere8a04642008-07-17 00:34:19 -07005429 kfree(dev->_tx);
5430
Jiri Pirkof001fde2009-05-05 02:48:28 +00005431 /* Flush device addresses */
5432 dev_addr_flush(dev);
5433
Peter P Waskiewicz Jr15682bc2010-02-10 20:03:05 -08005434 /* Clear ethtool n-tuple list */
5435 ethtool_ntuple_flush(dev);
5436
Herbert Xud565b0a2008-12-15 23:38:52 -08005437 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5438 netif_napi_del(p);
5439
Stephen Hemminger3041a062006-05-26 13:25:24 -07005440 /* Compatibility with error handling in drivers */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005441 if (dev->reg_state == NETREG_UNINITIALIZED) {
5442 kfree((char *)dev - dev->padded);
5443 return;
5444 }
5445
5446 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5447 dev->reg_state = NETREG_RELEASED;
5448
Greg Kroah-Hartman43cb76d2002-04-09 12:14:34 -07005449 /* will free via device release */
5450 put_device(&dev->dev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005451}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005452EXPORT_SYMBOL(free_netdev);
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005453
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005454/**
5455 * synchronize_net - Synchronize with packet receive processing
5456 *
5457 * Wait for packets currently being received to be done.
5458 * Does not block later packets from starting.
5459 */
YOSHIFUJI Hideaki4ec93ed2007-02-09 23:24:36 +09005460void synchronize_net(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005461{
5462 might_sleep();
Paul E. McKenneyfbd568a3e2005-05-01 08:59:04 -07005463 synchronize_rcu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005464}
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005465EXPORT_SYMBOL(synchronize_net);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005466
5467/**
Eric Dumazet44a08732009-10-27 07:03:04 +00005468 * unregister_netdevice_queue - remove device from the kernel
Linus Torvalds1da177e2005-04-16 15:20:36 -07005469 * @dev: device
Eric Dumazet44a08732009-10-27 07:03:04 +00005470 * @head: list
Jaswinder Singh Rajput6ebfbc02009-11-22 20:43:13 -08005471 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07005472 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005473 * from the kernel tables.
Eric Dumazet44a08732009-10-27 07:03:04 +00005474 * If head not NULL, device is queued to be unregistered later.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005475 *
5476 * Callers must hold the rtnl semaphore. You may want
5477 * unregister_netdev() instead of this.
5478 */
5479
Eric Dumazet44a08732009-10-27 07:03:04 +00005480void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005481{
Herbert Xua6620712007-12-12 19:21:56 -08005482 ASSERT_RTNL();
5483
Eric Dumazet44a08732009-10-27 07:03:04 +00005484 if (head) {
Eric W. Biederman9fdce092009-10-30 14:51:13 +00005485 list_move_tail(&dev->unreg_list, head);
Eric Dumazet44a08732009-10-27 07:03:04 +00005486 } else {
5487 rollback_registered(dev);
5488 /* Finish processing unregister after unlock */
5489 net_set_todo(dev);
5490 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005491}
Eric Dumazet44a08732009-10-27 07:03:04 +00005492EXPORT_SYMBOL(unregister_netdevice_queue);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005493
5494/**
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005495 * unregister_netdevice_many - unregister many devices
5496 * @head: list of devices
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005497 */
5498void unregister_netdevice_many(struct list_head *head)
5499{
5500 struct net_device *dev;
5501
5502 if (!list_empty(head)) {
5503 rollback_registered_many(head);
5504 list_for_each_entry(dev, head, unreg_list)
5505 net_set_todo(dev);
5506 }
5507}
Eric Dumazet63c80992009-10-27 07:06:49 +00005508EXPORT_SYMBOL(unregister_netdevice_many);
Eric Dumazet9b5e3832009-10-27 07:04:19 +00005509
5510/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07005511 * unregister_netdev - remove device from the kernel
5512 * @dev: device
5513 *
5514 * This function shuts down a device interface and removes it
Wang Chend59b54b2007-12-11 02:28:03 -08005515 * from the kernel tables.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005516 *
5517 * This is just a wrapper for unregister_netdevice that takes
5518 * the rtnl semaphore. In general you want to use this and not
5519 * unregister_netdevice.
5520 */
5521void unregister_netdev(struct net_device *dev)
5522{
5523 rtnl_lock();
5524 unregister_netdevice(dev);
5525 rtnl_unlock();
5526}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005527EXPORT_SYMBOL(unregister_netdev);
5528
Eric W. Biedermance286d32007-09-12 13:53:49 +02005529/**
5530 * dev_change_net_namespace - move device to different nethost namespace
5531 * @dev: device
5532 * @net: network namespace
5533 * @pat: If not NULL name pattern to try if the current device name
5534 * is already taken in the destination network namespace.
5535 *
5536 * This function shuts down a device interface and moves it
5537 * to a new network namespace. On success 0 is returned, on
5538 * a failure a netagive errno code is returned.
5539 *
5540 * Callers must hold the rtnl semaphore.
5541 */
5542
5543int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5544{
Eric W. Biedermance286d32007-09-12 13:53:49 +02005545 int err;
5546
5547 ASSERT_RTNL();
5548
5549 /* Don't allow namespace local devices to be moved. */
5550 err = -EINVAL;
5551 if (dev->features & NETIF_F_NETNS_LOCAL)
5552 goto out;
5553
Eric W. Biederman38918452008-10-27 17:51:47 -07005554#ifdef CONFIG_SYSFS
5555 /* Don't allow real devices to be moved when sysfs
5556 * is enabled.
5557 */
5558 err = -EINVAL;
5559 if (dev->dev.parent)
5560 goto out;
5561#endif
5562
Eric W. Biedermance286d32007-09-12 13:53:49 +02005563 /* Ensure the device has been registrered */
5564 err = -EINVAL;
5565 if (dev->reg_state != NETREG_REGISTERED)
5566 goto out;
5567
5568 /* Get out if there is nothing todo */
5569 err = 0;
YOSHIFUJI Hideaki878628f2008-03-26 03:57:35 +09005570 if (net_eq(dev_net(dev), net))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005571 goto out;
5572
5573 /* Pick the destination device name, and ensure
5574 * we can use it in the destination network namespace.
5575 */
5576 err = -EEXIST;
Octavian Purdilad9031022009-11-18 02:36:59 +00005577 if (__dev_get_by_name(net, dev->name)) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005578 /* We get here if we can't use the current device name */
5579 if (!pat)
5580 goto out;
Daniel Lezcano8ce6ceb2010-05-19 10:12:19 +00005581 if (dev_get_valid_name(dev, pat, 1))
Eric W. Biedermance286d32007-09-12 13:53:49 +02005582 goto out;
5583 }
5584
5585 /*
5586 * And now a mini version of register_netdevice unregister_netdevice.
5587 */
5588
5589 /* If device is running close it first. */
Pavel Emelyanov9b772652007-10-10 02:49:09 -07005590 dev_close(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005591
5592 /* And unlink it from device chain */
5593 err = -ENODEV;
5594 unlist_netdevice(dev);
5595
5596 synchronize_net();
5597
5598 /* Shutdown queueing discipline. */
5599 dev_shutdown(dev);
5600
5601 /* Notify protocols, that we are about to destroy
5602 this device. They should clean all the things.
5603 */
5604 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
Eric W. Biedermana5ee1552009-11-29 15:45:58 +00005605 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005606
5607 /*
5608 * Flush the unicast and multicast chains
5609 */
Jiri Pirkoa748ee22010-04-01 21:22:09 +00005610 dev_uc_flush(dev);
Jiri Pirko22bedad2010-04-01 21:22:57 +00005611 dev_mc_flush(dev);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005612
Eric W. Biederman38918452008-10-27 17:51:47 -07005613 netdev_unregister_kobject(dev);
5614
Eric W. Biedermance286d32007-09-12 13:53:49 +02005615 /* Actually switch the network namespace */
YOSHIFUJI Hideakic346dca2008-03-25 21:47:49 +09005616 dev_net_set(dev, net);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005617
Eric W. Biedermance286d32007-09-12 13:53:49 +02005618 /* If there is an ifindex conflict assign a new one */
5619 if (__dev_get_by_index(net, dev->ifindex)) {
5620 int iflink = (dev->iflink == dev->ifindex);
5621 dev->ifindex = dev_new_index(net);
5622 if (iflink)
5623 dev->iflink = dev->ifindex;
5624 }
5625
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005626 /* Fixup kobjects */
Daniel Lezcanoaaf8cdc2008-05-02 17:00:58 -07005627 err = netdev_register_kobject(dev);
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005628 WARN_ON(err);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005629
5630 /* Add the device back in the hashes */
5631 list_netdevice(dev);
5632
5633 /* Notify protocols, that a new device appeared. */
5634 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5635
Eric W. Biedermand90a9092009-12-12 22:11:15 +00005636 /*
5637 * Prevent userspace races by waiting until the network
5638 * device is fully setup before sending notifications.
5639 */
5640 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5641
Eric W. Biedermance286d32007-09-12 13:53:49 +02005642 synchronize_net();
5643 err = 0;
5644out:
5645 return err;
5646}
Johannes Berg463d0182009-07-14 00:33:35 +02005647EXPORT_SYMBOL_GPL(dev_change_net_namespace);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005648
Linus Torvalds1da177e2005-04-16 15:20:36 -07005649static int dev_cpu_callback(struct notifier_block *nfb,
5650 unsigned long action,
5651 void *ocpu)
5652{
5653 struct sk_buff **list_skb;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005654 struct sk_buff *skb;
5655 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5656 struct softnet_data *sd, *oldsd;
5657
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005658 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005659 return NOTIFY_OK;
5660
5661 local_irq_disable();
5662 cpu = smp_processor_id();
5663 sd = &per_cpu(softnet_data, cpu);
5664 oldsd = &per_cpu(softnet_data, oldcpu);
5665
5666 /* Find end of our completion_queue. */
5667 list_skb = &sd->completion_queue;
5668 while (*list_skb)
5669 list_skb = &(*list_skb)->next;
5670 /* Append completion queue from offline CPU. */
5671 *list_skb = oldsd->completion_queue;
5672 oldsd->completion_queue = NULL;
5673
Linus Torvalds1da177e2005-04-16 15:20:36 -07005674 /* Append output queue from offline CPU. */
Changli Gaoa9cbd582010-04-26 23:06:24 +00005675 if (oldsd->output_queue) {
5676 *sd->output_queue_tailp = oldsd->output_queue;
5677 sd->output_queue_tailp = oldsd->output_queue_tailp;
5678 oldsd->output_queue = NULL;
5679 oldsd->output_queue_tailp = &oldsd->output_queue;
5680 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005681
5682 raise_softirq_irqoff(NET_TX_SOFTIRQ);
5683 local_irq_enable();
5684
5685 /* Process offline CPU's input_pkt_queue */
Tom Herbert76cc8b12010-05-20 18:37:59 +00005686 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
5687 netif_rx(skb);
5688 input_queue_head_incr(oldsd);
5689 }
Tom Herbertfec5e652010-04-16 16:01:27 -07005690 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005691 netif_rx(skb);
Tom Herbert76cc8b12010-05-20 18:37:59 +00005692 input_queue_head_incr(oldsd);
Tom Herbertfec5e652010-04-16 16:01:27 -07005693 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005694
5695 return NOTIFY_OK;
5696}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005697
5698
Herbert Xu7f353bf2007-08-10 15:47:58 -07005699/**
Herbert Xub63365a2008-10-23 01:11:29 -07005700 * netdev_increment_features - increment feature set by one
5701 * @all: current feature set
5702 * @one: new feature set
5703 * @mask: mask feature set
Herbert Xu7f353bf2007-08-10 15:47:58 -07005704 *
5705 * Computes a new feature set after adding a device with feature set
Herbert Xub63365a2008-10-23 01:11:29 -07005706 * @one to the master device with current feature set @all. Will not
5707 * enable anything that is off in @mask. Returns the new feature set.
Herbert Xu7f353bf2007-08-10 15:47:58 -07005708 */
Herbert Xub63365a2008-10-23 01:11:29 -07005709unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5710 unsigned long mask)
Herbert Xu7f353bf2007-08-10 15:47:58 -07005711{
Herbert Xub63365a2008-10-23 01:11:29 -07005712 /* If device needs checksumming, downgrade to it. */
Eric Dumazetd1b19df2009-09-03 01:29:39 -07005713 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
Herbert Xub63365a2008-10-23 01:11:29 -07005714 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5715 else if (mask & NETIF_F_ALL_CSUM) {
5716 /* If one device supports v4/v6 checksumming, set for all. */
5717 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5718 !(all & NETIF_F_GEN_CSUM)) {
5719 all &= ~NETIF_F_ALL_CSUM;
5720 all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5721 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07005722
Herbert Xub63365a2008-10-23 01:11:29 -07005723 /* If one device supports hw checksumming, set for all. */
5724 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5725 all &= ~NETIF_F_ALL_CSUM;
5726 all |= NETIF_F_HW_CSUM;
5727 }
5728 }
Herbert Xu7f353bf2007-08-10 15:47:58 -07005729
Herbert Xub63365a2008-10-23 01:11:29 -07005730 one |= NETIF_F_ALL_CSUM;
Herbert Xu7f353bf2007-08-10 15:47:58 -07005731
Herbert Xub63365a2008-10-23 01:11:29 -07005732 one |= all & NETIF_F_ONE_FOR_ALL;
Sridhar Samudralad9f59502009-10-07 12:24:25 +00005733 all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
Herbert Xub63365a2008-10-23 01:11:29 -07005734 all |= one & mask & NETIF_F_ONE_FOR_ALL;
Herbert Xu7f353bf2007-08-10 15:47:58 -07005735
5736 return all;
5737}
Herbert Xub63365a2008-10-23 01:11:29 -07005738EXPORT_SYMBOL(netdev_increment_features);
Herbert Xu7f353bf2007-08-10 15:47:58 -07005739
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005740static struct hlist_head *netdev_create_hash(void)
5741{
5742 int i;
5743 struct hlist_head *hash;
5744
5745 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5746 if (hash != NULL)
5747 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5748 INIT_HLIST_HEAD(&hash[i]);
5749
5750 return hash;
5751}
5752
Eric W. Biederman881d9662007-09-17 11:56:21 -07005753/* Initialize per network namespace state */
Pavel Emelyanov46650792007-10-08 20:38:39 -07005754static int __net_init netdev_init(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07005755{
Eric W. Biederman881d9662007-09-17 11:56:21 -07005756 INIT_LIST_HEAD(&net->dev_base_head);
Eric W. Biederman881d9662007-09-17 11:56:21 -07005757
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005758 net->dev_name_head = netdev_create_hash();
5759 if (net->dev_name_head == NULL)
5760 goto err_name;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005761
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005762 net->dev_index_head = netdev_create_hash();
5763 if (net->dev_index_head == NULL)
5764 goto err_idx;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005765
5766 return 0;
Pavel Emelyanov30d97d32007-09-16 15:40:33 -07005767
5768err_idx:
5769 kfree(net->dev_name_head);
5770err_name:
5771 return -ENOMEM;
Eric W. Biederman881d9662007-09-17 11:56:21 -07005772}
5773
Stephen Hemmingerf0db2752008-09-30 02:23:58 -07005774/**
5775 * netdev_drivername - network driver for the device
5776 * @dev: network device
5777 * @buffer: buffer for resulting name
5778 * @len: size of buffer
5779 *
5780 * Determine network driver for device.
5781 */
Stephen Hemmingercf04a4c2008-09-30 02:22:14 -07005782char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
Arjan van de Ven6579e572008-07-21 13:31:48 -07005783{
Stephen Hemmingercf04a4c2008-09-30 02:22:14 -07005784 const struct device_driver *driver;
5785 const struct device *parent;
Arjan van de Ven6579e572008-07-21 13:31:48 -07005786
5787 if (len <= 0 || !buffer)
5788 return buffer;
5789 buffer[0] = 0;
5790
5791 parent = dev->dev.parent;
5792
5793 if (!parent)
5794 return buffer;
5795
5796 driver = parent->driver;
5797 if (driver && driver->name)
5798 strlcpy(buffer, driver->name, len);
5799 return buffer;
5800}
5801
Pavel Emelyanov46650792007-10-08 20:38:39 -07005802static void __net_exit netdev_exit(struct net *net)
Eric W. Biederman881d9662007-09-17 11:56:21 -07005803{
5804 kfree(net->dev_name_head);
5805 kfree(net->dev_index_head);
5806}
5807
Denis V. Lunev022cbae2007-11-13 03:23:50 -08005808static struct pernet_operations __net_initdata netdev_net_ops = {
Eric W. Biederman881d9662007-09-17 11:56:21 -07005809 .init = netdev_init,
5810 .exit = netdev_exit,
5811};
5812
Pavel Emelyanov46650792007-10-08 20:38:39 -07005813static void __net_exit default_device_exit(struct net *net)
Eric W. Biedermance286d32007-09-12 13:53:49 +02005814{
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005815 struct net_device *dev, *aux;
Eric W. Biedermance286d32007-09-12 13:53:49 +02005816 /*
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005817 * Push all migratable network devices back to the
Eric W. Biedermance286d32007-09-12 13:53:49 +02005818 * initial network namespace
5819 */
5820 rtnl_lock();
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005821 for_each_netdev_safe(net, dev, aux) {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005822 int err;
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005823 char fb_name[IFNAMSIZ];
Eric W. Biedermance286d32007-09-12 13:53:49 +02005824
5825 /* Ignore unmoveable devices (i.e. loopback) */
5826 if (dev->features & NETIF_F_NETNS_LOCAL)
5827 continue;
5828
Eric W. Biedermane008b5f2009-11-29 22:25:30 +00005829 /* Leave virtual devices for the generic cleanup */
5830 if (dev->rtnl_link_ops)
5831 continue;
Eric W. Biedermand0c082c2008-11-05 15:59:38 -08005832
Eric W. Biedermance286d32007-09-12 13:53:49 +02005833 /* Push remaing network devices to init_net */
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005834 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5835 err = dev_change_net_namespace(dev, &init_net, fb_name);
Eric W. Biedermance286d32007-09-12 13:53:49 +02005836 if (err) {
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005837 printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
Eric W. Biedermance286d32007-09-12 13:53:49 +02005838 __func__, dev->name, err);
Pavel Emelyanovaca51392008-05-08 01:24:25 -07005839 BUG();
Eric W. Biedermance286d32007-09-12 13:53:49 +02005840 }
5841 }
5842 rtnl_unlock();
5843}
5844
Eric W. Biederman04dc7f62009-12-03 02:29:04 +00005845static void __net_exit default_device_exit_batch(struct list_head *net_list)
5846{
5847 /* At exit all network devices most be removed from a network
5848 * namespace. Do this in the reverse order of registeration.
5849 * Do this across as many network namespaces as possible to
5850 * improve batching efficiency.
5851 */
5852 struct net_device *dev;
5853 struct net *net;
5854 LIST_HEAD(dev_kill_list);
5855
5856 rtnl_lock();
5857 list_for_each_entry(net, net_list, exit_list) {
5858 for_each_netdev_reverse(net, dev) {
5859 if (dev->rtnl_link_ops)
5860 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
5861 else
5862 unregister_netdevice_queue(dev, &dev_kill_list);
5863 }
5864 }
5865 unregister_netdevice_many(&dev_kill_list);
5866 rtnl_unlock();
5867}
5868
Denis V. Lunev022cbae2007-11-13 03:23:50 -08005869static struct pernet_operations __net_initdata default_device_ops = {
Eric W. Biedermance286d32007-09-12 13:53:49 +02005870 .exit = default_device_exit,
Eric W. Biederman04dc7f62009-12-03 02:29:04 +00005871 .exit_batch = default_device_exit_batch,
Eric W. Biedermance286d32007-09-12 13:53:49 +02005872};
5873
Linus Torvalds1da177e2005-04-16 15:20:36 -07005874/*
5875 * Initialize the DEV module. At boot time this walks the device list and
5876 * unhooks any devices that fail to initialise (normally hardware not
5877 * present) and leaves us with a valid list of present and active devices.
5878 *
5879 */
5880
5881/*
5882 * This is called single threaded during boot, so no need
5883 * to take the rtnl semaphore.
5884 */
5885static int __init net_dev_init(void)
5886{
5887 int i, rc = -ENOMEM;
5888
5889 BUG_ON(!dev_boot_phase);
5890
Linus Torvalds1da177e2005-04-16 15:20:36 -07005891 if (dev_proc_init())
5892 goto out;
5893
Eric W. Biederman8b41d182007-09-26 22:02:53 -07005894 if (netdev_kobject_init())
Linus Torvalds1da177e2005-04-16 15:20:36 -07005895 goto out;
5896
5897 INIT_LIST_HEAD(&ptype_all);
Pavel Emelyanov82d8a862007-11-26 20:12:58 +08005898 for (i = 0; i < PTYPE_HASH_SIZE; i++)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005899 INIT_LIST_HEAD(&ptype_base[i]);
5900
Eric W. Biederman881d9662007-09-17 11:56:21 -07005901 if (register_pernet_subsys(&netdev_net_ops))
5902 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005903
5904 /*
5905 * Initialise the packet receive queues.
5906 */
5907
KAMEZAWA Hiroyuki6f912042006-04-10 22:52:50 -07005908 for_each_possible_cpu(i) {
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005909 struct softnet_data *sd = &per_cpu(softnet_data, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005910
Changli Gaodee42872010-05-02 05:42:16 +00005911 memset(sd, 0, sizeof(*sd));
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005912 skb_queue_head_init(&sd->input_pkt_queue);
Changli Gao6e7676c2010-04-27 15:07:33 -07005913 skb_queue_head_init(&sd->process_queue);
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005914 sd->completion_queue = NULL;
5915 INIT_LIST_HEAD(&sd->poll_list);
Changli Gaoa9cbd582010-04-26 23:06:24 +00005916 sd->output_queue = NULL;
5917 sd->output_queue_tailp = &sd->output_queue;
Eric Dumazetdf334542010-03-24 19:13:54 +00005918#ifdef CONFIG_RPS
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005919 sd->csd.func = rps_trigger_softirq;
5920 sd->csd.info = sd;
5921 sd->csd.flags = 0;
5922 sd->cpu = i;
Tom Herbert1e94d722010-03-18 17:45:44 -07005923#endif
Tom Herbert0a9627f2010-03-16 08:03:29 +00005924
Eric Dumazete36fa2f2010-04-19 21:17:14 +00005925 sd->backlog.poll = process_backlog;
5926 sd->backlog.weight = weight_p;
5927 sd->backlog.gro_list = NULL;
5928 sd->backlog.gro_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005929 }
5930
Linus Torvalds1da177e2005-04-16 15:20:36 -07005931 dev_boot_phase = 0;
5932
Eric W. Biederman505d4f72008-11-07 22:54:20 -08005933 /* The loopback device is special if any other network devices
5934 * is present in a network namespace the loopback device must
5935 * be present. Since we now dynamically allocate and free the
5936 * loopback device ensure this invariant is maintained by
5937 * keeping the loopback device as the first device on the
5938 * list of network devices. Ensuring the loopback devices
5939 * is the first device that appears and the last network device
5940 * that disappears.
5941 */
5942 if (register_pernet_device(&loopback_net_ops))
5943 goto out;
5944
5945 if (register_pernet_device(&default_device_ops))
5946 goto out;
5947
Carlos R. Mafra962cf362008-05-15 11:15:37 -03005948 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5949 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005950
5951 hotcpu_notifier(dev_cpu_callback, 0);
5952 dst_init();
5953 dev_mcast_init();
5954 rc = 0;
5955out:
5956 return rc;
5957}
5958
5959subsys_initcall(net_dev_init);
5960
Krishna Kumare88721f2009-02-18 17:55:02 -08005961static int __init initialize_hashrnd(void)
5962{
Tom Herbert0a9627f2010-03-16 08:03:29 +00005963 get_random_bytes(&hashrnd, sizeof(hashrnd));
Krishna Kumare88721f2009-02-18 17:55:02 -08005964 return 0;
5965}
5966
5967late_initcall_sync(initialize_hashrnd);
5968