aboutsummaryrefslogtreecommitdiff
path: root/net/mptcp/protocol.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-04-29 11:57:23 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-04-29 11:57:23 -0700
commit9d31d2338950293ec19d9b095fbaa9030899dcb4 (patch)
treee688040d0557c24a2eeb9f6c9c223d949f6f7ef9 /net/mptcp/protocol.c
parent635de956a7f5a6ffcb04f29d70630c64c717b56b (diff)
parent4a52dd8fefb45626dace70a63c0738dbd83b7edb (diff)
Merge tag 'net-next-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Jakub Kicinski: "Core: - bpf: - allow bpf programs calling kernel functions (initially to reuse TCP congestion control implementations) - enable task local storage for tracing programs - remove the need to store per-task state in hash maps, and allow tracing programs access to task local storage previously added for BPF_LSM - add bpf_for_each_map_elem() helper, allowing programs to walk all map elements in a more robust and easier to verify fashion - sockmap: support UDP and cross-protocol BPF_SK_SKB_VERDICT redirection - lpm: add support for batched ops in LPM trie - add BTF_KIND_FLOAT support - mostly to allow use of BTF on s390 which has floats in its headers files - improve BPF syscall documentation and extend the use of kdoc parsing scripts we already employ for bpf-helpers - libbpf, bpftool: support static linking of BPF ELF files - improve support for encapsulation of L2 packets - xdp: restructure redirect actions to avoid a runtime lookup, improving performance by 4-8% in microbenchmarks - xsk: build skb by page (aka generic zerocopy xmit) - improve performance of software AF_XDP path by 33% for devices which don't need headers in the linear skb part (e.g. virtio) - nexthop: resilient next-hop groups - improve path stability on next-hops group changes (incl. offload for mlxsw) - ipv6: segment routing: add support for IPv4 decapsulation - icmp: add support for RFC 8335 extended PROBE messages - inet: use bigger hash table for IP ID generation - tcp: deal better with delayed TX completions - make sure we don't give up on fast TCP retransmissions only because driver is slow in reporting that it completed transmitting the original - tcp: reorder tcp_congestion_ops for better cache locality - mptcp: - add sockopt support for common TCP options - add support for common TCP msg flags - include multiple address ids in RM_ADDR - add reset option support for resetting one subflow - udp: GRO L4 improvements - improve 'forward' / 'frag_list' co-existence with UDP tunnel GRO, allowing the first to take place correctly even for encapsulated UDP traffic - micro-optimize dev_gro_receive() and flow dissection, avoid retpoline overhead on VLAN and TEB GRO - use less memory for sysctls, add a new sysctl type, to allow using u8 instead of "int" and "long" and shrink networking sysctls - veth: allow GRO without XDP - this allows aggregating UDP packets before handing them off to routing, bridge, OvS, etc. - allow specifing ifindex when device is moved to another namespace - netfilter: - nft_socket: add support for cgroupsv2 - nftables: add catch-all set element - special element used to define a default action in case normal lookup missed - use net_generic infra in many modules to avoid allocating per-ns memory unnecessarily - xps: improve the xps handling to avoid potential out-of-bound accesses and use-after-free when XPS change race with other re-configuration under traffic - add a config knob to turn off per-cpu netdev refcnt to catch underflows in testing Device APIs: - add WWAN subsystem to organize the WWAN interfaces better and hopefully start driving towards more unified and vendor- independent APIs - ethtool: - add interface for reading IEEE MIB stats (incl. mlx5 and bnxt support) - allow network drivers to dump arbitrary SFP EEPROM data, current offset+length API was a poor fit for modern SFP which define EEPROM in terms of pages (incl. mlx5 support) - act_police, flow_offload: add support for packet-per-second policing (incl. offload for nfp) - psample: add additional metadata attributes like transit delay for packets sampled from switch HW (and corresponding egress and policy-based sampling in the mlxsw driver) - dsa: improve support for sandwiched LAGs with bridge and DSA - netfilter: - flowtable: use direct xmit in topologies with IP forwarding, bridging, vlans etc. - nftables: counter hardware offload support - Bluetooth: - improvements for firmware download w/ Intel devices - add support for reading AOSP vendor capabilities - add support for virtio transport driver - mac80211: - allow concurrent monitor iface and ethernet rx decap - set priority and queue mapping for injected frames - phy: add support for Clause-45 PHY Loopback - pci/iov: add sysfs MSI-X vector assignment interface to distribute MSI-X resources to VFs (incl. mlx5 support) New hardware/drivers: - dsa: mv88e6xxx: add support for Marvell mv88e6393x - 11-port Ethernet switch with 8x 1-Gigabit Ethernet and 3x 10-Gigabit interfaces. - dsa: support for legacy Broadcom tags used on BCM5325, BCM5365 and BCM63xx switches - Microchip KSZ8863 and KSZ8873; 3x 10/100Mbps Ethernet switches - ath11k: support for QCN9074 a 802.11ax device - Bluetooth: Broadcom BCM4330 and BMC4334 - phy: Marvell 88X2222 transceiver support - mdio: add BCM6368 MDIO mux bus controller - r8152: support RTL8153 and RTL8156 (USB Ethernet) chips - mana: driver for Microsoft Azure Network Adapter (MANA) - Actions Semi Owl Ethernet MAC - can: driver for ETAS ES58X CAN/USB interfaces Pure driver changes: - add XDP support to: enetc, igc, stmmac - add AF_XDP support to: stmmac - virtio: - page_to_skb() use build_skb when there's sufficient tailroom (21% improvement for 1000B UDP frames) - support XDP even without dedicated Tx queues - share the Tx queues with the stack when necessary - mlx5: - flow rules: add support for mirroring with conntrack, matching on ICMP, GTP, flex filters and more - support packet sampling with flow offloads - persist uplink representor netdev across eswitch mode changes - allow coexistence of CQE compression and HW time-stamping - add ethtool extended link error state reporting - ice, iavf: support flow filters, UDP Segmentation Offload - dpaa2-switch: - move the driver out of staging - add spanning tree (STP) support - add rx copybreak support - add tc flower hardware offload on ingress traffic - ionic: - implement Rx page reuse - support HW PTP time-stamping - octeon: support TC hardware offloads - flower matching on ingress and egress ratelimitting. - stmmac: - add RX frame steering based on VLAN priority in tc flower - support frame preemption (FPE) - intel: add cross time-stamping freq difference adjustment - ocelot: - support forwarding of MRP frames in HW - support multiple bridges - support PTP Sync one-step timestamping - dsa: mv88e6xxx, dpaa2-switch: offload bridge port flags like learning, flooding etc. - ipa: add IPA v4.5, v4.9 and v4.11 support (Qualcomm SDX55, SM8350, SC7280 SoCs) - mt7601u: enable TDLS support - mt76: - add support for 802.3 rx frames (mt7915/mt7615) - mt7915 flash pre-calibration support - mt7921/mt7663 runtime power management fixes" * tag 'net-next-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (2451 commits) net: selftest: fix build issue if INET is disabled net: netrom: nr_in: Remove redundant assignment to ns net: tun: Remove redundant assignment to ret net: phy: marvell: add downshift support for M88E1240 net: dsa: ksz: Make reg_mib_cnt a u8 as it never exceeds 255 net/sched: act_ct: Remove redundant ct get and check icmp: standardize naming of RFC 8335 PROBE constants bpf, selftests: Update array map tests for per-cpu batched ops bpf: Add batched ops support for percpu array bpf: Implement formatted output helpers with bstr_printf seq_file: Add a seq_bprintf function sfc: adjust efx->xdp_tx_queue_count with the real number of initialized queues net:nfc:digital: Fix a double free in digital_tg_recv_dep_req net: fix a concurrency bug in l2tp_tunnel_register() net/smc: Remove redundant assignment to rc mpls: Remove redundant assignment to err llc2: Remove redundant assignment to rc net/tls: Remove redundant initialization of record rds: Remove redundant assignment to nr_sig dt-bindings: net: mdio-gpio: add compatible for microchip,mdio-smi0 ...
Diffstat (limited to 'net/mptcp/protocol.c')
-rw-r--r--net/mptcp/protocol.c381
1 files changed, 131 insertions, 250 deletions
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 4bde960e19dc..29a2d690d8d5 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -25,6 +25,9 @@
#include "protocol.h"
#include "mib.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/mptcp.h>
+
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
struct mptcp6_sock {
struct mptcp_sock msk;
@@ -90,16 +93,6 @@ static bool mptcp_is_tcpsk(struct sock *sk)
return false;
}
-static struct sock *__mptcp_tcp_fallback(struct mptcp_sock *msk)
-{
- sock_owned_by_me((const struct sock *)msk);
-
- if (likely(!__mptcp_check_fallback(msk)))
- return NULL;
-
- return msk->first;
-}
-
static int __mptcp_socket_create(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
@@ -399,6 +392,14 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq)
return false;
}
+static void mptcp_set_datafin_timeout(const struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ mptcp_sk(sk)->timer_ival = min(TCP_RTO_MAX,
+ TCP_RTO_MIN << icsk->icsk_retransmits);
+}
+
static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
{
long tout = ssk && inet_csk(ssk)->icsk_pending ?
@@ -409,18 +410,6 @@ static void mptcp_set_timeout(const struct sock *sk, const struct sock *ssk)
mptcp_sk(sk)->timer_ival = tout > 0 ? tout : TCP_RTO_MIN;
}
-static bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
-{
- struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
-
- /* can't send if JOIN hasn't completed yet (i.e. is usable for mptcp) */
- if (subflow->request_join && !subflow->fully_established)
- return false;
-
- /* only send if our side has not closed yet */
- return ((1 << ssk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT));
-}
-
static bool tcp_can_send_ack(const struct sock *ssk)
{
return !((1 << inet_sk_state_load(ssk)) &
@@ -491,7 +480,7 @@ static bool mptcp_check_data_fin(struct sock *sk)
u64 rcv_data_fin_seq;
bool ret = false;
- if (__mptcp_check_fallback(msk) || !msk->first)
+ if (__mptcp_check_fallback(msk))
return ret;
/* Need to ack a DATA_FIN received from a peer while this side
@@ -740,18 +729,47 @@ wake:
sk->sk_data_ready(sk);
}
-void __mptcp_flush_join_list(struct mptcp_sock *msk)
+static bool mptcp_do_flush_join_list(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
+ bool ret = false;
if (likely(list_empty(&msk->join_list)))
- return;
+ return false;
spin_lock_bh(&msk->join_list_lock);
- list_for_each_entry(subflow, &msk->join_list, node)
+ list_for_each_entry(subflow, &msk->join_list, node) {
+ u32 sseq = READ_ONCE(subflow->setsockopt_seq);
+
mptcp_propagate_sndbuf((struct sock *)msk, mptcp_subflow_tcp_sock(subflow));
+ if (READ_ONCE(msk->setsockopt_seq) != sseq)
+ ret = true;
+ }
list_splice_tail_init(&msk->join_list, &msk->conn_list);
spin_unlock_bh(&msk->join_list_lock);
+
+ return ret;
+}
+
+void __mptcp_flush_join_list(struct mptcp_sock *msk)
+{
+ if (likely(!mptcp_do_flush_join_list(msk)))
+ return;
+
+ if (!test_and_set_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags))
+ mptcp_schedule_work((struct sock *)msk);
+}
+
+static void mptcp_flush_join_list(struct mptcp_sock *msk)
+{
+ bool sync_needed = test_and_clear_bit(MPTCP_WORK_SYNC_SETSOCKOPT, &msk->flags);
+
+ might_sleep();
+
+ if (!mptcp_do_flush_join_list(msk) && !sync_needed)
+ return;
+
+ mptcp_sockopt_sync_all(msk);
}
static bool mptcp_timer_pending(struct sock *sk)
@@ -1052,7 +1070,7 @@ out:
}
if (snd_una == READ_ONCE(msk->snd_nxt)) {
- if (msk->timer_ival)
+ if (msk->timer_ival && !mptcp_data_fin_enabled(msk))
mptcp_stop_timer(sk);
} else {
mptcp_reset_timer(sk);
@@ -1275,7 +1293,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk,
int avail_size;
size_t ret = 0;
- pr_debug("msk=%p ssk=%p sending dfrag at seq=%lld len=%d already sent=%d",
+ pr_debug("msk=%p ssk=%p sending dfrag at seq=%llu len=%u already sent=%u",
msk, ssk, dfrag->data_seq, dfrag->data_len, info->sent);
/* compute send limit */
@@ -1403,6 +1421,7 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
send_info[i].ratio = -1;
}
mptcp_for_each_subflow(msk, subflow) {
+ trace_mptcp_subflow_get_send(subflow);
ssk = mptcp_subflow_tcp_sock(subflow);
if (!mptcp_subflow_active(subflow))
continue;
@@ -1423,10 +1442,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
}
}
- pr_debug("msk=%p nr_active=%d ssk=%p:%lld backup=%p:%lld",
- msk, nr_active, send_info[0].ssk, send_info[0].ratio,
- send_info[1].ssk, send_info[1].ratio);
-
/* pick the best backup if no other subflow is active */
if (!nr_active)
send_info[0].ssk = send_info[1].ssk;
@@ -1467,7 +1482,7 @@ static void __mptcp_push_pending(struct sock *sk, unsigned int flags)
int ret = 0;
prev_ssk = ssk;
- __mptcp_flush_join_list(msk);
+ mptcp_flush_join_list(msk);
ssk = mptcp_subflow_get_send(msk);
/* try to keep the subflow socket lock across
@@ -1607,9 +1622,13 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int ret = 0;
long timeo;
- if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
+ /* we don't support FASTOPEN yet */
+ if (msg->msg_flags & MSG_FASTOPEN)
return -EOPNOTSUPP;
+ /* silently ignore everything else */
+ msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
+
mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@@ -1693,7 +1712,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!msk->first_pending)
WRITE_ONCE(msk->first_pending, dfrag);
}
- pr_debug("msk=%p dfrag at seq=%lld len=%d sent=%d new=%d", msk,
+ pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d", msk,
dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
!dfrag_collapsed);
@@ -1732,36 +1751,41 @@ static void mptcp_wait_data(struct sock *sk, long *timeo)
static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk,
struct msghdr *msg,
- size_t len)
+ size_t len, int flags)
{
- struct sk_buff *skb;
+ struct sk_buff *skb, *tmp;
int copied = 0;
- while ((skb = skb_peek(&msk->receive_queue)) != NULL) {
+ skb_queue_walk_safe(&msk->receive_queue, skb, tmp) {
u32 offset = MPTCP_SKB_CB(skb)->offset;
u32 data_len = skb->len - offset;
u32 count = min_t(size_t, len - copied, data_len);
int err;
- err = skb_copy_datagram_msg(skb, offset, msg, count);
- if (unlikely(err < 0)) {
- if (!copied)
- return err;
- break;
+ if (!(flags & MSG_TRUNC)) {
+ err = skb_copy_datagram_msg(skb, offset, msg, count);
+ if (unlikely(err < 0)) {
+ if (!copied)
+ return err;
+ break;
+ }
}
copied += count;
if (count < data_len) {
- MPTCP_SKB_CB(skb)->offset += count;
+ if (!(flags & MSG_PEEK))
+ MPTCP_SKB_CB(skb)->offset += count;
break;
}
- /* we will bulk release the skb memory later */
- skb->destructor = NULL;
- msk->rmem_released += skb->truesize;
- __skb_unlink(skb, &msk->receive_queue);
- __kfree_skb(skb);
+ if (!(flags & MSG_PEEK)) {
+ /* we will bulk release the skb memory later */
+ skb->destructor = NULL;
+ msk->rmem_released += skb->truesize;
+ __skb_unlink(skb, &msk->receive_queue);
+ __kfree_skb(skb);
+ }
if (copied >= len)
break;
@@ -1893,7 +1917,7 @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk)
unsigned int moved = 0;
bool ret, done;
- __mptcp_flush_join_list(msk);
+ mptcp_flush_join_list(msk);
do {
struct sock *ssk = mptcp_subflow_recv_lookup(msk);
bool slowpath;
@@ -1938,8 +1962,9 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int target;
long timeo;
- if (msg->msg_flags & ~(MSG_WAITALL | MSG_DONTWAIT))
- return -EOPNOTSUPP;
+ /* MSG_ERRQUEUE is really a no-op till we support IP_RECVERR */
+ if (unlikely(flags & MSG_ERRQUEUE))
+ return inet_recv_error(sk, msg, len, addr_len);
mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
if (unlikely(sk->sk_state == TCP_LISTEN)) {
@@ -1955,7 +1980,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
while (copied < len) {
int bytes_read;
- bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied);
+ bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags);
if (unlikely(bytes_read < 0)) {
if (!copied)
copied = bytes_read;
@@ -2039,34 +2064,28 @@ out_err:
pr_debug("msk=%p data_ready=%d rx queue empty=%d copied=%d",
msk, test_bit(MPTCP_DATA_READY, &msk->flags),
skb_queue_empty_lockless(&sk->sk_receive_queue), copied);
- mptcp_rcv_space_adjust(msk, copied);
+ if (!(flags & MSG_PEEK))
+ mptcp_rcv_space_adjust(msk, copied);
release_sock(sk);
return copied;
}
-static void mptcp_retransmit_handler(struct sock *sk)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
-
- set_bit(MPTCP_WORK_RTX, &msk->flags);
- mptcp_schedule_work(sk);
-}
-
static void mptcp_retransmit_timer(struct timer_list *t)
{
struct inet_connection_sock *icsk = from_timer(icsk, t,
icsk_retransmit_timer);
struct sock *sk = &icsk->icsk_inet.sk;
+ struct mptcp_sock *msk = mptcp_sk(sk);
bh_lock_sock(sk);
if (!sock_owned_by_user(sk)) {
- mptcp_retransmit_handler(sk);
+ /* we need a process context to retransmit */
+ if (!test_and_set_bit(MPTCP_WORK_RTX, &msk->flags))
+ mptcp_schedule_work(sk);
} else {
/* delegate our work to tcp_release_cb() */
- if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED,
- &sk->sk_tsq_flags))
- sock_hold(sk);
+ set_bit(MPTCP_RETRANSMIT, &msk->flags);
}
bh_unlock_sock(sk);
sock_put(sk);
@@ -2276,8 +2295,19 @@ static void __mptcp_retrans(struct sock *sk)
__mptcp_clean_una_wakeup(sk);
dfrag = mptcp_rtx_head(sk);
- if (!dfrag)
+ if (!dfrag) {
+ if (mptcp_data_fin_enabled(msk)) {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ icsk->icsk_retransmits++;
+ mptcp_set_datafin_timeout(sk);
+ mptcp_send_ack(msk);
+
+ goto reset_timer;
+ }
+
return;
+ }
ssk = mptcp_subflow_get_retrans(msk);
if (!ssk)
@@ -2324,7 +2354,7 @@ static void mptcp_worker(struct work_struct *work)
goto unlock;
mptcp_check_data_fin_ack(sk);
- __mptcp_flush_join_list(msk);
+ mptcp_flush_join_list(msk);
mptcp_check_fastclose(msk);
@@ -2387,6 +2417,9 @@ static int __mptcp_init_sock(struct sock *sk)
/* re-use the csk retrans timer for MPTCP-level retrans */
timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
timer_setup(&sk->sk_timer, mptcp_timeout_timer, 0);
+
+ tcp_assign_congestion_control(sk);
+
return 0;
}
@@ -2460,6 +2493,8 @@ void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how)
pr_debug("Sending DATA_FIN on subflow %p", ssk);
mptcp_set_timeout(sk, ssk);
tcp_send_ack(ssk);
+ if (!mptcp_timer_pending(sk))
+ mptcp_reset_timer(sk);
}
break;
}
@@ -2524,7 +2559,7 @@ static void __mptcp_check_send_data_fin(struct sock *sk)
}
}
- __mptcp_flush_join_list(msk);
+ mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
@@ -2580,6 +2615,8 @@ static void __mptcp_destroy_sock(struct sock *sk)
WARN_ON_ONCE(msk->rmem_released);
sk_stream_kill_queues(sk);
xfrm_sk_free_policy(sk);
+
+ tcp_cleanup_congestion_control(sk);
sk_refcnt_debug_release(sk);
mptcp_dispose_initial_subflow(msk);
sock_put(sk);
@@ -2606,7 +2643,7 @@ static void mptcp_close(struct sock *sk, long timeout)
cleanup:
/* orphan all the subflows */
inet_csk(sk)->icsk_mtup.probe_timestamp = tcp_jiffies32;
- list_for_each_entry(subflow, &mptcp_sk(sk)->conn_list, node) {
+ mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
bool slow = lock_sock_fast(ssk);
@@ -2661,7 +2698,8 @@ static int mptcp_disconnect(struct sock *sk, int flags)
struct mptcp_subflow_context *subflow;
struct mptcp_sock *msk = mptcp_sk(sk);
- __mptcp_flush_join_list(msk);
+ mptcp_do_flush_join_list(msk);
+
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
@@ -2710,6 +2748,7 @@ struct sock *mptcp_sk_clone(const struct sock *sk,
msk->snd_nxt = msk->write_seq;
msk->snd_una = msk->write_seq;
msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
+ msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
if (mp_opt->mp_capable) {
msk->can_ack = true;
@@ -2818,161 +2857,6 @@ static void mptcp_destroy(struct sock *sk)
sk_sockets_allocated_dec(sk);
}
-static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname,
- sockptr_t optval, unsigned int optlen)
-{
- struct sock *sk = (struct sock *)msk;
- struct socket *ssock;
- int ret;
-
- switch (optname) {
- case SO_REUSEPORT:
- case SO_REUSEADDR:
- lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- release_sock(sk);
- return -EINVAL;
- }
-
- ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen);
- if (ret == 0) {
- if (optname == SO_REUSEPORT)
- sk->sk_reuseport = ssock->sk->sk_reuseport;
- else if (optname == SO_REUSEADDR)
- sk->sk_reuse = ssock->sk->sk_reuse;
- }
- release_sock(sk);
- return ret;
- }
-
- return sock_setsockopt(sk->sk_socket, SOL_SOCKET, optname, optval, optlen);
-}
-
-static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname,
- sockptr_t optval, unsigned int optlen)
-{
- struct sock *sk = (struct sock *)msk;
- int ret = -EOPNOTSUPP;
- struct socket *ssock;
-
- switch (optname) {
- case IPV6_V6ONLY:
- lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- release_sock(sk);
- return -EINVAL;
- }
-
- ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen);
- if (ret == 0)
- sk->sk_ipv6only = ssock->sk->sk_ipv6only;
-
- release_sock(sk);
- break;
- }
-
- return ret;
-}
-
-static bool mptcp_unsupported(int level, int optname)
-{
- if (level == SOL_IP) {
- switch (optname) {
- case IP_ADD_MEMBERSHIP:
- case IP_ADD_SOURCE_MEMBERSHIP:
- case IP_DROP_MEMBERSHIP:
- case IP_DROP_SOURCE_MEMBERSHIP:
- case IP_BLOCK_SOURCE:
- case IP_UNBLOCK_SOURCE:
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_MSFILTER:
- return true;
- }
- return false;
- }
- if (level == SOL_IPV6) {
- switch (optname) {
- case IPV6_ADDRFORM:
- case IPV6_ADD_MEMBERSHIP:
- case IPV6_DROP_MEMBERSHIP:
- case IPV6_JOIN_ANYCAST:
- case IPV6_LEAVE_ANYCAST:
- case MCAST_JOIN_GROUP:
- case MCAST_LEAVE_GROUP:
- case MCAST_JOIN_SOURCE_GROUP:
- case MCAST_LEAVE_SOURCE_GROUP:
- case MCAST_BLOCK_SOURCE:
- case MCAST_UNBLOCK_SOURCE:
- case MCAST_MSFILTER:
- return true;
- }
- return false;
- }
- return false;
-}
-
-static int mptcp_setsockopt(struct sock *sk, int level, int optname,
- sockptr_t optval, unsigned int optlen)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
- struct sock *ssk;
-
- pr_debug("msk=%p", msk);
-
- if (mptcp_unsupported(level, optname))
- return -ENOPROTOOPT;
-
- if (level == SOL_SOCKET)
- return mptcp_setsockopt_sol_socket(msk, optname, optval, optlen);
-
- /* @@ the meaning of setsockopt() when the socket is connected and
- * there are multiple subflows is not yet defined. It is up to the
- * MPTCP-level socket to configure the subflows until the subflow
- * is in TCP fallback, when TCP socket options are passed through
- * to the one remaining subflow.
- */
- lock_sock(sk);
- ssk = __mptcp_tcp_fallback(msk);
- release_sock(sk);
- if (ssk)
- return tcp_setsockopt(ssk, level, optname, optval, optlen);
-
- if (level == SOL_IPV6)
- return mptcp_setsockopt_v6(msk, optname, optval, optlen);
-
- return -EOPNOTSUPP;
-}
-
-static int mptcp_getsockopt(struct sock *sk, int level, int optname,
- char __user *optval, int __user *option)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
- struct sock *ssk;
-
- pr_debug("msk=%p", msk);
-
- /* @@ the meaning of setsockopt() when the socket is connected and
- * there are multiple subflows is not yet defined. It is up to the
- * MPTCP-level socket to configure the subflows until the subflow
- * is in TCP fallback, when socket options are passed through
- * to the one remaining subflow.
- */
- lock_sock(sk);
- ssk = __mptcp_tcp_fallback(msk);
- release_sock(sk);
- if (ssk)
- return tcp_getsockopt(ssk, level, optname, optval, option);
-
- return -EOPNOTSUPP;
-}
-
void __mptcp_data_acked(struct sock *sk)
{
if (!sock_owned_by_user(sk))
@@ -3001,17 +2885,16 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk)
}
}
-#define MPTCP_DEFERRED_ALL (TCPF_WRITE_TIMER_DEFERRED)
-
/* processes deferred events and flush wmem */
static void mptcp_release_cb(struct sock *sk)
{
- unsigned long flags, nflags;
-
for (;;) {
- flags = 0;
+ unsigned long flags = 0;
+
if (test_and_clear_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->flags))
flags |= BIT(MPTCP_PUSH_PENDING);
+ if (test_and_clear_bit(MPTCP_RETRANSMIT, &mptcp_sk(sk)->flags))
+ flags |= BIT(MPTCP_RETRANSMIT);
if (!flags)
break;
@@ -3026,6 +2909,8 @@ static void mptcp_release_cb(struct sock *sk)
spin_unlock_bh(&sk->sk_lock.slock);
if (flags & BIT(MPTCP_PUSH_PENDING))
__mptcp_push_pending(sk, 0);
+ if (flags & BIT(MPTCP_RETRANSMIT))
+ __mptcp_retrans(sk);
cond_resched();
spin_lock_bh(&sk->sk_lock.slock);
@@ -3041,20 +2926,6 @@ static void mptcp_release_cb(struct sock *sk)
*/
__mptcp_update_wmem(sk);
__mptcp_update_rmem(sk);
-
- do {
- flags = sk->sk_tsq_flags;
- if (!(flags & MPTCP_DEFERRED_ALL))
- return;
- nflags = flags & ~MPTCP_DEFERRED_ALL;
- } while (cmpxchg(&sk->sk_tsq_flags, flags, nflags) != flags);
-
- sock_release_ownership(sk);
-
- if (flags & TCPF_WRITE_TIMER_DEFERRED) {
- mptcp_retransmit_handler(sk);
- __sock_put(sk);
- }
}
void mptcp_subflow_process_delegated(struct sock *ssk)
@@ -3153,14 +3024,18 @@ bool mptcp_finish_join(struct sock *ssk)
pr_debug("msk=%p, subflow=%p", msk, subflow);
/* mptcp socket already closing? */
- if (!mptcp_is_fully_established(parent))
+ if (!mptcp_is_fully_established(parent)) {
+ subflow->reset_reason = MPTCP_RST_EMPTCP;
return false;
+ }
if (!msk->pm.server_side)
goto out;
- if (!mptcp_pm_allow_new_subflow(msk))
+ if (!mptcp_pm_allow_new_subflow(msk)) {
+ subflow->reset_reason = MPTCP_RST_EPROHIBIT;
return false;
+ }
/* active connections are already on conn_list, and we can't acquire
* msk lock here.
@@ -3174,8 +3049,10 @@ bool mptcp_finish_join(struct sock *ssk)
sock_hold(ssk);
}
spin_unlock_bh(&msk->join_list_lock);
- if (!ret)
+ if (!ret) {
+ subflow->reset_reason = MPTCP_RST_EPROHIBIT;
return false;
+ }
/* attach to msk socket only after we are sure he will deal with us
* at close time
@@ -3287,8 +3164,12 @@ static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr,
if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info))
mptcp_subflow_early_fallback(msk, subflow);
#endif
- if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk))
+ if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) {
+ MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT);
mptcp_subflow_early_fallback(msk, subflow);
+ }
+ if (likely(!__mptcp_check_fallback(msk)))
+ MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE);
do_connect:
err = ssock->ops->connect(ssock, uaddr, addr_len, flags);
@@ -3385,7 +3266,7 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
/* set ssk->sk_socket of accept()ed flows to mptcp socket.
* This is needed so NOSPACE flag can be set from tcp stack.
*/
- __mptcp_flush_join_list(msk);
+ mptcp_flush_join_list(msk);
mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);