diff options
author | Tuomas Taipale <tuomas.taipale@nokia.com> | 2022-04-04 12:18:51 +0000 |
---|---|---|
committer | Matias Elo <matias.elo@nokia.com> | 2022-05-02 17:02:50 +0300 |
commit | 40e999eaeb52d34d5289b6b8d6e405f1623d8a53 (patch) | |
tree | bc7ac6aa0c7a5cd6eb14842886e9bd701267a033 | |
parent | 32f878c9b291cc4f24c30819e45aa2605fcd84f4 (diff) |
linux-gen: pktio: add AF_XDP socket based packet IO
Add new AF_XDP socket based packet IO implementation. Information about
AF_XDP and XDP can be found in [1].
ODP packet pool(s) created for packet IO entries are used as the UMEM
areas. Currently, only zero-copy mode is supported, copy-mode support
will be added later.
Depending on the XDP program loaded into the environment's NIC(s), TX
and RX queue selection for packets may differ. By default, only a single
combined queue is supported and this may require changing the
configuration of the NIC accordingly so that packets end up on the
AF_XDP socket created by ODP.
[1] https://www.kernel.org/doc/Documentation/networking/af_xdp.rst
Signed-off-by: Tuomas Taipale <tuomas.taipale@nokia.com>
Reviewed-by: Matias Elo <matias.elo@nokia.com>
-rw-r--r-- | DEPENDENCIES | 66 | ||||
-rw-r--r-- | include/odp/autoheader_internal.h.in | 3 | ||||
-rw-r--r-- | platform/linux-generic/Makefile.am | 3 | ||||
-rw-r--r-- | platform/linux-generic/include/odp_packet_internal.h | 6 | ||||
-rw-r--r-- | platform/linux-generic/include/odp_packet_io_internal.h | 3 | ||||
-rw-r--r-- | platform/linux-generic/libodp-linux.pc.in | 2 | ||||
-rw-r--r-- | platform/linux-generic/m4/configure.m4 | 3 | ||||
-rw-r--r-- | platform/linux-generic/m4/odp_xdp.m4 | 15 | ||||
-rw-r--r-- | platform/linux-generic/odp_pool_mem_src_ops.c | 4 | ||||
-rw-r--r-- | platform/linux-generic/pktio/io_ops.c | 3 | ||||
-rw-r--r-- | platform/linux-generic/pktio/socket_xdp.c | 688 |
11 files changed, 793 insertions, 3 deletions
diff --git a/DEPENDENCIES b/DEPENDENCIES index 85a4cf242..b18279da2 100644 --- a/DEPENDENCIES +++ b/DEPENDENCIES @@ -291,6 +291,72 @@ Prerequisites for building the OpenDataPlane (ODP) API 1024MB of memory: $ sudo ODP_PKTIO_DPDK_PARAMS="-m 1024" ./test/performance/odp_l2fwd -i 0 -c 1 +3.6 AF_XDP socket based packet I/O support (optional) + + Use AF_XDP socket for packet I/O. At the moment, only zero-copy variant is + supported, requiring a kernel version 5.4 or higher. Additionally, if packet + pools are to be shared between packet I/Os, kernel version of 5.10 or higher + is required. + + More information about XDP and AF_XDP can be found here: + https://www.kernel.org/doc/Documentation/networking/af_xdp.rst + + The status of the implementation is **experimental** and may cause issues + e.g. with some packet length, packet segment length and pool size + combinations that would otherwise conform to reported capabilities. + +3.6.1 AF_XDP socket packet I/O requirements + + AF_XDP socket packet I/O implementation requires libxdp and libbpf libraries. + They can be fetched from XDP-project in GitHub: + + $ git clone https://github.com/xdp-project/xdp-tools + + (Contains submodules which should be cloned as well.) + + Additional packages might be needed to be installed as well: llvm-dev and + gcc-multilib. + + $ ./configure + $ make + + After building, libraries should be installed. + + $ cd <path to built libxdp> + $ make install + $ cd <path to built libbpf> + $ make install + +3.6.2 Build ODP with AF_XDP socket packet I/O support + + After building and installing libxdp and libbpf, ODP can be configured to be + built with AF_XDP support (pass PKG_CONFIG_PATH if needed). + + $ ./configure --enable-xdp + +3.6.3 Running ODP with AF_XDP socket packet I/O + + At the moment, each AF_XDP socket packet I/O binds to a single TRX queue, + this means that NIC(s) of the environment have to be configured accordingly. + + $ ethtool -L <if name> combined 1 + + Additionally, with some NICs (e.g. Mellanox), when zero-copy XDP is in use, + the queue configuration is adjusted by the NIC with additional queues on top + of the configured single TRX queue. This requires a forwarding rule: + + $ ethtool -N <if name> flow-type ether dst <mac of if> action 1 + + Which queue to bind to in a given interface can be controlled with an + environment variable when starting an ODP executable: + + $ ODP_PKTIO_XDP_PARAMS="<if name>:<queue index> <if name>:<queue index> ..." ./<odp executable> ... + + parameter being a string of interface-queue index pairs, where interface and + queue is separated by a colon and pairs separated by a whitespace. If no + environment variable is passed, zero (0) queue is chosen for all AF_XDP + interfaces. + 4.0 Packages needed to build API tests CUnit test framework version 2.1-3 is required diff --git a/include/odp/autoheader_internal.h.in b/include/odp/autoheader_internal.h.in index 952675fb5..33d9f280f 100644 --- a/include/odp/autoheader_internal.h.in +++ b/include/odp/autoheader_internal.h.in @@ -32,4 +32,7 @@ /* Define to 1 to enable OpenSSL random data */ #undef _ODP_OPENSSL_RAND +/* Define to 1 to enable XDP support */ +#undef _ODP_PKTIO_XDP + #endif diff --git a/platform/linux-generic/Makefile.am b/platform/linux-generic/Makefile.am index d76dd81e1..6e64df740 100644 --- a/platform/linux-generic/Makefile.am +++ b/platform/linux-generic/Makefile.am @@ -17,6 +17,7 @@ AM_CPPFLAGS += $(NETMAP_CPPFLAGS) AM_CFLAGS += $(AARCH64CRYPTO_CFLAGS) AM_CFLAGS += $(DPDK_CFLAGS) AM_CFLAGS += $(LIBCONFIG_CFLAGS) +AM_CFLAGS += $(LIBXDP_CFLAGS) DISTCLEANFILES = include/odp_libconfig_config.h include/odp_libconfig_config.h: $(top_builddir)/$(rel_default_config_path) $(top_builddir)/config.status @@ -256,6 +257,7 @@ __LIB__libodp_linux_la_SOURCES = \ pktio/pktio_common.c \ pktio/socket.c \ pktio/socket_mmap.c \ + pktio/socket_xdp.c \ pktio/tap.c if WITH_OPENSSL_CRYPTO @@ -418,6 +420,7 @@ __LIB__libodp_linux_la_LIBADD += $(LIBCONFIG_LIBS) __LIB__libodp_linux_la_LIBADD += $(DPDK_LIBS_LIBODP) __LIB__libodp_linux_la_LIBADD += $(PTHREAD_LIBS) __LIB__libodp_linux_la_LIBADD += $(TIMER_LIBS) +__LIB__libodp_linux_la_LIBADD += $(LIBXDP_LIBS) if ODP_PKTIO_PCAP __LIB__libodp_linux_la_LIBADD += $(PCAP_LIBS) diff --git a/platform/linux-generic/include/odp_packet_internal.h b/platform/linux-generic/include/odp_packet_internal.h index c8dade24c..9a6bc3254 100644 --- a/platform/linux-generic/include/odp_packet_internal.h +++ b/platform/linux-generic/include/odp_packet_internal.h @@ -35,6 +35,7 @@ extern "C" { #include <odp_ipsec_internal.h> #include <odp_pool_internal.h> #include <odp_queue_if.h> +#include <odp_config_internal.h> #include <stdint.h> #include <string.h> @@ -151,6 +152,9 @@ typedef struct ODP_ALIGNED_CACHE odp_packet_hdr_t { /* LSO profile index */ uint8_t lso_profile_idx; + /* Pktio where packet is used as a memory source */ + uint8_t ms_pktio_idx; + union { /* Result for crypto packet op */ odp_crypto_packet_result_t crypto_op_result; @@ -171,6 +175,8 @@ typedef struct ODP_ALIGNED_CACHE odp_packet_hdr_t { * grow over 256 bytes. */ ODP_STATIC_ASSERT(sizeof(odp_packet_hdr_t) <= 256, "PACKET_HDR_SIZE_ERROR"); +ODP_STATIC_ASSERT(ODP_CONFIG_PKTIO_ENTRIES < UINT8_MAX, "MS_PKTIO_IDX_SIZE_ERROR"); + /** * Return the packet header */ diff --git a/platform/linux-generic/include/odp_packet_io_internal.h b/platform/linux-generic/include/odp_packet_io_internal.h index a8697c069..ca9f083da 100644 --- a/platform/linux-generic/include/odp_packet_io_internal.h +++ b/platform/linux-generic/include/odp_packet_io_internal.h @@ -70,7 +70,7 @@ struct pktio_if_ops; #elif defined(_ODP_PKTIO_DPDK) #define PKTIO_PRIVATE_SIZE 5632 #else -#define PKTIO_PRIVATE_SIZE 384 +#define PKTIO_PRIVATE_SIZE 512 #endif struct pktio_entry { @@ -308,6 +308,7 @@ static inline void _odp_pktio_tx_ts_set(pktio_entry_t *entry) extern const pktio_if_ops_t _odp_netmap_pktio_ops; extern const pktio_if_ops_t _odp_dpdk_pktio_ops; +extern const pktio_if_ops_t _odp_sock_xdp_pktio_ops; extern const pktio_if_ops_t _odp_sock_mmsg_pktio_ops; extern const pktio_if_ops_t _odp_sock_mmap_pktio_ops; extern const pktio_if_ops_t _odp_loopback_pktio_ops; diff --git a/platform/linux-generic/libodp-linux.pc.in b/platform/linux-generic/libodp-linux.pc.in index 28c7ac49c..f9a339fb8 100644 --- a/platform/linux-generic/libodp-linux.pc.in +++ b/platform/linux-generic/libodp-linux.pc.in @@ -8,5 +8,5 @@ Description: The ODP packet processing engine Version: @PKGCONFIG_VERSION@ Requires.private: libconfig@AARCH64CRYPTO_PKG@ Libs: -L${libdir} -l@ODP_LIB_NAME@ @ATOMIC_LIBS_NON_ABI_COMPAT@ -Libs.private: @OPENSSL_STATIC_LIBS@ @DPDK_LIBS@ @PCAP_LIBS@ @PTHREAD_LIBS@ @TIMER_LIBS@ -lpthread @ATOMIC_LIBS_ABI_COMPAT@ +Libs.private: @OPENSSL_STATIC_LIBS@ @DPDK_LIBS@ @PCAP_LIBS@ @PTHREAD_LIBS@ @TIMER_LIBS@ @LIBXDP_LIBS@ -lpthread @ATOMIC_LIBS_ABI_COMPAT@ Cflags: -I${includedir} diff --git a/platform/linux-generic/m4/configure.m4 b/platform/linux-generic/m4/configure.m4 index 4ec623a5d..4f3365ea6 100644 --- a/platform/linux-generic/m4/configure.m4 +++ b/platform/linux-generic/m4/configure.m4 @@ -28,9 +28,10 @@ m4_include([platform/linux-generic/m4/odp_crypto.m4]) m4_include([platform/linux-generic/m4/odp_pcapng.m4]) m4_include([platform/linux-generic/m4/odp_netmap.m4]) m4_include([platform/linux-generic/m4/odp_dpdk.m4]) +m4_include([platform/linux-generic/m4/odp_xdp.m4]) ODP_SCHEDULER -AS_VAR_APPEND([PLAT_DEP_LIBS], ["${ATOMIC_LIBS} ${AARCH64CRYPTO_LIBS} ${LIBCONFIG_LIBS} ${OPENSSL_LIBS} ${DPDK_LIBS_LT} ${LIBCLI_LIBS}"]) +AS_VAR_APPEND([PLAT_DEP_LIBS], ["${ATOMIC_LIBS} ${AARCH64CRYPTO_LIBS} ${LIBCONFIG_LIBS} ${OPENSSL_LIBS} ${DPDK_LIBS_LT} ${LIBCLI_LIBS} ${LIBXDP_LIBS}"]) # Add text to the end of configure with platform specific settings. # Make sure it's aligned same as other lines in configure.ac. diff --git a/platform/linux-generic/m4/odp_xdp.m4 b/platform/linux-generic/m4/odp_xdp.m4 new file mode 100644 index 000000000..2c6179df9 --- /dev/null +++ b/platform/linux-generic/m4/odp_xdp.m4 @@ -0,0 +1,15 @@ +########################################################################## +# Check for libxdp availability +########################################################################## +AC_ARG_ENABLE([xdp], AS_HELP_STRING([--enable-xdp], + [enable experimental XDP support for Packet I/O [default=disabled] (linux-generic)])) + +AS_IF([test "x$enable_xdp" = "xyes"], [ + PKG_CHECK_MODULES([LIBXDP], [libxdp], + [ + AC_DEFINE(_ODP_PKTIO_XDP, [1], [Define to 1 to enable xdp packet I/O support]) + ], + [ + AS_IF([test "x$enable_xdp" == "xyes"], [AC_MSG_ERROR([libxdp not found])]) + ]) +]) diff --git a/platform/linux-generic/odp_pool_mem_src_ops.c b/platform/linux-generic/odp_pool_mem_src_ops.c index d9b810a6a..2f8dc2078 100644 --- a/platform/linux-generic/odp_pool_mem_src_ops.c +++ b/platform/linux-generic/odp_pool_mem_src_ops.c @@ -8,11 +8,15 @@ #include <odp_pool_internal.h> extern const _odp_pool_mem_src_ops_t _odp_pool_dpdk_mem_src_ops; +extern const _odp_pool_mem_src_ops_t _odp_pool_sock_xdp_mem_src_ops; /* List of available ODP packet pool memory source operations. Array must be NULL terminated */ const _odp_pool_mem_src_ops_t * const _odp_pool_mem_src_ops[] = { #ifdef _ODP_PKTIO_DPDK &_odp_pool_dpdk_mem_src_ops, #endif +#ifdef _ODP_PKTIO_XDP + &_odp_pool_sock_xdp_mem_src_ops, +#endif NULL }; diff --git a/platform/linux-generic/pktio/io_ops.c b/platform/linux-generic/pktio/io_ops.c index b5a08b58a..f9ea89f71 100644 --- a/platform/linux-generic/pktio/io_ops.c +++ b/platform/linux-generic/pktio/io_ops.c @@ -16,6 +16,9 @@ const pktio_if_ops_t * const _odp_pktio_if_ops[] = { #ifdef _ODP_PKTIO_DPDK &_odp_dpdk_pktio_ops, #endif +#ifdef _ODP_PKTIO_XDP + &_odp_sock_xdp_pktio_ops, +#endif #ifdef _ODP_PKTIO_NETMAP &_odp_netmap_pktio_ops, #endif diff --git a/platform/linux-generic/pktio/socket_xdp.c b/platform/linux-generic/pktio/socket_xdp.c new file mode 100644 index 000000000..e43e4bf89 --- /dev/null +++ b/platform/linux-generic/pktio/socket_xdp.c @@ -0,0 +1,688 @@ +/* Copyright (c) 2022, Nokia + * All rights reserved. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#include <odp/autoheader_internal.h> + +#ifdef _ODP_PKTIO_XDP + +#include <odp_posix_extensions.h> +#include <odp/api/debug.h> +#include <odp/api/hints.h> +#include <odp/api/system_info.h> +#include <odp/api/ticketlock.h> + +#include <odp_debug_internal.h> +#include <odp_macros_internal.h> +#include <odp_packet_io_internal.h> +#include <odp_packet_internal.h> +#include <odp_parse_internal.h> +#include <odp_classification_internal.h> +#include <odp_socket_common.h> + +#include <string.h> +#include <errno.h> +#include <sys/socket.h> +#include <unistd.h> +#include <poll.h> + +#include <xdp/xsk.h> + +#define NUM_XDP_DESCS 1024U +#define MIN_FRAME_SIZE 2048U +#define IF_DELIM " " +#define Q_DELIM ':' + +typedef struct { + struct xsk_ring_prod fill_q; + struct xsk_ring_cons compl_q; + struct xsk_umem *umem; + pool_t *pool; +} xdp_umem_info_t; + +typedef struct { + struct xsk_ring_cons rx; + struct xsk_ring_cons compl_q; + struct xsk_ring_prod tx; + struct xsk_ring_prod fill_q; + xdp_umem_info_t *umem_info; + struct xsk_socket *xsk; + int pktio_idx; + int helper_sock; + uint32_t mtu; + uint32_t max_mtu; +} xdp_sock_info_t; + +typedef struct { + odp_ticketlock_t rx_lock ODP_ALIGNED_CACHE; + odp_ticketlock_t tx_lock ODP_ALIGNED_CACHE; + xdp_sock_info_t sock_info; +} pkt_xdp_t; + +typedef struct { + odp_packet_hdr_t *pkt_hdr; + odp_packet_t pkt; + uint8_t *data; + uint32_t len; +} pkt_data_t; + +ODP_STATIC_ASSERT(PKTIO_PRIVATE_SIZE >= sizeof(pkt_xdp_t), + "PKTIO_PRIVATE_SIZE too small"); + +static odp_bool_t disable_pktio; + +static int sock_xdp_init_global(void) +{ + if (getenv("ODP_PKTIO_DISABLE_SOCKET_XDP")) { + ODP_PRINT("PKTIO: socket xdp skipped," + " enabled export ODP_PKTIO_DISABLE_SOCKET_XDP=1.\n"); + disable_pktio = true; + } else { + ODP_PRINT("PKTIO: initialized socket xdp," + " use export ODP_PKTIO_DISABLE_SOCKET_XDP=1 to disable.\n"); + } + + return 0; +} + +static inline pkt_xdp_t *pkt_priv(pktio_entry_t *pktio_entry) +{ + return (pkt_xdp_t *)(uintptr_t)(pktio_entry->s.pkt_priv); +} + +static void fill_socket_config(struct xsk_socket_config *config) +{ + config->rx_size = NUM_XDP_DESCS; + config->tx_size = NUM_XDP_DESCS; + config->libxdp_flags = 0U; + config->xdp_flags = 0U; + config->bind_flags = XDP_ZEROCOPY; /* TODO: XDP_COPY */ +} + +static uint32_t get_bind_queue_index(const char *devname) +{ + const char *param = getenv("ODP_PKTIO_XDP_PARAMS"); + char *tmp_str; + char *tmp; + char *if_str; + int idx = 0; + + if (param == NULL) + goto out; + + tmp_str = strdup(param); + + if (tmp_str == NULL) + goto out; + + tmp = strtok(tmp_str, IF_DELIM); + + if (tmp == NULL) + goto out_str; + + while (tmp) { + if_str = strchr(tmp, Q_DELIM); + + if (if_str != NULL && if_str != &tmp[strlen(tmp) - 1U]) { + if (strncmp(devname, tmp, (uint64_t)(uintptr_t)(if_str - tmp)) == 0) { + idx = _ODP_MAX(atoi(++if_str), 0); + break; + } + } + + tmp = strtok(NULL, IF_DELIM); + } + +out_str: + free(tmp_str); + +out: + return idx; +} + +static odp_bool_t reserve_fill_queue_elements(xdp_sock_info_t *sock_info, int num) +{ + pool_t *pool; + odp_packet_t packets[num]; + int count; + struct xsk_ring_prod *fill_q; + uint32_t start_idx; + int pktio_idx; + uint32_t block_size; + odp_packet_hdr_t *pkt_hdr; + + pool = sock_info->umem_info->pool; + count = odp_packet_alloc_multi(pool->pool_hdl, sock_info->mtu, packets, num); + + if (count <= 0) + return false; + + fill_q = &sock_info->fill_q; + + if (xsk_ring_prod__reserve(fill_q, count, &start_idx) == 0U) { + odp_packet_free_multi(packets, count); + return false; + } + + pktio_idx = sock_info->pktio_idx; + block_size = pool->block_size; + + for (int i = 0; i < count; ++i) { + pkt_hdr = packet_hdr(packets[i]); + pkt_hdr->ms_pktio_idx = pktio_idx; + *xsk_ring_prod__fill_addr(fill_q, start_idx++) = + pkt_hdr->event_hdr.index.event * block_size; + } + + xsk_ring_prod__submit(&sock_info->fill_q, count); + + return true; +} + +static int sock_xdp_open(odp_pktio_t pktio, pktio_entry_t *pktio_entry, const char *devname, + odp_pool_t pool_hdl) +{ + pkt_xdp_t *priv; + pool_t *pool; + struct xsk_socket_config config; + uint32_t bind_q; + int ret; + + if (disable_pktio) + return -1; + + priv = pkt_priv(pktio_entry); + memset(priv, 0, sizeof(pkt_xdp_t)); + pool = pool_entry_from_hdl(pool_hdl); + priv->sock_info.umem_info = (xdp_umem_info_t *)pool->mem_src_data; + priv->sock_info.xsk = NULL; + /* Mark transitory kernel-owned packets with the pktio index, so that they can be freed on + * close. */ + priv->sock_info.pktio_idx = 1 + odp_pktio_index(pktio); + fill_socket_config(&config); + bind_q = get_bind_queue_index(devname); + /* With xsk_socket__create_shared(), as only one bind queue index can + * be passed, NIC in use needs to be configured accordingly to have + * only a single combined TX-RX queue, otherwise traffic may not end up + * on the socket. For now, always bind to the first queue (overridable + * with environment variable). */ + ret = xsk_socket__create_shared(&priv->sock_info.xsk, devname, bind_q, + priv->sock_info.umem_info->umem, &priv->sock_info.rx, + &priv->sock_info.tx, &priv->sock_info.fill_q, + &priv->sock_info.compl_q, &config); + + if (ret) { + ODP_ERR("Error creating xdp socket for bind queue %u: %d\n", bind_q, ret); + goto xsk_err; + } + + /* Ring setup/clean up routines seem to be asynchronous with some drivers and might not be + * ready yet after xsk_socket__create_shared(). */ + sleep(1U); + + /* Querying with ioctl() via AF_XDP socket doesn't seem to work, so + * create a helper socket for this. */ + priv->sock_info.helper_sock = -1; + ret = socket(AF_INET, SOCK_DGRAM, 0); + + if (ret == -1) { + ODP_ERR("Error creating helper socket for xdp: %s\n", strerror(errno)); + goto sock_err; + } + + priv->sock_info.helper_sock = ret; + priv->sock_info.mtu = _odp_mtu_get_fd(priv->sock_info.helper_sock, devname); + + if (priv->sock_info.mtu == 0U) + goto res_err; + + priv->sock_info.max_mtu = pool->seg_len; + + if (!reserve_fill_queue_elements(&priv->sock_info, config.rx_size)) { + ODP_ERR("Unable to reserve fill queue descriptors.\n"); + goto res_err; + } + + odp_ticketlock_init(&priv->rx_lock); + odp_ticketlock_init(&priv->tx_lock); + + return 0; + +res_err: + close(priv->sock_info.helper_sock); + priv->sock_info.helper_sock = -1; + +sock_err: + xsk_socket__delete(priv->sock_info.xsk); + priv->sock_info.xsk = NULL; + +xsk_err: + return -1; +} + +static int sock_xdp_close(pktio_entry_t *pktio_entry) +{ + pkt_xdp_t *priv = pkt_priv(pktio_entry); + pool_t *pool = priv->sock_info.umem_info->pool; + odp_packet_hdr_t *pkt_hdr; + + if (priv->sock_info.helper_sock != -1) + close(priv->sock_info.helper_sock); + + if (priv->sock_info.xsk != NULL) + xsk_socket__delete(priv->sock_info.xsk); + + /* Ring setup/clean up routines seem to be asynchronous with some drivers and might not be + * ready yet after xsk_socket__delete(). */ + sleep(1U); + + /* Free all packets that were in fill or completion queues at the time of closing. */ + for (uint32_t i = 0U; i < pool->num + pool->skipped_blocks; ++i) { + pkt_hdr = packet_hdr(packet_from_event_hdr(event_hdr_from_index(pool, i))); + + if (pkt_hdr->ms_pktio_idx == priv->sock_info.pktio_idx) { + pkt_hdr->ms_pktio_idx = 0U; + odp_packet_free(packet_handle(pkt_hdr)); + } + } + + return 0; +} + +static inline void extract_data(const struct xdp_desc *rx_desc, uint8_t *pool_base_addr, + pkt_data_t *pkt_data) +{ + uint64_t frame_off; + uint64_t pkt_off; + + /* UMEM "addresses" are offsets from start of a registered UMEM area. + * Additionally, the packet data offset (where received packet data + * starts within a UMEM frame) is encoded to the UMEM address with + * XSK_UNALIGNED_BUF_OFFSET_SHIFT left bitshift when XDP_ZEROCOPY and + * XDP_UMEM_UNALIGNED_CHUNK_FLAG are enabled. */ + frame_off = rx_desc->addr; + pkt_off = xsk_umem__add_offset_to_addr(frame_off); + frame_off = xsk_umem__extract_addr(frame_off); + pkt_data->pkt_hdr = xsk_umem__get_data(pool_base_addr, frame_off); + pkt_data->pkt = packet_handle(pkt_data->pkt_hdr); + pkt_data->data = xsk_umem__get_data(pool_base_addr, pkt_off); + pkt_data->len = rx_desc->len; +} + +static uint32_t process_received(pktio_entry_t *pktio_entry, xdp_sock_info_t *sock_info, + uint32_t start_idx, odp_packet_t packets[], int num) +{ + pkt_data_t pkt_data; + struct xsk_ring_cons *rx = &sock_info->rx; + uint8_t *base_addr = sock_info->umem_info->pool->base_addr; + const odp_proto_layer_t layer = pktio_entry->s.parse_layer; + const odp_proto_chksums_t in_chksums = pktio_entry->s.in_chksums; + const odp_pktin_config_opt_t opt = pktio_entry->s.config.pktin; + uint64_t l4_part_sum = 0U; + odp_pool_t *pool_hdl = &sock_info->umem_info->pool->pool_hdl; + odp_pktio_t pktio_hdl = pktio_entry->s.handle; + uint32_t num_rx = 0U; + + for (int i = 0; i < num; ++i) { + extract_data(xsk_ring_cons__rx_desc(rx, start_idx++), base_addr, &pkt_data); + pkt_data.pkt_hdr->ms_pktio_idx = 0U; + packet_init(pkt_data.pkt_hdr, pkt_data.len); + + if (layer) { + if (_odp_packet_parse_common(&pkt_data.pkt_hdr->p, pkt_data.data, + pkt_data.len, pkt_data.len, + layer, in_chksums, &l4_part_sum, opt) < 0) { + odp_packet_free(pkt_data.pkt); + continue; + } + + if (pktio_cls_enabled(pktio_entry) && + _odp_cls_classify_packet(pktio_entry, pkt_data.data, pool_hdl, + pkt_data.pkt_hdr)) { + odp_packet_free(pkt_data.pkt); + continue; + } + } + + pkt_data.pkt_hdr->seg_data = pkt_data.data; + pkt_data.pkt_hdr->event_hdr.base_data = pkt_data.data; + pkt_data.pkt_hdr->input = pktio_hdl; + packets[num_rx++] = pkt_data.pkt; + } + + return num_rx; +} + +static int sock_xdp_recv(pktio_entry_t *pktio_entry, int index ODP_UNUSED, odp_packet_t packets[], + int num) +{ + pkt_xdp_t *priv; + struct pollfd fd; + uint32_t start_idx = 0U, recvd, procd; + + priv = pkt_priv(pktio_entry); + odp_ticketlock_lock(&priv->rx_lock); + + if (odp_unlikely(xsk_ring_prod__needs_wakeup(&priv->sock_info.fill_q))) { + fd.fd = xsk_socket__fd(priv->sock_info.xsk); + fd.events = POLLIN; + (void)poll(&fd, 1U, 0); + } + + recvd = xsk_ring_cons__peek(&priv->sock_info.rx, num, &start_idx); + + if (recvd == 0U) { + odp_ticketlock_unlock(&priv->rx_lock); + return 0; + } + + procd = process_received(pktio_entry, &priv->sock_info, start_idx, packets, recvd); + xsk_ring_cons__release(&priv->sock_info.rx, recvd); + (void)reserve_fill_queue_elements(&priv->sock_info, recvd); + odp_ticketlock_unlock(&priv->rx_lock); + + return procd; +} + +static inline void populate_tx_desc(pool_t *pool, odp_packet_hdr_t *pkt_hdr, + struct xdp_desc *tx_desc) +{ + uint64_t frame_off; + uint64_t pkt_off; + + frame_off = pkt_hdr->event_hdr.index.event * pool->block_size; + pkt_off = (uint64_t)(uintptr_t)pkt_hdr->event_hdr.base_data + - (uint64_t)(uintptr_t)pool->base_addr - frame_off; + pkt_off <<= XSK_UNALIGNED_BUF_OFFSET_SHIFT; + tx_desc->addr = frame_off | pkt_off; + tx_desc->len = pkt_hdr->frame_len; +} + +static void handle_pending_tx(xdp_sock_info_t *sock_info, int num) +{ + struct xsk_ring_cons *compl_q; + uint32_t sent; + uint8_t *base_addr; + uint32_t start_idx; + uint64_t frame_off; + odp_packet_t pkt; + + if (odp_unlikely(xsk_ring_prod__needs_wakeup(&sock_info->tx))) + (void)sendto(xsk_socket__fd(sock_info->xsk), NULL, 0U, MSG_DONTWAIT, NULL, 0U); + + compl_q = &sock_info->compl_q; + sent = xsk_ring_cons__peek(compl_q, num, &start_idx); + base_addr = sock_info->umem_info->pool->base_addr; + + odp_packet_t packets[sent]; + + if (sent) { + for (uint32_t i = 0U; i < sent; ++i) { + frame_off = *xsk_ring_cons__comp_addr(compl_q, start_idx++); + frame_off = xsk_umem__extract_addr(frame_off); + pkt = xsk_umem__get_data(base_addr, frame_off); + packets[i] = pkt; + packet_hdr(packets[i])->ms_pktio_idx = 0U; + } + + odp_packet_free_multi(packets, sent); + xsk_ring_cons__release(compl_q, sent); + } +} + +static int sock_xdp_send(pktio_entry_t *pktio_entry, int index ODP_UNUSED, + const odp_packet_t packets[], int num) +{ + pkt_xdp_t *priv; + xdp_sock_info_t *sock_info; + pool_t *pool; + odp_pool_t pool_hdl; + int pktio_idx, i; + struct xsk_ring_prod *tx; + odp_packet_t pkt; + odp_packet_hdr_t *pkt_hdr; + uint32_t start_idx; + + if (odp_unlikely(num == 0)) + return 0; + + priv = pkt_priv(pktio_entry); + odp_ticketlock_lock(&priv->tx_lock); + sock_info = &priv->sock_info; + pool = sock_info->umem_info->pool; + pool_hdl = pool->pool_hdl; + pktio_idx = sock_info->pktio_idx; + tx = &sock_info->tx; + + for (i = 0; i < num; ++i) { + pkt = ODP_PACKET_INVALID; + + if (odp_unlikely(odp_packet_num_segs(packets[i])) > 1) { + /* TODO: handle segmented packets */ + ODP_ERR("Only single-segment packets supported\n"); + break; + } + + pkt_hdr = packet_hdr(packets[i]); + + if (pkt_hdr->event_hdr.pool_ptr != pool) { + pkt = odp_packet_copy(packets[i], pool_hdl); + + if (odp_unlikely(pkt == ODP_PACKET_INVALID)) + break; + + pkt_hdr = packet_hdr(pkt); + } + + if (xsk_ring_prod__reserve(tx, 1U, &start_idx) == 0U) { + handle_pending_tx(sock_info, NUM_XDP_DESCS); + + if (xsk_ring_prod__reserve(tx, 1U, &start_idx) == 0U) { + if (pkt != ODP_PACKET_INVALID) + odp_packet_free(pkt); + + break; + } + } + + if (pkt != ODP_PACKET_INVALID) + odp_packet_free(packets[i]); + + pkt_hdr->ms_pktio_idx = pktio_idx; + populate_tx_desc(pool, pkt_hdr, xsk_ring_prod__tx_desc(tx, start_idx)); + } + + xsk_ring_prod__submit(tx, i); + handle_pending_tx(sock_info, NUM_XDP_DESCS); + odp_ticketlock_unlock(&priv->tx_lock); + + return i; +} + +static uint32_t sock_xdp_mtu_get(pktio_entry_t *pktio_entry) +{ + return pkt_priv(pktio_entry)->sock_info.mtu; +} + +static int sock_xdp_mtu_set(pktio_entry_t *pktio_entry, uint32_t maxlen_input, + uint32_t maxlen_output ODP_UNUSED) +{ + pkt_xdp_t *priv = pkt_priv(pktio_entry); + int ret; + + ret = _odp_mtu_set_fd(priv->sock_info.helper_sock, pktio_entry->s.name, maxlen_input); + if (ret) + return ret; + + priv->sock_info.mtu = maxlen_input; + + return 0; +} + +static int sock_xdp_promisc_mode_set(pktio_entry_t *pktio_entry, int enable) +{ + return _odp_promisc_mode_set_fd(pkt_priv(pktio_entry)->sock_info.helper_sock, + pktio_entry->s.name, enable); +} + +static int sock_xdp_promisc_mode_get(pktio_entry_t *pktio_entry) +{ + return _odp_promisc_mode_get_fd(pkt_priv(pktio_entry)->sock_info.helper_sock, + pktio_entry->s.name); +} + +static int sock_xdp_mac_addr_get(pktio_entry_t *pktio_entry ODP_UNUSED, void *mac_addr) +{ + return _odp_mac_addr_get_fd(pkt_priv(pktio_entry)->sock_info.helper_sock, + pktio_entry->s.name, mac_addr) ? -1 : ETH_ALEN; +} + +static int sock_xdp_link_status(pktio_entry_t *pktio_entry) +{ + return _odp_link_status_fd(pkt_priv(pktio_entry)->sock_info.helper_sock, + pktio_entry->s.name); +} + +static int sock_xdp_link_info(pktio_entry_t *pktio_entry, odp_pktio_link_info_t *info) +{ + return _odp_link_info_fd(pkt_priv(pktio_entry)->sock_info.helper_sock, + pktio_entry->s.name, info); +} + +static int sock_xdp_capability(pktio_entry_t *pktio_entry, odp_pktio_capability_t *capa) +{ + pkt_xdp_t *priv = pkt_priv(pktio_entry); + + memset(capa, 0, sizeof(odp_pktio_capability_t)); + capa->max_input_queues = 1U; + capa->max_output_queues = 1U; + capa->set_op.op.promisc_mode = 1U; + capa->set_op.op.maxlen = 1U; + + capa->maxlen.equal = true; + capa->maxlen.min_input = _ODP_SOCKET_MTU_MIN; + capa->maxlen.max_input = priv->sock_info.max_mtu; + capa->maxlen.min_output = _ODP_SOCKET_MTU_MIN; + capa->maxlen.max_output = priv->sock_info.max_mtu; + + capa->config.parser.layer = ODP_PROTO_LAYER_ALL; + + capa->stats.pktio.all_counters = 0U; + capa->stats.pktin_queue.all_counters = 0U; + capa->stats.pktout_queue.all_counters = 0U; + + return 0; +} + +const pktio_if_ops_t _odp_sock_xdp_pktio_ops = { + /* TODO: at least stats */ + .name = "socket_xdp", + .print = NULL, + .init_global = sock_xdp_init_global, + .init_local = NULL, + .term = NULL, + .open = sock_xdp_open, + .close = sock_xdp_close, + .start = NULL, + .stop = NULL, + .stats = NULL, + .stats_reset = NULL, + .pktin_queue_stats = NULL, + .pktout_queue_stats = NULL, + .extra_stat_info = NULL, + .extra_stats = NULL, + .extra_stat_counter = NULL, + .pktio_ts_res = NULL, + .pktio_ts_from_ns = NULL, + .pktio_time = NULL, + .recv = sock_xdp_recv, + .recv_tmo = NULL, + .recv_mq_tmo = NULL, + .fd_set = NULL, + .send = sock_xdp_send, + .maxlen_get = sock_xdp_mtu_get, + .maxlen_set = sock_xdp_mtu_set, + .promisc_mode_set = sock_xdp_promisc_mode_set, + .promisc_mode_get = sock_xdp_promisc_mode_get, + .mac_get = sock_xdp_mac_addr_get, + .mac_set = NULL, + .link_status = sock_xdp_link_status, + .link_info = sock_xdp_link_info, + .capability = sock_xdp_capability, + .config = NULL, + .input_queues_config = NULL, + .output_queues_config = NULL +}; + +static odp_bool_t sock_xdp_is_mem_src_active(void) +{ + return !disable_pktio; +} + +static void sock_xdp_force_mem_src_disable(void) +{ + disable_pktio = true; +} + +static void sock_xdp_adjust_block_size(uint8_t *data ODP_UNUSED, uint32_t *block_size, + uint32_t *block_offset ODP_UNUSED, uint32_t *flags) +{ + const uint32_t size = *block_size + XDP_PACKET_HEADROOM; + const uint64_t ps = odp_sys_page_size(); + /* AF_XDP requires frames to be between 2kB and page size, so with + * XDP_ZEROCOPY, if block size is less than 2kB, adjust it to 2kB, if + * it is larger than page size, make pool creation fail. */ + if (disable_pktio) + return; + + if (size > ps) { + ODP_ERR("Adjusted pool block size larger than page size: %u > %" PRIu64 "\n", + size, ps); + *block_size = 0U; + } + + *flags |= ODP_SHM_HP; + *block_size = _ODP_MAX(size, MIN_FRAME_SIZE); +} + +static int sock_xdp_umem_create(uint8_t *data, pool_t *pool) +{ + struct xsk_umem_config cfg; + xdp_umem_info_t *umem_info = (xdp_umem_info_t *)data; + + umem_info->pool = pool; + /* Fill queue size is recommended to be >= HW RX ring size + AF_XDP RX + * ring size, so use size twice the size of AF_XDP RX ring. */ + cfg.fill_size = NUM_XDP_DESCS * 2U; /* TODO: num descs vs pool size */ + cfg.comp_size = NUM_XDP_DESCS; + cfg.frame_size = pool->block_size; + cfg.frame_headroom = sizeof(odp_packet_hdr_t) + pool->headroom; + cfg.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG; + + return xsk_umem__create(&umem_info->umem, pool->base_addr, pool->shm_size, + &umem_info->fill_q, &umem_info->compl_q, &cfg); +} + +static void sock_xdp_umem_delete(uint8_t *data) +{ + xdp_umem_info_t *umem_info = (xdp_umem_info_t *)data; + + while (xsk_umem__delete(umem_info->umem) == -EBUSY) + continue; +} + +const _odp_pool_mem_src_ops_t _odp_pool_sock_xdp_mem_src_ops = { + .name = "xdp_zc", + .is_active = sock_xdp_is_mem_src_active, + .force_disable = sock_xdp_force_mem_src_disable, + .adjust_size = sock_xdp_adjust_block_size, + .bind = sock_xdp_umem_create, + .unbind = sock_xdp_umem_delete +}; + +#else +/* Avoid warning about empty translation unit */ +typedef int _odp_dummy; +#endif |