aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTuomas Taipale <tuomas.taipale@nokia.com>2022-04-04 12:18:51 +0000
committerMatias Elo <matias.elo@nokia.com>2022-05-02 17:02:50 +0300
commit40e999eaeb52d34d5289b6b8d6e405f1623d8a53 (patch)
treebc7ac6aa0c7a5cd6eb14842886e9bd701267a033
parent32f878c9b291cc4f24c30819e45aa2605fcd84f4 (diff)
linux-gen: pktio: add AF_XDP socket based packet IO
Add new AF_XDP socket based packet IO implementation. Information about AF_XDP and XDP can be found in [1]. ODP packet pool(s) created for packet IO entries are used as the UMEM areas. Currently, only zero-copy mode is supported, copy-mode support will be added later. Depending on the XDP program loaded into the environment's NIC(s), TX and RX queue selection for packets may differ. By default, only a single combined queue is supported and this may require changing the configuration of the NIC accordingly so that packets end up on the AF_XDP socket created by ODP. [1] https://www.kernel.org/doc/Documentation/networking/af_xdp.rst Signed-off-by: Tuomas Taipale <tuomas.taipale@nokia.com> Reviewed-by: Matias Elo <matias.elo@nokia.com>
-rw-r--r--DEPENDENCIES66
-rw-r--r--include/odp/autoheader_internal.h.in3
-rw-r--r--platform/linux-generic/Makefile.am3
-rw-r--r--platform/linux-generic/include/odp_packet_internal.h6
-rw-r--r--platform/linux-generic/include/odp_packet_io_internal.h3
-rw-r--r--platform/linux-generic/libodp-linux.pc.in2
-rw-r--r--platform/linux-generic/m4/configure.m43
-rw-r--r--platform/linux-generic/m4/odp_xdp.m415
-rw-r--r--platform/linux-generic/odp_pool_mem_src_ops.c4
-rw-r--r--platform/linux-generic/pktio/io_ops.c3
-rw-r--r--platform/linux-generic/pktio/socket_xdp.c688
11 files changed, 793 insertions, 3 deletions
diff --git a/DEPENDENCIES b/DEPENDENCIES
index 85a4cf242..b18279da2 100644
--- a/DEPENDENCIES
+++ b/DEPENDENCIES
@@ -291,6 +291,72 @@ Prerequisites for building the OpenDataPlane (ODP) API
1024MB of memory:
$ sudo ODP_PKTIO_DPDK_PARAMS="-m 1024" ./test/performance/odp_l2fwd -i 0 -c 1
+3.6 AF_XDP socket based packet I/O support (optional)
+
+ Use AF_XDP socket for packet I/O. At the moment, only zero-copy variant is
+ supported, requiring a kernel version 5.4 or higher. Additionally, if packet
+ pools are to be shared between packet I/Os, kernel version of 5.10 or higher
+ is required.
+
+ More information about XDP and AF_XDP can be found here:
+ https://www.kernel.org/doc/Documentation/networking/af_xdp.rst
+
+ The status of the implementation is **experimental** and may cause issues
+ e.g. with some packet length, packet segment length and pool size
+ combinations that would otherwise conform to reported capabilities.
+
+3.6.1 AF_XDP socket packet I/O requirements
+
+ AF_XDP socket packet I/O implementation requires libxdp and libbpf libraries.
+ They can be fetched from XDP-project in GitHub:
+
+ $ git clone https://github.com/xdp-project/xdp-tools
+
+ (Contains submodules which should be cloned as well.)
+
+ Additional packages might be needed to be installed as well: llvm-dev and
+ gcc-multilib.
+
+ $ ./configure
+ $ make
+
+ After building, libraries should be installed.
+
+ $ cd <path to built libxdp>
+ $ make install
+ $ cd <path to built libbpf>
+ $ make install
+
+3.6.2 Build ODP with AF_XDP socket packet I/O support
+
+ After building and installing libxdp and libbpf, ODP can be configured to be
+ built with AF_XDP support (pass PKG_CONFIG_PATH if needed).
+
+ $ ./configure --enable-xdp
+
+3.6.3 Running ODP with AF_XDP socket packet I/O
+
+ At the moment, each AF_XDP socket packet I/O binds to a single TRX queue,
+ this means that NIC(s) of the environment have to be configured accordingly.
+
+ $ ethtool -L <if name> combined 1
+
+ Additionally, with some NICs (e.g. Mellanox), when zero-copy XDP is in use,
+ the queue configuration is adjusted by the NIC with additional queues on top
+ of the configured single TRX queue. This requires a forwarding rule:
+
+ $ ethtool -N <if name> flow-type ether dst <mac of if> action 1
+
+ Which queue to bind to in a given interface can be controlled with an
+ environment variable when starting an ODP executable:
+
+ $ ODP_PKTIO_XDP_PARAMS="<if name>:<queue index> <if name>:<queue index> ..." ./<odp executable> ...
+
+ parameter being a string of interface-queue index pairs, where interface and
+ queue is separated by a colon and pairs separated by a whitespace. If no
+ environment variable is passed, zero (0) queue is chosen for all AF_XDP
+ interfaces.
+
4.0 Packages needed to build API tests
CUnit test framework version 2.1-3 is required
diff --git a/include/odp/autoheader_internal.h.in b/include/odp/autoheader_internal.h.in
index 952675fb5..33d9f280f 100644
--- a/include/odp/autoheader_internal.h.in
+++ b/include/odp/autoheader_internal.h.in
@@ -32,4 +32,7 @@
/* Define to 1 to enable OpenSSL random data */
#undef _ODP_OPENSSL_RAND
+/* Define to 1 to enable XDP support */
+#undef _ODP_PKTIO_XDP
+
#endif
diff --git a/platform/linux-generic/Makefile.am b/platform/linux-generic/Makefile.am
index d76dd81e1..6e64df740 100644
--- a/platform/linux-generic/Makefile.am
+++ b/platform/linux-generic/Makefile.am
@@ -17,6 +17,7 @@ AM_CPPFLAGS += $(NETMAP_CPPFLAGS)
AM_CFLAGS += $(AARCH64CRYPTO_CFLAGS)
AM_CFLAGS += $(DPDK_CFLAGS)
AM_CFLAGS += $(LIBCONFIG_CFLAGS)
+AM_CFLAGS += $(LIBXDP_CFLAGS)
DISTCLEANFILES = include/odp_libconfig_config.h
include/odp_libconfig_config.h: $(top_builddir)/$(rel_default_config_path) $(top_builddir)/config.status
@@ -256,6 +257,7 @@ __LIB__libodp_linux_la_SOURCES = \
pktio/pktio_common.c \
pktio/socket.c \
pktio/socket_mmap.c \
+ pktio/socket_xdp.c \
pktio/tap.c
if WITH_OPENSSL_CRYPTO
@@ -418,6 +420,7 @@ __LIB__libodp_linux_la_LIBADD += $(LIBCONFIG_LIBS)
__LIB__libodp_linux_la_LIBADD += $(DPDK_LIBS_LIBODP)
__LIB__libodp_linux_la_LIBADD += $(PTHREAD_LIBS)
__LIB__libodp_linux_la_LIBADD += $(TIMER_LIBS)
+__LIB__libodp_linux_la_LIBADD += $(LIBXDP_LIBS)
if ODP_PKTIO_PCAP
__LIB__libodp_linux_la_LIBADD += $(PCAP_LIBS)
diff --git a/platform/linux-generic/include/odp_packet_internal.h b/platform/linux-generic/include/odp_packet_internal.h
index c8dade24c..9a6bc3254 100644
--- a/platform/linux-generic/include/odp_packet_internal.h
+++ b/platform/linux-generic/include/odp_packet_internal.h
@@ -35,6 +35,7 @@ extern "C" {
#include <odp_ipsec_internal.h>
#include <odp_pool_internal.h>
#include <odp_queue_if.h>
+#include <odp_config_internal.h>
#include <stdint.h>
#include <string.h>
@@ -151,6 +152,9 @@ typedef struct ODP_ALIGNED_CACHE odp_packet_hdr_t {
/* LSO profile index */
uint8_t lso_profile_idx;
+ /* Pktio where packet is used as a memory source */
+ uint8_t ms_pktio_idx;
+
union {
/* Result for crypto packet op */
odp_crypto_packet_result_t crypto_op_result;
@@ -171,6 +175,8 @@ typedef struct ODP_ALIGNED_CACHE odp_packet_hdr_t {
* grow over 256 bytes. */
ODP_STATIC_ASSERT(sizeof(odp_packet_hdr_t) <= 256, "PACKET_HDR_SIZE_ERROR");
+ODP_STATIC_ASSERT(ODP_CONFIG_PKTIO_ENTRIES < UINT8_MAX, "MS_PKTIO_IDX_SIZE_ERROR");
+
/**
* Return the packet header
*/
diff --git a/platform/linux-generic/include/odp_packet_io_internal.h b/platform/linux-generic/include/odp_packet_io_internal.h
index a8697c069..ca9f083da 100644
--- a/platform/linux-generic/include/odp_packet_io_internal.h
+++ b/platform/linux-generic/include/odp_packet_io_internal.h
@@ -70,7 +70,7 @@ struct pktio_if_ops;
#elif defined(_ODP_PKTIO_DPDK)
#define PKTIO_PRIVATE_SIZE 5632
#else
-#define PKTIO_PRIVATE_SIZE 384
+#define PKTIO_PRIVATE_SIZE 512
#endif
struct pktio_entry {
@@ -308,6 +308,7 @@ static inline void _odp_pktio_tx_ts_set(pktio_entry_t *entry)
extern const pktio_if_ops_t _odp_netmap_pktio_ops;
extern const pktio_if_ops_t _odp_dpdk_pktio_ops;
+extern const pktio_if_ops_t _odp_sock_xdp_pktio_ops;
extern const pktio_if_ops_t _odp_sock_mmsg_pktio_ops;
extern const pktio_if_ops_t _odp_sock_mmap_pktio_ops;
extern const pktio_if_ops_t _odp_loopback_pktio_ops;
diff --git a/platform/linux-generic/libodp-linux.pc.in b/platform/linux-generic/libodp-linux.pc.in
index 28c7ac49c..f9a339fb8 100644
--- a/platform/linux-generic/libodp-linux.pc.in
+++ b/platform/linux-generic/libodp-linux.pc.in
@@ -8,5 +8,5 @@ Description: The ODP packet processing engine
Version: @PKGCONFIG_VERSION@
Requires.private: libconfig@AARCH64CRYPTO_PKG@
Libs: -L${libdir} -l@ODP_LIB_NAME@ @ATOMIC_LIBS_NON_ABI_COMPAT@
-Libs.private: @OPENSSL_STATIC_LIBS@ @DPDK_LIBS@ @PCAP_LIBS@ @PTHREAD_LIBS@ @TIMER_LIBS@ -lpthread @ATOMIC_LIBS_ABI_COMPAT@
+Libs.private: @OPENSSL_STATIC_LIBS@ @DPDK_LIBS@ @PCAP_LIBS@ @PTHREAD_LIBS@ @TIMER_LIBS@ @LIBXDP_LIBS@ -lpthread @ATOMIC_LIBS_ABI_COMPAT@
Cflags: -I${includedir}
diff --git a/platform/linux-generic/m4/configure.m4 b/platform/linux-generic/m4/configure.m4
index 4ec623a5d..4f3365ea6 100644
--- a/platform/linux-generic/m4/configure.m4
+++ b/platform/linux-generic/m4/configure.m4
@@ -28,9 +28,10 @@ m4_include([platform/linux-generic/m4/odp_crypto.m4])
m4_include([platform/linux-generic/m4/odp_pcapng.m4])
m4_include([platform/linux-generic/m4/odp_netmap.m4])
m4_include([platform/linux-generic/m4/odp_dpdk.m4])
+m4_include([platform/linux-generic/m4/odp_xdp.m4])
ODP_SCHEDULER
-AS_VAR_APPEND([PLAT_DEP_LIBS], ["${ATOMIC_LIBS} ${AARCH64CRYPTO_LIBS} ${LIBCONFIG_LIBS} ${OPENSSL_LIBS} ${DPDK_LIBS_LT} ${LIBCLI_LIBS}"])
+AS_VAR_APPEND([PLAT_DEP_LIBS], ["${ATOMIC_LIBS} ${AARCH64CRYPTO_LIBS} ${LIBCONFIG_LIBS} ${OPENSSL_LIBS} ${DPDK_LIBS_LT} ${LIBCLI_LIBS} ${LIBXDP_LIBS}"])
# Add text to the end of configure with platform specific settings.
# Make sure it's aligned same as other lines in configure.ac.
diff --git a/platform/linux-generic/m4/odp_xdp.m4 b/platform/linux-generic/m4/odp_xdp.m4
new file mode 100644
index 000000000..2c6179df9
--- /dev/null
+++ b/platform/linux-generic/m4/odp_xdp.m4
@@ -0,0 +1,15 @@
+##########################################################################
+# Check for libxdp availability
+##########################################################################
+AC_ARG_ENABLE([xdp], AS_HELP_STRING([--enable-xdp],
+ [enable experimental XDP support for Packet I/O [default=disabled] (linux-generic)]))
+
+AS_IF([test "x$enable_xdp" = "xyes"], [
+ PKG_CHECK_MODULES([LIBXDP], [libxdp],
+ [
+ AC_DEFINE(_ODP_PKTIO_XDP, [1], [Define to 1 to enable xdp packet I/O support])
+ ],
+ [
+ AS_IF([test "x$enable_xdp" == "xyes"], [AC_MSG_ERROR([libxdp not found])])
+ ])
+])
diff --git a/platform/linux-generic/odp_pool_mem_src_ops.c b/platform/linux-generic/odp_pool_mem_src_ops.c
index d9b810a6a..2f8dc2078 100644
--- a/platform/linux-generic/odp_pool_mem_src_ops.c
+++ b/platform/linux-generic/odp_pool_mem_src_ops.c
@@ -8,11 +8,15 @@
#include <odp_pool_internal.h>
extern const _odp_pool_mem_src_ops_t _odp_pool_dpdk_mem_src_ops;
+extern const _odp_pool_mem_src_ops_t _odp_pool_sock_xdp_mem_src_ops;
/* List of available ODP packet pool memory source operations. Array must be NULL terminated */
const _odp_pool_mem_src_ops_t * const _odp_pool_mem_src_ops[] = {
#ifdef _ODP_PKTIO_DPDK
&_odp_pool_dpdk_mem_src_ops,
#endif
+#ifdef _ODP_PKTIO_XDP
+ &_odp_pool_sock_xdp_mem_src_ops,
+#endif
NULL
};
diff --git a/platform/linux-generic/pktio/io_ops.c b/platform/linux-generic/pktio/io_ops.c
index b5a08b58a..f9ea89f71 100644
--- a/platform/linux-generic/pktio/io_ops.c
+++ b/platform/linux-generic/pktio/io_ops.c
@@ -16,6 +16,9 @@ const pktio_if_ops_t * const _odp_pktio_if_ops[] = {
#ifdef _ODP_PKTIO_DPDK
&_odp_dpdk_pktio_ops,
#endif
+#ifdef _ODP_PKTIO_XDP
+ &_odp_sock_xdp_pktio_ops,
+#endif
#ifdef _ODP_PKTIO_NETMAP
&_odp_netmap_pktio_ops,
#endif
diff --git a/platform/linux-generic/pktio/socket_xdp.c b/platform/linux-generic/pktio/socket_xdp.c
new file mode 100644
index 000000000..e43e4bf89
--- /dev/null
+++ b/platform/linux-generic/pktio/socket_xdp.c
@@ -0,0 +1,688 @@
+/* Copyright (c) 2022, Nokia
+ * All rights reserved.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <odp/autoheader_internal.h>
+
+#ifdef _ODP_PKTIO_XDP
+
+#include <odp_posix_extensions.h>
+#include <odp/api/debug.h>
+#include <odp/api/hints.h>
+#include <odp/api/system_info.h>
+#include <odp/api/ticketlock.h>
+
+#include <odp_debug_internal.h>
+#include <odp_macros_internal.h>
+#include <odp_packet_io_internal.h>
+#include <odp_packet_internal.h>
+#include <odp_parse_internal.h>
+#include <odp_classification_internal.h>
+#include <odp_socket_common.h>
+
+#include <string.h>
+#include <errno.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <poll.h>
+
+#include <xdp/xsk.h>
+
+#define NUM_XDP_DESCS 1024U
+#define MIN_FRAME_SIZE 2048U
+#define IF_DELIM " "
+#define Q_DELIM ':'
+
+typedef struct {
+ struct xsk_ring_prod fill_q;
+ struct xsk_ring_cons compl_q;
+ struct xsk_umem *umem;
+ pool_t *pool;
+} xdp_umem_info_t;
+
+typedef struct {
+ struct xsk_ring_cons rx;
+ struct xsk_ring_cons compl_q;
+ struct xsk_ring_prod tx;
+ struct xsk_ring_prod fill_q;
+ xdp_umem_info_t *umem_info;
+ struct xsk_socket *xsk;
+ int pktio_idx;
+ int helper_sock;
+ uint32_t mtu;
+ uint32_t max_mtu;
+} xdp_sock_info_t;
+
+typedef struct {
+ odp_ticketlock_t rx_lock ODP_ALIGNED_CACHE;
+ odp_ticketlock_t tx_lock ODP_ALIGNED_CACHE;
+ xdp_sock_info_t sock_info;
+} pkt_xdp_t;
+
+typedef struct {
+ odp_packet_hdr_t *pkt_hdr;
+ odp_packet_t pkt;
+ uint8_t *data;
+ uint32_t len;
+} pkt_data_t;
+
+ODP_STATIC_ASSERT(PKTIO_PRIVATE_SIZE >= sizeof(pkt_xdp_t),
+ "PKTIO_PRIVATE_SIZE too small");
+
+static odp_bool_t disable_pktio;
+
+static int sock_xdp_init_global(void)
+{
+ if (getenv("ODP_PKTIO_DISABLE_SOCKET_XDP")) {
+ ODP_PRINT("PKTIO: socket xdp skipped,"
+ " enabled export ODP_PKTIO_DISABLE_SOCKET_XDP=1.\n");
+ disable_pktio = true;
+ } else {
+ ODP_PRINT("PKTIO: initialized socket xdp,"
+ " use export ODP_PKTIO_DISABLE_SOCKET_XDP=1 to disable.\n");
+ }
+
+ return 0;
+}
+
+static inline pkt_xdp_t *pkt_priv(pktio_entry_t *pktio_entry)
+{
+ return (pkt_xdp_t *)(uintptr_t)(pktio_entry->s.pkt_priv);
+}
+
+static void fill_socket_config(struct xsk_socket_config *config)
+{
+ config->rx_size = NUM_XDP_DESCS;
+ config->tx_size = NUM_XDP_DESCS;
+ config->libxdp_flags = 0U;
+ config->xdp_flags = 0U;
+ config->bind_flags = XDP_ZEROCOPY; /* TODO: XDP_COPY */
+}
+
+static uint32_t get_bind_queue_index(const char *devname)
+{
+ const char *param = getenv("ODP_PKTIO_XDP_PARAMS");
+ char *tmp_str;
+ char *tmp;
+ char *if_str;
+ int idx = 0;
+
+ if (param == NULL)
+ goto out;
+
+ tmp_str = strdup(param);
+
+ if (tmp_str == NULL)
+ goto out;
+
+ tmp = strtok(tmp_str, IF_DELIM);
+
+ if (tmp == NULL)
+ goto out_str;
+
+ while (tmp) {
+ if_str = strchr(tmp, Q_DELIM);
+
+ if (if_str != NULL && if_str != &tmp[strlen(tmp) - 1U]) {
+ if (strncmp(devname, tmp, (uint64_t)(uintptr_t)(if_str - tmp)) == 0) {
+ idx = _ODP_MAX(atoi(++if_str), 0);
+ break;
+ }
+ }
+
+ tmp = strtok(NULL, IF_DELIM);
+ }
+
+out_str:
+ free(tmp_str);
+
+out:
+ return idx;
+}
+
+static odp_bool_t reserve_fill_queue_elements(xdp_sock_info_t *sock_info, int num)
+{
+ pool_t *pool;
+ odp_packet_t packets[num];
+ int count;
+ struct xsk_ring_prod *fill_q;
+ uint32_t start_idx;
+ int pktio_idx;
+ uint32_t block_size;
+ odp_packet_hdr_t *pkt_hdr;
+
+ pool = sock_info->umem_info->pool;
+ count = odp_packet_alloc_multi(pool->pool_hdl, sock_info->mtu, packets, num);
+
+ if (count <= 0)
+ return false;
+
+ fill_q = &sock_info->fill_q;
+
+ if (xsk_ring_prod__reserve(fill_q, count, &start_idx) == 0U) {
+ odp_packet_free_multi(packets, count);
+ return false;
+ }
+
+ pktio_idx = sock_info->pktio_idx;
+ block_size = pool->block_size;
+
+ for (int i = 0; i < count; ++i) {
+ pkt_hdr = packet_hdr(packets[i]);
+ pkt_hdr->ms_pktio_idx = pktio_idx;
+ *xsk_ring_prod__fill_addr(fill_q, start_idx++) =
+ pkt_hdr->event_hdr.index.event * block_size;
+ }
+
+ xsk_ring_prod__submit(&sock_info->fill_q, count);
+
+ return true;
+}
+
+static int sock_xdp_open(odp_pktio_t pktio, pktio_entry_t *pktio_entry, const char *devname,
+ odp_pool_t pool_hdl)
+{
+ pkt_xdp_t *priv;
+ pool_t *pool;
+ struct xsk_socket_config config;
+ uint32_t bind_q;
+ int ret;
+
+ if (disable_pktio)
+ return -1;
+
+ priv = pkt_priv(pktio_entry);
+ memset(priv, 0, sizeof(pkt_xdp_t));
+ pool = pool_entry_from_hdl(pool_hdl);
+ priv->sock_info.umem_info = (xdp_umem_info_t *)pool->mem_src_data;
+ priv->sock_info.xsk = NULL;
+ /* Mark transitory kernel-owned packets with the pktio index, so that they can be freed on
+ * close. */
+ priv->sock_info.pktio_idx = 1 + odp_pktio_index(pktio);
+ fill_socket_config(&config);
+ bind_q = get_bind_queue_index(devname);
+ /* With xsk_socket__create_shared(), as only one bind queue index can
+ * be passed, NIC in use needs to be configured accordingly to have
+ * only a single combined TX-RX queue, otherwise traffic may not end up
+ * on the socket. For now, always bind to the first queue (overridable
+ * with environment variable). */
+ ret = xsk_socket__create_shared(&priv->sock_info.xsk, devname, bind_q,
+ priv->sock_info.umem_info->umem, &priv->sock_info.rx,
+ &priv->sock_info.tx, &priv->sock_info.fill_q,
+ &priv->sock_info.compl_q, &config);
+
+ if (ret) {
+ ODP_ERR("Error creating xdp socket for bind queue %u: %d\n", bind_q, ret);
+ goto xsk_err;
+ }
+
+ /* Ring setup/clean up routines seem to be asynchronous with some drivers and might not be
+ * ready yet after xsk_socket__create_shared(). */
+ sleep(1U);
+
+ /* Querying with ioctl() via AF_XDP socket doesn't seem to work, so
+ * create a helper socket for this. */
+ priv->sock_info.helper_sock = -1;
+ ret = socket(AF_INET, SOCK_DGRAM, 0);
+
+ if (ret == -1) {
+ ODP_ERR("Error creating helper socket for xdp: %s\n", strerror(errno));
+ goto sock_err;
+ }
+
+ priv->sock_info.helper_sock = ret;
+ priv->sock_info.mtu = _odp_mtu_get_fd(priv->sock_info.helper_sock, devname);
+
+ if (priv->sock_info.mtu == 0U)
+ goto res_err;
+
+ priv->sock_info.max_mtu = pool->seg_len;
+
+ if (!reserve_fill_queue_elements(&priv->sock_info, config.rx_size)) {
+ ODP_ERR("Unable to reserve fill queue descriptors.\n");
+ goto res_err;
+ }
+
+ odp_ticketlock_init(&priv->rx_lock);
+ odp_ticketlock_init(&priv->tx_lock);
+
+ return 0;
+
+res_err:
+ close(priv->sock_info.helper_sock);
+ priv->sock_info.helper_sock = -1;
+
+sock_err:
+ xsk_socket__delete(priv->sock_info.xsk);
+ priv->sock_info.xsk = NULL;
+
+xsk_err:
+ return -1;
+}
+
+static int sock_xdp_close(pktio_entry_t *pktio_entry)
+{
+ pkt_xdp_t *priv = pkt_priv(pktio_entry);
+ pool_t *pool = priv->sock_info.umem_info->pool;
+ odp_packet_hdr_t *pkt_hdr;
+
+ if (priv->sock_info.helper_sock != -1)
+ close(priv->sock_info.helper_sock);
+
+ if (priv->sock_info.xsk != NULL)
+ xsk_socket__delete(priv->sock_info.xsk);
+
+ /* Ring setup/clean up routines seem to be asynchronous with some drivers and might not be
+ * ready yet after xsk_socket__delete(). */
+ sleep(1U);
+
+ /* Free all packets that were in fill or completion queues at the time of closing. */
+ for (uint32_t i = 0U; i < pool->num + pool->skipped_blocks; ++i) {
+ pkt_hdr = packet_hdr(packet_from_event_hdr(event_hdr_from_index(pool, i)));
+
+ if (pkt_hdr->ms_pktio_idx == priv->sock_info.pktio_idx) {
+ pkt_hdr->ms_pktio_idx = 0U;
+ odp_packet_free(packet_handle(pkt_hdr));
+ }
+ }
+
+ return 0;
+}
+
+static inline void extract_data(const struct xdp_desc *rx_desc, uint8_t *pool_base_addr,
+ pkt_data_t *pkt_data)
+{
+ uint64_t frame_off;
+ uint64_t pkt_off;
+
+ /* UMEM "addresses" are offsets from start of a registered UMEM area.
+ * Additionally, the packet data offset (where received packet data
+ * starts within a UMEM frame) is encoded to the UMEM address with
+ * XSK_UNALIGNED_BUF_OFFSET_SHIFT left bitshift when XDP_ZEROCOPY and
+ * XDP_UMEM_UNALIGNED_CHUNK_FLAG are enabled. */
+ frame_off = rx_desc->addr;
+ pkt_off = xsk_umem__add_offset_to_addr(frame_off);
+ frame_off = xsk_umem__extract_addr(frame_off);
+ pkt_data->pkt_hdr = xsk_umem__get_data(pool_base_addr, frame_off);
+ pkt_data->pkt = packet_handle(pkt_data->pkt_hdr);
+ pkt_data->data = xsk_umem__get_data(pool_base_addr, pkt_off);
+ pkt_data->len = rx_desc->len;
+}
+
+static uint32_t process_received(pktio_entry_t *pktio_entry, xdp_sock_info_t *sock_info,
+ uint32_t start_idx, odp_packet_t packets[], int num)
+{
+ pkt_data_t pkt_data;
+ struct xsk_ring_cons *rx = &sock_info->rx;
+ uint8_t *base_addr = sock_info->umem_info->pool->base_addr;
+ const odp_proto_layer_t layer = pktio_entry->s.parse_layer;
+ const odp_proto_chksums_t in_chksums = pktio_entry->s.in_chksums;
+ const odp_pktin_config_opt_t opt = pktio_entry->s.config.pktin;
+ uint64_t l4_part_sum = 0U;
+ odp_pool_t *pool_hdl = &sock_info->umem_info->pool->pool_hdl;
+ odp_pktio_t pktio_hdl = pktio_entry->s.handle;
+ uint32_t num_rx = 0U;
+
+ for (int i = 0; i < num; ++i) {
+ extract_data(xsk_ring_cons__rx_desc(rx, start_idx++), base_addr, &pkt_data);
+ pkt_data.pkt_hdr->ms_pktio_idx = 0U;
+ packet_init(pkt_data.pkt_hdr, pkt_data.len);
+
+ if (layer) {
+ if (_odp_packet_parse_common(&pkt_data.pkt_hdr->p, pkt_data.data,
+ pkt_data.len, pkt_data.len,
+ layer, in_chksums, &l4_part_sum, opt) < 0) {
+ odp_packet_free(pkt_data.pkt);
+ continue;
+ }
+
+ if (pktio_cls_enabled(pktio_entry) &&
+ _odp_cls_classify_packet(pktio_entry, pkt_data.data, pool_hdl,
+ pkt_data.pkt_hdr)) {
+ odp_packet_free(pkt_data.pkt);
+ continue;
+ }
+ }
+
+ pkt_data.pkt_hdr->seg_data = pkt_data.data;
+ pkt_data.pkt_hdr->event_hdr.base_data = pkt_data.data;
+ pkt_data.pkt_hdr->input = pktio_hdl;
+ packets[num_rx++] = pkt_data.pkt;
+ }
+
+ return num_rx;
+}
+
+static int sock_xdp_recv(pktio_entry_t *pktio_entry, int index ODP_UNUSED, odp_packet_t packets[],
+ int num)
+{
+ pkt_xdp_t *priv;
+ struct pollfd fd;
+ uint32_t start_idx = 0U, recvd, procd;
+
+ priv = pkt_priv(pktio_entry);
+ odp_ticketlock_lock(&priv->rx_lock);
+
+ if (odp_unlikely(xsk_ring_prod__needs_wakeup(&priv->sock_info.fill_q))) {
+ fd.fd = xsk_socket__fd(priv->sock_info.xsk);
+ fd.events = POLLIN;
+ (void)poll(&fd, 1U, 0);
+ }
+
+ recvd = xsk_ring_cons__peek(&priv->sock_info.rx, num, &start_idx);
+
+ if (recvd == 0U) {
+ odp_ticketlock_unlock(&priv->rx_lock);
+ return 0;
+ }
+
+ procd = process_received(pktio_entry, &priv->sock_info, start_idx, packets, recvd);
+ xsk_ring_cons__release(&priv->sock_info.rx, recvd);
+ (void)reserve_fill_queue_elements(&priv->sock_info, recvd);
+ odp_ticketlock_unlock(&priv->rx_lock);
+
+ return procd;
+}
+
+static inline void populate_tx_desc(pool_t *pool, odp_packet_hdr_t *pkt_hdr,
+ struct xdp_desc *tx_desc)
+{
+ uint64_t frame_off;
+ uint64_t pkt_off;
+
+ frame_off = pkt_hdr->event_hdr.index.event * pool->block_size;
+ pkt_off = (uint64_t)(uintptr_t)pkt_hdr->event_hdr.base_data
+ - (uint64_t)(uintptr_t)pool->base_addr - frame_off;
+ pkt_off <<= XSK_UNALIGNED_BUF_OFFSET_SHIFT;
+ tx_desc->addr = frame_off | pkt_off;
+ tx_desc->len = pkt_hdr->frame_len;
+}
+
+static void handle_pending_tx(xdp_sock_info_t *sock_info, int num)
+{
+ struct xsk_ring_cons *compl_q;
+ uint32_t sent;
+ uint8_t *base_addr;
+ uint32_t start_idx;
+ uint64_t frame_off;
+ odp_packet_t pkt;
+
+ if (odp_unlikely(xsk_ring_prod__needs_wakeup(&sock_info->tx)))
+ (void)sendto(xsk_socket__fd(sock_info->xsk), NULL, 0U, MSG_DONTWAIT, NULL, 0U);
+
+ compl_q = &sock_info->compl_q;
+ sent = xsk_ring_cons__peek(compl_q, num, &start_idx);
+ base_addr = sock_info->umem_info->pool->base_addr;
+
+ odp_packet_t packets[sent];
+
+ if (sent) {
+ for (uint32_t i = 0U; i < sent; ++i) {
+ frame_off = *xsk_ring_cons__comp_addr(compl_q, start_idx++);
+ frame_off = xsk_umem__extract_addr(frame_off);
+ pkt = xsk_umem__get_data(base_addr, frame_off);
+ packets[i] = pkt;
+ packet_hdr(packets[i])->ms_pktio_idx = 0U;
+ }
+
+ odp_packet_free_multi(packets, sent);
+ xsk_ring_cons__release(compl_q, sent);
+ }
+}
+
+static int sock_xdp_send(pktio_entry_t *pktio_entry, int index ODP_UNUSED,
+ const odp_packet_t packets[], int num)
+{
+ pkt_xdp_t *priv;
+ xdp_sock_info_t *sock_info;
+ pool_t *pool;
+ odp_pool_t pool_hdl;
+ int pktio_idx, i;
+ struct xsk_ring_prod *tx;
+ odp_packet_t pkt;
+ odp_packet_hdr_t *pkt_hdr;
+ uint32_t start_idx;
+
+ if (odp_unlikely(num == 0))
+ return 0;
+
+ priv = pkt_priv(pktio_entry);
+ odp_ticketlock_lock(&priv->tx_lock);
+ sock_info = &priv->sock_info;
+ pool = sock_info->umem_info->pool;
+ pool_hdl = pool->pool_hdl;
+ pktio_idx = sock_info->pktio_idx;
+ tx = &sock_info->tx;
+
+ for (i = 0; i < num; ++i) {
+ pkt = ODP_PACKET_INVALID;
+
+ if (odp_unlikely(odp_packet_num_segs(packets[i])) > 1) {
+ /* TODO: handle segmented packets */
+ ODP_ERR("Only single-segment packets supported\n");
+ break;
+ }
+
+ pkt_hdr = packet_hdr(packets[i]);
+
+ if (pkt_hdr->event_hdr.pool_ptr != pool) {
+ pkt = odp_packet_copy(packets[i], pool_hdl);
+
+ if (odp_unlikely(pkt == ODP_PACKET_INVALID))
+ break;
+
+ pkt_hdr = packet_hdr(pkt);
+ }
+
+ if (xsk_ring_prod__reserve(tx, 1U, &start_idx) == 0U) {
+ handle_pending_tx(sock_info, NUM_XDP_DESCS);
+
+ if (xsk_ring_prod__reserve(tx, 1U, &start_idx) == 0U) {
+ if (pkt != ODP_PACKET_INVALID)
+ odp_packet_free(pkt);
+
+ break;
+ }
+ }
+
+ if (pkt != ODP_PACKET_INVALID)
+ odp_packet_free(packets[i]);
+
+ pkt_hdr->ms_pktio_idx = pktio_idx;
+ populate_tx_desc(pool, pkt_hdr, xsk_ring_prod__tx_desc(tx, start_idx));
+ }
+
+ xsk_ring_prod__submit(tx, i);
+ handle_pending_tx(sock_info, NUM_XDP_DESCS);
+ odp_ticketlock_unlock(&priv->tx_lock);
+
+ return i;
+}
+
+static uint32_t sock_xdp_mtu_get(pktio_entry_t *pktio_entry)
+{
+ return pkt_priv(pktio_entry)->sock_info.mtu;
+}
+
+static int sock_xdp_mtu_set(pktio_entry_t *pktio_entry, uint32_t maxlen_input,
+ uint32_t maxlen_output ODP_UNUSED)
+{
+ pkt_xdp_t *priv = pkt_priv(pktio_entry);
+ int ret;
+
+ ret = _odp_mtu_set_fd(priv->sock_info.helper_sock, pktio_entry->s.name, maxlen_input);
+ if (ret)
+ return ret;
+
+ priv->sock_info.mtu = maxlen_input;
+
+ return 0;
+}
+
+static int sock_xdp_promisc_mode_set(pktio_entry_t *pktio_entry, int enable)
+{
+ return _odp_promisc_mode_set_fd(pkt_priv(pktio_entry)->sock_info.helper_sock,
+ pktio_entry->s.name, enable);
+}
+
+static int sock_xdp_promisc_mode_get(pktio_entry_t *pktio_entry)
+{
+ return _odp_promisc_mode_get_fd(pkt_priv(pktio_entry)->sock_info.helper_sock,
+ pktio_entry->s.name);
+}
+
+static int sock_xdp_mac_addr_get(pktio_entry_t *pktio_entry ODP_UNUSED, void *mac_addr)
+{
+ return _odp_mac_addr_get_fd(pkt_priv(pktio_entry)->sock_info.helper_sock,
+ pktio_entry->s.name, mac_addr) ? -1 : ETH_ALEN;
+}
+
+static int sock_xdp_link_status(pktio_entry_t *pktio_entry)
+{
+ return _odp_link_status_fd(pkt_priv(pktio_entry)->sock_info.helper_sock,
+ pktio_entry->s.name);
+}
+
+static int sock_xdp_link_info(pktio_entry_t *pktio_entry, odp_pktio_link_info_t *info)
+{
+ return _odp_link_info_fd(pkt_priv(pktio_entry)->sock_info.helper_sock,
+ pktio_entry->s.name, info);
+}
+
+static int sock_xdp_capability(pktio_entry_t *pktio_entry, odp_pktio_capability_t *capa)
+{
+ pkt_xdp_t *priv = pkt_priv(pktio_entry);
+
+ memset(capa, 0, sizeof(odp_pktio_capability_t));
+ capa->max_input_queues = 1U;
+ capa->max_output_queues = 1U;
+ capa->set_op.op.promisc_mode = 1U;
+ capa->set_op.op.maxlen = 1U;
+
+ capa->maxlen.equal = true;
+ capa->maxlen.min_input = _ODP_SOCKET_MTU_MIN;
+ capa->maxlen.max_input = priv->sock_info.max_mtu;
+ capa->maxlen.min_output = _ODP_SOCKET_MTU_MIN;
+ capa->maxlen.max_output = priv->sock_info.max_mtu;
+
+ capa->config.parser.layer = ODP_PROTO_LAYER_ALL;
+
+ capa->stats.pktio.all_counters = 0U;
+ capa->stats.pktin_queue.all_counters = 0U;
+ capa->stats.pktout_queue.all_counters = 0U;
+
+ return 0;
+}
+
+const pktio_if_ops_t _odp_sock_xdp_pktio_ops = {
+ /* TODO: at least stats */
+ .name = "socket_xdp",
+ .print = NULL,
+ .init_global = sock_xdp_init_global,
+ .init_local = NULL,
+ .term = NULL,
+ .open = sock_xdp_open,
+ .close = sock_xdp_close,
+ .start = NULL,
+ .stop = NULL,
+ .stats = NULL,
+ .stats_reset = NULL,
+ .pktin_queue_stats = NULL,
+ .pktout_queue_stats = NULL,
+ .extra_stat_info = NULL,
+ .extra_stats = NULL,
+ .extra_stat_counter = NULL,
+ .pktio_ts_res = NULL,
+ .pktio_ts_from_ns = NULL,
+ .pktio_time = NULL,
+ .recv = sock_xdp_recv,
+ .recv_tmo = NULL,
+ .recv_mq_tmo = NULL,
+ .fd_set = NULL,
+ .send = sock_xdp_send,
+ .maxlen_get = sock_xdp_mtu_get,
+ .maxlen_set = sock_xdp_mtu_set,
+ .promisc_mode_set = sock_xdp_promisc_mode_set,
+ .promisc_mode_get = sock_xdp_promisc_mode_get,
+ .mac_get = sock_xdp_mac_addr_get,
+ .mac_set = NULL,
+ .link_status = sock_xdp_link_status,
+ .link_info = sock_xdp_link_info,
+ .capability = sock_xdp_capability,
+ .config = NULL,
+ .input_queues_config = NULL,
+ .output_queues_config = NULL
+};
+
+static odp_bool_t sock_xdp_is_mem_src_active(void)
+{
+ return !disable_pktio;
+}
+
+static void sock_xdp_force_mem_src_disable(void)
+{
+ disable_pktio = true;
+}
+
+static void sock_xdp_adjust_block_size(uint8_t *data ODP_UNUSED, uint32_t *block_size,
+ uint32_t *block_offset ODP_UNUSED, uint32_t *flags)
+{
+ const uint32_t size = *block_size + XDP_PACKET_HEADROOM;
+ const uint64_t ps = odp_sys_page_size();
+ /* AF_XDP requires frames to be between 2kB and page size, so with
+ * XDP_ZEROCOPY, if block size is less than 2kB, adjust it to 2kB, if
+ * it is larger than page size, make pool creation fail. */
+ if (disable_pktio)
+ return;
+
+ if (size > ps) {
+ ODP_ERR("Adjusted pool block size larger than page size: %u > %" PRIu64 "\n",
+ size, ps);
+ *block_size = 0U;
+ }
+
+ *flags |= ODP_SHM_HP;
+ *block_size = _ODP_MAX(size, MIN_FRAME_SIZE);
+}
+
+static int sock_xdp_umem_create(uint8_t *data, pool_t *pool)
+{
+ struct xsk_umem_config cfg;
+ xdp_umem_info_t *umem_info = (xdp_umem_info_t *)data;
+
+ umem_info->pool = pool;
+ /* Fill queue size is recommended to be >= HW RX ring size + AF_XDP RX
+ * ring size, so use size twice the size of AF_XDP RX ring. */
+ cfg.fill_size = NUM_XDP_DESCS * 2U; /* TODO: num descs vs pool size */
+ cfg.comp_size = NUM_XDP_DESCS;
+ cfg.frame_size = pool->block_size;
+ cfg.frame_headroom = sizeof(odp_packet_hdr_t) + pool->headroom;
+ cfg.flags = XDP_UMEM_UNALIGNED_CHUNK_FLAG;
+
+ return xsk_umem__create(&umem_info->umem, pool->base_addr, pool->shm_size,
+ &umem_info->fill_q, &umem_info->compl_q, &cfg);
+}
+
+static void sock_xdp_umem_delete(uint8_t *data)
+{
+ xdp_umem_info_t *umem_info = (xdp_umem_info_t *)data;
+
+ while (xsk_umem__delete(umem_info->umem) == -EBUSY)
+ continue;
+}
+
+const _odp_pool_mem_src_ops_t _odp_pool_sock_xdp_mem_src_ops = {
+ .name = "xdp_zc",
+ .is_active = sock_xdp_is_mem_src_active,
+ .force_disable = sock_xdp_force_mem_src_disable,
+ .adjust_size = sock_xdp_adjust_block_size,
+ .bind = sock_xdp_umem_create,
+ .unbind = sock_xdp_umem_delete
+};
+
+#else
+/* Avoid warning about empty translation unit */
+typedef int _odp_dummy;
+#endif