--- libev/ev_iouring.c 2019/12/27 21:17:11 1.10 +++ libev/ev_iouring.c 2022/08/10 16:50:05 1.26 @@ -1,7 +1,7 @@ /* * libev linux io_uring fd activity backend * - * Copyright (c) 2019 Marc Alexander Lehmann + * Copyright (c) 2019-2020 Marc Alexander Lehmann * All rights reserved. * * Redistribution and use in source and binary forms, with or without modifica- @@ -46,11 +46,10 @@ * of linux aio or epoll and so on and so on. and you could do event stuff * without any syscalls. what's not to like? * d) ok, it's vastly more complex, but that's ok, really. - * e) why 3 mmaps instead of one? one would be more space-efficient, - * and I can't see what benefit three would have (other than being + * e) why two mmaps instead of one? one would be more space-efficient, + * and I can't see what benefit two would have (other than being * somehow resizable/relocatable, but that's apparently not possible). - * (FIXME: newer kernels can use 2 mmaps only, need to look into this). - * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and + * f) hmm, it's practically undebuggable (gdb can't access the memory, and * the bizarre way structure offsets are communicated makes it hard to * just print the ring buffer heads, even *iff* the memory were visible * in gdb. but then, that's also ok, really. @@ -61,25 +60,22 @@ * like a ยต-optimisation by the io_uring author for his personal * applications, to the detriment of everybody else who just wants * an event loop. but, umm, ok, if that's all, it could be worse. - * (FIXME: jens mentioned timeout commands, need to investigate) - * h) there is a hardcoded limit of 4096 outstanding events. okay, - * at least there is no arbitrary low system-wide limit... - * (FIXME: apparently, this was increased to 32768 in later kernels( + * (from what I gather from the author Jens Axboe, it simply didn't + * occur to him, and he made good on it by adding an unlimited number + * of timeouts later :). + * h) initially there was a hardcoded limit of 4096 outstanding events. + * later versions not only bump this to 32k, but also can handle + * an unlimited amount of events, so this only affects the batch size. * i) unlike linux aio, you *can* register more then the limit - * of fd events, and the kernel will "gracefully" signal an - * overflow, after which you could destroy and recreate the kernel - * state, a bit bigger, or fall back to e.g. poll. thats not - * totally insane, but kind of questions the point a high - * performance I/O framework when it doesn't really work - * under stress. - * (FIXME: iouring should no longer drop events, need to investigate) - * j) but, oh my! is has exactly the same bugs as the linux aio backend, - * where some undocumented poll combinations just fail. - * so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course, - * this is completely undocumented, have I mantioned this already? + * of fd events. while early verisons of io_uring signalled an overflow + * and you ended up getting wet. 5.5+ does not do this anymore. + * j) but, oh my! it had exactly the same bugs as the linux aio backend, + * where some undocumented poll combinations just fail. fortunately, + * after finally reaching the author, he was more than willing to fix + * this probably in 5.6+. * k) overall, the *API* itself is, I dare to say, not a total trainwreck. - * the big isuess with it are the bugs requiring epoll, which might - * or might not get fixed (do I hold my breath?). + * once the bugs ae fixed (probably in 5.6+), it will be without + * competition. */ /* TODO: use internal TIMEOUT */ @@ -89,6 +85,7 @@ #include #include #include +#include #define IOURING_INIT_ENTRIES 32 @@ -123,10 +120,12 @@ __u32 cancel_flags; __u32 open_flags; __u32 statx_flags; + __u32 fadvise_advice; }; __u64 user_data; union { __u16 buf_index; + __u16 personality; __u64 __pad2[3]; }; }; @@ -175,18 +174,57 @@ struct io_cqring_offsets cq_off; }; -#define IORING_OP_POLL_ADD 6 -#define IORING_OP_POLL_REMOVE 7 +#define IORING_FEAT_SINGLE_MMAP 0x00000001 +#define IORING_FEAT_NODROP 0x00000002 +#define IORING_FEAT_SUBMIT_STABLE 0x00000004 + +#define IORING_SETUP_CQSIZE 0x00000008 +#define IORING_SETUP_CLAMP 0x00000010 + +#define IORING_OP_POLL_ADD 6 +#define IORING_OP_POLL_REMOVE 7 +#define IORING_OP_TIMEOUT 11 +#define IORING_OP_TIMEOUT_REMOVE 12 + +#define IORING_REGISTER_EVENTFD 4 +#define IORING_REGISTER_EVENTFD_ASYNC 7 +#define IORING_REGISTER_PROBE 8 + +#define IO_URING_OP_SUPPORTED 1 + +struct io_uring_probe_op { + __u8 op; + __u8 resv; + __u16 flags; + __u32 resv2; +}; + +struct io_uring_probe +{ + __u8 last_op; + __u8 ops_len; + __u16 resv; + __u32 resv2[3]; + struct io_uring_probe_op ops[0]; +}; + +/* relative or absolute, reference clock is CLOCK_MONOTONIC */ +struct iouring_kernel_timespec +{ + int64_t tv_sec; + long long tv_nsec; +}; + +#define IORING_TIMEOUT_ABS 0x00000001 #define IORING_ENTER_GETEVENTS 0x01 #define IORING_OFF_SQ_RING 0x00000000ULL -#define IORING_OFF_CQ_RING 0x08000000ULL #define IORING_OFF_SQES 0x10000000ULL -#define IORING_FEAT_SINGLE_MMAP 0x1 -#define IORING_FEAT_NODROP 0x2 -#define IORING_FEAT_SUBMIT_STABLE 0x4 +#define IORING_FEAT_SINGLE_MMAP 0x00000001 +#define IORING_FEAT_NODROP 0x00000002 +#define IORING_FEAT_SUBMIT_STABLE 0x00000004 inline_size int @@ -202,40 +240,89 @@ return ev_syscall6 (SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigsz); } +inline_size +int +evsys_io_uring_register (unsigned int fd, unsigned int opcode, void *arg, unsigned int nr_args) +{ + return ev_syscall4 (SYS_io_uring_register, fd, opcode, arg, nr_args); +} + /*****************************************************************************/ -/* actual backed implementation */ +/* actual backend implementation */ /* we hope that volatile will make the compiler access this variables only once */ -#define EV_SQ_VAR(name) *(volatile unsigned *)((char *)iouring_sq_ring + iouring_sq_ ## name) -#define EV_CQ_VAR(name) *(volatile unsigned *)((char *)iouring_cq_ring + iouring_cq_ ## name) +#define EV_SQ_VAR(name) *(volatile unsigned *)((char *)iouring_ring + iouring_sq_ ## name) +#define EV_CQ_VAR(name) *(volatile unsigned *)((char *)iouring_ring + iouring_cq_ ## name) /* the index array */ -#define EV_SQ_ARRAY ((unsigned *)((char *)iouring_sq_ring + iouring_sq_array)) +#define EV_SQ_ARRAY ((unsigned *)((char *)iouring_ring + iouring_sq_array)) /* the submit/completion queue entries */ #define EV_SQES ((struct io_uring_sqe *) iouring_sqes) -#define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) +#define EV_CQES ((struct io_uring_cqe *)((char *)iouring_ring + iouring_cq_cqes)) + +inline_speed +int +iouring_enter (EV_P_ ev_tstamp timeout) +{ + int res; + + EV_RELEASE_CB; + + res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, + timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); + + assert (("libev: io_uring_enter did not consume all sqes", (res < 0 || res == iouring_to_submit))); + + iouring_to_submit = 0; + + EV_ACQUIRE_CB; + + return res; +} + +/* TODO: can we move things around so we don't need this forward-reference? */ +static void +iouring_poll (EV_P_ ev_tstamp timeout); static struct io_uring_sqe * iouring_sqe_get (EV_P) { - unsigned tail = EV_SQ_VAR (tail); - - if (tail + 1 - EV_SQ_VAR (head) > EV_SQ_VAR (ring_entries)) + unsigned tail; + + for (;;) { - /* queue full, flush */ - evsys_io_uring_enter (iouring_fd, iouring_to_submit, 0, 0, 0, 0); - iouring_to_submit = 0; + tail = EV_SQ_VAR (tail); + + if (ecb_expect_true (tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))) + break; /* whats the problem, we have free sqes */ + + /* queue full, need to flush and possibly handle some events */ + +#if EV_FEATURE_CODE + /* first we ask the kernel nicely, most often this frees up some sqes */ + int res = iouring_enter (EV_A_ EV_TS_CONST (0.)); + + ECB_MEMORY_FENCE_ACQUIRE; /* better safe than sorry */ + + if (res >= 0) + continue; /* yes, it worked, try again */ +#endif + + /* some problem, possibly EBUSY - do the full poll and let it handle any issues */ + + iouring_poll (EV_A_ EV_TS_CONST (0.)); + /* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE for us */ } - assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))); + /*assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)));*/ return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); } inline_size -struct io_uring_sqe * +void iouring_sqe_submit (EV_P_ struct io_uring_sqe *sqe) { unsigned idx = sqe - EV_SQES; @@ -261,15 +348,14 @@ /* called for full and partial cleanup */ ecb_cold -static int +static void iouring_internal_destroy (EV_P) { close (iouring_tfd); close (iouring_fd); - if (iouring_sq_ring != MAP_FAILED) munmap (iouring_sq_ring, iouring_sq_ring_size); - if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size); - if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size ); + if (iouring_ring != MAP_FAILED) munmap (iouring_ring, iouring_ring_size); + if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes, iouring_sqes_size); if (ev_is_active (&iouring_tfd_w)) { @@ -283,49 +369,41 @@ iouring_internal_init (EV_P) { struct io_uring_params params = { 0 }; + uint32_t sq_size, cq_size; + + params.flags = IORING_SETUP_CLAMP; iouring_to_submit = 0; - iouring_tfd = -1; - iouring_sq_ring = MAP_FAILED; - iouring_cq_ring = MAP_FAILED; - iouring_sqes = MAP_FAILED; + iouring_tfd = -1; + iouring_ring = MAP_FAILED; + iouring_sqes = MAP_FAILED; - for (;;) - { - iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); + if (!have_monotonic) /* cannot really happen, but what if11 */ + return -1; - if (iouring_fd >= 0) - break; /* yippie */ + iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); - if (errno != EINVAL) - return -1; /* we failed */ + if (iouring_fd < 0) + return -1; - /* EINVAL: lots of possible reasons, but maybe - * it is because we hit the unqueryable hardcoded size limit - */ + if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEAT_SINGLE_MMAP | IORING_FEAT_SUBMIT_STABLE)) + return -1; /* we require the above features */ - /* we hit the limit already, give up */ - if (iouring_max_entries) - return -1; - - /* first time we hit EINVAL? assume we hit the limit, so go back and retry */ - iouring_entries >>= 1; - iouring_max_entries = iouring_entries; - } + /* TODO: remember somehow whether our queue size has been clamped */ + + sq_size = params.sq_off.array + params.sq_entries * sizeof (unsigned); + cq_size = params.cq_off.cqes + params.cq_entries * sizeof (struct io_uring_cqe); - iouring_sq_ring_size = params.sq_off.array + params.sq_entries * sizeof (unsigned); - iouring_cq_ring_size = params.cq_off.cqes + params.cq_entries * sizeof (struct io_uring_cqe); - iouring_sqes_size = params.sq_entries * sizeof (struct io_uring_sqe); - - iouring_sq_ring = mmap (0, iouring_sq_ring_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQ_RING); - iouring_cq_ring = mmap (0, iouring_cq_ring_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_CQ_RING); - iouring_sqes = mmap (0, iouring_sqes_size, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQES); + iouring_ring_size = sq_size > cq_size ? sq_size : cq_size; + iouring_sqes_size = params.sq_entries * sizeof (struct io_uring_sqe); - if (iouring_sq_ring == MAP_FAILED || iouring_cq_ring == MAP_FAILED || iouring_sqes == MAP_FAILED) + iouring_ring = mmap (0, iouring_ring_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQ_RING); + iouring_sqes = mmap (0, iouring_sqes_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQES); + + if (iouring_ring == MAP_FAILED || iouring_sqes == MAP_FAILED) return -1; iouring_sq_head = params.sq_off.head; @@ -343,12 +421,12 @@ iouring_cq_overflow = params.cq_off.overflow; iouring_cq_cqes = params.cq_off.cqes; + iouring_tfd_to = EV_TSTAMP_HUGE; + iouring_tfd = timerfd_create (CLOCK_MONOTONIC, TFD_CLOEXEC); if (iouring_tfd < 0) - return iouring_tfd; - - iouring_tfd_to = EV_TSTAMP_HUGE; + return -1; return 0; } @@ -380,7 +458,13 @@ struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); sqe->opcode = IORING_OP_POLL_REMOVE; sqe->fd = fd; - sqe->user_data = -1; + /* Jens Axboe notified me that user_data is not what is documented, but is + * some kind of unique ID that has to match, otherwise the request cannot + * be removed. Since we don't *really* have that, we pass in the old + * generation counter - if that fails, too bad, it will hopefully be removed + * at close time and then be ignored. */ + sqe->addr = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); + sqe->user_data = (uint64_t)-1; iouring_sqe_submit (EV_A_ sqe); /* increment generation counter to avoid handling old events */ @@ -392,6 +476,7 @@ struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); sqe->opcode = IORING_OP_POLL_ADD; sqe->fd = fd; + sqe->addr = 0; sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); sqe->poll_events = (nev & EV_READ ? POLLIN : 0) @@ -431,9 +516,8 @@ uint32_t gen = cqe->user_data >> 32; int res = cqe->res; - /* ignore fd removal events, if there are any. TODO: verify */ - /* TODO: yes, this triggers */ - if (cqe->user_data == (__u64)-1) + /* user_data -1 is a remove that we are not atm. interested in */ + if (cqe->user_data == (uint64_t)-1) return; assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); @@ -445,14 +529,14 @@ */ /* ignore event if generation doesn't match */ + /* other than skipping removal events, */ /* this should actually be very rare */ if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) return; if (ecb_expect_false (res < 0)) { - //TODO: EINVAL handling (was something failed with this fd) - //TODO: EBUSY happens when? + /*TODO: EINVAL handling (was something failed with this fd)*/ if (res == -EBADF) { @@ -568,7 +652,11 @@ iouring_poll (EV_P_ ev_tstamp timeout) { /* if we have events, no need for extra syscalls, but we might have to queue events */ - if (iouring_handle_cq (EV_A)) + /* we also clar the timeout if there are outstanding fdchanges */ + /* the latter should only happen if both the sq and cq are full, most likely */ + /* because we have a lot of event sources that immediately complete */ + /* TODO: fdchacngecnt is always 0 because fd_reify does not have two buffers yet */ + if (iouring_handle_cq (EV_A) || fdchangecnt) timeout = EV_TS_CONST (0.); else /* no events, so maybe wait for some */ @@ -577,19 +665,13 @@ /* only enter the kernel if we have something to submit, or we need to wait */ if (timeout || iouring_to_submit) { - int res; - - EV_RELEASE_CB; - - res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, - timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); - iouring_to_submit = 0; - - EV_ACQUIRE_CB; + int res = iouring_enter (EV_A_ timeout); if (ecb_expect_false (res < 0)) if (errno == EINTR) /* ignore */; + else if (errno == EBUSY) + /* cq full, cannot submit - should be rare because we flush the cq first, so simply ignore */; else ev_syserr ("(libev) iouring setup"); else