--- libev/ev_iouring.c 2019/08/17 05:30:16 1.3 +++ libev/ev_iouring.c 2019/12/28 05:53:48 1.15 @@ -49,8 +49,9 @@ * e) why 3 mmaps instead of one? one would be more space-efficient, * and I can't see what benefit three would have (other than being * somehow resizable/relocatable, but that's apparently not possible). + * (FIXME: newer kernels can use 2 mmaps only, need to look into this). * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and - the bizarre way structure offsets are commuinicated makes it hard to + * the bizarre way structure offsets are communicated makes it hard to * just print the ring buffer heads, even *iff* the memory were visible * in gdb. but then, that's also ok, really. * g) well, you cannot specify a timeout when waiting for events. no, @@ -60,8 +61,10 @@ * like a ยต-optimisation by the io_uring author for his personal * applications, to the detriment of everybody else who just wants * an event loop. but, umm, ok, if that's all, it could be worse. + * (FIXME: jens mentioned timeout commands, need to investigate) * h) there is a hardcoded limit of 4096 outstanding events. okay, * at least there is no arbitrary low system-wide limit... + * (FIXME: apparently, this was increased to 32768 in later kernels( * i) unlike linux aio, you *can* register more then the limit * of fd events, and the kernel will "gracefully" signal an * overflow, after which you could destroy and recreate the kernel @@ -69,6 +72,7 @@ * totally insane, but kind of questions the point a high * performance I/O framework when it doesn't really work * under stress. + * (FIXME: iouring should no longer drop events, need to investigate) * j) but, oh my! is has exactly the same bugs as the linux aio backend, * where some undocumented poll combinations just fail. * so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course, @@ -78,9 +82,14 @@ * or might not get fixed (do I hold my breath?). */ +/* TODO: use internal TIMEOUT */ +/* TODO: take advantage of single mmap, NODROP etc. */ +/* TODO: resize cq/sq size independently */ + #include #include #include +#include #define IOURING_INIT_ENTRIES 32 @@ -98,7 +107,10 @@ __u8 flags; __u16 ioprio; __s32 fd; - __u64 off; + union { + __u64 off; + __u64 addr2; + }; __u64 addr; __u32 len; union { @@ -107,6 +119,11 @@ __u16 poll_events; __u32 sync_range_flags; __u32 msg_flags; + __u32 timeout_flags; + __u32 accept_flags; + __u32 cancel_flags; + __u32 open_flags; + __u32 statx_flags; }; __u64 user_data; union { @@ -153,13 +170,27 @@ __u32 flags; __u32 sq_thread_cpu; __u32 sq_thread_idle; - __u32 resv[5]; + __u32 features; + __u32 resv[4]; struct io_sqring_offsets sq_off; struct io_cqring_offsets cq_off; }; -#define IORING_OP_POLL_ADD 6 -#define IORING_OP_POLL_REMOVE 7 +#define IORING_SETUP_CQSIZE 0x00000008 + +#define IORING_OP_POLL_ADD 6 +#define IORING_OP_POLL_REMOVE 7 +#define IORING_OP_TIMEOUT 11 +#define IORING_OP_TIMEOUT_REMOVE 12 + +/* relative or absolute, reference clock is CLOCK_MONOTONIC */ +struct iouring_kernel_timespec +{ + int64_t tv_sec; + long long tv_nsec; +}; + +#define IORING_TIMEOUT_ABS 0x00000001 #define IORING_ENTER_GETEVENTS 0x01 @@ -167,6 +198,10 @@ #define IORING_OFF_CQ_RING 0x08000000ULL #define IORING_OFF_SQES 0x10000000ULL +#define IORING_FEAT_SINGLE_MMAP 0x00000001 +#define IORING_FEAT_NODROP 0x00000002 +#define IORING_FEAT_SUBMIT_STABLE 0x00000004 + inline_size int evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) @@ -195,20 +230,54 @@ #define EV_SQES ((struct io_uring_sqe *) iouring_sqes) #define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) +/* TODO: this is not enough, we might have to reap events */ +/* TODO: but we can't, as that will re-arm events, causing */ +/* TODO: an endless loop in fd_reify */ +static int +iouring_enter (EV_P_ ev_tstamp timeout) +{ + int res; + + EV_RELEASE_CB; + + res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, + timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); + + assert (("libev: io_uring_enter did not consume all sqes", (res < 0 || res == iouring_to_submit))); + + iouring_to_submit = 0; + + EV_ACQUIRE_CB; + + return res; +} + static struct io_uring_sqe * iouring_sqe_get (EV_P) { unsigned tail = EV_SQ_VAR (tail); - if (tail + 1 - EV_SQ_VAR (head) > EV_SQ_VAR (ring_entries)) + while (ecb_expect_false (tail + 1 - EV_SQ_VAR (head) > EV_SQ_VAR (ring_entries))) { - /* queue full, flush */ - evsys_io_uring_enter (iouring_fd, iouring_to_submit, 0, 0, 0, 0); - iouring_to_submit = 0; + /* queue full, need to flush */ + + int res = iouring_enter (EV_A_ EV_TS_CONST (0.)); + + /* io_uring_enter might fail with EBUSY and won't submit anything */ + /* unfortunately, we can't handle this at the moment */ + + if (res < 0 && errno == EBUSY) + /* the sane thing might be to resize, but we can't */ + //TODO + ev_syserr ("(libev) io_uring_enter could not clear sq"); + else + break; + + /* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE */ } - assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))); + /*assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)));*/ return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); } @@ -238,12 +307,6 @@ iouring_tfd_to = EV_TSTAMP_HUGE; } -static void -iouring_epoll_cb (EV_P_ struct ev_io *w, int revents) -{ - epoll_poll (EV_A_ 0); -} - /* called for full and partial cleanup */ ecb_cold static int @@ -256,8 +319,11 @@ if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size); if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size ); - if (ev_is_active (&iouring_epoll_w)) ev_ref (EV_A); ev_io_stop (EV_A_ &iouring_epoll_w); - if (ev_is_active (&iouring_tfd_w )) ev_ref (EV_A); ev_io_stop (EV_A_ &iouring_tfd_w ); + if (ev_is_active (&iouring_tfd_w)) + { + ev_ref (EV_A); + ev_io_stop (EV_A_ &iouring_tfd_w); + } } ecb_cold @@ -273,6 +339,9 @@ iouring_cq_ring = MAP_FAILED; iouring_sqes = MAP_FAILED; + if (!have_monotonic) /* cannot really happen, but what if11 */ + return -1; + for (;;) { iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); @@ -283,6 +352,11 @@ if (errno != EINVAL) return -1; /* we failed */ +#if TODO + if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP)) + return -1; /* we require the above features */ +#endif + /* EINVAL: lots of possible reasons, but maybe * it is because we hit the unqueryable hardcoded size limit */ @@ -344,14 +418,7 @@ while (iouring_internal_init (EV_A) < 0) ev_syserr ("(libev) io_uring_setup"); - /* forking epoll should also effectively unregister all fds from the backend */ - epoll_fork (EV_A); - /* epoll_fork already did this. hopefully */ - /*fd_rearm_all (EV_A);*/ - - ev_io_stop (EV_A_ &iouring_epoll_w); - ev_io_set (EV_A_ &iouring_epoll_w, backend_fd, EV_READ); - ev_io_start (EV_A_ &iouring_epoll_w); + fd_rearm_all (EV_A); ev_io_stop (EV_A_ &iouring_tfd_w); ev_io_set (EV_A_ &iouring_tfd_w, iouring_tfd, EV_READ); @@ -363,23 +430,19 @@ static void iouring_modify (EV_P_ int fd, int oev, int nev) { - fprintf (stderr,"modify %d (%d, %d) %d\n", fd, oev,nev, anfds[fd].eflags);//D - if (ecb_expect_false (anfds [fd].eflags)) - { - /* we handed this fd over to epoll, so undo this first */ - /* we do it manually because the optimisations on epoll_modify won't do us any good */ - epoll_ctl (iouring_fd, EPOLL_CTL_DEL, fd, 0); - anfds [fd].eflags = 0; - oev = 0; - } - if (oev) { /* we assume the sqe's are all "properly" initialised */ struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); sqe->opcode = IORING_OP_POLL_REMOVE; sqe->fd = fd; - sqe->user_data = -1; + /* Jens Axboe notified me that user_data is not what is documented, but is + * some kind of unique ID that has to match, otherwise the request cannot + * be removed. Since we don't *really* have that, we pass in the old + * generation counter - if that fails, too bad, it will hopefully be removed + * at close time and then be ignored. */ + sqe->addr = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); + sqe->user_data = (uint64_t)-1; iouring_sqe_submit (EV_A_ sqe); /* increment generation counter to avoid handling old events */ @@ -430,9 +493,9 @@ uint32_t gen = cqe->user_data >> 32; int res = cqe->res; - /* ignore fd removal events, if there are any. TODO: verify */ - if (cqe->user_data == (__u64)-1) - abort ();//D + /* user_data -1 is a remove that we are not atm. interested in */ + if (cqe->user_data == (uint64_t)-1) + return; assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); @@ -443,23 +506,17 @@ */ /* ignore event if generation doesn't match */ + /* other than skipping removal events, */ /* this should actually be very rare */ if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) return; if (ecb_expect_false (res < 0)) { - if (res == -EINVAL) - { - /* we assume this error code means the fd/poll combination is buggy - * and fall back to epoll. - * this error code might also indicate a bug, but the kernel doesn't - * distinguish between those two conditions, so... sigh... - */ + /*TODO: EINVAL handling (was something failed with this fd)*/ + /*TODO: EBUSY happens when?*/ - epoll_modify (EV_A_ fd, 0, anfds [fd].events); - } - else if (res == -EBADF) + if (res == -EBADF) { assert (("libev: event loop rejected bad fd", res != -EBADF)); fd_kill (EV_A_ fd); @@ -473,8 +530,6 @@ return; } - fprintf (stderr, "fd %d event, rearm\n", fd);//D - /* feed events, we do not expect or handle POLLNVAL */ fd_event ( EV_A_ @@ -497,7 +552,7 @@ /* we have two options, resize the queue (by tearing down * everything and recreating it, or living with it * and polling. - * we implement this by resizing tghe queue, and, if that fails, + * we implement this by resizing the queue, and, if that fails, * we just recreate the state on every failure, which * kind of is a very inefficient poll. * one danger is, due to the bios toward lower fds, @@ -519,12 +574,12 @@ /* we hit the kernel limit, we should fall back to something else. * we can either poll() a few times and hope for the best, * poll always, or switch to epoll. - * since we use epoll anyways, go epoll. + * TODO: is this necessary with newer kernels? */ iouring_internal_destroy (EV_A); - /* this should make it so that on return, we don'T call any uring functions */ + /* this should make it so that on return, we don't call any uring functions */ iouring_to_submit = 0; for (;;) @@ -581,22 +636,16 @@ /* no events, so maybe wait for some */ iouring_tfd_update (EV_A_ timeout); - /* only enter the kernel if we have somethign to submit, or we need to wait */ + /* only enter the kernel if we have something to submit, or we need to wait */ if (timeout || iouring_to_submit) { - int res; - - EV_RELEASE_CB; - - res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, - timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); - iouring_to_submit = 0; - - EV_ACQUIRE_CB; + int res = iouring_enter (EV_A_ timeout); if (ecb_expect_false (res < 0)) if (errno == EINTR) /* ignore */; + else if (errno == EBUSY) + /* cq full, cannot submit - should be rare because we flush the cq first, so simply ignore */; else ev_syserr ("(libev) iouring setup"); else @@ -608,15 +657,6 @@ int iouring_init (EV_P_ int flags) { - if (!epoll_init (EV_A_ 0)) - return 0; - - ev_io_init (EV_A_ &iouring_epoll_w, iouring_epoll_cb, backend_fd, EV_READ); - ev_set_priority (&iouring_epoll_w, EV_MAXPRI); - - ev_io_init (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ); - ev_set_priority (&iouring_tfd_w, EV_MAXPRI); - iouring_entries = IOURING_INIT_ENTRIES; iouring_max_entries = 0; @@ -626,9 +666,8 @@ return 0; } - ev_io_start (EV_A_ &iouring_epoll_w); - ev_unref (EV_A); /* watcher should not keep loop alive */ - + ev_io_init (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ); + ev_set_priority (&iouring_tfd_w, EV_MINPRI); ev_io_start (EV_A_ &iouring_tfd_w); ev_unref (EV_A); /* watcher should not keep loop alive */ @@ -643,6 +682,5 @@ iouring_destroy (EV_P) { iouring_internal_destroy (EV_A); - epoll_destroy (EV_A); }