--- libev/ev_iouring.c 2019/12/28 05:20:17 1.14 +++ libev/ev_iouring.c 2020/01/22 02:20:47 1.21 @@ -1,7 +1,7 @@ /* * libev linux io_uring fd activity backend * - * Copyright (c) 2019 Marc Alexander Lehmann + * Copyright (c) 2019-2020 Marc Alexander Lehmann * All rights reserved. * * Redistribution and use in source and binary forms, with or without modifica- @@ -46,11 +46,10 @@ * of linux aio or epoll and so on and so on. and you could do event stuff * without any syscalls. what's not to like? * d) ok, it's vastly more complex, but that's ok, really. - * e) why 3 mmaps instead of one? one would be more space-efficient, - * and I can't see what benefit three would have (other than being + * e) why two mmaps instead of one? one would be more space-efficient, + * and I can't see what benefit two would have (other than being * somehow resizable/relocatable, but that's apparently not possible). - * (FIXME: newer kernels can use 2 mmaps only, need to look into this). - * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and + * f) hmm, it's practically undebuggable (gdb can't access the memory, and * the bizarre way structure offsets are communicated makes it hard to * just print the ring buffer heads, even *iff* the memory were visible * in gdb. but then, that's also ok, really. @@ -61,25 +60,22 @@ * like a ยต-optimisation by the io_uring author for his personal * applications, to the detriment of everybody else who just wants * an event loop. but, umm, ok, if that's all, it could be worse. - * (FIXME: jens mentioned timeout commands, need to investigate) - * h) there is a hardcoded limit of 4096 outstanding events. okay, - * at least there is no arbitrary low system-wide limit... - * (FIXME: apparently, this was increased to 32768 in later kernels( + * (from what I gather from the author Jens Axboe, it simply didn't + * occur to him, and he made good on it by adding an unlimited nuber + * of timeouts later :). + * h) initially there was a hardcoded limit of 4096 outstanding events. + * later versions not only bump this to 32k, but also can handle + * an unlimited amount of events, so this only affects the batch size. * i) unlike linux aio, you *can* register more then the limit - * of fd events, and the kernel will "gracefully" signal an - * overflow, after which you could destroy and recreate the kernel - * state, a bit bigger, or fall back to e.g. poll. thats not - * totally insane, but kind of questions the point a high - * performance I/O framework when it doesn't really work - * under stress. - * (FIXME: iouring should no longer drop events, need to investigate) - * j) but, oh my! is has exactly the same bugs as the linux aio backend, - * where some undocumented poll combinations just fail. - * so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course, - * this is completely undocumented, have I mantioned this already? + * of fd events. while early verisons of io_uring signalled an overflow + * and you ended up getting wet. 5.5+ does not do this anymore. + * j) but, oh my! it had exactly the same bugs as the linux aio backend, + * where some undocumented poll combinations just fail. fortunately, + * after finally reaching the author, he was more than willing to fix + * this probably in 5.6+. * k) overall, the *API* itself is, I dare to say, not a total trainwreck. - * the big isuess with it are the bugs requiring epoll, which might - * or might not get fixed (do I hold my breath?). + * once the bugs ae fixed (probably in 5.6+), it will be without + * competition. */ /* TODO: use internal TIMEOUT */ @@ -230,10 +226,8 @@ #define EV_SQES ((struct io_uring_sqe *) iouring_sqes) #define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) -/* TODO: this is not enough, we might have to reap events */ -/* TODO: but we can't, as that will re-arm events, causing */ -/* TODO: an endless loop in fd_reify */ -static int +inline_speed +int iouring_enter (EV_P_ ev_tstamp timeout) { int res; @@ -252,28 +246,39 @@ return res; } +/* TODO: can we move things around so we don't need this forward-reference? */ +static void +iouring_poll (EV_P_ ev_tstamp timeout); + static struct io_uring_sqe * iouring_sqe_get (EV_P) { - unsigned tail = EV_SQ_VAR (tail); - - while (ecb_expect_false (tail + 1 - EV_SQ_VAR (head) > EV_SQ_VAR (ring_entries))) + unsigned tail; + + for (;;) { - /* queue full, need to flush */ + tail = EV_SQ_VAR (tail); + + if (ecb_expect_true (tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))) + break; /* whats the problem, we have free sqes */ + /* queue full, need to flush and possibly handle some events */ + +#if EV_FEATURE_CODE + /* first we ask the kernel nicely, most often this frees up some sqes */ int res = iouring_enter (EV_A_ EV_TS_CONST (0.)); - /* io_uring_enter might fail with EBUSY and won't submit anything */ - /* unfortunately, we can't handle this at the moment */ + ECB_MEMORY_FENCE_ACQUIRE; /* better safe than sorry */ - if (res < 0 && errno == EBUSY) - //TODO - ev_syserr ("(libev) io_uring_enter could not clear sq"); - else - break; - - /* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE */ + if (res >= 0) + continue; /* yes, it worked, try again */ +#endif + + /* some problem, possibly EBUSY - do the full poll and let it handle any issues */ + + iouring_poll (EV_A_ EV_TS_CONST (0.)); + /* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE for us */ } /*assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)));*/ @@ -352,7 +357,7 @@ return -1; /* we failed */ #if TODO - if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP)) + if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP | IORING_FEAT_SUBMIT_STABLE)) return -1; /* we require the above features */ #endif @@ -440,7 +445,8 @@ * be removed. Since we don't *really* have that, we pass in the old * generation counter - if that fails, too bad, it will hopefully be removed * at close time and then be ignored. */ - sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); + sqe->addr = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); + sqe->user_data = (uint64_t)-1; iouring_sqe_submit (EV_A_ sqe); /* increment generation counter to avoid handling old events */ @@ -452,6 +458,7 @@ struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); sqe->opcode = IORING_OP_POLL_ADD; sqe->fd = fd; + sqe->addr = 0; sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); sqe->poll_events = (nev & EV_READ ? POLLIN : 0) @@ -491,6 +498,10 @@ uint32_t gen = cqe->user_data >> 32; int res = cqe->res; + /* user_data -1 is a remove that we are not atm. interested in */ + if (cqe->user_data == (uint64_t)-1) + return; + assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); /* documentation lies, of course. the result value is NOT like @@ -508,7 +519,6 @@ if (ecb_expect_false (res < 0)) { /*TODO: EINVAL handling (was something failed with this fd)*/ - /*TODO: EBUSY happens when?*/ if (res == -EBADF) { @@ -624,7 +634,11 @@ iouring_poll (EV_P_ ev_tstamp timeout) { /* if we have events, no need for extra syscalls, but we might have to queue events */ - if (iouring_handle_cq (EV_A)) + /* we also clar the timeout if there are outstanding fdchanges */ + /* the latter should only happen if both the sq and cq are full, most likely */ + /* because we have a lot of event sources that immediately complete */ + /* TODO: fdchacngecnt is always 0 because fd_reify does not have two buffers yet */ + if (iouring_handle_cq (EV_A) || fdchangecnt) timeout = EV_TS_CONST (0.); else /* no events, so maybe wait for some */