… | |
… | |
44 | * b) best is not necessarily very good. |
44 | * b) best is not necessarily very good. |
45 | * c) it's better than the aio mess, doesn't suffer from the fork problems |
45 | * c) it's better than the aio mess, doesn't suffer from the fork problems |
46 | * of linux aio or epoll and so on and so on. and you could do event stuff |
46 | * of linux aio or epoll and so on and so on. and you could do event stuff |
47 | * without any syscalls. what's not to like? |
47 | * without any syscalls. what's not to like? |
48 | * d) ok, it's vastly more complex, but that's ok, really. |
48 | * d) ok, it's vastly more complex, but that's ok, really. |
49 | * e) why 3 mmaps instead of one? one would be more space-efficient, |
49 | * e) why two mmaps instead of one? one would be more space-efficient, |
50 | * and I can't see what benefit three would have (other than being |
50 | * and I can't see what benefit two would have (other than being |
51 | * somehow resizable/relocatable, but that's apparently not possible). |
51 | * somehow resizable/relocatable, but that's apparently not possible). |
52 | * (FIXME: newer kernels can use 2 mmaps only, need to look into this). |
|
|
53 | * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and |
52 | * f) hmm, it's practically undebuggable (gdb can't access the memory, and |
54 | * the bizarre way structure offsets are communicated makes it hard to |
53 | * the bizarre way structure offsets are communicated makes it hard to |
55 | * just print the ring buffer heads, even *iff* the memory were visible |
54 | * just print the ring buffer heads, even *iff* the memory were visible |
56 | * in gdb. but then, that's also ok, really. |
55 | * in gdb. but then, that's also ok, really. |
57 | * g) well, you cannot specify a timeout when waiting for events. no, |
56 | * g) well, you cannot specify a timeout when waiting for events. no, |
58 | * seriously, the interface doesn't support a timeout. never seen _that_ |
57 | * seriously, the interface doesn't support a timeout. never seen _that_ |
59 | * before. sure, you can use a timerfd, but that's another syscall |
58 | * before. sure, you can use a timerfd, but that's another syscall |
60 | * you could have avoided. overall, this bizarre omission smells |
59 | * you could have avoided. overall, this bizarre omission smells |
61 | * like a µ-optimisation by the io_uring author for his personal |
60 | * like a µ-optimisation by the io_uring author for his personal |
62 | * applications, to the detriment of everybody else who just wants |
61 | * applications, to the detriment of everybody else who just wants |
63 | * an event loop. but, umm, ok, if that's all, it could be worse. |
62 | * an event loop. but, umm, ok, if that's all, it could be worse. |
64 | * (FIXME: jens mentioned timeout commands, need to investigate) |
63 | * (from what I gather form Jens Axboe, it simply didn't occur to him, |
|
|
64 | * and he made good on it by adding an unlimited nuber of timeouts |
|
|
65 | * later :). |
65 | * h) there is a hardcoded limit of 4096 outstanding events. okay, |
66 | * h) initially there was a hardcoded limit of 4096 outstanding events. |
66 | * at least there is no arbitrary low system-wide limit... |
67 | * later versions not onlyx bump this to 32k, but also can handle |
67 | * (FIXME: apparently, this was increased to 32768 in later kernels( |
68 | * an unlimited amount of events, so this only affects the batch size. |
68 | * i) unlike linux aio, you *can* register more then the limit |
69 | * i) unlike linux aio, you *can* register more then the limit |
69 | * of fd events, and the kernel will "gracefully" signal an |
70 | * of fd events. while early verisons of io_uring signalled an overflow |
70 | * overflow, after which you could destroy and recreate the kernel |
71 | * and you ended up getting wet. 5.5+ does not do this anymore. |
71 | * state, a bit bigger, or fall back to e.g. poll. thats not |
|
|
72 | * totally insane, but kind of questions the point a high |
|
|
73 | * performance I/O framework when it doesn't really work |
|
|
74 | * under stress. |
|
|
75 | * (FIXME: iouring should no longer drop events, need to investigate) |
|
|
76 | * j) but, oh my! is has exactly the same bugs as the linux aio backend, |
72 | * j) but, oh my! it had exactly the same bugs as the linux aio backend, |
77 | * where some undocumented poll combinations just fail. |
73 | * where some undocumented poll combinations just fail. fortunately, |
78 | * so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course, |
74 | * after finally reaching the author, he was more than willing to fix |
79 | * this is completely undocumented, have I mantioned this already? |
75 | * this probably in 5.6+. |
80 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
76 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
81 | * the big isuess with it are the bugs requiring epoll, which might |
77 | * once the bugs ae fixed (probably in 5.6+), it will be without |
82 | * or might not get fixed (do I hold my breath?). |
78 | * competition. |
83 | */ |
79 | */ |
84 | |
80 | |
85 | /* TODO: use internal TIMEOUT */ |
81 | /* TODO: use internal TIMEOUT */ |
86 | /* TODO: take advantage of single mmap, NODROP etc. */ |
82 | /* TODO: take advantage of single mmap, NODROP etc. */ |
87 | /* TODO: resize cq/sq size independently */ |
83 | /* TODO: resize cq/sq size independently */ |
88 | |
84 | |
89 | #include <sys/timerfd.h> |
85 | #include <sys/timerfd.h> |
90 | #include <sys/mman.h> |
86 | #include <sys/mman.h> |
91 | #include <poll.h> |
87 | #include <poll.h> |
|
|
88 | #include <stdint.h> |
92 | |
89 | |
93 | #define IOURING_INIT_ENTRIES 32 |
90 | #define IOURING_INIT_ENTRIES 32 |
94 | |
91 | |
95 | /*****************************************************************************/ |
92 | /*****************************************************************************/ |
96 | /* syscall wrapdadoop - this section has the raw api/abi definitions */ |
93 | /* syscall wrapdadoop - this section has the raw api/abi definitions */ |
… | |
… | |
173 | __u32 resv[4]; |
170 | __u32 resv[4]; |
174 | struct io_sqring_offsets sq_off; |
171 | struct io_sqring_offsets sq_off; |
175 | struct io_cqring_offsets cq_off; |
172 | struct io_cqring_offsets cq_off; |
176 | }; |
173 | }; |
177 | |
174 | |
|
|
175 | #define IORING_SETUP_CQSIZE 0x00000008 |
|
|
176 | |
178 | #define IORING_OP_POLL_ADD 6 |
177 | #define IORING_OP_POLL_ADD 6 |
179 | #define IORING_OP_POLL_REMOVE 7 |
178 | #define IORING_OP_POLL_REMOVE 7 |
|
|
179 | #define IORING_OP_TIMEOUT 11 |
|
|
180 | #define IORING_OP_TIMEOUT_REMOVE 12 |
|
|
181 | |
|
|
182 | /* relative or absolute, reference clock is CLOCK_MONOTONIC */ |
|
|
183 | struct iouring_kernel_timespec |
|
|
184 | { |
|
|
185 | int64_t tv_sec; |
|
|
186 | long long tv_nsec; |
|
|
187 | }; |
|
|
188 | |
|
|
189 | #define IORING_TIMEOUT_ABS 0x00000001 |
180 | |
190 | |
181 | #define IORING_ENTER_GETEVENTS 0x01 |
191 | #define IORING_ENTER_GETEVENTS 0x01 |
182 | |
192 | |
183 | #define IORING_OFF_SQ_RING 0x00000000ULL |
193 | #define IORING_OFF_SQ_RING 0x00000000ULL |
184 | #define IORING_OFF_CQ_RING 0x08000000ULL |
194 | #define IORING_OFF_CQ_RING 0x08000000ULL |
185 | #define IORING_OFF_SQES 0x10000000ULL |
195 | #define IORING_OFF_SQES 0x10000000ULL |
186 | |
196 | |
187 | #define IORING_FEAT_SINGLE_MMAP 0x1 |
197 | #define IORING_FEAT_SINGLE_MMAP 0x00000001 |
188 | #define IORING_FEAT_NODROP 0x2 |
198 | #define IORING_FEAT_NODROP 0x00000002 |
189 | #define IORING_FEAT_SUBMIT_STABLE 0x4 |
199 | #define IORING_FEAT_SUBMIT_STABLE 0x00000004 |
190 | |
200 | |
191 | inline_size |
201 | inline_size |
192 | int |
202 | int |
193 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
203 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
194 | { |
204 | { |
… | |
… | |
214 | |
224 | |
215 | /* the submit/completion queue entries */ |
225 | /* the submit/completion queue entries */ |
216 | #define EV_SQES ((struct io_uring_sqe *) iouring_sqes) |
226 | #define EV_SQES ((struct io_uring_sqe *) iouring_sqes) |
217 | #define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) |
227 | #define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) |
218 | |
228 | |
|
|
229 | inline_speed |
|
|
230 | int |
|
|
231 | iouring_enter (EV_P_ ev_tstamp timeout) |
|
|
232 | { |
|
|
233 | int res; |
|
|
234 | |
|
|
235 | EV_RELEASE_CB; |
|
|
236 | |
|
|
237 | res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, |
|
|
238 | timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); |
|
|
239 | |
|
|
240 | assert (("libev: io_uring_enter did not consume all sqes", (res < 0 || res == iouring_to_submit))); |
|
|
241 | |
|
|
242 | iouring_to_submit = 0; |
|
|
243 | |
|
|
244 | EV_ACQUIRE_CB; |
|
|
245 | |
|
|
246 | return res; |
|
|
247 | } |
|
|
248 | |
|
|
249 | /* TODO: can we move things around so we don't need this forward-reference? */ |
|
|
250 | static void |
|
|
251 | iouring_poll (EV_P_ ev_tstamp timeout); |
|
|
252 | |
219 | static |
253 | static |
220 | struct io_uring_sqe * |
254 | struct io_uring_sqe * |
221 | iouring_sqe_get (EV_P) |
255 | iouring_sqe_get (EV_P) |
222 | { |
256 | { |
|
|
257 | unsigned tail; |
|
|
258 | |
|
|
259 | for (;;) |
|
|
260 | { |
223 | unsigned tail = EV_SQ_VAR (tail); |
261 | tail = EV_SQ_VAR (tail); |
224 | |
262 | |
225 | if (tail + 1 - EV_SQ_VAR (head) > EV_SQ_VAR (ring_entries)) |
263 | if (ecb_expect_true (tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))) |
|
|
264 | break; /* whats the problem, we have free sqes */ |
|
|
265 | |
|
|
266 | /* queue full, need to flush and possibly handle some events */ |
|
|
267 | |
|
|
268 | #if EV_FEATURE_CODE |
|
|
269 | /* first we ask the kernel nicely, most often this frees up some sqes */ |
|
|
270 | int res = iouring_enter (EV_A_ EV_TS_CONST (0.)); |
|
|
271 | |
|
|
272 | ECB_MEMORY_FENCE_ACQUIRE; /* better safe than sorry */ |
|
|
273 | |
|
|
274 | if (res >= 0) |
|
|
275 | continue; /* yes, it worked, try again */ |
|
|
276 | #endif |
|
|
277 | |
|
|
278 | /* some problem, possibly EBUSY - do the full poll and let it handle any issues */ |
|
|
279 | |
|
|
280 | iouring_poll (EV_A_ EV_TS_CONST (0.)); |
|
|
281 | /* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE for us */ |
226 | { |
282 | } |
227 | /* queue full, flush */ |
|
|
228 | evsys_io_uring_enter (iouring_fd, iouring_to_submit, 0, 0, 0, 0); |
|
|
229 | iouring_to_submit = 0; |
|
|
230 | } |
|
|
231 | |
283 | |
232 | assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))); |
284 | /*assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)));*/ |
233 | |
285 | |
234 | return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); |
286 | return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); |
235 | } |
287 | } |
236 | |
288 | |
237 | inline_size |
289 | inline_size |
… | |
… | |
289 | iouring_tfd = -1; |
341 | iouring_tfd = -1; |
290 | iouring_sq_ring = MAP_FAILED; |
342 | iouring_sq_ring = MAP_FAILED; |
291 | iouring_cq_ring = MAP_FAILED; |
343 | iouring_cq_ring = MAP_FAILED; |
292 | iouring_sqes = MAP_FAILED; |
344 | iouring_sqes = MAP_FAILED; |
293 | |
345 | |
|
|
346 | if (!have_monotonic) /* cannot really happen, but what if11 */ |
|
|
347 | return -1; |
|
|
348 | |
294 | for (;;) |
349 | for (;;) |
295 | { |
350 | { |
296 | iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); |
351 | iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); |
297 | |
352 | |
298 | if (iouring_fd >= 0) |
353 | if (iouring_fd >= 0) |
299 | break; /* yippie */ |
354 | break; /* yippie */ |
300 | |
355 | |
301 | if (errno != EINVAL) |
356 | if (errno != EINVAL) |
302 | return -1; /* we failed */ |
357 | return -1; /* we failed */ |
|
|
358 | |
|
|
359 | #if TODO |
|
|
360 | if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP | IORING_FEAT_SUBMIT_STABLE)) |
|
|
361 | return -1; /* we require the above features */ |
|
|
362 | #endif |
303 | |
363 | |
304 | /* EINVAL: lots of possible reasons, but maybe |
364 | /* EINVAL: lots of possible reasons, but maybe |
305 | * it is because we hit the unqueryable hardcoded size limit |
365 | * it is because we hit the unqueryable hardcoded size limit |
306 | */ |
366 | */ |
307 | |
367 | |
… | |
… | |
378 | { |
438 | { |
379 | /* we assume the sqe's are all "properly" initialised */ |
439 | /* we assume the sqe's are all "properly" initialised */ |
380 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
440 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
381 | sqe->opcode = IORING_OP_POLL_REMOVE; |
441 | sqe->opcode = IORING_OP_POLL_REMOVE; |
382 | sqe->fd = fd; |
442 | sqe->fd = fd; |
|
|
443 | /* Jens Axboe notified me that user_data is not what is documented, but is |
|
|
444 | * some kind of unique ID that has to match, otherwise the request cannot |
|
|
445 | * be removed. Since we don't *really* have that, we pass in the old |
|
|
446 | * generation counter - if that fails, too bad, it will hopefully be removed |
|
|
447 | * at close time and then be ignored. */ |
|
|
448 | sqe->addr = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); |
383 | sqe->user_data = -1; |
449 | sqe->user_data = (uint64_t)-1; |
384 | iouring_sqe_submit (EV_A_ sqe); |
450 | iouring_sqe_submit (EV_A_ sqe); |
385 | |
451 | |
386 | /* increment generation counter to avoid handling old events */ |
452 | /* increment generation counter to avoid handling old events */ |
387 | ++anfds [fd].egen; |
453 | ++anfds [fd].egen; |
388 | } |
454 | } |
… | |
… | |
429 | { |
495 | { |
430 | int fd = cqe->user_data & 0xffffffffU; |
496 | int fd = cqe->user_data & 0xffffffffU; |
431 | uint32_t gen = cqe->user_data >> 32; |
497 | uint32_t gen = cqe->user_data >> 32; |
432 | int res = cqe->res; |
498 | int res = cqe->res; |
433 | |
499 | |
434 | /* ignore fd removal events, if there are any. TODO: verify */ |
500 | /* user_data -1 is a remove that we are not atm. interested in */ |
435 | /* TODO: yes, this triggers */ |
|
|
436 | if (cqe->user_data == (__u64)-1) |
501 | if (cqe->user_data == (uint64_t)-1) |
437 | return; |
502 | return; |
438 | |
503 | |
439 | assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); |
504 | assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); |
440 | |
505 | |
441 | /* documentation lies, of course. the result value is NOT like |
506 | /* documentation lies, of course. the result value is NOT like |
… | |
… | |
443 | * error numbers. fortunate, as otherwise there would be no way |
508 | * error numbers. fortunate, as otherwise there would be no way |
444 | * to get error codes at all. still, why not document this? |
509 | * to get error codes at all. still, why not document this? |
445 | */ |
510 | */ |
446 | |
511 | |
447 | /* ignore event if generation doesn't match */ |
512 | /* ignore event if generation doesn't match */ |
|
|
513 | /* other than skipping removal events, */ |
448 | /* this should actually be very rare */ |
514 | /* this should actually be very rare */ |
449 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
515 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
450 | return; |
516 | return; |
451 | |
517 | |
452 | if (ecb_expect_false (res < 0)) |
518 | if (ecb_expect_false (res < 0)) |
453 | { |
519 | { |
454 | //TODO: EINVAL handling (was something failed with this fd) |
520 | /*TODO: EINVAL handling (was something failed with this fd)*/ |
455 | //TODO: EBUSY happens when? |
521 | /*TODO: EBUSY happens when?*/ |
456 | |
522 | |
457 | if (res == -EBADF) |
523 | if (res == -EBADF) |
458 | { |
524 | { |
459 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
525 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
460 | fd_kill (EV_A_ fd); |
526 | fd_kill (EV_A_ fd); |
… | |
… | |
566 | |
632 | |
567 | static void |
633 | static void |
568 | iouring_poll (EV_P_ ev_tstamp timeout) |
634 | iouring_poll (EV_P_ ev_tstamp timeout) |
569 | { |
635 | { |
570 | /* if we have events, no need for extra syscalls, but we might have to queue events */ |
636 | /* if we have events, no need for extra syscalls, but we might have to queue events */ |
|
|
637 | /* we also clar the timeout if there are outstanding fdchanges */ |
|
|
638 | /* the latter should only happen if both the sq and cq are full, most likely */ |
|
|
639 | /* because we have a lot of event sources that immediately complete */ |
|
|
640 | /* TODO: fdchacngecnt is always 0 because fd_reify does not have two buffers yet */ |
571 | if (iouring_handle_cq (EV_A)) |
641 | if (iouring_handle_cq (EV_A) || fdchangecnt) |
572 | timeout = EV_TS_CONST (0.); |
642 | timeout = EV_TS_CONST (0.); |
573 | else |
643 | else |
574 | /* no events, so maybe wait for some */ |
644 | /* no events, so maybe wait for some */ |
575 | iouring_tfd_update (EV_A_ timeout); |
645 | iouring_tfd_update (EV_A_ timeout); |
576 | |
646 | |
577 | /* only enter the kernel if we have something to submit, or we need to wait */ |
647 | /* only enter the kernel if we have something to submit, or we need to wait */ |
578 | if (timeout || iouring_to_submit) |
648 | if (timeout || iouring_to_submit) |
579 | { |
649 | { |
580 | int res; |
650 | int res = iouring_enter (EV_A_ timeout); |
581 | |
|
|
582 | EV_RELEASE_CB; |
|
|
583 | |
|
|
584 | res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, |
|
|
585 | timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); |
|
|
586 | iouring_to_submit = 0; |
|
|
587 | |
|
|
588 | EV_ACQUIRE_CB; |
|
|
589 | |
651 | |
590 | if (ecb_expect_false (res < 0)) |
652 | if (ecb_expect_false (res < 0)) |
591 | if (errno == EINTR) |
653 | if (errno == EINTR) |
592 | /* ignore */; |
654 | /* ignore */; |
|
|
655 | else if (errno == EBUSY) |
|
|
656 | /* cq full, cannot submit - should be rare because we flush the cq first, so simply ignore */; |
593 | else |
657 | else |
594 | ev_syserr ("(libev) iouring setup"); |
658 | ev_syserr ("(libev) iouring setup"); |
595 | else |
659 | else |
596 | iouring_handle_cq (EV_A); |
660 | iouring_handle_cq (EV_A); |
597 | } |
661 | } |