1 | /* |
1 | /* |
2 | * libev linux io_uring fd activity backend |
2 | * libev linux io_uring fd activity backend |
3 | * |
3 | * |
4 | * Copyright (c) 2019 Marc Alexander Lehmann <libev@schmorp.de> |
4 | * Copyright (c) 2019-2020 Marc Alexander Lehmann <libev@schmorp.de> |
5 | * All rights reserved. |
5 | * All rights reserved. |
6 | * |
6 | * |
7 | * Redistribution and use in source and binary forms, with or without modifica- |
7 | * Redistribution and use in source and binary forms, with or without modifica- |
8 | * tion, are permitted provided that the following conditions are met: |
8 | * tion, are permitted provided that the following conditions are met: |
9 | * |
9 | * |
… | |
… | |
44 | * b) best is not necessarily very good. |
44 | * b) best is not necessarily very good. |
45 | * c) it's better than the aio mess, doesn't suffer from the fork problems |
45 | * c) it's better than the aio mess, doesn't suffer from the fork problems |
46 | * of linux aio or epoll and so on and so on. and you could do event stuff |
46 | * of linux aio or epoll and so on and so on. and you could do event stuff |
47 | * without any syscalls. what's not to like? |
47 | * without any syscalls. what's not to like? |
48 | * d) ok, it's vastly more complex, but that's ok, really. |
48 | * d) ok, it's vastly more complex, but that's ok, really. |
49 | * e) why 3 mmaps instead of one? one would be more space-efficient, |
49 | * e) why two mmaps instead of one? one would be more space-efficient, |
50 | * and I can't see what benefit three would have (other than being |
50 | * and I can't see what benefit two would have (other than being |
51 | * somehow resizable/relocatable, but that's apparently not possible). |
51 | * somehow resizable/relocatable, but that's apparently not possible). |
52 | * (FIXME: newer kernels can use 2 mmaps only, need to look into this). |
|
|
53 | * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and |
52 | * f) hmm, it's practically undebuggable (gdb can't access the memory, and |
54 | * the bizarre way structure offsets are communicated makes it hard to |
53 | * the bizarre way structure offsets are communicated makes it hard to |
55 | * just print the ring buffer heads, even *iff* the memory were visible |
54 | * just print the ring buffer heads, even *iff* the memory were visible |
56 | * in gdb. but then, that's also ok, really. |
55 | * in gdb. but then, that's also ok, really. |
57 | * g) well, you cannot specify a timeout when waiting for events. no, |
56 | * g) well, you cannot specify a timeout when waiting for events. no, |
58 | * seriously, the interface doesn't support a timeout. never seen _that_ |
57 | * seriously, the interface doesn't support a timeout. never seen _that_ |
59 | * before. sure, you can use a timerfd, but that's another syscall |
58 | * before. sure, you can use a timerfd, but that's another syscall |
60 | * you could have avoided. overall, this bizarre omission smells |
59 | * you could have avoided. overall, this bizarre omission smells |
61 | * like a µ-optimisation by the io_uring author for his personal |
60 | * like a µ-optimisation by the io_uring author for his personal |
62 | * applications, to the detriment of everybody else who just wants |
61 | * applications, to the detriment of everybody else who just wants |
63 | * an event loop. but, umm, ok, if that's all, it could be worse. |
62 | * an event loop. but, umm, ok, if that's all, it could be worse. |
64 | * (FIXME: jens mentioned timeout commands, need to investigate) |
63 | * (from what I gather from the author Jens Axboe, it simply didn't |
|
|
64 | * occur to him, and he made good on it by adding an unlimited nuber |
|
|
65 | * of timeouts later :). |
65 | * h) there is a hardcoded limit of 4096 outstanding events. okay, |
66 | * h) initially there was a hardcoded limit of 4096 outstanding events. |
66 | * at least there is no arbitrary low system-wide limit... |
67 | * later versions not only bump this to 32k, but also can handle |
67 | * (FIXME: apparently, this was increased to 32768 in later kernels( |
68 | * an unlimited amount of events, so this only affects the batch size. |
68 | * i) unlike linux aio, you *can* register more then the limit |
69 | * i) unlike linux aio, you *can* register more then the limit |
69 | * of fd events, and the kernel will "gracefully" signal an |
70 | * of fd events. while early verisons of io_uring signalled an overflow |
70 | * overflow, after which you could destroy and recreate the kernel |
71 | * and you ended up getting wet. 5.5+ does not do this anymore. |
71 | * state, a bit bigger, or fall back to e.g. poll. thats not |
|
|
72 | * totally insane, but kind of questions the point a high |
|
|
73 | * performance I/O framework when it doesn't really work |
|
|
74 | * under stress. |
|
|
75 | * (FIXME: iouring should no longer drop events, need to investigate) |
|
|
76 | * j) but, oh my! is has exactly the same bugs as the linux aio backend, |
72 | * j) but, oh my! it had exactly the same bugs as the linux aio backend, |
77 | * where some undocumented poll combinations just fail. |
73 | * where some undocumented poll combinations just fail. fortunately, |
78 | * so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course, |
74 | * after finally reaching the author, he was more than willing to fix |
79 | * this is completely undocumented, have I mantioned this already? |
75 | * this probably in 5.6+. |
80 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
76 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
81 | * the big isuess with it are the bugs requiring epoll, which might |
77 | * once the bugs ae fixed (probably in 5.6+), it will be without |
82 | * or might not get fixed (do I hold my breath?). |
78 | * competition. |
83 | */ |
79 | */ |
84 | |
80 | |
85 | /* TODO: use internal TIMEOUT */ |
81 | /* TODO: use internal TIMEOUT */ |
86 | /* TODO: take advantage of single mmap, NODROP etc. */ |
82 | /* TODO: take advantage of single mmap, NODROP etc. */ |
87 | /* TODO: resize cq/sq size independently */ |
83 | /* TODO: resize cq/sq size independently */ |
… | |
… | |
359 | |
355 | |
360 | if (errno != EINVAL) |
356 | if (errno != EINVAL) |
361 | return -1; /* we failed */ |
357 | return -1; /* we failed */ |
362 | |
358 | |
363 | #if TODO |
359 | #if TODO |
364 | if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP)) |
360 | if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP | IORING_FEAT_SUBMIT_STABLE)) |
365 | return -1; /* we require the above features */ |
361 | return -1; /* we require the above features */ |
366 | #endif |
362 | #endif |
367 | |
363 | |
368 | /* EINVAL: lots of possible reasons, but maybe |
364 | /* EINVAL: lots of possible reasons, but maybe |
369 | * it is because we hit the unqueryable hardcoded size limit |
365 | * it is because we hit the unqueryable hardcoded size limit |
… | |
… | |
460 | if (nev) |
456 | if (nev) |
461 | { |
457 | { |
462 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
458 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
463 | sqe->opcode = IORING_OP_POLL_ADD; |
459 | sqe->opcode = IORING_OP_POLL_ADD; |
464 | sqe->fd = fd; |
460 | sqe->fd = fd; |
|
|
461 | sqe->addr = 0; |
465 | sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); |
462 | sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); |
466 | sqe->poll_events = |
463 | sqe->poll_events = |
467 | (nev & EV_READ ? POLLIN : 0) |
464 | (nev & EV_READ ? POLLIN : 0) |
468 | | (nev & EV_WRITE ? POLLOUT : 0); |
465 | | (nev & EV_WRITE ? POLLOUT : 0); |
469 | iouring_sqe_submit (EV_A_ sqe); |
466 | iouring_sqe_submit (EV_A_ sqe); |
… | |
… | |
520 | return; |
517 | return; |
521 | |
518 | |
522 | if (ecb_expect_false (res < 0)) |
519 | if (ecb_expect_false (res < 0)) |
523 | { |
520 | { |
524 | /*TODO: EINVAL handling (was something failed with this fd)*/ |
521 | /*TODO: EINVAL handling (was something failed with this fd)*/ |
525 | /*TODO: EBUSY happens when?*/ |
|
|
526 | |
522 | |
527 | if (res == -EBADF) |
523 | if (res == -EBADF) |
528 | { |
524 | { |
529 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
525 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
530 | fd_kill (EV_A_ fd); |
526 | fd_kill (EV_A_ fd); |