… | |
… | |
80 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
80 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
81 | * the big isuess with it are the bugs requiring epoll, which might |
81 | * the big isuess with it are the bugs requiring epoll, which might |
82 | * or might not get fixed (do I hold my breath?). |
82 | * or might not get fixed (do I hold my breath?). |
83 | */ |
83 | */ |
84 | |
84 | |
|
|
85 | /* TODO: use internal TIMEOUT */ |
|
|
86 | /* TODO: take advantage of single mmap, NODROP etc. */ |
|
|
87 | /* TODO: resize cq/sq size independently */ |
|
|
88 | |
85 | #include <sys/timerfd.h> |
89 | #include <sys/timerfd.h> |
86 | #include <sys/mman.h> |
90 | #include <sys/mman.h> |
87 | #include <poll.h> |
91 | #include <poll.h> |
|
|
92 | #include <stdint.h> |
88 | |
93 | |
89 | #define IOURING_INIT_ENTRIES 32 |
94 | #define IOURING_INIT_ENTRIES 32 |
90 | |
95 | |
91 | /*****************************************************************************/ |
96 | /*****************************************************************************/ |
92 | /* syscall wrapdadoop - this section has the raw api/abi definitions */ |
97 | /* syscall wrapdadoop - this section has the raw api/abi definitions */ |
… | |
… | |
169 | __u32 resv[4]; |
174 | __u32 resv[4]; |
170 | struct io_sqring_offsets sq_off; |
175 | struct io_sqring_offsets sq_off; |
171 | struct io_cqring_offsets cq_off; |
176 | struct io_cqring_offsets cq_off; |
172 | }; |
177 | }; |
173 | |
178 | |
|
|
179 | #define IORING_SETUP_CQSIZE 0x00000008 |
|
|
180 | |
174 | #define IORING_OP_POLL_ADD 6 |
181 | #define IORING_OP_POLL_ADD 6 |
175 | #define IORING_OP_POLL_REMOVE 7 |
182 | #define IORING_OP_POLL_REMOVE 7 |
|
|
183 | #define IORING_OP_TIMEOUT 11 |
|
|
184 | #define IORING_OP_TIMEOUT_REMOVE 12 |
|
|
185 | |
|
|
186 | /* relative or absolute, reference clock is CLOCK_MONOTONIC */ |
|
|
187 | struct iouring_kernel_timespec |
|
|
188 | { |
|
|
189 | int64_t tv_sec; |
|
|
190 | long long tv_nsec; |
|
|
191 | }; |
|
|
192 | |
|
|
193 | #define IORING_TIMEOUT_ABS 0x00000001 |
176 | |
194 | |
177 | #define IORING_ENTER_GETEVENTS 0x01 |
195 | #define IORING_ENTER_GETEVENTS 0x01 |
178 | |
196 | |
179 | #define IORING_OFF_SQ_RING 0x00000000ULL |
197 | #define IORING_OFF_SQ_RING 0x00000000ULL |
180 | #define IORING_OFF_CQ_RING 0x08000000ULL |
198 | #define IORING_OFF_CQ_RING 0x08000000ULL |
181 | #define IORING_OFF_SQES 0x10000000ULL |
199 | #define IORING_OFF_SQES 0x10000000ULL |
182 | |
200 | |
183 | #define IORING_FEAT_SINGLE_MMAP 0x1 |
201 | #define IORING_FEAT_SINGLE_MMAP 0x00000001 |
184 | #define IORING_FEAT_NODROP 0x2 |
202 | #define IORING_FEAT_NODROP 0x00000002 |
185 | #define IORING_FEAT_SUBMIT_STABLE 0x4 |
203 | #define IORING_FEAT_SUBMIT_STABLE 0x00000004 |
186 | |
204 | |
187 | inline_size |
205 | inline_size |
188 | int |
206 | int |
189 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
207 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
190 | { |
208 | { |
… | |
… | |
285 | iouring_tfd = -1; |
303 | iouring_tfd = -1; |
286 | iouring_sq_ring = MAP_FAILED; |
304 | iouring_sq_ring = MAP_FAILED; |
287 | iouring_cq_ring = MAP_FAILED; |
305 | iouring_cq_ring = MAP_FAILED; |
288 | iouring_sqes = MAP_FAILED; |
306 | iouring_sqes = MAP_FAILED; |
289 | |
307 | |
|
|
308 | if (!have_monotonic) /* cannot really happen, but what if11 */ |
|
|
309 | return -1; |
|
|
310 | |
290 | for (;;) |
311 | for (;;) |
291 | { |
312 | { |
292 | iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); |
313 | iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); |
293 | |
314 | |
294 | if (iouring_fd >= 0) |
315 | if (iouring_fd >= 0) |
295 | break; /* yippie */ |
316 | break; /* yippie */ |
296 | |
317 | |
297 | if (errno != EINVAL) |
318 | if (errno != EINVAL) |
298 | return -1; /* we failed */ |
319 | return -1; /* we failed */ |
|
|
320 | |
|
|
321 | #if TODO |
|
|
322 | if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP)) |
|
|
323 | return -1; /* we require the above features */ |
|
|
324 | #endif |
299 | |
325 | |
300 | /* EINVAL: lots of possible reasons, but maybe |
326 | /* EINVAL: lots of possible reasons, but maybe |
301 | * it is because we hit the unqueryable hardcoded size limit |
327 | * it is because we hit the unqueryable hardcoded size limit |
302 | */ |
328 | */ |
303 | |
329 | |
… | |
… | |
374 | { |
400 | { |
375 | /* we assume the sqe's are all "properly" initialised */ |
401 | /* we assume the sqe's are all "properly" initialised */ |
376 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
402 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
377 | sqe->opcode = IORING_OP_POLL_REMOVE; |
403 | sqe->opcode = IORING_OP_POLL_REMOVE; |
378 | sqe->fd = fd; |
404 | sqe->fd = fd; |
379 | sqe->user_data = -1; |
405 | /* Jens Axboe notified me that user_data is not what is documented, but is |
|
|
406 | * some kind of unique ID that has to match, otherwise the request cannot |
|
|
407 | * be removed. Since we don't *really* have that, we pass in the old |
|
|
408 | * generation counter - if that fails, too bad, it will hopefully be removed |
|
|
409 | * at close time and then be ignored. */ |
|
|
410 | sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); |
380 | iouring_sqe_submit (EV_A_ sqe); |
411 | iouring_sqe_submit (EV_A_ sqe); |
381 | |
412 | |
382 | /* increment generation counter to avoid handling old events */ |
413 | /* increment generation counter to avoid handling old events */ |
383 | ++anfds [fd].egen; |
414 | ++anfds [fd].egen; |
384 | } |
415 | } |
… | |
… | |
425 | { |
456 | { |
426 | int fd = cqe->user_data & 0xffffffffU; |
457 | int fd = cqe->user_data & 0xffffffffU; |
427 | uint32_t gen = cqe->user_data >> 32; |
458 | uint32_t gen = cqe->user_data >> 32; |
428 | int res = cqe->res; |
459 | int res = cqe->res; |
429 | |
460 | |
430 | /* ignore fd removal events, if there are any. TODO: verify */ |
|
|
431 | if (cqe->user_data == (__u64)-1) |
|
|
432 | abort ();//D |
|
|
433 | |
|
|
434 | assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); |
461 | assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); |
435 | |
462 | |
436 | /* documentation lies, of course. the result value is NOT like |
463 | /* documentation lies, of course. the result value is NOT like |
437 | * normal syscalls, but like linux raw syscalls, i.e. negative |
464 | * normal syscalls, but like linux raw syscalls, i.e. negative |
438 | * error numbers. fortunate, as otherwise there would be no way |
465 | * error numbers. fortunate, as otherwise there would be no way |
439 | * to get error codes at all. still, why not document this? |
466 | * to get error codes at all. still, why not document this? |
440 | */ |
467 | */ |
441 | |
468 | |
442 | /* ignore event if generation doesn't match */ |
469 | /* ignore event if generation doesn't match */ |
|
|
470 | /* other than skipping removal events, */ |
443 | /* this should actually be very rare */ |
471 | /* this should actually be very rare */ |
444 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
472 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
445 | return; |
473 | return; |
446 | |
474 | |
447 | if (ecb_expect_false (res < 0)) |
475 | if (ecb_expect_false (res < 0)) |
448 | { |
476 | { |
449 | //TODO: EINVAL handling (was something failed with this fd) |
477 | /*TODO: EINVAL handling (was something failed with this fd)*/ |
450 | //TODO: EBUSY happens when? |
478 | /*TODO: EBUSY happens when?*/ |
451 | |
479 | |
452 | if (res == -EBADF) |
480 | if (res == -EBADF) |
453 | { |
481 | { |
454 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
482 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
455 | fd_kill (EV_A_ fd); |
483 | fd_kill (EV_A_ fd); |
… | |
… | |
483 | iouring_overflow (EV_P) |
511 | iouring_overflow (EV_P) |
484 | { |
512 | { |
485 | /* we have two options, resize the queue (by tearing down |
513 | /* we have two options, resize the queue (by tearing down |
486 | * everything and recreating it, or living with it |
514 | * everything and recreating it, or living with it |
487 | * and polling. |
515 | * and polling. |
488 | * we implement this by resizing tghe queue, and, if that fails, |
516 | * we implement this by resizing the queue, and, if that fails, |
489 | * we just recreate the state on every failure, which |
517 | * we just recreate the state on every failure, which |
490 | * kind of is a very inefficient poll. |
518 | * kind of is a very inefficient poll. |
491 | * one danger is, due to the bios toward lower fds, |
519 | * one danger is, due to the bios toward lower fds, |
492 | * we will only really get events for those, so |
520 | * we will only really get events for those, so |
493 | * maybe we need a poll() fallback, after all. |
521 | * maybe we need a poll() fallback, after all. |
… | |
… | |
505 | else |
533 | else |
506 | { |
534 | { |
507 | /* we hit the kernel limit, we should fall back to something else. |
535 | /* we hit the kernel limit, we should fall back to something else. |
508 | * we can either poll() a few times and hope for the best, |
536 | * we can either poll() a few times and hope for the best, |
509 | * poll always, or switch to epoll. |
537 | * poll always, or switch to epoll. |
510 | * since we use epoll anyways, go epoll. |
538 | * TODO: is this necessary with newer kernels? |
511 | */ |
539 | */ |
512 | |
540 | |
513 | iouring_internal_destroy (EV_A); |
541 | iouring_internal_destroy (EV_A); |
514 | |
542 | |
515 | /* this should make it so that on return, we don'T call any uring functions */ |
543 | /* this should make it so that on return, we don't call any uring functions */ |
516 | iouring_to_submit = 0; |
544 | iouring_to_submit = 0; |
517 | |
545 | |
518 | for (;;) |
546 | for (;;) |
519 | { |
547 | { |
520 | backend = epoll_init (EV_A_ 0); |
548 | backend = epoll_init (EV_A_ 0); |