… | |
… | |
87 | /* TODO: resize cq/sq size independently */ |
87 | /* TODO: resize cq/sq size independently */ |
88 | |
88 | |
89 | #include <sys/timerfd.h> |
89 | #include <sys/timerfd.h> |
90 | #include <sys/mman.h> |
90 | #include <sys/mman.h> |
91 | #include <poll.h> |
91 | #include <poll.h> |
|
|
92 | #include <stdint.h> |
92 | |
93 | |
93 | #define IOURING_INIT_ENTRIES 32 |
94 | #define IOURING_INIT_ENTRIES 32 |
94 | |
95 | |
95 | /*****************************************************************************/ |
96 | /*****************************************************************************/ |
96 | /* syscall wrapdadoop - this section has the raw api/abi definitions */ |
97 | /* syscall wrapdadoop - this section has the raw api/abi definitions */ |
… | |
… | |
173 | __u32 resv[4]; |
174 | __u32 resv[4]; |
174 | struct io_sqring_offsets sq_off; |
175 | struct io_sqring_offsets sq_off; |
175 | struct io_cqring_offsets cq_off; |
176 | struct io_cqring_offsets cq_off; |
176 | }; |
177 | }; |
177 | |
178 | |
|
|
179 | #define IORING_SETUP_CQSIZE 0x00000008 |
|
|
180 | |
178 | #define IORING_OP_POLL_ADD 6 |
181 | #define IORING_OP_POLL_ADD 6 |
179 | #define IORING_OP_POLL_REMOVE 7 |
182 | #define IORING_OP_POLL_REMOVE 7 |
|
|
183 | #define IORING_OP_TIMEOUT 11 |
|
|
184 | #define IORING_OP_TIMEOUT_REMOVE 12 |
|
|
185 | |
|
|
186 | /* relative or absolute, reference clock is CLOCK_MONOTONIC */ |
|
|
187 | struct iouring_kernel_timespec |
|
|
188 | { |
|
|
189 | int64_t tv_sec; |
|
|
190 | long long tv_nsec; |
|
|
191 | }; |
|
|
192 | |
|
|
193 | #define IORING_TIMEOUT_ABS 0x00000001 |
180 | |
194 | |
181 | #define IORING_ENTER_GETEVENTS 0x01 |
195 | #define IORING_ENTER_GETEVENTS 0x01 |
182 | |
196 | |
183 | #define IORING_OFF_SQ_RING 0x00000000ULL |
197 | #define IORING_OFF_SQ_RING 0x00000000ULL |
184 | #define IORING_OFF_CQ_RING 0x08000000ULL |
198 | #define IORING_OFF_CQ_RING 0x08000000ULL |
185 | #define IORING_OFF_SQES 0x10000000ULL |
199 | #define IORING_OFF_SQES 0x10000000ULL |
186 | |
200 | |
187 | #define IORING_FEAT_SINGLE_MMAP 0x1 |
201 | #define IORING_FEAT_SINGLE_MMAP 0x00000001 |
188 | #define IORING_FEAT_NODROP 0x2 |
202 | #define IORING_FEAT_NODROP 0x00000002 |
189 | #define IORING_FEAT_SUBMIT_STABLE 0x4 |
203 | #define IORING_FEAT_SUBMIT_STABLE 0x00000004 |
190 | |
204 | |
191 | inline_size |
205 | inline_size |
192 | int |
206 | int |
193 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
207 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
194 | { |
208 | { |
… | |
… | |
214 | |
228 | |
215 | /* the submit/completion queue entries */ |
229 | /* the submit/completion queue entries */ |
216 | #define EV_SQES ((struct io_uring_sqe *) iouring_sqes) |
230 | #define EV_SQES ((struct io_uring_sqe *) iouring_sqes) |
217 | #define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) |
231 | #define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) |
218 | |
232 | |
|
|
233 | /* TODO: this is not enough, we might have to reap events */ |
|
|
234 | /* TODO: but we can't, as that will re-arm events, causing */ |
|
|
235 | /* TODO: an endless loop in fd_reify */ |
|
|
236 | static int |
|
|
237 | iouring_enter (EV_P_ ev_tstamp timeout) |
|
|
238 | { |
|
|
239 | int res; |
|
|
240 | |
|
|
241 | EV_RELEASE_CB; |
|
|
242 | |
|
|
243 | res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, |
|
|
244 | timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); |
|
|
245 | |
|
|
246 | assert (("libev: io_uring_enter did not consume all sqes", (res < 0 || res == iouring_to_submit))); |
|
|
247 | |
|
|
248 | iouring_to_submit = 0; |
|
|
249 | |
|
|
250 | EV_ACQUIRE_CB; |
|
|
251 | |
|
|
252 | return res; |
|
|
253 | } |
|
|
254 | |
219 | static |
255 | static |
220 | struct io_uring_sqe * |
256 | struct io_uring_sqe * |
221 | iouring_sqe_get (EV_P) |
257 | iouring_sqe_get (EV_P) |
222 | { |
258 | { |
223 | unsigned tail = EV_SQ_VAR (tail); |
259 | unsigned tail = EV_SQ_VAR (tail); |
224 | |
260 | |
225 | if (tail + 1 - EV_SQ_VAR (head) > EV_SQ_VAR (ring_entries)) |
261 | while (ecb_expect_false (tail + 1 - EV_SQ_VAR (head) > EV_SQ_VAR (ring_entries))) |
226 | { |
262 | { |
227 | /* queue full, flush */ |
263 | /* queue full, need to flush */ |
228 | evsys_io_uring_enter (iouring_fd, iouring_to_submit, 0, 0, 0, 0); |
|
|
229 | iouring_to_submit = 0; |
|
|
230 | } |
|
|
231 | |
264 | |
|
|
265 | int res = iouring_enter (EV_A_ EV_TS_CONST (0.)); |
|
|
266 | |
|
|
267 | /* io_uring_enter might fail with EBUSY and won't submit anything */ |
|
|
268 | /* unfortunately, we can't handle this at the moment */ |
|
|
269 | |
|
|
270 | if (res < 0 && errno == EBUSY) |
|
|
271 | //TODO |
|
|
272 | ev_syserr ("(libev) io_uring_enter could not clear sq"); |
|
|
273 | else |
|
|
274 | break; |
|
|
275 | |
|
|
276 | /* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE */ |
|
|
277 | } |
|
|
278 | |
232 | assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))); |
279 | /*assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)));*/ |
233 | |
280 | |
234 | return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); |
281 | return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); |
235 | } |
282 | } |
236 | |
283 | |
237 | inline_size |
284 | inline_size |
… | |
… | |
288 | |
335 | |
289 | iouring_tfd = -1; |
336 | iouring_tfd = -1; |
290 | iouring_sq_ring = MAP_FAILED; |
337 | iouring_sq_ring = MAP_FAILED; |
291 | iouring_cq_ring = MAP_FAILED; |
338 | iouring_cq_ring = MAP_FAILED; |
292 | iouring_sqes = MAP_FAILED; |
339 | iouring_sqes = MAP_FAILED; |
|
|
340 | |
|
|
341 | if (!have_monotonic) /* cannot really happen, but what if11 */ |
|
|
342 | return -1; |
293 | |
343 | |
294 | for (;;) |
344 | for (;;) |
295 | { |
345 | { |
296 | iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); |
346 | iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); |
297 | |
347 | |
… | |
… | |
383 | { |
433 | { |
384 | /* we assume the sqe's are all "properly" initialised */ |
434 | /* we assume the sqe's are all "properly" initialised */ |
385 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
435 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
386 | sqe->opcode = IORING_OP_POLL_REMOVE; |
436 | sqe->opcode = IORING_OP_POLL_REMOVE; |
387 | sqe->fd = fd; |
437 | sqe->fd = fd; |
388 | sqe->user_data = -1; |
438 | /* Jens Axboe notified me that user_data is not what is documented, but is |
|
|
439 | * some kind of unique ID that has to match, otherwise the request cannot |
|
|
440 | * be removed. Since we don't *really* have that, we pass in the old |
|
|
441 | * generation counter - if that fails, too bad, it will hopefully be removed |
|
|
442 | * at close time and then be ignored. */ |
|
|
443 | sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); |
389 | iouring_sqe_submit (EV_A_ sqe); |
444 | iouring_sqe_submit (EV_A_ sqe); |
390 | |
445 | |
391 | /* increment generation counter to avoid handling old events */ |
446 | /* increment generation counter to avoid handling old events */ |
392 | ++anfds [fd].egen; |
447 | ++anfds [fd].egen; |
393 | } |
448 | } |
… | |
… | |
434 | { |
489 | { |
435 | int fd = cqe->user_data & 0xffffffffU; |
490 | int fd = cqe->user_data & 0xffffffffU; |
436 | uint32_t gen = cqe->user_data >> 32; |
491 | uint32_t gen = cqe->user_data >> 32; |
437 | int res = cqe->res; |
492 | int res = cqe->res; |
438 | |
493 | |
439 | /* ignore fd removal events, if there are any. TODO: verify */ |
|
|
440 | /* TODO: yes, this triggers */ |
|
|
441 | if (cqe->user_data == (__u64)-1) |
|
|
442 | return; |
|
|
443 | |
|
|
444 | assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); |
494 | assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); |
445 | |
495 | |
446 | /* documentation lies, of course. the result value is NOT like |
496 | /* documentation lies, of course. the result value is NOT like |
447 | * normal syscalls, but like linux raw syscalls, i.e. negative |
497 | * normal syscalls, but like linux raw syscalls, i.e. negative |
448 | * error numbers. fortunate, as otherwise there would be no way |
498 | * error numbers. fortunate, as otherwise there would be no way |
449 | * to get error codes at all. still, why not document this? |
499 | * to get error codes at all. still, why not document this? |
450 | */ |
500 | */ |
451 | |
501 | |
452 | /* ignore event if generation doesn't match */ |
502 | /* ignore event if generation doesn't match */ |
|
|
503 | /* other than skipping removal events, */ |
453 | /* this should actually be very rare */ |
504 | /* this should actually be very rare */ |
454 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
505 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
455 | return; |
506 | return; |
456 | |
507 | |
457 | if (ecb_expect_false (res < 0)) |
508 | if (ecb_expect_false (res < 0)) |
… | |
… | |
580 | iouring_tfd_update (EV_A_ timeout); |
631 | iouring_tfd_update (EV_A_ timeout); |
581 | |
632 | |
582 | /* only enter the kernel if we have something to submit, or we need to wait */ |
633 | /* only enter the kernel if we have something to submit, or we need to wait */ |
583 | if (timeout || iouring_to_submit) |
634 | if (timeout || iouring_to_submit) |
584 | { |
635 | { |
585 | int res; |
636 | int res = iouring_enter (EV_A_ timeout); |
586 | |
|
|
587 | EV_RELEASE_CB; |
|
|
588 | |
|
|
589 | res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, |
|
|
590 | timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); |
|
|
591 | iouring_to_submit = 0; |
|
|
592 | |
|
|
593 | EV_ACQUIRE_CB; |
|
|
594 | |
637 | |
595 | if (ecb_expect_false (res < 0)) |
638 | if (ecb_expect_false (res < 0)) |
596 | if (errno == EINTR) |
639 | if (errno == EINTR) |
597 | /* ignore */; |
640 | /* ignore */; |
|
|
641 | else if (errno == EBUSY) |
|
|
642 | /* cq full, cannot submit - should be rare because we flush the cq first, so simply ignore */; |
598 | else |
643 | else |
599 | ev_syserr ("(libev) iouring setup"); |
644 | ev_syserr ("(libev) iouring setup"); |
600 | else |
645 | else |
601 | iouring_handle_cq (EV_A); |
646 | iouring_handle_cq (EV_A); |
602 | } |
647 | } |