… | |
… | |
47 | * without any syscalls. what's not to like? |
47 | * without any syscalls. what's not to like? |
48 | * d) ok, it's vastly more complex, but that's ok, really. |
48 | * d) ok, it's vastly more complex, but that's ok, really. |
49 | * e) why 3 mmaps instead of one? one would be more space-efficient, |
49 | * e) why 3 mmaps instead of one? one would be more space-efficient, |
50 | * and I can't see what benefit three would have (other than being |
50 | * and I can't see what benefit three would have (other than being |
51 | * somehow resizable/relocatable, but that's apparently not possible). |
51 | * somehow resizable/relocatable, but that's apparently not possible). |
|
|
52 | * (FIXME: newer kernels can use 2 mmaps only, need to look into this). |
52 | * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and |
53 | * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and |
53 | the bizarre way structure offsets are commuinicated makes it hard to |
54 | * the bizarre way structure offsets are communicated makes it hard to |
54 | * just print the ring buffer heads, even *iff* the memory were visible |
55 | * just print the ring buffer heads, even *iff* the memory were visible |
55 | * in gdb. but then, that's also ok, really. |
56 | * in gdb. but then, that's also ok, really. |
56 | * g) well, you cannot specify a timeout when waiting for events. no, |
57 | * g) well, you cannot specify a timeout when waiting for events. no, |
57 | * seriously, the interface doesn't support a timeout. never seen _that_ |
58 | * seriously, the interface doesn't support a timeout. never seen _that_ |
58 | * before. sure, you can use a timerfd, but that's another syscall |
59 | * before. sure, you can use a timerfd, but that's another syscall |
59 | * you could have avoided. overall, this bizarre omission smells |
60 | * you could have avoided. overall, this bizarre omission smells |
60 | * like a µ-optimisation by the io_uring author for his personal |
61 | * like a µ-optimisation by the io_uring author for his personal |
61 | * applications, to the detriment of everybody else who just wants |
62 | * applications, to the detriment of everybody else who just wants |
62 | * an event loop. but, umm, ok, if that's all, it could be worse. |
63 | * an event loop. but, umm, ok, if that's all, it could be worse. |
|
|
64 | * (FIXME: jens mentioned timeout commands, need to investigate) |
63 | * h) there is a hardcoded limit of 4096 outstanding events. okay, |
65 | * h) there is a hardcoded limit of 4096 outstanding events. okay, |
64 | * at least there is no arbitrary low system-wide limit... |
66 | * at least there is no arbitrary low system-wide limit... |
|
|
67 | * (FIXME: apparently, this was increased to 32768 in later kernels( |
65 | * i) unlike linux aio, you *can* register more then the limit |
68 | * i) unlike linux aio, you *can* register more then the limit |
66 | * of fd events, and the kernel will "gracefully" signal an |
69 | * of fd events, and the kernel will "gracefully" signal an |
67 | * overflow, after which you could destroy and recreate the kernel |
70 | * overflow, after which you could destroy and recreate the kernel |
68 | * state, a bit bigger, or fall back to e.g. poll. thats not |
71 | * state, a bit bigger, or fall back to e.g. poll. thats not |
69 | * totally insane, but kind of questions the point a high |
72 | * totally insane, but kind of questions the point a high |
70 | * performance I/O framework when it doesn't really work |
73 | * performance I/O framework when it doesn't really work |
71 | * under stress. |
74 | * under stress. |
|
|
75 | * (FIXME: iouring should no longer drop events, need to investigate) |
72 | * j) but, oh my! is has exactly the same bugs as the linux aio backend, |
76 | * j) but, oh my! is has exactly the same bugs as the linux aio backend, |
73 | * where some undocumented poll combinations just fail. |
77 | * where some undocumented poll combinations just fail. |
74 | * so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course, |
78 | * so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course, |
75 | * this is completely undocumented, have I mantioned this already? |
79 | * this is completely undocumented, have I mantioned this already? |
76 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
80 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
… | |
… | |
96 | { |
100 | { |
97 | __u8 opcode; |
101 | __u8 opcode; |
98 | __u8 flags; |
102 | __u8 flags; |
99 | __u16 ioprio; |
103 | __u16 ioprio; |
100 | __s32 fd; |
104 | __s32 fd; |
|
|
105 | union { |
101 | __u64 off; |
106 | __u64 off; |
|
|
107 | __u64 addr2; |
|
|
108 | }; |
102 | __u64 addr; |
109 | __u64 addr; |
103 | __u32 len; |
110 | __u32 len; |
104 | union { |
111 | union { |
105 | __kernel_rwf_t rw_flags; |
112 | __kernel_rwf_t rw_flags; |
106 | __u32 fsync_flags; |
113 | __u32 fsync_flags; |
107 | __u16 poll_events; |
114 | __u16 poll_events; |
108 | __u32 sync_range_flags; |
115 | __u32 sync_range_flags; |
109 | __u32 msg_flags; |
116 | __u32 msg_flags; |
|
|
117 | __u32 timeout_flags; |
|
|
118 | __u32 accept_flags; |
|
|
119 | __u32 cancel_flags; |
|
|
120 | __u32 open_flags; |
|
|
121 | __u32 statx_flags; |
110 | }; |
122 | }; |
111 | __u64 user_data; |
123 | __u64 user_data; |
112 | union { |
124 | union { |
113 | __u16 buf_index; |
125 | __u16 buf_index; |
114 | __u64 __pad2[3]; |
126 | __u64 __pad2[3]; |
… | |
… | |
151 | __u32 sq_entries; |
163 | __u32 sq_entries; |
152 | __u32 cq_entries; |
164 | __u32 cq_entries; |
153 | __u32 flags; |
165 | __u32 flags; |
154 | __u32 sq_thread_cpu; |
166 | __u32 sq_thread_cpu; |
155 | __u32 sq_thread_idle; |
167 | __u32 sq_thread_idle; |
|
|
168 | __u32 features; |
156 | __u32 resv[5]; |
169 | __u32 resv[4]; |
157 | struct io_sqring_offsets sq_off; |
170 | struct io_sqring_offsets sq_off; |
158 | struct io_cqring_offsets cq_off; |
171 | struct io_cqring_offsets cq_off; |
159 | }; |
172 | }; |
160 | |
173 | |
161 | #define IORING_OP_POLL_ADD 6 |
174 | #define IORING_OP_POLL_ADD 6 |
… | |
… | |
164 | #define IORING_ENTER_GETEVENTS 0x01 |
177 | #define IORING_ENTER_GETEVENTS 0x01 |
165 | |
178 | |
166 | #define IORING_OFF_SQ_RING 0x00000000ULL |
179 | #define IORING_OFF_SQ_RING 0x00000000ULL |
167 | #define IORING_OFF_CQ_RING 0x08000000ULL |
180 | #define IORING_OFF_CQ_RING 0x08000000ULL |
168 | #define IORING_OFF_SQES 0x10000000ULL |
181 | #define IORING_OFF_SQES 0x10000000ULL |
|
|
182 | |
|
|
183 | #define IORING_FEAT_SINGLE_MMAP 0x1 |
|
|
184 | #define IORING_FEAT_NODROP 0x2 |
|
|
185 | #define IORING_FEAT_SUBMIT_STABLE 0x4 |
169 | |
186 | |
170 | inline_size |
187 | inline_size |
171 | int |
188 | int |
172 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
189 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
173 | { |
190 | { |
… | |
… | |
236 | iouring_tfd_cb (EV_P_ struct ev_io *w, int revents) |
253 | iouring_tfd_cb (EV_P_ struct ev_io *w, int revents) |
237 | { |
254 | { |
238 | iouring_tfd_to = EV_TSTAMP_HUGE; |
255 | iouring_tfd_to = EV_TSTAMP_HUGE; |
239 | } |
256 | } |
240 | |
257 | |
241 | static void |
|
|
242 | iouring_epoll_cb (EV_P_ struct ev_io *w, int revents) |
|
|
243 | { |
|
|
244 | epoll_poll (EV_A_ 0); |
|
|
245 | } |
|
|
246 | |
|
|
247 | /* called for full and partial cleanup */ |
258 | /* called for full and partial cleanup */ |
248 | ecb_cold |
259 | ecb_cold |
249 | static int |
260 | static int |
250 | iouring_internal_destroy (EV_P) |
261 | iouring_internal_destroy (EV_P) |
251 | { |
262 | { |
… | |
… | |
254 | |
265 | |
255 | if (iouring_sq_ring != MAP_FAILED) munmap (iouring_sq_ring, iouring_sq_ring_size); |
266 | if (iouring_sq_ring != MAP_FAILED) munmap (iouring_sq_ring, iouring_sq_ring_size); |
256 | if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size); |
267 | if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size); |
257 | if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size ); |
268 | if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size ); |
258 | |
269 | |
259 | if (ev_is_active (&iouring_epoll_w)) ev_ref (EV_A); ev_io_stop (EV_A_ &iouring_epoll_w); |
270 | if (ev_is_active (&iouring_tfd_w)) |
260 | if (ev_is_active (&iouring_tfd_w )) ev_ref (EV_A); ev_io_stop (EV_A_ &iouring_tfd_w ); |
271 | { |
|
|
272 | ev_ref (EV_A); |
|
|
273 | ev_io_stop (EV_A_ &iouring_tfd_w); |
|
|
274 | } |
261 | } |
275 | } |
262 | |
276 | |
263 | ecb_cold |
277 | ecb_cold |
264 | static int |
278 | static int |
265 | iouring_internal_init (EV_P) |
279 | iouring_internal_init (EV_P) |
… | |
… | |
342 | iouring_internal_destroy (EV_A); |
356 | iouring_internal_destroy (EV_A); |
343 | |
357 | |
344 | while (iouring_internal_init (EV_A) < 0) |
358 | while (iouring_internal_init (EV_A) < 0) |
345 | ev_syserr ("(libev) io_uring_setup"); |
359 | ev_syserr ("(libev) io_uring_setup"); |
346 | |
360 | |
347 | /* forking epoll should also effectively unregister all fds from the backend */ |
|
|
348 | epoll_fork (EV_A); |
|
|
349 | /* epoll_fork already did this. hopefully */ |
|
|
350 | /*fd_rearm_all (EV_A);*/ |
361 | fd_rearm_all (EV_A); |
351 | |
|
|
352 | ev_io_stop (EV_A_ &iouring_epoll_w); |
|
|
353 | ev_io_set (EV_A_ &iouring_epoll_w, backend_fd, EV_READ); |
|
|
354 | ev_io_start (EV_A_ &iouring_epoll_w); |
|
|
355 | |
362 | |
356 | ev_io_stop (EV_A_ &iouring_tfd_w); |
363 | ev_io_stop (EV_A_ &iouring_tfd_w); |
357 | ev_io_set (EV_A_ &iouring_tfd_w, iouring_tfd, EV_READ); |
364 | ev_io_set (EV_A_ &iouring_tfd_w, iouring_tfd, EV_READ); |
358 | ev_io_start (EV_A_ &iouring_tfd_w); |
365 | ev_io_start (EV_A_ &iouring_tfd_w); |
359 | } |
366 | } |
… | |
… | |
361 | /*****************************************************************************/ |
368 | /*****************************************************************************/ |
362 | |
369 | |
363 | static void |
370 | static void |
364 | iouring_modify (EV_P_ int fd, int oev, int nev) |
371 | iouring_modify (EV_P_ int fd, int oev, int nev) |
365 | { |
372 | { |
366 | if (ecb_expect_false (anfds [fd].eflags)) |
|
|
367 | { |
|
|
368 | /* we handed this fd over to epoll, so undo this first */ |
|
|
369 | /* we do it manually because the optimisations on epoll_modify won't do us any good */ |
|
|
370 | epoll_ctl (iouring_fd, EPOLL_CTL_DEL, fd, 0); |
|
|
371 | anfds [fd].eflags = 0; |
|
|
372 | oev = 0; |
|
|
373 | } |
|
|
374 | |
|
|
375 | if (oev) |
373 | if (oev) |
376 | { |
374 | { |
377 | /* we assume the sqe's are all "properly" initialised */ |
375 | /* we assume the sqe's are all "properly" initialised */ |
378 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
376 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
379 | sqe->opcode = IORING_OP_POLL_REMOVE; |
377 | sqe->opcode = IORING_OP_POLL_REMOVE; |
… | |
… | |
446 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
444 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
447 | return; |
445 | return; |
448 | |
446 | |
449 | if (ecb_expect_false (res < 0)) |
447 | if (ecb_expect_false (res < 0)) |
450 | { |
448 | { |
451 | if (res == -EINVAL) |
449 | //TODO: EINVAL handling (was something failed with this fd) |
452 | { |
450 | //TODO: EBUSY happens when? |
453 | /* we assume this error code means the fd/poll combination is buggy |
|
|
454 | * and fall back to epoll. |
|
|
455 | * this error code might also indicate a bug, but the kernel doesn't |
|
|
456 | * distinguish between those two conditions, so... sigh... |
|
|
457 | */ |
|
|
458 | |
451 | |
459 | epoll_modify (EV_A_ fd, 0, anfds [fd].events); |
|
|
460 | } |
|
|
461 | else if (res == -EBADF) |
452 | if (res == -EBADF) |
462 | { |
453 | { |
463 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
454 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
464 | fd_kill (EV_A_ fd); |
455 | fd_kill (EV_A_ fd); |
465 | } |
456 | } |
466 | else |
457 | else |
… | |
… | |
603 | |
594 | |
604 | inline_size |
595 | inline_size |
605 | int |
596 | int |
606 | iouring_init (EV_P_ int flags) |
597 | iouring_init (EV_P_ int flags) |
607 | { |
598 | { |
608 | if (!epoll_init (EV_A_ 0)) |
|
|
609 | return 0; |
|
|
610 | |
|
|
611 | iouring_entries = IOURING_INIT_ENTRIES; |
599 | iouring_entries = IOURING_INIT_ENTRIES; |
612 | iouring_max_entries = 0; |
600 | iouring_max_entries = 0; |
613 | |
601 | |
614 | if (iouring_internal_init (EV_A) < 0) |
602 | if (iouring_internal_init (EV_A) < 0) |
615 | { |
603 | { |
616 | iouring_internal_destroy (EV_A); |
604 | iouring_internal_destroy (EV_A); |
617 | return 0; |
605 | return 0; |
618 | } |
606 | } |
619 | |
607 | |
620 | ev_io_init (&iouring_epoll_w, iouring_epoll_cb, backend_fd, EV_READ); |
|
|
621 | ev_set_priority (&iouring_epoll_w, EV_MAXPRI); |
|
|
622 | |
|
|
623 | ev_io_init (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ); |
608 | ev_io_init (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ); |
624 | ev_set_priority (&iouring_tfd_w, EV_MAXPRI); |
609 | ev_set_priority (&iouring_tfd_w, EV_MINPRI); |
625 | |
|
|
626 | ev_io_start (EV_A_ &iouring_epoll_w); |
|
|
627 | ev_unref (EV_A); /* watcher should not keep loop alive */ |
|
|
628 | |
|
|
629 | ev_io_start (EV_A_ &iouring_tfd_w); |
610 | ev_io_start (EV_A_ &iouring_tfd_w); |
630 | ev_unref (EV_A); /* watcher should not keep loop alive */ |
611 | ev_unref (EV_A); /* watcher should not keep loop alive */ |
631 | |
612 | |
632 | backend_modify = iouring_modify; |
613 | backend_modify = iouring_modify; |
633 | backend_poll = iouring_poll; |
614 | backend_poll = iouring_poll; |
… | |
… | |
638 | inline_size |
619 | inline_size |
639 | void |
620 | void |
640 | iouring_destroy (EV_P) |
621 | iouring_destroy (EV_P) |
641 | { |
622 | { |
642 | iouring_internal_destroy (EV_A); |
623 | iouring_internal_destroy (EV_A); |
643 | epoll_destroy (EV_A); |
|
|
644 | } |
624 | } |
645 | |
625 | |