… | |
… | |
80 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
80 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
81 | * the big isuess with it are the bugs requiring epoll, which might |
81 | * the big isuess with it are the bugs requiring epoll, which might |
82 | * or might not get fixed (do I hold my breath?). |
82 | * or might not get fixed (do I hold my breath?). |
83 | */ |
83 | */ |
84 | |
84 | |
|
|
85 | /* TODO: use internal TIMEOUT */ |
|
|
86 | /* TODO: take advantage of single mmap, NODROP etc. */ |
|
|
87 | /* TODO: resize cq/sq size independently */ |
|
|
88 | |
85 | #include <sys/timerfd.h> |
89 | #include <sys/timerfd.h> |
86 | #include <sys/mman.h> |
90 | #include <sys/mman.h> |
87 | #include <poll.h> |
91 | #include <poll.h> |
88 | |
92 | |
89 | #define IOURING_INIT_ENTRIES 32 |
93 | #define IOURING_INIT_ENTRIES 32 |
… | |
… | |
100 | { |
104 | { |
101 | __u8 opcode; |
105 | __u8 opcode; |
102 | __u8 flags; |
106 | __u8 flags; |
103 | __u16 ioprio; |
107 | __u16 ioprio; |
104 | __s32 fd; |
108 | __s32 fd; |
|
|
109 | union { |
105 | __u64 off; |
110 | __u64 off; |
|
|
111 | __u64 addr2; |
|
|
112 | }; |
106 | __u64 addr; |
113 | __u64 addr; |
107 | __u32 len; |
114 | __u32 len; |
108 | union { |
115 | union { |
109 | __kernel_rwf_t rw_flags; |
116 | __kernel_rwf_t rw_flags; |
110 | __u32 fsync_flags; |
117 | __u32 fsync_flags; |
111 | __u16 poll_events; |
118 | __u16 poll_events; |
112 | __u32 sync_range_flags; |
119 | __u32 sync_range_flags; |
113 | __u32 msg_flags; |
120 | __u32 msg_flags; |
|
|
121 | __u32 timeout_flags; |
|
|
122 | __u32 accept_flags; |
|
|
123 | __u32 cancel_flags; |
|
|
124 | __u32 open_flags; |
|
|
125 | __u32 statx_flags; |
114 | }; |
126 | }; |
115 | __u64 user_data; |
127 | __u64 user_data; |
116 | union { |
128 | union { |
117 | __u16 buf_index; |
129 | __u16 buf_index; |
118 | __u64 __pad2[3]; |
130 | __u64 __pad2[3]; |
… | |
… | |
155 | __u32 sq_entries; |
167 | __u32 sq_entries; |
156 | __u32 cq_entries; |
168 | __u32 cq_entries; |
157 | __u32 flags; |
169 | __u32 flags; |
158 | __u32 sq_thread_cpu; |
170 | __u32 sq_thread_cpu; |
159 | __u32 sq_thread_idle; |
171 | __u32 sq_thread_idle; |
|
|
172 | __u32 features; |
160 | __u32 resv[5]; |
173 | __u32 resv[4]; |
161 | struct io_sqring_offsets sq_off; |
174 | struct io_sqring_offsets sq_off; |
162 | struct io_cqring_offsets cq_off; |
175 | struct io_cqring_offsets cq_off; |
163 | }; |
176 | }; |
164 | |
177 | |
165 | #define IORING_OP_POLL_ADD 6 |
178 | #define IORING_OP_POLL_ADD 6 |
… | |
… | |
168 | #define IORING_ENTER_GETEVENTS 0x01 |
181 | #define IORING_ENTER_GETEVENTS 0x01 |
169 | |
182 | |
170 | #define IORING_OFF_SQ_RING 0x00000000ULL |
183 | #define IORING_OFF_SQ_RING 0x00000000ULL |
171 | #define IORING_OFF_CQ_RING 0x08000000ULL |
184 | #define IORING_OFF_CQ_RING 0x08000000ULL |
172 | #define IORING_OFF_SQES 0x10000000ULL |
185 | #define IORING_OFF_SQES 0x10000000ULL |
|
|
186 | |
|
|
187 | #define IORING_FEAT_SINGLE_MMAP 0x1 |
|
|
188 | #define IORING_FEAT_NODROP 0x2 |
|
|
189 | #define IORING_FEAT_SUBMIT_STABLE 0x4 |
173 | |
190 | |
174 | inline_size |
191 | inline_size |
175 | int |
192 | int |
176 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
193 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
177 | { |
194 | { |
… | |
… | |
240 | iouring_tfd_cb (EV_P_ struct ev_io *w, int revents) |
257 | iouring_tfd_cb (EV_P_ struct ev_io *w, int revents) |
241 | { |
258 | { |
242 | iouring_tfd_to = EV_TSTAMP_HUGE; |
259 | iouring_tfd_to = EV_TSTAMP_HUGE; |
243 | } |
260 | } |
244 | |
261 | |
245 | static void |
|
|
246 | iouring_epoll_cb (EV_P_ struct ev_io *w, int revents) |
|
|
247 | { |
|
|
248 | epoll_poll (EV_A_ 0); |
|
|
249 | } |
|
|
250 | |
|
|
251 | /* called for full and partial cleanup */ |
262 | /* called for full and partial cleanup */ |
252 | ecb_cold |
263 | ecb_cold |
253 | static int |
264 | static int |
254 | iouring_internal_destroy (EV_P) |
265 | iouring_internal_destroy (EV_P) |
255 | { |
266 | { |
… | |
… | |
258 | |
269 | |
259 | if (iouring_sq_ring != MAP_FAILED) munmap (iouring_sq_ring, iouring_sq_ring_size); |
270 | if (iouring_sq_ring != MAP_FAILED) munmap (iouring_sq_ring, iouring_sq_ring_size); |
260 | if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size); |
271 | if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size); |
261 | if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size ); |
272 | if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size ); |
262 | |
273 | |
263 | if (ev_is_active (&iouring_epoll_w)) ev_ref (EV_A); ev_io_stop (EV_A_ &iouring_epoll_w); |
274 | if (ev_is_active (&iouring_tfd_w)) |
264 | if (ev_is_active (&iouring_tfd_w )) ev_ref (EV_A); ev_io_stop (EV_A_ &iouring_tfd_w ); |
275 | { |
|
|
276 | ev_ref (EV_A); |
|
|
277 | ev_io_stop (EV_A_ &iouring_tfd_w); |
|
|
278 | } |
265 | } |
279 | } |
266 | |
280 | |
267 | ecb_cold |
281 | ecb_cold |
268 | static int |
282 | static int |
269 | iouring_internal_init (EV_P) |
283 | iouring_internal_init (EV_P) |
… | |
… | |
346 | iouring_internal_destroy (EV_A); |
360 | iouring_internal_destroy (EV_A); |
347 | |
361 | |
348 | while (iouring_internal_init (EV_A) < 0) |
362 | while (iouring_internal_init (EV_A) < 0) |
349 | ev_syserr ("(libev) io_uring_setup"); |
363 | ev_syserr ("(libev) io_uring_setup"); |
350 | |
364 | |
351 | /* forking epoll should also effectively unregister all fds from the backend */ |
|
|
352 | epoll_fork (EV_A); |
|
|
353 | /* epoll_fork already did this. hopefully */ |
|
|
354 | /*fd_rearm_all (EV_A);*/ |
365 | fd_rearm_all (EV_A); |
355 | |
|
|
356 | ev_io_stop (EV_A_ &iouring_epoll_w); |
|
|
357 | ev_io_set (EV_A_ &iouring_epoll_w, backend_fd, EV_READ); |
|
|
358 | ev_io_start (EV_A_ &iouring_epoll_w); |
|
|
359 | |
366 | |
360 | ev_io_stop (EV_A_ &iouring_tfd_w); |
367 | ev_io_stop (EV_A_ &iouring_tfd_w); |
361 | ev_io_set (EV_A_ &iouring_tfd_w, iouring_tfd, EV_READ); |
368 | ev_io_set (EV_A_ &iouring_tfd_w, iouring_tfd, EV_READ); |
362 | ev_io_start (EV_A_ &iouring_tfd_w); |
369 | ev_io_start (EV_A_ &iouring_tfd_w); |
363 | } |
370 | } |
… | |
… | |
365 | /*****************************************************************************/ |
372 | /*****************************************************************************/ |
366 | |
373 | |
367 | static void |
374 | static void |
368 | iouring_modify (EV_P_ int fd, int oev, int nev) |
375 | iouring_modify (EV_P_ int fd, int oev, int nev) |
369 | { |
376 | { |
370 | if (ecb_expect_false (anfds [fd].eflags)) |
|
|
371 | { |
|
|
372 | /* we handed this fd over to epoll, so undo this first */ |
|
|
373 | /* we do it manually because the optimisations on epoll_modify won't do us any good */ |
|
|
374 | epoll_ctl (iouring_fd, EPOLL_CTL_DEL, fd, 0); |
|
|
375 | anfds [fd].eflags = 0; |
|
|
376 | oev = 0; |
|
|
377 | } |
|
|
378 | |
|
|
379 | if (oev) |
377 | if (oev) |
380 | { |
378 | { |
381 | /* we assume the sqe's are all "properly" initialised */ |
379 | /* we assume the sqe's are all "properly" initialised */ |
382 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
380 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
383 | sqe->opcode = IORING_OP_POLL_REMOVE; |
381 | sqe->opcode = IORING_OP_POLL_REMOVE; |
… | |
… | |
450 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
448 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
451 | return; |
449 | return; |
452 | |
450 | |
453 | if (ecb_expect_false (res < 0)) |
451 | if (ecb_expect_false (res < 0)) |
454 | { |
452 | { |
455 | if (res == -EINVAL) |
453 | //TODO: EINVAL handling (was something failed with this fd) |
456 | { |
454 | //TODO: EBUSY happens when? |
457 | /* we assume this error code means the fd/poll combination is buggy |
|
|
458 | * and fall back to epoll. |
|
|
459 | * this error code might also indicate a bug, but the kernel doesn't |
|
|
460 | * distinguish between those two conditions, so... sigh... |
|
|
461 | */ |
|
|
462 | |
455 | |
463 | epoll_modify (EV_A_ fd, 0, anfds [fd].events); |
|
|
464 | } |
|
|
465 | else if (res == -EBADF) |
456 | if (res == -EBADF) |
466 | { |
457 | { |
467 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
458 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
468 | fd_kill (EV_A_ fd); |
459 | fd_kill (EV_A_ fd); |
469 | } |
460 | } |
470 | else |
461 | else |
… | |
… | |
607 | |
598 | |
608 | inline_size |
599 | inline_size |
609 | int |
600 | int |
610 | iouring_init (EV_P_ int flags) |
601 | iouring_init (EV_P_ int flags) |
611 | { |
602 | { |
612 | if (!epoll_init (EV_A_ 0)) |
|
|
613 | return 0; |
|
|
614 | |
|
|
615 | iouring_entries = IOURING_INIT_ENTRIES; |
603 | iouring_entries = IOURING_INIT_ENTRIES; |
616 | iouring_max_entries = 0; |
604 | iouring_max_entries = 0; |
617 | |
605 | |
618 | if (iouring_internal_init (EV_A) < 0) |
606 | if (iouring_internal_init (EV_A) < 0) |
619 | { |
607 | { |
620 | iouring_internal_destroy (EV_A); |
608 | iouring_internal_destroy (EV_A); |
621 | return 0; |
609 | return 0; |
622 | } |
610 | } |
623 | |
611 | |
624 | ev_io_init (&iouring_epoll_w, iouring_epoll_cb, backend_fd, EV_READ); |
|
|
625 | ev_set_priority (&iouring_epoll_w, EV_MAXPRI); |
|
|
626 | |
|
|
627 | ev_io_init (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ); |
612 | ev_io_init (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ); |
628 | ev_set_priority (&iouring_tfd_w, EV_MAXPRI); |
613 | ev_set_priority (&iouring_tfd_w, EV_MINPRI); |
629 | |
|
|
630 | ev_io_start (EV_A_ &iouring_epoll_w); |
|
|
631 | ev_unref (EV_A); /* watcher should not keep loop alive */ |
|
|
632 | |
|
|
633 | ev_io_start (EV_A_ &iouring_tfd_w); |
614 | ev_io_start (EV_A_ &iouring_tfd_w); |
634 | ev_unref (EV_A); /* watcher should not keep loop alive */ |
615 | ev_unref (EV_A); /* watcher should not keep loop alive */ |
635 | |
616 | |
636 | backend_modify = iouring_modify; |
617 | backend_modify = iouring_modify; |
637 | backend_poll = iouring_poll; |
618 | backend_poll = iouring_poll; |
… | |
… | |
642 | inline_size |
623 | inline_size |
643 | void |
624 | void |
644 | iouring_destroy (EV_P) |
625 | iouring_destroy (EV_P) |
645 | { |
626 | { |
646 | iouring_internal_destroy (EV_A); |
627 | iouring_internal_destroy (EV_A); |
647 | epoll_destroy (EV_A); |
|
|
648 | } |
628 | } |
649 | |
629 | |