… | |
… | |
47 | * without any syscalls. what's not to like? |
47 | * without any syscalls. what's not to like? |
48 | * d) ok, it's vastly more complex, but that's ok, really. |
48 | * d) ok, it's vastly more complex, but that's ok, really. |
49 | * e) why 3 mmaps instead of one? one would be more space-efficient, |
49 | * e) why 3 mmaps instead of one? one would be more space-efficient, |
50 | * and I can't see what benefit three would have (other than being |
50 | * and I can't see what benefit three would have (other than being |
51 | * somehow resizable/relocatable, but that's apparently not possible). |
51 | * somehow resizable/relocatable, but that's apparently not possible). |
|
|
52 | * (FIXME: newer kernels can use 2 mmaps only, need to look into this). |
52 | * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and |
53 | * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and |
53 | the bizarre way structure offsets are commuinicated makes it hard to |
54 | * the bizarre way structure offsets are communicated makes it hard to |
54 | * just print the ring buffer heads, even *iff* the memory were visible |
55 | * just print the ring buffer heads, even *iff* the memory were visible |
55 | * in gdb. but then, that's also ok, really. |
56 | * in gdb. but then, that's also ok, really. |
56 | * g) well, you cannot specify a timeout when waiting for events. no, |
57 | * g) well, you cannot specify a timeout when waiting for events. no, |
57 | * seriously, the interface doesn't support a timeout. never seen _that_ |
58 | * seriously, the interface doesn't support a timeout. never seen _that_ |
58 | * before. sure, you can use a timerfd, but that's another syscall |
59 | * before. sure, you can use a timerfd, but that's another syscall |
59 | * you could have avoided. overall, this bizarre omission smells |
60 | * you could have avoided. overall, this bizarre omission smells |
60 | * like a µ-optimisation by the io_uring author for his personal |
61 | * like a µ-optimisation by the io_uring author for his personal |
61 | * applications, to the detriment of everybody else who just wants |
62 | * applications, to the detriment of everybody else who just wants |
62 | * an event loop. but, umm, ok, if that's all, it could be worse. |
63 | * an event loop. but, umm, ok, if that's all, it could be worse. |
|
|
64 | * (FIXME: jens mentioned timeout commands, need to investigate) |
63 | * h) there is a hardcoded limit of 4096 outstanding events. okay, |
65 | * h) there is a hardcoded limit of 4096 outstanding events. okay, |
64 | * at least there is no arbitrary low system-wide limit... |
66 | * at least there is no arbitrary low system-wide limit... |
|
|
67 | * (FIXME: apparently, this was increased to 32768 in later kernels( |
65 | * i) unlike linux aio, you *can* register more then the limit |
68 | * i) unlike linux aio, you *can* register more then the limit |
66 | * of fd events, and the kernel will "gracefully" signal an |
69 | * of fd events, and the kernel will "gracefully" signal an |
67 | * overflow, after which you could destroy and recreate the kernel |
70 | * overflow, after which you could destroy and recreate the kernel |
68 | * state, a bit bigger, or fall back to e.g. poll. thats not |
71 | * state, a bit bigger, or fall back to e.g. poll. thats not |
69 | * totally insane, but kind of questions the point a high |
72 | * totally insane, but kind of questions the point a high |
70 | * performance I/O framework when it doesn't really work |
73 | * performance I/O framework when it doesn't really work |
71 | * under stress. |
74 | * under stress. |
|
|
75 | * (FIXME: iouring should no longer drop events, need to investigate) |
72 | * j) but, oh my! is has exactly the same bugs as the linux aio backend, |
76 | * j) but, oh my! is has exactly the same bugs as the linux aio backend, |
73 | * where some undocumented poll combinations just fail. |
77 | * where some undocumented poll combinations just fail. |
74 | * so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course, |
78 | * so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course, |
75 | * this is completely undocumented, have I mantioned this already? |
79 | * this is completely undocumented, have I mantioned this already? |
76 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
80 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
77 | * the big isuess with it are the bugs requiring epoll, which might |
81 | * the big isuess with it are the bugs requiring epoll, which might |
78 | * or might not get fixed (do I hold my breath?). |
82 | * or might not get fixed (do I hold my breath?). |
79 | */ |
83 | */ |
80 | |
84 | |
|
|
85 | /* TODO: use internal TIMEOUT */ |
|
|
86 | /* TODO: take advantage of single mmap, NODROP etc. */ |
|
|
87 | /* TODO: resize cq/sq size independently */ |
|
|
88 | |
81 | #include <sys/timerfd.h> |
89 | #include <sys/timerfd.h> |
82 | #include <sys/mman.h> |
90 | #include <sys/mman.h> |
83 | #include <poll.h> |
91 | #include <poll.h> |
|
|
92 | #include <stdint.h> |
84 | |
93 | |
85 | #define IOURING_INIT_ENTRIES 32 |
94 | #define IOURING_INIT_ENTRIES 32 |
86 | |
95 | |
87 | /*****************************************************************************/ |
96 | /*****************************************************************************/ |
88 | /* syscall wrapdadoop - this section has the raw api/abi definitions */ |
97 | /* syscall wrapdadoop - this section has the raw api/abi definitions */ |
… | |
… | |
96 | { |
105 | { |
97 | __u8 opcode; |
106 | __u8 opcode; |
98 | __u8 flags; |
107 | __u8 flags; |
99 | __u16 ioprio; |
108 | __u16 ioprio; |
100 | __s32 fd; |
109 | __s32 fd; |
|
|
110 | union { |
101 | __u64 off; |
111 | __u64 off; |
|
|
112 | __u64 addr2; |
|
|
113 | }; |
102 | __u64 addr; |
114 | __u64 addr; |
103 | __u32 len; |
115 | __u32 len; |
104 | union { |
116 | union { |
105 | __kernel_rwf_t rw_flags; |
117 | __kernel_rwf_t rw_flags; |
106 | __u32 fsync_flags; |
118 | __u32 fsync_flags; |
107 | __u16 poll_events; |
119 | __u16 poll_events; |
108 | __u32 sync_range_flags; |
120 | __u32 sync_range_flags; |
109 | __u32 msg_flags; |
121 | __u32 msg_flags; |
|
|
122 | __u32 timeout_flags; |
|
|
123 | __u32 accept_flags; |
|
|
124 | __u32 cancel_flags; |
|
|
125 | __u32 open_flags; |
|
|
126 | __u32 statx_flags; |
110 | }; |
127 | }; |
111 | __u64 user_data; |
128 | __u64 user_data; |
112 | union { |
129 | union { |
113 | __u16 buf_index; |
130 | __u16 buf_index; |
114 | __u64 __pad2[3]; |
131 | __u64 __pad2[3]; |
… | |
… | |
151 | __u32 sq_entries; |
168 | __u32 sq_entries; |
152 | __u32 cq_entries; |
169 | __u32 cq_entries; |
153 | __u32 flags; |
170 | __u32 flags; |
154 | __u32 sq_thread_cpu; |
171 | __u32 sq_thread_cpu; |
155 | __u32 sq_thread_idle; |
172 | __u32 sq_thread_idle; |
|
|
173 | __u32 features; |
156 | __u32 resv[5]; |
174 | __u32 resv[4]; |
157 | struct io_sqring_offsets sq_off; |
175 | struct io_sqring_offsets sq_off; |
158 | struct io_cqring_offsets cq_off; |
176 | struct io_cqring_offsets cq_off; |
159 | }; |
177 | }; |
160 | |
178 | |
|
|
179 | #define IORING_SETUP_CQSIZE 0x00000008 |
|
|
180 | |
161 | #define IORING_OP_POLL_ADD 6 |
181 | #define IORING_OP_POLL_ADD 6 |
162 | #define IORING_OP_POLL_REMOVE 7 |
182 | #define IORING_OP_POLL_REMOVE 7 |
|
|
183 | #define IORING_OP_TIMEOUT 11 |
|
|
184 | #define IORING_OP_TIMEOUT_REMOVE 12 |
|
|
185 | |
|
|
186 | /* relative or absolute, reference clock is CLOCK_MONOTONIC */ |
|
|
187 | struct iouring_kernel_timespec |
|
|
188 | { |
|
|
189 | int64_t tv_sec; |
|
|
190 | long long tv_nsec; |
|
|
191 | }; |
|
|
192 | |
|
|
193 | #define IORING_TIMEOUT_ABS 0x00000001 |
163 | |
194 | |
164 | #define IORING_ENTER_GETEVENTS 0x01 |
195 | #define IORING_ENTER_GETEVENTS 0x01 |
165 | |
196 | |
166 | #define IORING_OFF_SQ_RING 0x00000000ULL |
197 | #define IORING_OFF_SQ_RING 0x00000000ULL |
167 | #define IORING_OFF_CQ_RING 0x08000000ULL |
198 | #define IORING_OFF_CQ_RING 0x08000000ULL |
168 | #define IORING_OFF_SQES 0x10000000ULL |
199 | #define IORING_OFF_SQES 0x10000000ULL |
169 | |
200 | |
|
|
201 | #define IORING_FEAT_SINGLE_MMAP 0x00000001 |
|
|
202 | #define IORING_FEAT_NODROP 0x00000002 |
|
|
203 | #define IORING_FEAT_SUBMIT_STABLE 0x00000004 |
|
|
204 | |
170 | inline_size |
205 | inline_size |
171 | int |
206 | int |
172 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
207 | evsys_io_uring_setup (unsigned entries, struct io_uring_params *params) |
173 | { |
208 | { |
174 | return ev_syscall2 (SYS_io_uring_setup, entries, params); |
209 | return ev_syscall2 (SYS_io_uring_setup, entries, params); |
… | |
… | |
193 | |
228 | |
194 | /* the submit/completion queue entries */ |
229 | /* the submit/completion queue entries */ |
195 | #define EV_SQES ((struct io_uring_sqe *) iouring_sqes) |
230 | #define EV_SQES ((struct io_uring_sqe *) iouring_sqes) |
196 | #define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) |
231 | #define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) |
197 | |
232 | |
|
|
233 | inline_speed |
|
|
234 | int |
|
|
235 | iouring_enter (EV_P_ ev_tstamp timeout) |
|
|
236 | { |
|
|
237 | int res; |
|
|
238 | |
|
|
239 | EV_RELEASE_CB; |
|
|
240 | |
|
|
241 | res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, |
|
|
242 | timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); |
|
|
243 | |
|
|
244 | assert (("libev: io_uring_enter did not consume all sqes", (res < 0 || res == iouring_to_submit))); |
|
|
245 | |
|
|
246 | iouring_to_submit = 0; |
|
|
247 | |
|
|
248 | EV_ACQUIRE_CB; |
|
|
249 | |
|
|
250 | return res; |
|
|
251 | } |
|
|
252 | |
|
|
253 | /* TODO: can we move things around so we don't need this forward-reference? */ |
|
|
254 | static void |
|
|
255 | iouring_poll (EV_P_ ev_tstamp timeout); |
|
|
256 | |
198 | static |
257 | static |
199 | struct io_uring_sqe * |
258 | struct io_uring_sqe * |
200 | iouring_sqe_get (EV_P) |
259 | iouring_sqe_get (EV_P) |
201 | { |
260 | { |
|
|
261 | unsigned tail; |
|
|
262 | |
|
|
263 | for (;;) |
|
|
264 | { |
202 | unsigned tail = EV_SQ_VAR (tail); |
265 | tail = EV_SQ_VAR (tail); |
203 | |
266 | |
204 | if (tail + 1 - EV_SQ_VAR (head) > EV_SQ_VAR (ring_entries)) |
267 | if (ecb_expect_true (tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))) |
|
|
268 | break; /* whats the problem, we have free sqes */ |
|
|
269 | |
|
|
270 | /* queue full, need to flush and possibly handle some events */ |
|
|
271 | |
|
|
272 | #if EV_FEATURE_CODE |
|
|
273 | /* first we ask the kernel nicely, most often this frees up some sqes */ |
|
|
274 | int res = iouring_enter (EV_A_ EV_TS_CONST (0.)); |
|
|
275 | |
|
|
276 | ECB_MEMORY_FENCE_ACQUIRE; /* better safe than sorry */ |
|
|
277 | |
|
|
278 | if (res >= 0) |
|
|
279 | continue; /* yes, it worked, try again */ |
|
|
280 | #endif |
|
|
281 | |
|
|
282 | /* some problem, possibly EBUSY - do the full poll and let it handle any issues */ |
|
|
283 | |
|
|
284 | iouring_poll (EV_A_ EV_TS_CONST (0.)); |
|
|
285 | /* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE for us */ |
205 | { |
286 | } |
206 | /* queue full, flush */ |
|
|
207 | evsys_io_uring_enter (iouring_fd, iouring_to_submit, 0, 0, 0, 0); |
|
|
208 | iouring_to_submit = 0; |
|
|
209 | } |
|
|
210 | |
287 | |
211 | assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))); |
288 | /*assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)));*/ |
212 | |
289 | |
213 | return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); |
290 | return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); |
214 | } |
291 | } |
215 | |
292 | |
216 | inline_size |
293 | inline_size |
… | |
… | |
236 | iouring_tfd_cb (EV_P_ struct ev_io *w, int revents) |
313 | iouring_tfd_cb (EV_P_ struct ev_io *w, int revents) |
237 | { |
314 | { |
238 | iouring_tfd_to = EV_TSTAMP_HUGE; |
315 | iouring_tfd_to = EV_TSTAMP_HUGE; |
239 | } |
316 | } |
240 | |
317 | |
241 | static void |
|
|
242 | iouring_epoll_cb (EV_P_ struct ev_io *w, int revents) |
|
|
243 | { |
|
|
244 | epoll_poll (EV_A_ 0); |
|
|
245 | } |
|
|
246 | |
|
|
247 | /* called for full and partial cleanup */ |
318 | /* called for full and partial cleanup */ |
248 | ecb_cold |
319 | ecb_cold |
249 | static int |
320 | static int |
250 | iouring_internal_destroy (EV_P) |
321 | iouring_internal_destroy (EV_P) |
251 | { |
322 | { |
… | |
… | |
254 | |
325 | |
255 | if (iouring_sq_ring != MAP_FAILED) munmap (iouring_sq_ring, iouring_sq_ring_size); |
326 | if (iouring_sq_ring != MAP_FAILED) munmap (iouring_sq_ring, iouring_sq_ring_size); |
256 | if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size); |
327 | if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size); |
257 | if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size ); |
328 | if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size ); |
258 | |
329 | |
259 | if (ev_is_active (&iouring_epoll_w)) ev_ref (EV_A); ev_io_stop (EV_A_ &iouring_epoll_w); |
330 | if (ev_is_active (&iouring_tfd_w)) |
260 | if (ev_is_active (&iouring_tfd_w )) ev_ref (EV_A); ev_io_stop (EV_A_ &iouring_tfd_w ); |
331 | { |
|
|
332 | ev_ref (EV_A); |
|
|
333 | ev_io_stop (EV_A_ &iouring_tfd_w); |
|
|
334 | } |
261 | } |
335 | } |
262 | |
336 | |
263 | ecb_cold |
337 | ecb_cold |
264 | static int |
338 | static int |
265 | iouring_internal_init (EV_P) |
339 | iouring_internal_init (EV_P) |
… | |
… | |
271 | iouring_tfd = -1; |
345 | iouring_tfd = -1; |
272 | iouring_sq_ring = MAP_FAILED; |
346 | iouring_sq_ring = MAP_FAILED; |
273 | iouring_cq_ring = MAP_FAILED; |
347 | iouring_cq_ring = MAP_FAILED; |
274 | iouring_sqes = MAP_FAILED; |
348 | iouring_sqes = MAP_FAILED; |
275 | |
349 | |
|
|
350 | if (!have_monotonic) /* cannot really happen, but what if11 */ |
|
|
351 | return -1; |
|
|
352 | |
276 | for (;;) |
353 | for (;;) |
277 | { |
354 | { |
278 | iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); |
355 | iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); |
279 | |
356 | |
280 | if (iouring_fd >= 0) |
357 | if (iouring_fd >= 0) |
281 | break; /* yippie */ |
358 | break; /* yippie */ |
282 | |
359 | |
283 | if (errno != EINVAL) |
360 | if (errno != EINVAL) |
284 | return -1; /* we failed */ |
361 | return -1; /* we failed */ |
|
|
362 | |
|
|
363 | #if TODO |
|
|
364 | if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP)) |
|
|
365 | return -1; /* we require the above features */ |
|
|
366 | #endif |
285 | |
367 | |
286 | /* EINVAL: lots of possible reasons, but maybe |
368 | /* EINVAL: lots of possible reasons, but maybe |
287 | * it is because we hit the unqueryable hardcoded size limit |
369 | * it is because we hit the unqueryable hardcoded size limit |
288 | */ |
370 | */ |
289 | |
371 | |
… | |
… | |
342 | iouring_internal_destroy (EV_A); |
424 | iouring_internal_destroy (EV_A); |
343 | |
425 | |
344 | while (iouring_internal_init (EV_A) < 0) |
426 | while (iouring_internal_init (EV_A) < 0) |
345 | ev_syserr ("(libev) io_uring_setup"); |
427 | ev_syserr ("(libev) io_uring_setup"); |
346 | |
428 | |
347 | /* forking epoll should also effectively unregister all fds from the backend */ |
|
|
348 | epoll_fork (EV_A); |
|
|
349 | /* epoll_fork already did this. hopefully */ |
|
|
350 | /*fd_rearm_all (EV_A);*/ |
429 | fd_rearm_all (EV_A); |
351 | |
|
|
352 | ev_io_stop (EV_A_ &iouring_epoll_w); |
|
|
353 | ev_io_set (EV_A_ &iouring_epoll_w, backend_fd, EV_READ); |
|
|
354 | ev_io_start (EV_A_ &iouring_epoll_w); |
|
|
355 | |
430 | |
356 | ev_io_stop (EV_A_ &iouring_tfd_w); |
431 | ev_io_stop (EV_A_ &iouring_tfd_w); |
357 | ev_io_set (EV_A_ &iouring_tfd_w, iouring_tfd, EV_READ); |
432 | ev_io_set (EV_A_ &iouring_tfd_w, iouring_tfd, EV_READ); |
358 | ev_io_start (EV_A_ &iouring_tfd_w); |
433 | ev_io_start (EV_A_ &iouring_tfd_w); |
359 | } |
434 | } |
… | |
… | |
361 | /*****************************************************************************/ |
436 | /*****************************************************************************/ |
362 | |
437 | |
363 | static void |
438 | static void |
364 | iouring_modify (EV_P_ int fd, int oev, int nev) |
439 | iouring_modify (EV_P_ int fd, int oev, int nev) |
365 | { |
440 | { |
366 | fprintf (stderr,"modify %d (%d, %d) %d\n", fd, oev,nev, anfds[fd].eflags);//D |
|
|
367 | if (ecb_expect_false (anfds [fd].eflags)) |
|
|
368 | { |
|
|
369 | /* we handed this fd over to epoll, so undo this first */ |
|
|
370 | /* we do it manually because the optimisations on epoll_modify won't do us any good */ |
|
|
371 | epoll_ctl (iouring_fd, EPOLL_CTL_DEL, fd, 0); |
|
|
372 | anfds [fd].eflags = 0; |
|
|
373 | oev = 0; |
|
|
374 | } |
|
|
375 | |
|
|
376 | if (oev) |
441 | if (oev) |
377 | { |
442 | { |
378 | /* we assume the sqe's are all "properly" initialised */ |
443 | /* we assume the sqe's are all "properly" initialised */ |
379 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
444 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
380 | sqe->opcode = IORING_OP_POLL_REMOVE; |
445 | sqe->opcode = IORING_OP_POLL_REMOVE; |
381 | sqe->fd = fd; |
446 | sqe->fd = fd; |
|
|
447 | /* Jens Axboe notified me that user_data is not what is documented, but is |
|
|
448 | * some kind of unique ID that has to match, otherwise the request cannot |
|
|
449 | * be removed. Since we don't *really* have that, we pass in the old |
|
|
450 | * generation counter - if that fails, too bad, it will hopefully be removed |
|
|
451 | * at close time and then be ignored. */ |
|
|
452 | sqe->addr = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); |
382 | sqe->user_data = -1; |
453 | sqe->user_data = (uint64_t)-1; |
383 | iouring_sqe_submit (EV_A_ sqe); |
454 | iouring_sqe_submit (EV_A_ sqe); |
384 | |
455 | |
385 | /* increment generation counter to avoid handling old events */ |
456 | /* increment generation counter to avoid handling old events */ |
386 | ++anfds [fd].egen; |
457 | ++anfds [fd].egen; |
387 | } |
458 | } |
… | |
… | |
428 | { |
499 | { |
429 | int fd = cqe->user_data & 0xffffffffU; |
500 | int fd = cqe->user_data & 0xffffffffU; |
430 | uint32_t gen = cqe->user_data >> 32; |
501 | uint32_t gen = cqe->user_data >> 32; |
431 | int res = cqe->res; |
502 | int res = cqe->res; |
432 | |
503 | |
433 | /* ignore fd removal events, if there are any. TODO: verify */ |
504 | /* user_data -1 is a remove that we are not atm. interested in */ |
434 | if (cqe->user_data == (__u64)-1) |
505 | if (cqe->user_data == (uint64_t)-1) |
435 | abort ();//D |
506 | return; |
436 | |
507 | |
437 | assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); |
508 | assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); |
438 | |
509 | |
439 | /* documentation lies, of course. the result value is NOT like |
510 | /* documentation lies, of course. the result value is NOT like |
440 | * normal syscalls, but like linux raw syscalls, i.e. negative |
511 | * normal syscalls, but like linux raw syscalls, i.e. negative |
441 | * error numbers. fortunate, as otherwise there would be no way |
512 | * error numbers. fortunate, as otherwise there would be no way |
442 | * to get error codes at all. still, why not document this? |
513 | * to get error codes at all. still, why not document this? |
443 | */ |
514 | */ |
444 | |
515 | |
445 | /* ignore event if generation doesn't match */ |
516 | /* ignore event if generation doesn't match */ |
|
|
517 | /* other than skipping removal events, */ |
446 | /* this should actually be very rare */ |
518 | /* this should actually be very rare */ |
447 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
519 | if (ecb_expect_false (gen != (uint32_t)anfds [fd].egen)) |
448 | return; |
520 | return; |
449 | |
521 | |
450 | if (ecb_expect_false (res < 0)) |
522 | if (ecb_expect_false (res < 0)) |
451 | { |
523 | { |
452 | if (res == -EINVAL) |
524 | /*TODO: EINVAL handling (was something failed with this fd)*/ |
453 | { |
525 | /*TODO: EBUSY happens when?*/ |
454 | /* we assume this error code means the fd/poll combination is buggy |
|
|
455 | * and fall back to epoll. |
|
|
456 | * this error code might also indicate a bug, but the kernel doesn't |
|
|
457 | * distinguish between those two conditions, so... sigh... |
|
|
458 | */ |
|
|
459 | |
526 | |
460 | epoll_modify (EV_A_ fd, 0, anfds [fd].events); |
|
|
461 | } |
|
|
462 | else if (res == -EBADF) |
527 | if (res == -EBADF) |
463 | { |
528 | { |
464 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
529 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
465 | fd_kill (EV_A_ fd); |
530 | fd_kill (EV_A_ fd); |
466 | } |
531 | } |
467 | else |
532 | else |
… | |
… | |
471 | } |
536 | } |
472 | |
537 | |
473 | return; |
538 | return; |
474 | } |
539 | } |
475 | |
540 | |
476 | fprintf (stderr, "fd %d event, rearm\n", fd);//D |
|
|
477 | |
|
|
478 | /* feed events, we do not expect or handle POLLNVAL */ |
541 | /* feed events, we do not expect or handle POLLNVAL */ |
479 | fd_event ( |
542 | fd_event ( |
480 | EV_A_ |
543 | EV_A_ |
481 | fd, |
544 | fd, |
482 | (res & (POLLOUT | POLLERR | POLLHUP) ? EV_WRITE : 0) |
545 | (res & (POLLOUT | POLLERR | POLLHUP) ? EV_WRITE : 0) |
… | |
… | |
495 | iouring_overflow (EV_P) |
558 | iouring_overflow (EV_P) |
496 | { |
559 | { |
497 | /* we have two options, resize the queue (by tearing down |
560 | /* we have two options, resize the queue (by tearing down |
498 | * everything and recreating it, or living with it |
561 | * everything and recreating it, or living with it |
499 | * and polling. |
562 | * and polling. |
500 | * we implement this by resizing tghe queue, and, if that fails, |
563 | * we implement this by resizing the queue, and, if that fails, |
501 | * we just recreate the state on every failure, which |
564 | * we just recreate the state on every failure, which |
502 | * kind of is a very inefficient poll. |
565 | * kind of is a very inefficient poll. |
503 | * one danger is, due to the bios toward lower fds, |
566 | * one danger is, due to the bios toward lower fds, |
504 | * we will only really get events for those, so |
567 | * we will only really get events for those, so |
505 | * maybe we need a poll() fallback, after all. |
568 | * maybe we need a poll() fallback, after all. |
… | |
… | |
517 | else |
580 | else |
518 | { |
581 | { |
519 | /* we hit the kernel limit, we should fall back to something else. |
582 | /* we hit the kernel limit, we should fall back to something else. |
520 | * we can either poll() a few times and hope for the best, |
583 | * we can either poll() a few times and hope for the best, |
521 | * poll always, or switch to epoll. |
584 | * poll always, or switch to epoll. |
522 | * since we use epoll anyways, go epoll. |
585 | * TODO: is this necessary with newer kernels? |
523 | */ |
586 | */ |
524 | |
587 | |
525 | iouring_internal_destroy (EV_A); |
588 | iouring_internal_destroy (EV_A); |
526 | |
589 | |
527 | /* this should make it so that on return, we don'T call any uring functions */ |
590 | /* this should make it so that on return, we don't call any uring functions */ |
528 | iouring_to_submit = 0; |
591 | iouring_to_submit = 0; |
529 | |
592 | |
530 | for (;;) |
593 | for (;;) |
531 | { |
594 | { |
532 | backend = epoll_init (EV_A_ 0); |
595 | backend = epoll_init (EV_A_ 0); |
… | |
… | |
573 | |
636 | |
574 | static void |
637 | static void |
575 | iouring_poll (EV_P_ ev_tstamp timeout) |
638 | iouring_poll (EV_P_ ev_tstamp timeout) |
576 | { |
639 | { |
577 | /* if we have events, no need for extra syscalls, but we might have to queue events */ |
640 | /* if we have events, no need for extra syscalls, but we might have to queue events */ |
|
|
641 | /* we also clar the timeout if there are outstanding fdchanges */ |
|
|
642 | /* the latter should only happen if both the sq and cq are full, most likely */ |
|
|
643 | /* because we have a lot of event sources that immediately complete */ |
|
|
644 | /* TODO: fdchacngecnt is always 0 because fd_reify does not have two buffers yet */ |
578 | if (iouring_handle_cq (EV_A)) |
645 | if (iouring_handle_cq (EV_A) || fdchangecnt) |
579 | timeout = EV_TS_CONST (0.); |
646 | timeout = EV_TS_CONST (0.); |
580 | else |
647 | else |
581 | /* no events, so maybe wait for some */ |
648 | /* no events, so maybe wait for some */ |
582 | iouring_tfd_update (EV_A_ timeout); |
649 | iouring_tfd_update (EV_A_ timeout); |
583 | |
650 | |
584 | /* only enter the kernel if we have somethign to submit, or we need to wait */ |
651 | /* only enter the kernel if we have something to submit, or we need to wait */ |
585 | if (timeout || iouring_to_submit) |
652 | if (timeout || iouring_to_submit) |
586 | { |
653 | { |
587 | int res; |
654 | int res = iouring_enter (EV_A_ timeout); |
588 | |
|
|
589 | EV_RELEASE_CB; |
|
|
590 | |
|
|
591 | res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, |
|
|
592 | timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); |
|
|
593 | iouring_to_submit = 0; |
|
|
594 | |
|
|
595 | EV_ACQUIRE_CB; |
|
|
596 | |
655 | |
597 | if (ecb_expect_false (res < 0)) |
656 | if (ecb_expect_false (res < 0)) |
598 | if (errno == EINTR) |
657 | if (errno == EINTR) |
599 | /* ignore */; |
658 | /* ignore */; |
|
|
659 | else if (errno == EBUSY) |
|
|
660 | /* cq full, cannot submit - should be rare because we flush the cq first, so simply ignore */; |
600 | else |
661 | else |
601 | ev_syserr ("(libev) iouring setup"); |
662 | ev_syserr ("(libev) iouring setup"); |
602 | else |
663 | else |
603 | iouring_handle_cq (EV_A); |
664 | iouring_handle_cq (EV_A); |
604 | } |
665 | } |
… | |
… | |
606 | |
667 | |
607 | inline_size |
668 | inline_size |
608 | int |
669 | int |
609 | iouring_init (EV_P_ int flags) |
670 | iouring_init (EV_P_ int flags) |
610 | { |
671 | { |
611 | if (!epoll_init (EV_A_ 0)) |
|
|
612 | return 0; |
|
|
613 | |
|
|
614 | ev_io_init (&iouring_epoll_w, iouring_epoll_cb, backend_fd, EV_READ); |
|
|
615 | ev_set_priority (&iouring_epoll_w, EV_MAXPRI); |
|
|
616 | |
|
|
617 | ev_io_init (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ); |
|
|
618 | ev_set_priority (&iouring_tfd_w, EV_MAXPRI); |
|
|
619 | |
|
|
620 | iouring_entries = IOURING_INIT_ENTRIES; |
672 | iouring_entries = IOURING_INIT_ENTRIES; |
621 | iouring_max_entries = 0; |
673 | iouring_max_entries = 0; |
622 | |
674 | |
623 | if (iouring_internal_init (EV_A) < 0) |
675 | if (iouring_internal_init (EV_A) < 0) |
624 | { |
676 | { |
625 | iouring_internal_destroy (EV_A); |
677 | iouring_internal_destroy (EV_A); |
626 | return 0; |
678 | return 0; |
627 | } |
679 | } |
628 | |
680 | |
629 | ev_io_start (EV_A_ &iouring_epoll_w); |
681 | ev_io_init (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ); |
630 | ev_unref (EV_A); /* watcher should not keep loop alive */ |
682 | ev_set_priority (&iouring_tfd_w, EV_MINPRI); |
631 | |
|
|
632 | ev_io_start (EV_A_ &iouring_tfd_w); |
683 | ev_io_start (EV_A_ &iouring_tfd_w); |
633 | ev_unref (EV_A); /* watcher should not keep loop alive */ |
684 | ev_unref (EV_A); /* watcher should not keep loop alive */ |
634 | |
685 | |
635 | backend_modify = iouring_modify; |
686 | backend_modify = iouring_modify; |
636 | backend_poll = iouring_poll; |
687 | backend_poll = iouring_poll; |
… | |
… | |
641 | inline_size |
692 | inline_size |
642 | void |
693 | void |
643 | iouring_destroy (EV_P) |
694 | iouring_destroy (EV_P) |
644 | { |
695 | { |
645 | iouring_internal_destroy (EV_A); |
696 | iouring_internal_destroy (EV_A); |
646 | epoll_destroy (EV_A); |
|
|
647 | } |
697 | } |
648 | |
698 | |