1 | /* |
1 | /* |
2 | * libev linux io_uring fd activity backend |
2 | * libev linux io_uring fd activity backend |
3 | * |
3 | * |
4 | * Copyright (c) 2019 Marc Alexander Lehmann <libev@schmorp.de> |
4 | * Copyright (c) 2019-2020 Marc Alexander Lehmann <libev@schmorp.de> |
5 | * All rights reserved. |
5 | * All rights reserved. |
6 | * |
6 | * |
7 | * Redistribution and use in source and binary forms, with or without modifica- |
7 | * Redistribution and use in source and binary forms, with or without modifica- |
8 | * tion, are permitted provided that the following conditions are met: |
8 | * tion, are permitted provided that the following conditions are met: |
9 | * |
9 | * |
… | |
… | |
44 | * b) best is not necessarily very good. |
44 | * b) best is not necessarily very good. |
45 | * c) it's better than the aio mess, doesn't suffer from the fork problems |
45 | * c) it's better than the aio mess, doesn't suffer from the fork problems |
46 | * of linux aio or epoll and so on and so on. and you could do event stuff |
46 | * of linux aio or epoll and so on and so on. and you could do event stuff |
47 | * without any syscalls. what's not to like? |
47 | * without any syscalls. what's not to like? |
48 | * d) ok, it's vastly more complex, but that's ok, really. |
48 | * d) ok, it's vastly more complex, but that's ok, really. |
49 | * e) why 3 mmaps instead of one? one would be more space-efficient, |
49 | * e) why two mmaps instead of one? one would be more space-efficient, |
50 | * and I can't see what benefit three would have (other than being |
50 | * and I can't see what benefit two would have (other than being |
51 | * somehow resizable/relocatable, but that's apparently not possible). |
51 | * somehow resizable/relocatable, but that's apparently not possible). |
52 | * (FIXME: newer kernels can use 2 mmaps only, need to look into this). |
|
|
53 | * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and |
52 | * f) hmm, it's practically undebuggable (gdb can't access the memory, and |
54 | * the bizarre way structure offsets are communicated makes it hard to |
53 | * the bizarre way structure offsets are communicated makes it hard to |
55 | * just print the ring buffer heads, even *iff* the memory were visible |
54 | * just print the ring buffer heads, even *iff* the memory were visible |
56 | * in gdb. but then, that's also ok, really. |
55 | * in gdb. but then, that's also ok, really. |
57 | * g) well, you cannot specify a timeout when waiting for events. no, |
56 | * g) well, you cannot specify a timeout when waiting for events. no, |
58 | * seriously, the interface doesn't support a timeout. never seen _that_ |
57 | * seriously, the interface doesn't support a timeout. never seen _that_ |
59 | * before. sure, you can use a timerfd, but that's another syscall |
58 | * before. sure, you can use a timerfd, but that's another syscall |
60 | * you could have avoided. overall, this bizarre omission smells |
59 | * you could have avoided. overall, this bizarre omission smells |
61 | * like a µ-optimisation by the io_uring author for his personal |
60 | * like a µ-optimisation by the io_uring author for his personal |
62 | * applications, to the detriment of everybody else who just wants |
61 | * applications, to the detriment of everybody else who just wants |
63 | * an event loop. but, umm, ok, if that's all, it could be worse. |
62 | * an event loop. but, umm, ok, if that's all, it could be worse. |
64 | * (FIXME: jens mentioned timeout commands, need to investigate) |
63 | * (from what I gather from the author Jens Axboe, it simply didn't |
|
|
64 | * occur to him, and he made good on it by adding an unlimited number |
|
|
65 | * of timeouts later :). |
65 | * h) there is a hardcoded limit of 4096 outstanding events. okay, |
66 | * h) initially there was a hardcoded limit of 4096 outstanding events. |
66 | * at least there is no arbitrary low system-wide limit... |
67 | * later versions not only bump this to 32k, but also can handle |
67 | * (FIXME: apparently, this was increased to 32768 in later kernels( |
68 | * an unlimited amount of events, so this only affects the batch size. |
68 | * i) unlike linux aio, you *can* register more then the limit |
69 | * i) unlike linux aio, you *can* register more then the limit |
69 | * of fd events, and the kernel will "gracefully" signal an |
70 | * of fd events. while early verisons of io_uring signalled an overflow |
70 | * overflow, after which you could destroy and recreate the kernel |
71 | * and you ended up getting wet. 5.5+ does not do this anymore. |
71 | * state, a bit bigger, or fall back to e.g. poll. thats not |
|
|
72 | * totally insane, but kind of questions the point a high |
|
|
73 | * performance I/O framework when it doesn't really work |
|
|
74 | * under stress. |
|
|
75 | * (FIXME: iouring should no longer drop events, need to investigate) |
|
|
76 | * j) but, oh my! is has exactly the same bugs as the linux aio backend, |
72 | * j) but, oh my! it had exactly the same bugs as the linux aio backend, |
77 | * where some undocumented poll combinations just fail. |
73 | * where some undocumented poll combinations just fail. fortunately, |
78 | * so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course, |
74 | * after finally reaching the author, he was more than willing to fix |
79 | * this is completely undocumented, have I mantioned this already? |
75 | * this probably in 5.6+. |
80 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
76 | * k) overall, the *API* itself is, I dare to say, not a total trainwreck. |
81 | * the big isuess with it are the bugs requiring epoll, which might |
77 | * once the bugs ae fixed (probably in 5.6+), it will be without |
82 | * or might not get fixed (do I hold my breath?). |
78 | * competition. |
83 | */ |
79 | */ |
84 | |
80 | |
85 | /* TODO: use internal TIMEOUT */ |
81 | /* TODO: use internal TIMEOUT */ |
86 | /* TODO: take advantage of single mmap, NODROP etc. */ |
82 | /* TODO: take advantage of single mmap, NODROP etc. */ |
87 | /* TODO: resize cq/sq size independently */ |
83 | /* TODO: resize cq/sq size independently */ |
… | |
… | |
122 | __u32 timeout_flags; |
118 | __u32 timeout_flags; |
123 | __u32 accept_flags; |
119 | __u32 accept_flags; |
124 | __u32 cancel_flags; |
120 | __u32 cancel_flags; |
125 | __u32 open_flags; |
121 | __u32 open_flags; |
126 | __u32 statx_flags; |
122 | __u32 statx_flags; |
|
|
123 | __u32 fadvise_advice; |
127 | }; |
124 | }; |
128 | __u64 user_data; |
125 | __u64 user_data; |
129 | union { |
126 | union { |
130 | __u16 buf_index; |
127 | __u16 buf_index; |
|
|
128 | __u16 personality; |
131 | __u64 __pad2[3]; |
129 | __u64 __pad2[3]; |
132 | }; |
130 | }; |
133 | }; |
131 | }; |
134 | |
132 | |
135 | struct io_uring_cqe |
133 | struct io_uring_cqe |
… | |
… | |
174 | __u32 resv[4]; |
172 | __u32 resv[4]; |
175 | struct io_sqring_offsets sq_off; |
173 | struct io_sqring_offsets sq_off; |
176 | struct io_cqring_offsets cq_off; |
174 | struct io_cqring_offsets cq_off; |
177 | }; |
175 | }; |
178 | |
176 | |
|
|
177 | #define IORING_FEAT_SINGLE_MMAP 0x00000001 |
|
|
178 | #define IORING_FEAT_NODROP 0x00000002 |
|
|
179 | #define IORING_FEAT_SUBMIT_STABLE 0x00000004 |
|
|
180 | |
179 | #define IORING_SETUP_CQSIZE 0x00000008 |
181 | #define IORING_SETUP_CQSIZE 0x00000008 |
|
|
182 | #define IORING_SETUP_CLAMP 0x00000010 |
180 | |
183 | |
181 | #define IORING_OP_POLL_ADD 6 |
184 | #define IORING_OP_POLL_ADD 6 |
182 | #define IORING_OP_POLL_REMOVE 7 |
185 | #define IORING_OP_POLL_REMOVE 7 |
183 | #define IORING_OP_TIMEOUT 11 |
186 | #define IORING_OP_TIMEOUT 11 |
184 | #define IORING_OP_TIMEOUT_REMOVE 12 |
187 | #define IORING_OP_TIMEOUT_REMOVE 12 |
185 | |
188 | |
|
|
189 | #define IORING_REGISTER_EVENTFD 4 |
|
|
190 | #define IORING_REGISTER_EVENTFD_ASYNC 7 |
|
|
191 | #define IORING_REGISTER_PROBE 8 |
|
|
192 | |
|
|
193 | #define IO_URING_OP_SUPPORTED 1 |
|
|
194 | |
|
|
195 | struct io_uring_probe_op { |
|
|
196 | __u8 op; |
|
|
197 | __u8 resv; |
|
|
198 | __u16 flags; |
|
|
199 | __u32 resv2; |
|
|
200 | }; |
|
|
201 | |
|
|
202 | struct io_uring_probe |
|
|
203 | { |
|
|
204 | __u8 last_op; |
|
|
205 | __u8 ops_len; |
|
|
206 | __u16 resv; |
|
|
207 | __u32 resv2[3]; |
|
|
208 | struct io_uring_probe_op ops[0]; |
|
|
209 | }; |
|
|
210 | |
186 | /* relative or absolute, reference clock is CLOCK_MONOTONIC */ |
211 | /* relative or absolute, reference clock is CLOCK_MONOTONIC */ |
187 | struct iouring_kernel_timespec |
212 | struct iouring_kernel_timespec |
188 | { |
213 | { |
189 | int64_t tv_sec; |
214 | int64_t tv_sec; |
190 | long long tv_nsec; |
215 | long long tv_nsec; |
… | |
… | |
193 | #define IORING_TIMEOUT_ABS 0x00000001 |
218 | #define IORING_TIMEOUT_ABS 0x00000001 |
194 | |
219 | |
195 | #define IORING_ENTER_GETEVENTS 0x01 |
220 | #define IORING_ENTER_GETEVENTS 0x01 |
196 | |
221 | |
197 | #define IORING_OFF_SQ_RING 0x00000000ULL |
222 | #define IORING_OFF_SQ_RING 0x00000000ULL |
198 | #define IORING_OFF_CQ_RING 0x08000000ULL |
|
|
199 | #define IORING_OFF_SQES 0x10000000ULL |
223 | #define IORING_OFF_SQES 0x10000000ULL |
200 | |
224 | |
201 | #define IORING_FEAT_SINGLE_MMAP 0x00000001 |
225 | #define IORING_FEAT_SINGLE_MMAP 0x00000001 |
202 | #define IORING_FEAT_NODROP 0x00000002 |
226 | #define IORING_FEAT_NODROP 0x00000002 |
203 | #define IORING_FEAT_SUBMIT_STABLE 0x00000004 |
227 | #define IORING_FEAT_SUBMIT_STABLE 0x00000004 |
… | |
… | |
214 | evsys_io_uring_enter (int fd, unsigned to_submit, unsigned min_complete, unsigned flags, const sigset_t *sig, size_t sigsz) |
238 | evsys_io_uring_enter (int fd, unsigned to_submit, unsigned min_complete, unsigned flags, const sigset_t *sig, size_t sigsz) |
215 | { |
239 | { |
216 | return ev_syscall6 (SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigsz); |
240 | return ev_syscall6 (SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigsz); |
217 | } |
241 | } |
218 | |
242 | |
|
|
243 | inline_size |
|
|
244 | int |
|
|
245 | evsys_io_uring_register (unsigned int fd, unsigned int opcode, void *arg, unsigned int nr_args) |
|
|
246 | { |
|
|
247 | return ev_syscall4 (SYS_io_uring_register, fd, opcode, arg, nr_args); |
|
|
248 | } |
|
|
249 | |
219 | /*****************************************************************************/ |
250 | /*****************************************************************************/ |
220 | /* actual backed implementation */ |
251 | /* actual backend implementation */ |
221 | |
252 | |
222 | /* we hope that volatile will make the compiler access this variables only once */ |
253 | /* we hope that volatile will make the compiler access this variables only once */ |
223 | #define EV_SQ_VAR(name) *(volatile unsigned *)((char *)iouring_sq_ring + iouring_sq_ ## name) |
|
|
224 | #define EV_CQ_VAR(name) *(volatile unsigned *)((char *)iouring_cq_ring + iouring_cq_ ## name) |
254 | #define EV_SQ_VAR(name) *(volatile unsigned *)((char *)iouring_ring + iouring_sq_ ## name) |
|
|
255 | #define EV_CQ_VAR(name) *(volatile unsigned *)((char *)iouring_ring + iouring_cq_ ## name) |
225 | |
256 | |
226 | /* the index array */ |
257 | /* the index array */ |
227 | #define EV_SQ_ARRAY ((unsigned *)((char *)iouring_sq_ring + iouring_sq_array)) |
258 | #define EV_SQ_ARRAY ((unsigned *)((char *)iouring_ring + iouring_sq_array)) |
228 | |
259 | |
229 | /* the submit/completion queue entries */ |
260 | /* the submit/completion queue entries */ |
230 | #define EV_SQES ((struct io_uring_sqe *) iouring_sqes) |
261 | #define EV_SQES ((struct io_uring_sqe *) iouring_sqes) |
231 | #define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) |
262 | #define EV_CQES ((struct io_uring_cqe *)((char *)iouring_ring + iouring_cq_cqes)) |
|
|
263 | |
|
|
264 | inline_speed |
|
|
265 | int |
|
|
266 | iouring_enter (EV_P_ ev_tstamp timeout) |
|
|
267 | { |
|
|
268 | int res; |
|
|
269 | |
|
|
270 | EV_RELEASE_CB; |
|
|
271 | |
|
|
272 | res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, |
|
|
273 | timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); |
|
|
274 | |
|
|
275 | assert (("libev: io_uring_enter did not consume all sqes", (res < 0 || res == iouring_to_submit))); |
|
|
276 | |
|
|
277 | iouring_to_submit = 0; |
|
|
278 | |
|
|
279 | EV_ACQUIRE_CB; |
|
|
280 | |
|
|
281 | return res; |
|
|
282 | } |
|
|
283 | |
|
|
284 | /* TODO: can we move things around so we don't need this forward-reference? */ |
|
|
285 | static void |
|
|
286 | iouring_poll (EV_P_ ev_tstamp timeout); |
232 | |
287 | |
233 | static |
288 | static |
234 | struct io_uring_sqe * |
289 | struct io_uring_sqe * |
235 | iouring_sqe_get (EV_P) |
290 | iouring_sqe_get (EV_P) |
236 | { |
291 | { |
|
|
292 | unsigned tail; |
|
|
293 | |
|
|
294 | for (;;) |
|
|
295 | { |
237 | unsigned tail = EV_SQ_VAR (tail); |
296 | tail = EV_SQ_VAR (tail); |
238 | |
297 | |
239 | if (tail + 1 - EV_SQ_VAR (head) > EV_SQ_VAR (ring_entries)) |
298 | if (ecb_expect_true (tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))) |
|
|
299 | break; /* whats the problem, we have free sqes */ |
|
|
300 | |
|
|
301 | /* queue full, need to flush and possibly handle some events */ |
|
|
302 | |
|
|
303 | #if EV_FEATURE_CODE |
|
|
304 | /* first we ask the kernel nicely, most often this frees up some sqes */ |
|
|
305 | int res = iouring_enter (EV_A_ EV_TS_CONST (0.)); |
|
|
306 | |
|
|
307 | ECB_MEMORY_FENCE_ACQUIRE; /* better safe than sorry */ |
|
|
308 | |
|
|
309 | if (res >= 0) |
|
|
310 | continue; /* yes, it worked, try again */ |
|
|
311 | #endif |
|
|
312 | |
|
|
313 | /* some problem, possibly EBUSY - do the full poll and let it handle any issues */ |
|
|
314 | |
|
|
315 | iouring_poll (EV_A_ EV_TS_CONST (0.)); |
|
|
316 | /* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE for us */ |
240 | { |
317 | } |
241 | /* queue full, flush */ |
|
|
242 | evsys_io_uring_enter (iouring_fd, iouring_to_submit, 0, 0, 0, 0); |
|
|
243 | iouring_to_submit = 0; |
|
|
244 | } |
|
|
245 | |
318 | |
246 | assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries))); |
319 | /*assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)));*/ |
247 | |
320 | |
248 | return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); |
321 | return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); |
249 | } |
322 | } |
250 | |
323 | |
251 | inline_size |
324 | inline_size |
252 | struct io_uring_sqe * |
325 | struct io_uring_sqe * |
253 | iouring_sqe_submit (EV_P_ struct io_uring_sqe *sqe) |
326 | iouring_sqe_submit (EV_P_ struct io_uring_sqe *sqe) |
254 | { |
327 | { |
255 | unsigned idx = sqe - EV_SQES; |
328 | unsigned idx = sqe - EV_SQES; |
|
|
329 | |
|
|
330 | printf ("submit idx %d, op %d, fd %d, us5r %p, poll %d\n", idx, sqe->opcode, sqe->fd, sqe->user_data, sqe->poll_events); |
256 | |
331 | |
257 | EV_SQ_ARRAY [idx] = idx; |
332 | EV_SQ_ARRAY [idx] = idx; |
258 | ECB_MEMORY_FENCE_RELEASE; |
333 | ECB_MEMORY_FENCE_RELEASE; |
259 | ++EV_SQ_VAR (tail); |
334 | ++EV_SQ_VAR (tail); |
260 | /*ECB_MEMORY_FENCE_RELEASE; /* for the time being we assume this is not needed */ |
335 | /*ECB_MEMORY_FENCE_RELEASE; /* for the time being we assume this is not needed */ |
… | |
… | |
279 | iouring_internal_destroy (EV_P) |
354 | iouring_internal_destroy (EV_P) |
280 | { |
355 | { |
281 | close (iouring_tfd); |
356 | close (iouring_tfd); |
282 | close (iouring_fd); |
357 | close (iouring_fd); |
283 | |
358 | |
284 | if (iouring_sq_ring != MAP_FAILED) munmap (iouring_sq_ring, iouring_sq_ring_size); |
359 | if (iouring_ring != MAP_FAILED) munmap (iouring_ring, iouring_ring_size); |
285 | if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size); |
|
|
286 | if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size ); |
360 | if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes, iouring_sqes_size); |
287 | |
361 | |
288 | if (ev_is_active (&iouring_tfd_w)) |
362 | if (ev_is_active (&iouring_tfd_w)) |
289 | { |
363 | { |
290 | ev_ref (EV_A); |
364 | ev_ref (EV_A); |
291 | ev_io_stop (EV_A_ &iouring_tfd_w); |
365 | ev_io_stop (EV_A_ &iouring_tfd_w); |
… | |
… | |
295 | ecb_cold |
369 | ecb_cold |
296 | static int |
370 | static int |
297 | iouring_internal_init (EV_P) |
371 | iouring_internal_init (EV_P) |
298 | { |
372 | { |
299 | struct io_uring_params params = { 0 }; |
373 | struct io_uring_params params = { 0 }; |
|
|
374 | uint32_t sq_size, cq_size; |
|
|
375 | |
|
|
376 | params.flags = IORING_SETUP_CLAMP; |
300 | |
377 | |
301 | iouring_to_submit = 0; |
378 | iouring_to_submit = 0; |
302 | |
379 | |
303 | iouring_tfd = -1; |
380 | iouring_tfd = -1; |
304 | iouring_sq_ring = MAP_FAILED; |
381 | iouring_ring = MAP_FAILED; |
305 | iouring_cq_ring = MAP_FAILED; |
|
|
306 | iouring_sqes = MAP_FAILED; |
382 | iouring_sqes = MAP_FAILED; |
307 | |
383 | |
308 | if (!have_monotonic) /* cannot really happen, but what if11 */ |
384 | if (!have_monotonic) /* cannot really happen, but what if11 */ |
309 | return -1; |
385 | return -1; |
310 | |
386 | |
311 | for (;;) |
|
|
312 | { |
|
|
313 | iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); |
387 | iouring_fd = evsys_io_uring_setup (iouring_entries, ¶ms); |
314 | |
388 | |
315 | if (iouring_fd >= 0) |
389 | if (iouring_fd < 0) |
316 | break; /* yippie */ |
390 | return -1; |
317 | |
391 | |
318 | if (errno != EINVAL) |
|
|
319 | return -1; /* we failed */ |
|
|
320 | |
|
|
321 | #if TODO |
|
|
322 | if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP)) |
392 | if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEAT_SINGLE_MMAP | IORING_FEAT_SUBMIT_STABLE)) |
323 | return -1; /* we require the above features */ |
393 | return -1; /* we require the above features */ |
324 | #endif |
|
|
325 | |
394 | |
326 | /* EINVAL: lots of possible reasons, but maybe |
395 | /* TODO: remember somehow whether our queue size has been clamped */ |
327 | * it is because we hit the unqueryable hardcoded size limit |
|
|
328 | */ |
|
|
329 | |
396 | |
330 | /* we hit the limit already, give up */ |
|
|
331 | if (iouring_max_entries) |
|
|
332 | return -1; |
|
|
333 | |
|
|
334 | /* first time we hit EINVAL? assume we hit the limit, so go back and retry */ |
|
|
335 | iouring_entries >>= 1; |
|
|
336 | iouring_max_entries = iouring_entries; |
|
|
337 | } |
|
|
338 | |
|
|
339 | iouring_sq_ring_size = params.sq_off.array + params.sq_entries * sizeof (unsigned); |
397 | sq_size = params.sq_off.array + params.sq_entries * sizeof (unsigned); |
340 | iouring_cq_ring_size = params.cq_off.cqes + params.cq_entries * sizeof (struct io_uring_cqe); |
398 | cq_size = params.cq_off.cqes + params.cq_entries * sizeof (struct io_uring_cqe); |
|
|
399 | |
|
|
400 | iouring_ring_size = sq_size > cq_size ? sq_size : cq_size; |
341 | iouring_sqes_size = params.sq_entries * sizeof (struct io_uring_sqe); |
401 | iouring_sqes_size = params.sq_entries * sizeof (struct io_uring_sqe); |
342 | |
402 | |
343 | iouring_sq_ring = mmap (0, iouring_sq_ring_size, PROT_READ | PROT_WRITE, |
403 | iouring_ring = mmap (0, iouring_ring_size, PROT_READ | PROT_WRITE, |
344 | MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQ_RING); |
404 | MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQ_RING); |
345 | iouring_cq_ring = mmap (0, iouring_cq_ring_size, PROT_READ | PROT_WRITE, |
|
|
346 | MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_CQ_RING); |
|
|
347 | iouring_sqes = mmap (0, iouring_sqes_size, PROT_READ | PROT_WRITE, |
405 | iouring_sqes = mmap (0, iouring_sqes_size, PROT_READ | PROT_WRITE, |
348 | MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQES); |
406 | MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQES); |
349 | |
407 | |
350 | if (iouring_sq_ring == MAP_FAILED || iouring_cq_ring == MAP_FAILED || iouring_sqes == MAP_FAILED) |
408 | if (iouring_ring == MAP_FAILED || iouring_sqes == MAP_FAILED) |
351 | return -1; |
409 | return -1; |
352 | |
410 | |
353 | iouring_sq_head = params.sq_off.head; |
411 | iouring_sq_head = params.sq_off.head; |
354 | iouring_sq_tail = params.sq_off.tail; |
412 | iouring_sq_tail = params.sq_off.tail; |
355 | iouring_sq_ring_mask = params.sq_off.ring_mask; |
413 | iouring_sq_ring_mask = params.sq_off.ring_mask; |
… | |
… | |
363 | iouring_cq_ring_mask = params.cq_off.ring_mask; |
421 | iouring_cq_ring_mask = params.cq_off.ring_mask; |
364 | iouring_cq_ring_entries = params.cq_off.ring_entries; |
422 | iouring_cq_ring_entries = params.cq_off.ring_entries; |
365 | iouring_cq_overflow = params.cq_off.overflow; |
423 | iouring_cq_overflow = params.cq_off.overflow; |
366 | iouring_cq_cqes = params.cq_off.cqes; |
424 | iouring_cq_cqes = params.cq_off.cqes; |
367 | |
425 | |
|
|
426 | iouring_tfd_to = EV_TSTAMP_HUGE; |
|
|
427 | |
368 | iouring_tfd = timerfd_create (CLOCK_MONOTONIC, TFD_CLOEXEC); |
428 | iouring_tfd = timerfd_create (CLOCK_MONOTONIC, TFD_CLOEXEC); |
369 | |
429 | |
370 | if (iouring_tfd < 0) |
430 | if (iouring_tfd < 0) |
371 | return iouring_tfd; |
431 | return -1; |
372 | |
|
|
373 | iouring_tfd_to = EV_TSTAMP_HUGE; |
|
|
374 | |
432 | |
375 | return 0; |
433 | return 0; |
376 | } |
434 | } |
377 | |
435 | |
378 | ecb_cold |
436 | ecb_cold |
… | |
… | |
405 | /* Jens Axboe notified me that user_data is not what is documented, but is |
463 | /* Jens Axboe notified me that user_data is not what is documented, but is |
406 | * some kind of unique ID that has to match, otherwise the request cannot |
464 | * some kind of unique ID that has to match, otherwise the request cannot |
407 | * be removed. Since we don't *really* have that, we pass in the old |
465 | * be removed. Since we don't *really* have that, we pass in the old |
408 | * generation counter - if that fails, too bad, it will hopefully be removed |
466 | * generation counter - if that fails, too bad, it will hopefully be removed |
409 | * at close time and then be ignored. */ |
467 | * at close time and then be ignored. */ |
410 | sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); |
468 | sqe->addr = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); |
|
|
469 | sqe->user_data = (uint64_t)-1; |
411 | iouring_sqe_submit (EV_A_ sqe); |
470 | iouring_sqe_submit (EV_A_ sqe); |
412 | |
471 | |
413 | /* increment generation counter to avoid handling old events */ |
472 | /* increment generation counter to avoid handling old events */ |
414 | ++anfds [fd].egen; |
473 | ++anfds [fd].egen; |
415 | } |
474 | } |
… | |
… | |
417 | if (nev) |
476 | if (nev) |
418 | { |
477 | { |
419 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
478 | struct io_uring_sqe *sqe = iouring_sqe_get (EV_A); |
420 | sqe->opcode = IORING_OP_POLL_ADD; |
479 | sqe->opcode = IORING_OP_POLL_ADD; |
421 | sqe->fd = fd; |
480 | sqe->fd = fd; |
|
|
481 | sqe->addr = 0; |
422 | sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); |
482 | sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32); |
423 | sqe->poll_events = |
483 | sqe->poll_events = |
424 | (nev & EV_READ ? POLLIN : 0) |
484 | (nev & EV_READ ? POLLIN : 0) |
425 | | (nev & EV_WRITE ? POLLOUT : 0); |
485 | | (nev & EV_WRITE ? POLLOUT : 0); |
426 | iouring_sqe_submit (EV_A_ sqe); |
486 | iouring_sqe_submit (EV_A_ sqe); |
… | |
… | |
456 | { |
516 | { |
457 | int fd = cqe->user_data & 0xffffffffU; |
517 | int fd = cqe->user_data & 0xffffffffU; |
458 | uint32_t gen = cqe->user_data >> 32; |
518 | uint32_t gen = cqe->user_data >> 32; |
459 | int res = cqe->res; |
519 | int res = cqe->res; |
460 | |
520 | |
|
|
521 | /* user_data -1 is a remove that we are not atm. interested in */ |
|
|
522 | if (cqe->user_data == (uint64_t)-1) |
|
|
523 | return; |
|
|
524 | |
461 | assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); |
525 | assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax)); |
462 | |
526 | |
463 | /* documentation lies, of course. the result value is NOT like |
527 | /* documentation lies, of course. the result value is NOT like |
464 | * normal syscalls, but like linux raw syscalls, i.e. negative |
528 | * normal syscalls, but like linux raw syscalls, i.e. negative |
465 | * error numbers. fortunate, as otherwise there would be no way |
529 | * error numbers. fortunate, as otherwise there would be no way |
… | |
… | |
473 | return; |
537 | return; |
474 | |
538 | |
475 | if (ecb_expect_false (res < 0)) |
539 | if (ecb_expect_false (res < 0)) |
476 | { |
540 | { |
477 | /*TODO: EINVAL handling (was something failed with this fd)*/ |
541 | /*TODO: EINVAL handling (was something failed with this fd)*/ |
478 | /*TODO: EBUSY happens when?*/ |
|
|
479 | |
542 | |
480 | if (res == -EBADF) |
543 | if (res == -EBADF) |
481 | { |
544 | { |
482 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
545 | assert (("libev: event loop rejected bad fd", res != -EBADF)); |
483 | fd_kill (EV_A_ fd); |
546 | fd_kill (EV_A_ fd); |
… | |
… | |
589 | |
652 | |
590 | static void |
653 | static void |
591 | iouring_poll (EV_P_ ev_tstamp timeout) |
654 | iouring_poll (EV_P_ ev_tstamp timeout) |
592 | { |
655 | { |
593 | /* if we have events, no need for extra syscalls, but we might have to queue events */ |
656 | /* if we have events, no need for extra syscalls, but we might have to queue events */ |
|
|
657 | /* we also clar the timeout if there are outstanding fdchanges */ |
|
|
658 | /* the latter should only happen if both the sq and cq are full, most likely */ |
|
|
659 | /* because we have a lot of event sources that immediately complete */ |
|
|
660 | /* TODO: fdchacngecnt is always 0 because fd_reify does not have two buffers yet */ |
594 | if (iouring_handle_cq (EV_A)) |
661 | if (iouring_handle_cq (EV_A) || fdchangecnt) |
595 | timeout = EV_TS_CONST (0.); |
662 | timeout = EV_TS_CONST (0.); |
596 | else |
663 | else |
597 | /* no events, so maybe wait for some */ |
664 | /* no events, so maybe wait for some */ |
598 | iouring_tfd_update (EV_A_ timeout); |
665 | iouring_tfd_update (EV_A_ timeout); |
599 | |
666 | |
600 | /* only enter the kernel if we have something to submit, or we need to wait */ |
667 | /* only enter the kernel if we have something to submit, or we need to wait */ |
601 | if (timeout || iouring_to_submit) |
668 | if (timeout || iouring_to_submit) |
602 | { |
669 | { |
603 | int res; |
670 | int res = iouring_enter (EV_A_ timeout); |
604 | |
|
|
605 | EV_RELEASE_CB; |
|
|
606 | |
|
|
607 | res = evsys_io_uring_enter (iouring_fd, iouring_to_submit, 1, |
|
|
608 | timeout > EV_TS_CONST (0.) ? IORING_ENTER_GETEVENTS : 0, 0, 0); |
|
|
609 | iouring_to_submit = 0; |
|
|
610 | |
|
|
611 | EV_ACQUIRE_CB; |
|
|
612 | |
671 | |
613 | if (ecb_expect_false (res < 0)) |
672 | if (ecb_expect_false (res < 0)) |
614 | if (errno == EINTR) |
673 | if (errno == EINTR) |
615 | /* ignore */; |
674 | /* ignore */; |
|
|
675 | else if (errno == EBUSY) |
|
|
676 | /* cq full, cannot submit - should be rare because we flush the cq first, so simply ignore */; |
616 | else |
677 | else |
617 | ev_syserr ("(libev) iouring setup"); |
678 | ev_syserr ("(libev) iouring setup"); |
618 | else |
679 | else |
619 | iouring_handle_cq (EV_A); |
680 | iouring_handle_cq (EV_A); |
620 | } |
681 | } |