ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/libev/ev_iouring.c
(Generate patch)

Comparing libev/ev_iouring.c (file contents):
Revision 1.21 by root, Wed Jan 22 02:20:47 2020 UTC vs.
Revision 1.27 by sf-exg, Sun May 14 19:02:31 2023 UTC

59 * you could have avoided. overall, this bizarre omission smells 59 * you could have avoided. overall, this bizarre omission smells
60 * like a µ-optimisation by the io_uring author for his personal 60 * like a µ-optimisation by the io_uring author for his personal
61 * applications, to the detriment of everybody else who just wants 61 * applications, to the detriment of everybody else who just wants
62 * an event loop. but, umm, ok, if that's all, it could be worse. 62 * an event loop. but, umm, ok, if that's all, it could be worse.
63 * (from what I gather from the author Jens Axboe, it simply didn't 63 * (from what I gather from the author Jens Axboe, it simply didn't
64 * occur to him, and he made good on it by adding an unlimited nuber 64 * occur to him, and he made good on it by adding an unlimited number
65 * of timeouts later :). 65 * of timeouts later :).
66 * h) initially there was a hardcoded limit of 4096 outstanding events. 66 * h) initially there was a hardcoded limit of 4096 outstanding events.
67 * later versions not only bump this to 32k, but also can handle 67 * later versions not only bump this to 32k, but also can handle
68 * an unlimited amount of events, so this only affects the batch size. 68 * an unlimited amount of events, so this only affects the batch size.
69 * i) unlike linux aio, you *can* register more then the limit 69 * i) unlike linux aio, you *can* register more then the limit
70 * of fd events. while early verisons of io_uring signalled an overflow 70 * of fd events. while early versions of io_uring signalled an overflow
71 * and you ended up getting wet. 5.5+ does not do this anymore. 71 * and you ended up getting wet. 5.5+ does not do this anymore.
72 * j) but, oh my! it had exactly the same bugs as the linux aio backend, 72 * j) but, oh my! it had exactly the same bugs as the linux aio backend,
73 * where some undocumented poll combinations just fail. fortunately, 73 * where some undocumented poll combinations just fail. fortunately,
74 * after finally reaching the author, he was more than willing to fix 74 * after finally reaching the author, he was more than willing to fix
75 * this probably in 5.6+. 75 * this probably in 5.6+.
118 __u32 timeout_flags; 118 __u32 timeout_flags;
119 __u32 accept_flags; 119 __u32 accept_flags;
120 __u32 cancel_flags; 120 __u32 cancel_flags;
121 __u32 open_flags; 121 __u32 open_flags;
122 __u32 statx_flags; 122 __u32 statx_flags;
123 __u32 fadvise_advice;
123 }; 124 };
124 __u64 user_data; 125 __u64 user_data;
125 union { 126 union {
126 __u16 buf_index; 127 __u16 buf_index;
128 __u16 personality;
127 __u64 __pad2[3]; 129 __u64 __pad2[3];
128 }; 130 };
129}; 131};
130 132
131struct io_uring_cqe 133struct io_uring_cqe
170 __u32 resv[4]; 172 __u32 resv[4];
171 struct io_sqring_offsets sq_off; 173 struct io_sqring_offsets sq_off;
172 struct io_cqring_offsets cq_off; 174 struct io_cqring_offsets cq_off;
173}; 175};
174 176
177#define IORING_FEAT_SINGLE_MMAP 0x00000001
178#define IORING_FEAT_NODROP 0x00000002
179#define IORING_FEAT_SUBMIT_STABLE 0x00000004
180
175#define IORING_SETUP_CQSIZE 0x00000008 181#define IORING_SETUP_CQSIZE 0x00000008
182#define IORING_SETUP_CLAMP 0x00000010
176 183
177#define IORING_OP_POLL_ADD 6 184#define IORING_OP_POLL_ADD 6
178#define IORING_OP_POLL_REMOVE 7 185#define IORING_OP_POLL_REMOVE 7
179#define IORING_OP_TIMEOUT 11 186#define IORING_OP_TIMEOUT 11
180#define IORING_OP_TIMEOUT_REMOVE 12 187#define IORING_OP_TIMEOUT_REMOVE 12
181 188
189#define IORING_REGISTER_EVENTFD 4
190#define IORING_REGISTER_EVENTFD_ASYNC 7
191#define IORING_REGISTER_PROBE 8
192
193#define IO_URING_OP_SUPPORTED 1
194
195struct io_uring_probe_op {
196 __u8 op;
197 __u8 resv;
198 __u16 flags;
199 __u32 resv2;
200};
201
202struct io_uring_probe
203{
204 __u8 last_op;
205 __u8 ops_len;
206 __u16 resv;
207 __u32 resv2[3];
208 struct io_uring_probe_op ops[0];
209};
210
182/* relative or absolute, reference clock is CLOCK_MONOTONIC */ 211/* relative or absolute, reference clock is CLOCK_MONOTONIC */
183struct iouring_kernel_timespec 212struct iouring_kernel_timespec
184{ 213{
185 int64_t tv_sec; 214 int64_t tv_sec;
186 long long tv_nsec; 215 long long tv_nsec;
189#define IORING_TIMEOUT_ABS 0x00000001 218#define IORING_TIMEOUT_ABS 0x00000001
190 219
191#define IORING_ENTER_GETEVENTS 0x01 220#define IORING_ENTER_GETEVENTS 0x01
192 221
193#define IORING_OFF_SQ_RING 0x00000000ULL 222#define IORING_OFF_SQ_RING 0x00000000ULL
194#define IORING_OFF_CQ_RING 0x08000000ULL
195#define IORING_OFF_SQES 0x10000000ULL 223#define IORING_OFF_SQES 0x10000000ULL
196 224
197#define IORING_FEAT_SINGLE_MMAP 0x00000001 225#define IORING_FEAT_SINGLE_MMAP 0x00000001
198#define IORING_FEAT_NODROP 0x00000002 226#define IORING_FEAT_NODROP 0x00000002
199#define IORING_FEAT_SUBMIT_STABLE 0x00000004 227#define IORING_FEAT_SUBMIT_STABLE 0x00000004
210evsys_io_uring_enter (int fd, unsigned to_submit, unsigned min_complete, unsigned flags, const sigset_t *sig, size_t sigsz) 238evsys_io_uring_enter (int fd, unsigned to_submit, unsigned min_complete, unsigned flags, const sigset_t *sig, size_t sigsz)
211{ 239{
212 return ev_syscall6 (SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigsz); 240 return ev_syscall6 (SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigsz);
213} 241}
214 242
243inline_size
244int
245evsys_io_uring_register (unsigned int fd, unsigned int opcode, void *arg, unsigned int nr_args)
246{
247 return ev_syscall4 (SYS_io_uring_register, fd, opcode, arg, nr_args);
248}
249
215/*****************************************************************************/ 250/*****************************************************************************/
216/* actual backed implementation */ 251/* actual backend implementation */
217 252
218/* we hope that volatile will make the compiler access this variables only once */ 253/* we hope that volatile will make the compiler access this variables only once */
219#define EV_SQ_VAR(name) *(volatile unsigned *)((char *)iouring_sq_ring + iouring_sq_ ## name)
220#define EV_CQ_VAR(name) *(volatile unsigned *)((char *)iouring_cq_ring + iouring_cq_ ## name) 254#define EV_SQ_VAR(name) *(volatile unsigned *)((char *)iouring_ring + iouring_sq_ ## name)
255#define EV_CQ_VAR(name) *(volatile unsigned *)((char *)iouring_ring + iouring_cq_ ## name)
221 256
222/* the index array */ 257/* the index array */
223#define EV_SQ_ARRAY ((unsigned *)((char *)iouring_sq_ring + iouring_sq_array)) 258#define EV_SQ_ARRAY ((unsigned *)((char *)iouring_ring + iouring_sq_array))
224 259
225/* the submit/completion queue entries */ 260/* the submit/completion queue entries */
226#define EV_SQES ((struct io_uring_sqe *) iouring_sqes) 261#define EV_SQES ((struct io_uring_sqe *) iouring_sqes)
227#define EV_CQES ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes)) 262#define EV_CQES ((struct io_uring_cqe *)((char *)iouring_ring + iouring_cq_cqes))
228 263
229inline_speed 264inline_speed
230int 265int
231iouring_enter (EV_P_ ev_tstamp timeout) 266iouring_enter (EV_P_ ev_tstamp timeout)
232{ 267{
285 320
286 return EV_SQES + (tail & EV_SQ_VAR (ring_mask)); 321 return EV_SQES + (tail & EV_SQ_VAR (ring_mask));
287} 322}
288 323
289inline_size 324inline_size
290struct io_uring_sqe * 325void
291iouring_sqe_submit (EV_P_ struct io_uring_sqe *sqe) 326iouring_sqe_submit (EV_P_ struct io_uring_sqe *sqe)
292{ 327{
293 unsigned idx = sqe - EV_SQES; 328 unsigned idx = sqe - EV_SQES;
294 329
295 EV_SQ_ARRAY [idx] = idx; 330 EV_SQ_ARRAY [idx] = idx;
311 iouring_tfd_to = EV_TSTAMP_HUGE; 346 iouring_tfd_to = EV_TSTAMP_HUGE;
312} 347}
313 348
314/* called for full and partial cleanup */ 349/* called for full and partial cleanup */
315ecb_cold 350ecb_cold
316static int 351static void
317iouring_internal_destroy (EV_P) 352iouring_internal_destroy (EV_P)
318{ 353{
319 close (iouring_tfd); 354 close (iouring_tfd);
320 close (iouring_fd); 355 close (iouring_fd);
321 356
322 if (iouring_sq_ring != MAP_FAILED) munmap (iouring_sq_ring, iouring_sq_ring_size); 357 if (iouring_ring != MAP_FAILED) munmap (iouring_ring, iouring_ring_size);
323 if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size);
324 if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes , iouring_sqes_size ); 358 if (iouring_sqes != MAP_FAILED) munmap (iouring_sqes, iouring_sqes_size);
325 359
326 if (ev_is_active (&iouring_tfd_w)) 360 if (ev_is_active (&iouring_tfd_w))
327 { 361 {
328 ev_ref (EV_A); 362 ev_ref (EV_A);
329 ev_io_stop (EV_A_ &iouring_tfd_w); 363 ev_io_stop (EV_A_ &iouring_tfd_w);
333ecb_cold 367ecb_cold
334static int 368static int
335iouring_internal_init (EV_P) 369iouring_internal_init (EV_P)
336{ 370{
337 struct io_uring_params params = { 0 }; 371 struct io_uring_params params = { 0 };
372 uint32_t sq_size, cq_size;
373
374 params.flags = IORING_SETUP_CLAMP;
338 375
339 iouring_to_submit = 0; 376 iouring_to_submit = 0;
340 377
341 iouring_tfd = -1; 378 iouring_tfd = -1;
342 iouring_sq_ring = MAP_FAILED; 379 iouring_ring = MAP_FAILED;
343 iouring_cq_ring = MAP_FAILED;
344 iouring_sqes = MAP_FAILED; 380 iouring_sqes = MAP_FAILED;
345 381
346 if (!have_monotonic) /* cannot really happen, but what if11 */ 382 if (!have_monotonic) /* cannot really happen, but what if11 */
347 return -1; 383 return -1;
348 384
349 for (;;)
350 {
351 iouring_fd = evsys_io_uring_setup (iouring_entries, &params); 385 iouring_fd = evsys_io_uring_setup (iouring_entries, &params);
352 386
353 if (iouring_fd >= 0) 387 if (iouring_fd < 0)
354 break; /* yippie */ 388 return -1;
355 389
356 if (errno != EINVAL)
357 return -1; /* we failed */
358
359#if TODO
360 if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP | IORING_FEAT_SUBMIT_STABLE)) 390 if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEAT_SINGLE_MMAP | IORING_FEAT_SUBMIT_STABLE))
361 return -1; /* we require the above features */ 391 return -1; /* we require the above features */
362#endif
363 392
364 /* EINVAL: lots of possible reasons, but maybe 393 /* TODO: remember somehow whether our queue size has been clamped */
365 * it is because we hit the unqueryable hardcoded size limit
366 */
367 394
368 /* we hit the limit already, give up */
369 if (iouring_max_entries)
370 return -1;
371
372 /* first time we hit EINVAL? assume we hit the limit, so go back and retry */
373 iouring_entries >>= 1;
374 iouring_max_entries = iouring_entries;
375 }
376
377 iouring_sq_ring_size = params.sq_off.array + params.sq_entries * sizeof (unsigned); 395 sq_size = params.sq_off.array + params.sq_entries * sizeof (unsigned);
378 iouring_cq_ring_size = params.cq_off.cqes + params.cq_entries * sizeof (struct io_uring_cqe); 396 cq_size = params.cq_off.cqes + params.cq_entries * sizeof (struct io_uring_cqe);
397
398 iouring_ring_size = sq_size > cq_size ? sq_size : cq_size;
379 iouring_sqes_size = params.sq_entries * sizeof (struct io_uring_sqe); 399 iouring_sqes_size = params.sq_entries * sizeof (struct io_uring_sqe);
380 400
381 iouring_sq_ring = mmap (0, iouring_sq_ring_size, PROT_READ | PROT_WRITE, 401 iouring_ring = mmap (0, iouring_ring_size, PROT_READ | PROT_WRITE,
382 MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQ_RING); 402 MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQ_RING);
383 iouring_cq_ring = mmap (0, iouring_cq_ring_size, PROT_READ | PROT_WRITE,
384 MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_CQ_RING);
385 iouring_sqes = mmap (0, iouring_sqes_size, PROT_READ | PROT_WRITE, 403 iouring_sqes = mmap (0, iouring_sqes_size, PROT_READ | PROT_WRITE,
386 MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQES); 404 MAP_SHARED | MAP_POPULATE, iouring_fd, IORING_OFF_SQES);
387 405
388 if (iouring_sq_ring == MAP_FAILED || iouring_cq_ring == MAP_FAILED || iouring_sqes == MAP_FAILED) 406 if (iouring_ring == MAP_FAILED || iouring_sqes == MAP_FAILED)
389 return -1; 407 return -1;
390 408
391 iouring_sq_head = params.sq_off.head; 409 iouring_sq_head = params.sq_off.head;
392 iouring_sq_tail = params.sq_off.tail; 410 iouring_sq_tail = params.sq_off.tail;
393 iouring_sq_ring_mask = params.sq_off.ring_mask; 411 iouring_sq_ring_mask = params.sq_off.ring_mask;
401 iouring_cq_ring_mask = params.cq_off.ring_mask; 419 iouring_cq_ring_mask = params.cq_off.ring_mask;
402 iouring_cq_ring_entries = params.cq_off.ring_entries; 420 iouring_cq_ring_entries = params.cq_off.ring_entries;
403 iouring_cq_overflow = params.cq_off.overflow; 421 iouring_cq_overflow = params.cq_off.overflow;
404 iouring_cq_cqes = params.cq_off.cqes; 422 iouring_cq_cqes = params.cq_off.cqes;
405 423
424 iouring_tfd_to = EV_TSTAMP_HUGE;
425
406 iouring_tfd = timerfd_create (CLOCK_MONOTONIC, TFD_CLOEXEC); 426 iouring_tfd = timerfd_create (CLOCK_MONOTONIC, TFD_CLOEXEC);
407 427
408 if (iouring_tfd < 0) 428 if (iouring_tfd < 0)
409 return iouring_tfd; 429 return -1;
410
411 iouring_tfd_to = EV_TSTAMP_HUGE;
412 430
413 return 0; 431 return 0;
414} 432}
415 433
416ecb_cold 434ecb_cold

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines