--- libev/ev_iouring.c	2019/12/28 05:20:17	1.14
+++ libev/ev_iouring.c	2020/01/22 02:20:47	1.21
@@ -1,7 +1,7 @@
 /*
  * libev linux io_uring fd activity backend
  *
- * Copyright (c) 2019 Marc Alexander Lehmann <libev@schmorp.de>
+ * Copyright (c) 2019-2020 Marc Alexander Lehmann <libev@schmorp.de>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without modifica-
@@ -46,11 +46,10 @@
  *    of linux aio or epoll and so on and so on. and you could do event stuff
  *    without any syscalls. what's not to like?
  * d) ok, it's vastly more complex, but that's ok, really.
- * e) why 3 mmaps instead of one? one would be more space-efficient,
- *    and I can't see what benefit three would have (other than being
+ * e) why two mmaps instead of one? one would be more space-efficient,
+ *    and I can't see what benefit two would have (other than being
  *    somehow resizable/relocatable, but that's apparently not possible).
- *    (FIXME: newer kernels can use 2 mmaps only, need to look into this).
- * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and
+ * f) hmm, it's practically undebuggable (gdb can't access the memory, and
  *    the bizarre way structure offsets are communicated makes it hard to
  *    just print the ring buffer heads, even *iff* the memory were visible
  *    in gdb. but then, that's also ok, really.
@@ -61,25 +60,22 @@
  *    like a µ-optimisation by the io_uring author for his personal
  *    applications, to the detriment of everybody else who just wants
  *    an event loop. but, umm, ok, if that's all, it could be worse.
- *    (FIXME: jens mentioned timeout commands, need to investigate)
- * h) there is a hardcoded limit of 4096 outstanding events. okay,
- *    at least there is no arbitrary low system-wide limit...
- *    (FIXME: apparently, this was increased to 32768 in later kernels(
+ *    (from what I gather from the author Jens Axboe, it simply didn't
+ *    occur to him, and he made good on it by adding an unlimited nuber
+ *    of timeouts later :).
+ * h) initially there was a hardcoded limit of 4096 outstanding events.
+ *    later versions not only bump this to 32k, but also can handle
+ *    an unlimited amount of events, so this only affects the batch size.
  * i) unlike linux aio, you *can* register more then the limit
- *    of fd events, and the kernel will "gracefully" signal an
- *    overflow, after which you could destroy and recreate the kernel
- *    state, a bit bigger, or fall back to e.g. poll. thats not
- *    totally insane, but kind of questions the point a high
- *    performance I/O framework when it doesn't really work
- *    under stress.
- *    (FIXME: iouring should no longer drop events, need to investigate)
- * j) but, oh my! is has exactly the same bugs as the linux aio backend,
- *    where some undocumented poll combinations just fail.
- *    so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course,
- *    this is completely undocumented, have I mantioned this already?
+ *    of fd events. while early verisons of io_uring signalled an overflow
+ *    and you ended up getting wet. 5.5+ does not do this anymore.
+ * j) but, oh my! it had exactly the same bugs as the linux aio backend,
+ *    where some undocumented poll combinations just fail. fortunately,
+ *    after finally reaching the author, he was more than willing to fix
+ *    this probably in 5.6+.
  * k) overall, the *API* itself is, I dare to say, not a total trainwreck.
- *    the big isuess with it are the bugs requiring epoll, which might
- *    or might not get fixed (do I hold my breath?).
+ *    once the bugs ae fixed (probably in 5.6+), it will be without
+ *    competition.
  */
 
 /* TODO: use internal TIMEOUT */
@@ -230,10 +226,8 @@
 #define EV_SQES         ((struct io_uring_sqe *)         iouring_sqes)
 #define EV_CQES         ((struct io_uring_cqe *)((char *)iouring_cq_ring + iouring_cq_cqes))
 
-/* TODO: this is not enough, we might have to reap events */
-/* TODO: but we can't, as that will re-arm events, causing */
-/* TODO: an endless loop in fd_reify */
-static int
+inline_speed
+int
 iouring_enter (EV_P_ ev_tstamp timeout)
 {
   int res;
@@ -252,28 +246,39 @@
   return res;
 }
 
+/* TODO: can we move things around so we don't need this forward-reference? */
+static void
+iouring_poll (EV_P_ ev_tstamp timeout);
+
 static
 struct io_uring_sqe *
 iouring_sqe_get (EV_P)
 {
-  unsigned tail = EV_SQ_VAR (tail);
-
-  while (ecb_expect_false (tail + 1 - EV_SQ_VAR (head) > EV_SQ_VAR (ring_entries)))
+  unsigned tail;
+  
+  for (;;)
     {
-      /* queue full, need to flush */
+      tail = EV_SQ_VAR (tail);
+
+      if (ecb_expect_true (tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)))
+        break; /* whats the problem, we have free sqes */
 
+      /* queue full, need to flush and possibly handle some events */
+
+#if EV_FEATURE_CODE
+      /* first we ask the kernel nicely, most often this frees up some sqes */
       int res = iouring_enter (EV_A_ EV_TS_CONST (0.));
 
-      /* io_uring_enter might fail with EBUSY and won't submit anything */
-      /* unfortunately, we can't handle this at the moment */
+      ECB_MEMORY_FENCE_ACQUIRE; /* better safe than sorry */
 
-      if (res < 0 && errno == EBUSY)
-        //TODO
-        ev_syserr ("(libev) io_uring_enter could not clear sq");
-      else
-        break;
-        
-      /* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE */
+      if (res >= 0)
+        continue; /* yes, it worked, try again */
+#endif
+
+      /* some problem, possibly EBUSY - do the full poll and let it handle any issues */
+
+      iouring_poll (EV_A_ EV_TS_CONST (0.));
+      /* iouring_poll should have done ECB_MEMORY_FENCE_ACQUIRE for us */
     }
 
   /*assert (("libev: io_uring queue full after flush", tail + 1 - EV_SQ_VAR (head) <= EV_SQ_VAR (ring_entries)));*/
@@ -352,7 +357,7 @@
         return -1; /* we failed */
 
 #if TODO
-      if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP))
+      if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP | IORING_FEAT_SUBMIT_STABLE))
         return -1; /* we require the above features */
 #endif
 
@@ -440,7 +445,8 @@
        * be removed. Since we don't *really* have that, we pass in the old
        * generation counter - if that fails, too bad, it will hopefully be removed
        * at close time and then be ignored. */
-      sqe->user_data = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32);
+      sqe->addr      = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32);
+      sqe->user_data = (uint64_t)-1;
       iouring_sqe_submit (EV_A_ sqe);
 
       /* increment generation counter to avoid handling old events */
@@ -452,6 +458,7 @@
       struct io_uring_sqe *sqe = iouring_sqe_get (EV_A);
       sqe->opcode      = IORING_OP_POLL_ADD;
       sqe->fd          = fd;
+      sqe->addr        = 0;
       sqe->user_data   = (uint32_t)fd | ((__u64)(uint32_t)anfds [fd].egen << 32);
       sqe->poll_events =
         (nev & EV_READ ? POLLIN : 0)
@@ -491,6 +498,10 @@
   uint32_t gen = cqe->user_data >> 32;
   int      res = cqe->res;
 
+  /* user_data -1 is a remove that we are not atm. interested in */
+  if (cqe->user_data == (uint64_t)-1)
+    return;
+
   assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax));
 
   /* documentation lies, of course. the result value is NOT like
@@ -508,7 +519,6 @@
   if (ecb_expect_false (res < 0))
     {
       /*TODO: EINVAL handling (was something failed with this fd)*/
-      /*TODO: EBUSY happens when?*/
 
       if (res == -EBADF)
         {
@@ -624,7 +634,11 @@
 iouring_poll (EV_P_ ev_tstamp timeout)
 {
   /* if we have events, no need for extra syscalls, but we might have to queue events */
-  if (iouring_handle_cq (EV_A))
+  /* we also clar the timeout if there are outstanding fdchanges */
+  /* the latter should only happen if both the sq and cq are full, most likely */
+  /* because we have a lot of event sources that immediately complete */
+  /* TODO: fdchacngecnt is always 0 because fd_reify does not have two buffers yet */
+  if (iouring_handle_cq (EV_A) || fdchangecnt)
     timeout = EV_TS_CONST (0.);
   else
     /* no events, so maybe wait for some */