--- libev/ev_iouring.c	2019/12/20 05:20:23	1.6
+++ libev/ev_iouring.c	2019/12/27 21:56:29	1.11
@@ -49,8 +49,9 @@
  * e) why 3 mmaps instead of one? one would be more space-efficient,
  *    and I can't see what benefit three would have (other than being
  *    somehow resizable/relocatable, but that's apparently not possible).
+ *    (FIXME: newer kernels can use 2 mmaps only, need to look into this).
  * f) hmm, it's practiclaly undebuggable (gdb can't access the memory, and
-      the bizarre way structure offsets are commuinicated makes it hard to
+ *    the bizarre way structure offsets are communicated makes it hard to
  *    just print the ring buffer heads, even *iff* the memory were visible
  *    in gdb. but then, that's also ok, really.
  * g) well, you cannot specify a timeout when waiting for events. no,
@@ -60,8 +61,10 @@
  *    like a µ-optimisation by the io_uring author for his personal
  *    applications, to the detriment of everybody else who just wants
  *    an event loop. but, umm, ok, if that's all, it could be worse.
+ *    (FIXME: jens mentioned timeout commands, need to investigate)
  * h) there is a hardcoded limit of 4096 outstanding events. okay,
  *    at least there is no arbitrary low system-wide limit...
+ *    (FIXME: apparently, this was increased to 32768 in later kernels(
  * i) unlike linux aio, you *can* register more then the limit
  *    of fd events, and the kernel will "gracefully" signal an
  *    overflow, after which you could destroy and recreate the kernel
@@ -69,6 +72,7 @@
  *    totally insane, but kind of questions the point a high
  *    performance I/O framework when it doesn't really work
  *    under stress.
+ *    (FIXME: iouring should no longer drop events, need to investigate)
  * j) but, oh my! is has exactly the same bugs as the linux aio backend,
  *    where some undocumented poll combinations just fail.
  *    so we need epoll AGAIN as a fallback. AGAIN! epoll!! and of course,
@@ -78,6 +82,10 @@
  *    or might not get fixed (do I hold my breath?).
  */
 
+/* TODO: use internal TIMEOUT */
+/* TODO: take advantage of single mmap, NODROP etc. */
+/* TODO: resize cq/sq size independently */
+
 #include <sys/timerfd.h>
 #include <sys/mman.h>
 #include <poll.h>
@@ -98,7 +106,10 @@
   __u8 flags;
   __u16 ioprio;
   __s32 fd;
-  __u64 off;
+  union {
+    __u64 off;
+    __u64 addr2;
+  };
   __u64 addr;
   __u32 len;
   union {
@@ -107,6 +118,11 @@
     __u16 poll_events;
     __u32 sync_range_flags;
     __u32 msg_flags;
+    __u32 timeout_flags;
+    __u32 accept_flags;
+    __u32 cancel_flags;
+    __u32 open_flags;
+    __u32 statx_flags;
   };
   __u64 user_data;
   union {
@@ -153,7 +169,8 @@
   __u32 flags;
   __u32 sq_thread_cpu;
   __u32 sq_thread_idle;
-  __u32 resv[5];
+  __u32 features;
+  __u32 resv[4];
   struct io_sqring_offsets sq_off;
   struct io_cqring_offsets cq_off;
 };
@@ -167,6 +184,10 @@
 #define IORING_OFF_CQ_RING 0x08000000ULL
 #define IORING_OFF_SQES	   0x10000000ULL
 
+#define IORING_FEAT_SINGLE_MMAP   0x1
+#define IORING_FEAT_NODROP        0x2
+#define IORING_FEAT_SUBMIT_STABLE 0x4
+
 inline_size
 int
 evsys_io_uring_setup (unsigned entries, struct io_uring_params *params)
@@ -238,12 +259,6 @@
   iouring_tfd_to = EV_TSTAMP_HUGE;
 }
 
-static void
-iouring_epoll_cb (EV_P_ struct ev_io *w, int revents)
-{
-  epoll_poll (EV_A_ 0);
-}
-
 /* called for full and partial cleanup */
 ecb_cold
 static int
@@ -256,8 +271,11 @@
   if (iouring_cq_ring != MAP_FAILED) munmap (iouring_cq_ring, iouring_cq_ring_size);
   if (iouring_sqes    != MAP_FAILED) munmap (iouring_sqes   , iouring_sqes_size   );
 
-  if (ev_is_active (&iouring_epoll_w)) ev_ref (EV_A); ev_io_stop (EV_A_ &iouring_epoll_w);
-  if (ev_is_active (&iouring_tfd_w  )) ev_ref (EV_A); ev_io_stop (EV_A_ &iouring_tfd_w  );
+  if (ev_is_active (&iouring_tfd_w))
+    {
+      ev_ref (EV_A);
+      ev_io_stop (EV_A_ &iouring_tfd_w);
+    }
 }
 
 ecb_cold
@@ -283,6 +301,11 @@
       if (errno != EINVAL)
         return -1; /* we failed */
 
+#if TODO
+      if ((~params.features) & (IORING_FEAT_NODROP | IORING_FEATURE_SINGLE_MMAP))
+        return -1; /* we require the above features */
+#endif
+
       /* EINVAL: lots of possible reasons, but maybe
        * it is because we hit the unqueryable hardcoded size limit
        */
@@ -344,14 +367,7 @@
   while (iouring_internal_init (EV_A) < 0)
     ev_syserr ("(libev) io_uring_setup");
 
-  /* forking epoll should also effectively unregister all fds from the backend */
-  epoll_fork (EV_A);
-  /* epoll_fork already did this. hopefully */
-  /*fd_rearm_all (EV_A);*/
-
-  ev_io_stop  (EV_A_ &iouring_epoll_w);
-  ev_io_set   (EV_A_ &iouring_epoll_w, backend_fd, EV_READ);
-  ev_io_start (EV_A_ &iouring_epoll_w);
+  fd_rearm_all (EV_A);
 
   ev_io_stop  (EV_A_ &iouring_tfd_w);
   ev_io_set   (EV_A_ &iouring_tfd_w, iouring_tfd, EV_READ);
@@ -363,15 +379,6 @@
 static void
 iouring_modify (EV_P_ int fd, int oev, int nev)
 {
-  if (ecb_expect_false (anfds [fd].eflags))
-    {
-      /* we handed this fd over to epoll, so undo this first */
-      /* we do it manually because the optimisations on epoll_modify won't do us any good */
-      epoll_ctl (iouring_fd, EPOLL_CTL_DEL, fd, 0);
-      anfds [fd].eflags = 0;
-      oev = 0;
-    }
-
   if (oev)
     {
       /* we assume the sqe's are all "properly" initialised */
@@ -430,8 +437,9 @@
   int      res = cqe->res;
 
   /* ignore fd removal events, if there are any. TODO: verify */
+  /* TODO: yes, this triggers */
   if (cqe->user_data == (__u64)-1)
-    abort ();//D
+    return;
 
   assert (("libev: io_uring fd must be in-bounds", fd >= 0 && fd < anfdmax));
 
@@ -448,17 +456,10 @@
 
   if (ecb_expect_false (res < 0))
     {
-      if (res == -EINVAL)
-        {
-          /* we assume this error code means the fd/poll combination is buggy
-           * and fall back to epoll.
-           * this error code might also indicate a bug, but the kernel doesn't
-           * distinguish between those two conditions, so... sigh...
-           */
+      /*TODO: EINVAL handling (was something failed with this fd)*/
+      /*TODO: EBUSY happens when?*/
 
-          epoll_modify (EV_A_ fd, 0, anfds [fd].events);
-        }
-      else if (res == -EBADF)
+      if (res == -EBADF)
         {
           assert (("libev: event loop rejected bad fd", res != -EBADF));
           fd_kill (EV_A_ fd);
@@ -494,7 +495,7 @@
   /* we have two options, resize the queue (by tearing down
    * everything and recreating it, or living with it
    * and polling.
-   * we implement this by resizing tghe queue, and, if that fails,
+   * we implement this by resizing the queue, and, if that fails,
    * we just recreate the state on every failure, which
    * kind of is a very inefficient poll.
    * one danger is, due to the bios toward lower fds,
@@ -516,12 +517,12 @@
       /* we hit the kernel limit, we should fall back to something else.
        * we can either poll() a few times and hope for the best,
        * poll always, or switch to epoll.
-       * since we use epoll anyways, go epoll.
+       * TODO: is this necessary with newer kernels?
        */
 
       iouring_internal_destroy (EV_A);
 
-      /* this should make it so that on return, we don'T call any uring functions */
+      /* this should make it so that on return, we don't call any uring functions */
       iouring_to_submit = 0;
 
       for (;;)
@@ -605,9 +606,6 @@
 int
 iouring_init (EV_P_ int flags)
 {
-  if (!epoll_init (EV_A_ 0))
-    return 0;
-
   iouring_entries     = IOURING_INIT_ENTRIES;
   iouring_max_entries = 0;
 
@@ -617,15 +615,8 @@
       return 0;
     }
 
-  ev_io_init  (&iouring_epoll_w, iouring_epoll_cb, backend_fd, EV_READ);
-  ev_set_priority (&iouring_epoll_w, EV_MAXPRI);
-
   ev_io_init  (&iouring_tfd_w, iouring_tfd_cb, iouring_tfd, EV_READ);
-  ev_set_priority (&iouring_tfd_w, EV_MAXPRI);
-
-  ev_io_start (EV_A_ &iouring_epoll_w);
-  ev_unref (EV_A); /* watcher should not keep loop alive */
-
+  ev_set_priority (&iouring_tfd_w, EV_MINPRI);
   ev_io_start (EV_A_ &iouring_tfd_w);
   ev_unref (EV_A); /* watcher should not keep loop alive */
 
@@ -640,6 +631,5 @@
 iouring_destroy (EV_P)
 {
   iouring_internal_destroy (EV_A);
-  epoll_destroy (EV_A);
 }