--- libev/ev_epoll.c 2007/12/25 07:05:45 1.33 +++ libev/ev_epoll.c 2009/07/25 10:14:35 1.48 @@ -1,7 +1,7 @@ /* * libev epoll fd activity backend * - * Copyright (c) 2007 Marc Alexander Lehmann + * Copyright (c) 2007,2008,2009 Marc Alexander Lehmann * All rights reserved. * * Redistribution and use in source and binary forms, with or without modifica- @@ -52,7 +52,9 @@ * * lots of "weird code" and complication handling in this file is due * to these design problems with epoll, as we try very hard to avoid - * epoll_ctl syscalls for common usage patterns. + * epoll_ctl syscalls for common usage patterns and handle the breakage + * ensuing from receiving events for closed and otherwise long gone + * file descriptors. */ #include @@ -61,17 +63,25 @@ epoll_modify (EV_P_ int fd, int oev, int nev) { struct epoll_event ev; + unsigned char oldmask; /* * we handle EPOLL_CTL_DEL by ignoring it here * on the assumption that the fd is gone anyways * if that is wrong, we have to handle the spurious * event in epoll_poll. + * if the fd is added again, we try to ADD it, and, if that + * fails, we assume it still has the same eventmask. */ if (!nev) return; - ev.data.u64 = fd; /* use u64 to fully initialise the struct, for nicer strace etc. */ + oldmask = anfds [fd].emask; + anfds [fd].emask = nev; + + /* store the generation counter in the upper 32 bits, the fd in the lower 32 bits */ + ev.data.u64 = (uint64_t)(uint32_t)fd + | ((uint64_t)(uint32_t)++anfds [fd].egen << 32); ev.events = (nev & EV_READ ? EPOLLIN : 0) | (nev & EV_WRITE ? EPOLLOUT : 0); @@ -80,33 +90,47 @@ if (expect_true (errno == ENOENT)) { - /* on ENOENT the fd went away, so try to do the right thing */ + /* if ENOENT then the fd went away, so try to do the right thing */ if (!nev) - return; + goto dec_egen; if (!epoll_ctl (backend_fd, EPOLL_CTL_ADD, fd, &ev)) return; } else if (expect_true (errno == EEXIST)) { - /* on EEXIST we ignored a previous DEL */ + /* EEXIST means we ignored a previous DEL, but the fd is still active */ + /* if the kernel mask is the same as the new mask, we assume it hasn't changed */ + if (oldmask == nev) + goto dec_egen; + if (!epoll_ctl (backend_fd, EPOLL_CTL_MOD, fd, &ev)) return; } fd_kill (EV_A_ fd); + +dec_egen: + /* we didn't successfully call epoll_ctl, so decrement the generation counter again */ + --anfds [fd].egen; } static void epoll_poll (EV_P_ ev_tstamp timeout) { int i; - int eventcnt = epoll_wait (backend_fd, epoll_events, epoll_eventmax, (int)ceil (timeout * 1000.)); + int eventcnt; + + /* epoll wait times cannot be larger than (LONG_MAX - 999UL) / HZ msecs, which is below */ + /* the default libev max wait time, however. */ + EV_RELEASE_CB; + eventcnt = epoll_wait (backend_fd, epoll_events, epoll_eventmax, (int)ceil (timeout * 1000.)); + EV_ACQUIRE_CB; if (expect_false (eventcnt < 0)) { if (errno != EINTR) - syserr ("(libev) epoll_wait"); + ev_syserr ("(libev) epoll_wait"); return; } @@ -115,18 +139,33 @@ { struct epoll_event *ev = epoll_events + i; - int fd = ev->data.u64; + int fd = (uint32_t)ev->data.u64; /* mask out the lower 32 bits */ + int want = anfds [fd].events; int got = (ev->events & (EPOLLOUT | EPOLLERR | EPOLLHUP) ? EV_WRITE : 0) | (ev->events & (EPOLLIN | EPOLLERR | EPOLLHUP) ? EV_READ : 0); - int want = anfds [fd].events; + + /* check for spurious notification */ + if (expect_false ((uint32_t)anfds [fd].egen != (uint32_t)(ev->data.u64 >> 32))) + { + /* recreate kernel state */ + postfork = 1; + continue; + } if (expect_false (got & ~want)) { + anfds [fd].emask = want; + /* we received an event but are not interested in it, try mod or del */ + /* I don't think we ever need MOD, but let's handle it anyways */ ev->events = (want & EV_READ ? EPOLLIN : 0) | (want & EV_WRITE ? EPOLLOUT : 0); - epoll_ctl (backend_fd, want ? EPOLL_CTL_MOD : EPOLL_CTL_DEL, fd, ev); + if (epoll_ctl (backend_fd, want ? EPOLL_CTL_MOD : EPOLL_CTL_DEL, fd, ev)) + { + postfork = 1; /* an error occured, recreate kernel state */ + continue; + } } fd_event (EV_A_ fd, got); @@ -144,7 +183,12 @@ int inline_size epoll_init (EV_P_ int flags) { - backend_fd = epoll_create (256); +#ifdef EPOLL_CLOEXEC + backend_fd = epoll_create1 (EPOLL_CLOEXEC); + + if (backend_fd <= 0) +#endif + backend_fd = epoll_create (256); if (backend_fd < 0) return 0; @@ -155,7 +199,7 @@ backend_modify = epoll_modify; backend_poll = epoll_poll; - epoll_eventmax = 64; /* intiial number of events receivable per poll */ + epoll_eventmax = 64; /* initial number of events receivable per poll */ epoll_events = (struct epoll_event *)ev_malloc (sizeof (struct epoll_event) * epoll_eventmax); return EVBACKEND_EPOLL; @@ -173,7 +217,7 @@ close (backend_fd); while ((backend_fd = epoll_create (256)) < 0) - syserr ("(libev) epoll_create"); + ev_syserr ("(libev) epoll_create"); fcntl (backend_fd, F_SETFD, FD_CLOEXEC);