--- libeio/eio.c 2008/06/03 05:12:51 1.17 +++ libeio/eio.c 2009/06/12 20:01:42 1.38 @@ -1,7 +1,7 @@ /* * libeio implementation * - * Copyright (c) 2007,2008 Marc Alexander Lehmann + * Copyright (c) 2007,2008,2009 Marc Alexander Lehmann * All rights reserved. * * Redistribution and use in source and binary forms, with or without modifica- @@ -38,6 +38,10 @@ */ #include "eio.h" + +#ifdef EIO_STACKSIZE +# define XTHREAD_STACKSIZE EIO_STACKSIZE +#endif #include "xthread.h" #include @@ -66,17 +70,35 @@ #ifdef _WIN32 /*doh*/ - #else # include "config.h" # include # include +# include # include # include # include # include +/* POSIX_SOURCE is useless on bsd's, and XOPEN_SOURCE is unreliable there, too */ +# if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) +# define _DIRENT_HAVE_D_TYPE /* sigh */ +# define D_INO(de) (de)->d_fileno +# define D_NAMLEN(de) (de)->d_namlen +# elif defined(__linux) || defined(d_ino) || _XOPEN_SOURCE >= 600 +# define D_INO(de) (de)->d_ino +# endif + +#ifdef _D_EXACT_NAMLEN +# undef D_NAMLEN +# define D_NAMLEN(de) _D_EXACT_NAMLEN (de) +#endif + +# ifdef _DIRENT_HAVE_D_TYPE +# define D_TYPE(de) (de)->d_type +# endif + # ifndef EIO_STRUCT_DIRENT # define EIO_STRUCT_DIRENT struct dirent # endif @@ -98,6 +120,16 @@ # endif #endif +#ifndef D_TYPE +# define D_TYPE(de) 0 +#endif +#ifndef D_INO +# define D_INO(de) 0 +#endif +#ifndef D_NAMLEN +# define D_NAMLEN(de) strlen ((de)->d_name) +#endif + /* number of seconds after which an idle threads exit */ #define IDLE_TIMEOUT 10 @@ -122,6 +154,17 @@ /*****************************************************************************/ +#if __GNUC__ >= 3 +# define expect(expr,value) __builtin_expect ((expr),(value)) +#else +# define expect(expr,value) (expr) +#endif + +#define expect_false(expr) expect ((expr) != 0, 0) +#define expect_true(expr) expect ((expr) != 0, 1) + +/*****************************************************************************/ + #define ETP_PRI_MIN EIO_PRI_MIN #define ETP_PRI_MAX EIO_PRI_MAX @@ -146,6 +189,7 @@ closedir (wrk->dirp); \ wrk->dirp = 0; \ } + #define ETP_WORKER_COMMON \ void *dbuf; \ DIR *dirp; @@ -179,6 +223,15 @@ static mutex_t reqlock = X_MUTEX_INIT; static cond_t reqwait = X_COND_INIT; +#if !HAVE_PREADWRITE +/* + * make our pread/pwrite emulation safe against themselves, but not against + * normal read/write by using a mutex. slows down execution a lot, + * but that's your problem, not mine. + */ +static mutex_t preadwritelock = X_MUTEX_INIT; +#endif + typedef struct etp_worker { /* locked by wrklock */ @@ -332,10 +385,10 @@ { ETP_REQ *prv; - while (prv = reqq_shift (&req_queue)) + while ((prv = reqq_shift (&req_queue))) ETP_DESTROY (prv); - while (prv = reqq_shift (&res_queue)) + while ((prv = reqq_shift (&res_queue))) ETP_DESTROY (prv); while (wrk_first.next != &wrk_first) @@ -373,6 +426,8 @@ want_poll_cb = want_poll; done_poll_cb = done_poll; + + return 0; } X_THREAD_PROC (etp_proc); @@ -402,11 +457,11 @@ static void etp_maybe_start_thread (void) { - if (etp_nthreads () >= wanted) + if (expect_true (etp_nthreads () >= wanted)) return; /* todo: maybe use idle here, but might be less exact */ - if (0 <= (int)etp_nthreads () + (int)etp_npending () - (int)etp_nreqs ()) + if (expect_true (0 <= (int)etp_nthreads () + (int)etp_npending () - (int)etp_nreqs ())) return; etp_start_thread (); @@ -469,7 +524,7 @@ --nreqs; X_UNLOCK (reqlock); - if (req->type == EIO_GROUP && req->size) + if (expect_false (req->type == EIO_GROUP && req->size)) { req->int1 = 1; /* mark request as delayed */ continue; @@ -477,11 +532,11 @@ else { int res = ETP_FINISH (req); - if (res) + if (expect_false (res)) return res; } - if (maxreqs && !--maxreqs) + if (expect_false (maxreqs && !--maxreqs)) break; if (maxtime) @@ -510,17 +565,36 @@ { req->pri -= ETP_PRI_MIN; - if (req->pri < ETP_PRI_MIN - ETP_PRI_MIN) req->pri = ETP_PRI_MIN - ETP_PRI_MIN; - if (req->pri > ETP_PRI_MAX - ETP_PRI_MIN) req->pri = ETP_PRI_MAX - ETP_PRI_MIN; + if (expect_false (req->pri < ETP_PRI_MIN - ETP_PRI_MIN)) req->pri = ETP_PRI_MIN - ETP_PRI_MIN; + if (expect_false (req->pri > ETP_PRI_MAX - ETP_PRI_MIN)) req->pri = ETP_PRI_MAX - ETP_PRI_MIN; - X_LOCK (reqlock); - ++nreqs; - ++nready; - reqq_push (&req_queue, req); - X_COND_SIGNAL (reqwait); - X_UNLOCK (reqlock); + if (expect_false (req->type == EIO_GROUP)) + { + /* I hope this is worth it :/ */ + X_LOCK (reqlock); + ++nreqs; + X_UNLOCK (reqlock); + + X_LOCK (reslock); + + ++npending; - etp_maybe_start_thread (); + if (!reqq_push (&res_queue, req) && want_poll_cb) + want_poll_cb (); + + X_UNLOCK (reslock); + } + else + { + X_LOCK (reqlock); + ++nreqs; + ++nready; + reqq_push (&req_queue, req); + X_COND_SIGNAL (reqwait); + X_UNLOCK (reqlock); + + etp_maybe_start_thread (); + } } static void etp_set_max_poll_time (double nseconds) @@ -565,12 +639,12 @@ { while (grp->size < grp->int2 && !EIO_CANCELLED (grp)) { - int old_len = grp->size; + grp->flags &= ~EIO_FLAG_GROUPADD; EIO_FEED (grp); /* stop if no progress has been made */ - if (old_len == grp->size) + if (!(grp->flags & EIO_FLAG_GROUPADD)) { grp->feed = 0; break; @@ -697,16 +771,11 @@ /* work around various missing functions */ #if !HAVE_PREADWRITE +# undef pread +# undef pwrite # define pread eio__pread # define pwrite eio__pwrite -/* - * make our pread/pwrite safe against themselves, but not against - * normal read/write by using a mutex. slows down execution a lot, - * but that's your problem, not mine. - */ -static mutex_t preadwritelock = X_MUTEX_INIT; - static ssize_t eio__pread (int fd, void *buf, size_t count, off_t offset) { @@ -733,7 +802,7 @@ ooffset = lseek (fd, 0, SEEK_CUR); lseek (fd, offset, SEEK_SET); res = write (fd, buf, count); - lseek (fd, offset, SEEK_SET); + lseek (fd, ooffset, SEEK_SET); X_UNLOCK (preadwritelock); return res; @@ -742,6 +811,8 @@ #ifndef HAVE_FUTIMES +# undef utimes +# undef futimes # define utimes(path,times) eio__utimes (path, times) # define futimes(fd,times) eio__futimes (fd, times) @@ -770,10 +841,40 @@ #endif #if !HAVE_FDATASYNC -# define fdatasync fsync +# undef fdatasync +# define fdatasync(fd) fsync (fd) #endif +/* sync_file_range always needs emulation */ +int +eio__sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags) +{ +#if HAVE_SYNC_FILE_RANGE + int res; + + if (EIO_SYNC_FILE_RANGE_WAIT_BEFORE != SYNC_FILE_RANGE_WAIT_BEFORE + || EIO_SYNC_FILE_RANGE_WRITE != SYNC_FILE_RANGE_WRITE + || EIO_SYNC_FILE_RANGE_WAIT_AFTER != SYNC_FILE_RANGE_WAIT_AFTER) + { + flags = 0 + | (flags & EIO_SYNC_FILE_RANGE_WAIT_BEFORE ? SYNC_FILE_RANGE_WAIT_BEFORE : 0) + | (flags & EIO_SYNC_FILE_RANGE_WRITE ? SYNC_FILE_RANGE_WRITE : 0) + | (flags & EIO_SYNC_FILE_RANGE_WAIT_AFTER ? SYNC_FILE_RANGE_WAIT_AFTER : 0); + } + + res = sync_file_range (fd, offset, nbytes, flags); + + if (!res || errno != ENOSYS) + return res; +#endif + + /* even though we could play tricks with the flags, it's better to always + * call fdatasync, as thta matches the expectation of it's users best */ + return fdatasync (fd); +} + #if !HAVE_READAHEAD +# undef readahead # define readahead(fd,offset,count) eio__readahead (fd, offset, count, self) static ssize_t @@ -891,6 +992,85 @@ return res; } +static signed char +eio_dent_cmp (const eio_dirent *a, const eio_dirent *b) +{ + return b->score - a->score ? b->score - a->score /* works because our signed char is always 0..100 */ + : a->inode < b->inode ? -1 : a->inode > b->inode ? 1 : 0; +} + +#define EIO_QSORT_CUTOFF 20 /* quite high, but performs well on many filesystems */ + +static void +eio_dent_sort (eio_dirent *dents, int size) +{ + int i, j; + + if (size > EIO_QSORT_CUTOFF * 3) /* skip quicksort for small directories */ + { + /* first, use quicksort */ + /* should be good for 2**31 entries */ + struct rng { int l, r; } rng [32]; + + i = 0; + rng[0].l = 0; + rng[0].r = size; + + while (expect_true (i >= 0)) + { + int L = rng [i].l; + int R = rng [i].r - 1; + + if (expect_false (L + EIO_QSORT_CUTOFF < R)) + { + eio_dirent piv = dents [L]; + + while (L < R) + { + while (eio_dent_cmp (&dents [R], &piv) >= 0 && L < R) + --R; + + if (L < R) + dents [L++] = dents [R]; + + while (eio_dent_cmp (&dents [L], &piv) <= 0 && L < R) + ++L; + + if (L < R) + dents [R--] = dents [L]; + } + + dents [L] = piv; + + ++i; + rng [i].l = L + 1; + rng [i].r = rng [i - 1].r; + rng [i - 1].r = L; + + if (rng [i].r - rng [i].l > rng [i - 1].r - rng [i - 1].l) + { + struct rng t; + + t = rng [i]; rng [i] = rng [i - 1]; rng [i - 1] = t; + } + } + else + --i; + } + } + + /* use a simple insertion sort at the end */ + for (i = 1; i < size; ++i) + { + eio_dirent value = dents [i]; + + for (j = i - 1; j >= 0 && eio_dent_cmp (&dents [j], &value) > 0; --j) + dents [j + 1] = dents [j]; + + dents [j + 1] = value; + } +} + /* read a full directory */ static void eio__scandir (eio_req *req, etp_worker *self) @@ -898,53 +1078,245 @@ DIR *dirp; EIO_STRUCT_DIRENT *entp; char *name, *names; - int memlen = 4096; - int memofs = 0; - int res = 0; + int namesalloc = 4096; + int namesoffs = 0; + int flags = req->int1; + eio_dirent *dents = 0; + int dentalloc = 128; + int dentoffs = 0; + + req->result = -1; + + if (!(flags & EIO_READDIR_DENTS)) + flags &= ~(EIO_READDIR_DIRS_FIRST | EIO_READDIR_STAT_ORDER); X_LOCK (wrklock); + /* the corresponding closedir is in ETP_WORKER_CLEAR */ self->dirp = dirp = opendir (req->ptr1); - req->flags |= EIO_FLAG_PTR2_FREE; - req->ptr2 = names = malloc (memlen); + req->flags |= EIO_FLAG_PTR1_FREE | EIO_FLAG_PTR2_FREE; + req->ptr1 = names = malloc (namesalloc); + req->ptr2 = dents = flags ? malloc (dentalloc * sizeof (eio_dirent)) : 0; X_UNLOCK (wrklock); - if (dirp && names) + if (dirp && names && (!flags || dents)) for (;;) { errno = 0; entp = readdir (dirp); if (!entp) - break; + { + if (errno) + break; + + /* sort etc. */ + req->int1 = flags; + req->result = dentoffs; + + if (dents) + { + eio_dirent *ent = dents + dentoffs; + + while (ent > dents) + (--ent)->name = names + (size_t)ent->name; + } + + if (flags & EIO_READDIR_STAT_ORDER + || !(~flags & (EIO_READDIR_DIRS_FIRST | EIO_READDIR_FOUND_UNKNOWN))) + eio_dent_sort (dents, dentoffs); /* score depends of DIRS_FIRST */ + else if (flags & EIO_READDIR_DIRS_FIRST) + { + /* in this case, all is known, and we just put dirs first and sort them */ + eio_dirent *ent = dents + dentoffs; + eio_dirent *dir = dents; + + /* now move dirs to the front, and non-dirs to the back */ + /* by walking from both sides and swapping if necessary */ + while (ent > dir) + { + if (dir->type == DT_DIR) + ++dir; + else + { + --ent; + + if (ent->type == DT_DIR) + { + eio_dirent tmp = *dir; + *dir = *ent; + *ent = tmp; + + ++dir; + } + } + } + /* now sort the dirs only */ + eio_dent_sort (dents, dir - dents); + } + + /* only provide the names array unless DENTS is specified */ + if (!(flags & EIO_READDIR_DENTS)) + { + X_LOCK (wrklock); + assert (!dents); + req->ptr1 = 0; + req->ptr2 = names; + X_UNLOCK (wrklock); + } + + break; + } + + /* now add the entry to our list(s) */ name = entp->d_name; + /* skip . and .. entries */ if (name [0] != '.' || (name [1] && (name [1] != '.' || name [2]))) { - int len = strlen (name) + 1; + int len = D_NAMLEN (entp) + 1; - res++; - - while (memofs + len > memlen) + while (expect_false (namesoffs + len > namesalloc)) { - memlen *= 2; + namesalloc *= 2; X_LOCK (wrklock); - req->ptr2 = names = realloc (names, memlen); + req->ptr1 = names = realloc (names, namesalloc); X_UNLOCK (wrklock); if (!names) break; } - memcpy (names + memofs, name, len); - memofs += len; + memcpy (names + namesoffs, name, len); + + if (dents) + { + struct eio_dirent *ent; + + if (expect_false (dentoffs == dentalloc)) + { + dentalloc *= 2; + X_LOCK (wrklock); + req->ptr2 = dents = realloc (dents, dentalloc * sizeof (eio_dirent)); + X_UNLOCK (wrklock); + + if (!dents) + break; + } + + ent = dents + dentoffs; + + ent->name = (char *)(size_t)namesoffs; /* rather dirtily we store the offset in the pointer */ + ent->namelen = len - 1; + ent->inode = D_INO (entp); + + switch (D_TYPE (entp)) + { + default: + ent->type = EIO_DT_UNKNOWN; + flags |= EIO_READDIR_FOUND_UNKNOWN; + break; + + #ifdef DT_FIFO + case DT_FIFO: ent->type = EIO_DT_FIFO; break; + #endif + #ifdef DT_CHR + case DT_CHR: ent->type = EIO_DT_CHR; break; + #endif + #ifdef DT_MPC + case DT_MPC: ent->type = EIO_DT_MPC; break; + #endif + #ifdef DT_DIR + case DT_DIR: ent->type = EIO_DT_DIR; break; + #endif + #ifdef DT_NAM + case DT_NAM: ent->type = EIO_DT_NAM; break; + #endif + #ifdef DT_BLK + case DT_BLK: ent->type = EIO_DT_BLK; break; + #endif + #ifdef DT_MPB + case DT_MPB: ent->type = EIO_DT_MPB; break; + #endif + #ifdef DT_REG + case DT_REG: ent->type = EIO_DT_REG; break; + #endif + #ifdef DT_NWK + case DT_NWK: ent->type = EIO_DT_NWK; break; + #endif + #ifdef DT_CMP + case DT_CMP: ent->type = EIO_DT_CMP; break; + #endif + #ifdef DT_LNK + case DT_LNK: ent->type = EIO_DT_LNK; break; + #endif + #ifdef DT_SOCK + case DT_SOCK: ent->type = EIO_DT_SOCK; break; + #endif + #ifdef DT_DOOR + case DT_DOOR: ent->type = EIO_DT_DOOR; break; + #endif + #ifdef DT_WHT + case DT_WHT: ent->type = EIO_DT_WHT; break; + #endif + } + + ent->score = 0; + + if (flags & EIO_READDIR_DIRS_FIRST) + { + if (ent->type == EIO_DT_UNKNOWN) + { + if (*name == '.') /* leading dots are likely directories, and, in any case, rare */ + ent->score = 98; + else if (!strchr (name, '.')) /* absense of dots indicate likely dirs */ + ent->score = len <= 2 ? len + 6 : len <= 4 ? 5 : len <= 7 ? 4 : 1; /* shorter == more likely dir, but avoid too many classes */ + } + else if (ent->type == EIO_DT_DIR) + ent->score = 100; + } + } + + namesoffs += len; + ++dentoffs; + } + + if (EIO_CANCELLED (req)) + { + errno = ECANCELED; + break; } } +} - if (errno) - res = -1; - - req->result = res; +#if !(_POSIX_MAPPED_FILES && _POSIX_SYNCHRONIZED_IO) +# undef msync +# define msync(a,b,c) ((errno = ENOSYS), -1) +#endif + +int +eio__mtouch (void *mem, size_t len, int flags) +{ + intptr_t addr = (intptr_t)mem; + intptr_t end = addr + len; +#ifdef PAGESIZE + const intptr_t page = PAGESIZE; +#else + static intptr_t page; + + if (!page) + page = sysconf (_SC_PAGESIZE); +#endif + + addr &= ~(page - 1); /* assume page size is always a power of two */ + + if (addr < end) + if (flags) /* modify */ + do { *((volatile sig_atomic_t *)addr) |= 0; } while ((addr += page) < len); + else + do { *((volatile sig_atomic_t *)addr) ; } while ((addr += page) < len); + + return 0; } /*****************************************************************************/ @@ -1041,7 +1413,7 @@ int eio_init (void (*want_poll)(void), void (*done_poll)(void)) { - etp_init (want_poll, done_poll); + return etp_init (want_poll, done_poll); } static void eio_api_destroy (eio_req *req) @@ -1113,7 +1485,7 @@ case EIO_RENAME: req->result = rename (req->ptr1, req->ptr2); break; case EIO_LINK: req->result = link (req->ptr1, req->ptr2); break; case EIO_SYMLINK: req->result = symlink (req->ptr1, req->ptr2); break; - case EIO_MKNOD: req->result = mknod (req->ptr1, (mode_t)req->int2, (dev_t)req->offs); break; + case EIO_MKNOD: req->result = mknod (req->ptr1, (mode_t)req->int2, (dev_t)req->int3); break; case EIO_READLINK: ALLOC (NAME_MAX); req->result = readlink (req->ptr1, req->ptr2, NAME_MAX); break; @@ -1121,6 +1493,9 @@ case EIO_SYNC: req->result = 0; sync (); break; case EIO_FSYNC: req->result = fsync (req->int1); break; case EIO_FDATASYNC: req->result = fdatasync (req->int1); break; + case EIO_MSYNC: req->result = msync (req->ptr2, req->size, req->int1); break; + case EIO_MTOUCH: req->result = eio__mtouch (req->ptr2, req->size, req->int1); break; + case EIO_SYNC_FILE_RANGE: req->result = eio__sync_file_range (req->int1, req->offs, req->size, req->int2); break; case EIO_READDIR: eio__scandir (req, self); break; @@ -1162,14 +1537,17 @@ ? futimes (req->int1, times) : utimes (req->ptr1, times); } + break; case EIO_GROUP: + abort (); /* handled in eio_request */ + case EIO_NOP: req->result = 0; break; case EIO_CUSTOM: - req->feed (req); + ((void (*)(eio_req *))req->feed) (req); break; default: @@ -1202,6 +1580,21 @@ REQ (EIO_FSYNC); req->int1 = fd; SEND; } +eio_req *eio_msync (void *addr, size_t length, int flags, int pri, eio_cb cb, void *data) +{ + REQ (EIO_MSYNC); req->ptr2 = addr; req->size = length; req->int1 = flags; SEND; +} + +eio_req *eio_mtouch (void *addr, size_t length, int flags, int pri, eio_cb cb, void *data) +{ + REQ (EIO_MTOUCH); req->ptr2 = addr; req->size = length; req->int1 = flags; SEND; +} + +eio_req *eio_sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags, int pri, eio_cb cb, void *data) +{ + REQ (EIO_SYNC_FILE_RANGE); req->int1 = fd; req->offs = offset; req->size = nbytes; req->int2 = flags; SEND; +} + eio_req *eio_fdatasync (int fd, int pri, eio_cb cb, void *data) { REQ (EIO_FDATASYNC); req->int1 = fd; SEND; @@ -1323,14 +1716,14 @@ return eio__1path (EIO_RMDIR, path, pri, cb, data); } -eio_req *eio_readdir (const char *path, int pri, eio_cb cb, void *data) +eio_req *eio_readdir (const char *path, int flags, int pri, eio_cb cb, void *data) { - return eio__1path (EIO_READDIR, path, pri, cb, data); + REQ (EIO_READDIR); PATH; req->int1 = flags; SEND; } eio_req *eio_mknod (const char *path, mode_t mode, dev_t dev, int pri, eio_cb cb, void *data) { - REQ (EIO_MKNOD); PATH; req->int2 = (long)mode; req->int2 = (long)dev; SEND; + REQ (EIO_MKNOD); PATH; req->int2 = (long)mode; req->int3 = (long)dev; SEND; } static eio_req * @@ -1366,7 +1759,7 @@ eio_req *eio_custom (eio_cb execute, int pri, eio_cb cb, void *data) { - REQ (EIO_CUSTOM); req->feed = execute; SEND; + REQ (EIO_CUSTOM); req->feed = (void (*)(eio_req *))execute; SEND; } #endif @@ -1404,6 +1797,8 @@ { assert (("cannot add requests to IO::AIO::GRP after the group finished", grp->int1 != 2)); + grp->flags |= EIO_FLAG_GROUPADD; + ++grp->size; req->grp = grp; @@ -1422,12 +1817,15 @@ ssize_t eio_sendfile_sync (int ofd, int ifd, off_t offset, size_t count) { etp_worker wrk; + ssize_t ret; wrk.dbuf = 0; - eio__sendfile (ofd, ifd, offset, count, &wrk); + ret = eio__sendfile (ofd, ifd, offset, count, &wrk); if (wrk.dbuf) free (wrk.dbuf); + + return ret; }