--- libeio/eio.c 2008/06/16 12:19:55 1.18 +++ libeio/eio.c 2009/06/06 17:25:13 1.31 @@ -1,7 +1,7 @@ /* * libeio implementation * - * Copyright (c) 2007,2008 Marc Alexander Lehmann + * Copyright (c) 2007,2008,2009 Marc Alexander Lehmann * All rights reserved. * * Redistribution and use in source and binary forms, with or without modifica- @@ -38,6 +38,10 @@ */ #include "eio.h" + +#ifdef EIO_STACKSIZE +# define XTHREAD_STACKSIZE EIO_STACKSIZE +#endif #include "xthread.h" #include @@ -66,7 +70,6 @@ #ifdef _WIN32 /*doh*/ - #else # include "config.h" @@ -77,6 +80,18 @@ # include # include +/* POSIX_SOURCE is useless on bsd's, and XOPEN_SOURCE is unreliable there, too */ +# if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) +# define D_INO(de) (de)->d_fileno +# define _DIRENT_HAVE_D_TYPE /* sigh */ +# elif defined(__linux) || defined(d_ino) || _XOPEN_SOURCE >= 600 +# define D_INO(de) (de)->d_ino +# endif + +# ifdef _DIRENT_HAVE_D_TYPE +# define D_TYPE(de) (de)->d_type +# endif + # ifndef EIO_STRUCT_DIRENT # define EIO_STRUCT_DIRENT struct dirent # endif @@ -98,6 +113,14 @@ # endif #endif +#ifndef D_TYPE +# define D_TYPE(de) 0 +#endif + +#ifndef D_INO +# define D_INO(de) 0 +#endif + /* number of seconds after which an idle threads exit */ #define IDLE_TIMEOUT 10 @@ -122,6 +145,17 @@ /*****************************************************************************/ +#if __GNUC__ >= 3 +# define expect(expr,value) __builtin_expect ((expr),(value)) +#else +# define expect(expr,value) (expr) +#endif + +#define expect_false(expr) expect ((expr) != 0, 0) +#define expect_true(expr) expect ((expr) != 0, 1) + +/*****************************************************************************/ + #define ETP_PRI_MIN EIO_PRI_MIN #define ETP_PRI_MAX EIO_PRI_MAX @@ -179,6 +213,15 @@ static mutex_t reqlock = X_MUTEX_INIT; static cond_t reqwait = X_COND_INIT; +#if !HAVE_PREADWRITE +/* + * make our pread/pwrite emulation safe against themselves, but not against + * normal read/write by using a mutex. slows down execution a lot, + * but that's your problem, not mine. + */ +static mutex_t preadwritelock = X_MUTEX_INIT; +#endif + typedef struct etp_worker { /* locked by wrklock */ @@ -332,10 +375,10 @@ { ETP_REQ *prv; - while (prv = reqq_shift (&req_queue)) + while ((prv = reqq_shift (&req_queue))) ETP_DESTROY (prv); - while (prv = reqq_shift (&res_queue)) + while ((prv = reqq_shift (&res_queue))) ETP_DESTROY (prv); while (wrk_first.next != &wrk_first) @@ -373,6 +416,8 @@ want_poll_cb = want_poll; done_poll_cb = done_poll; + + return 0; } X_THREAD_PROC (etp_proc); @@ -402,11 +447,11 @@ static void etp_maybe_start_thread (void) { - if (etp_nthreads () >= wanted) + if (expect_true (etp_nthreads () >= wanted)) return; /* todo: maybe use idle here, but might be less exact */ - if (0 <= (int)etp_nthreads () + (int)etp_npending () - (int)etp_nreqs ()) + if (expect_true (0 <= (int)etp_nthreads () + (int)etp_npending () - (int)etp_nreqs ())) return; etp_start_thread (); @@ -469,7 +514,7 @@ --nreqs; X_UNLOCK (reqlock); - if (req->type == EIO_GROUP && req->size) + if (expect_false (req->type == EIO_GROUP && req->size)) { req->int1 = 1; /* mark request as delayed */ continue; @@ -477,11 +522,11 @@ else { int res = ETP_FINISH (req); - if (res) + if (expect_false (res)) return res; } - if (maxreqs && !--maxreqs) + if (expect_false (maxreqs && !--maxreqs)) break; if (maxtime) @@ -510,17 +555,36 @@ { req->pri -= ETP_PRI_MIN; - if (req->pri < ETP_PRI_MIN - ETP_PRI_MIN) req->pri = ETP_PRI_MIN - ETP_PRI_MIN; - if (req->pri > ETP_PRI_MAX - ETP_PRI_MIN) req->pri = ETP_PRI_MAX - ETP_PRI_MIN; + if (expect_false (req->pri < ETP_PRI_MIN - ETP_PRI_MIN)) req->pri = ETP_PRI_MIN - ETP_PRI_MIN; + if (expect_false (req->pri > ETP_PRI_MAX - ETP_PRI_MIN)) req->pri = ETP_PRI_MAX - ETP_PRI_MIN; - X_LOCK (reqlock); - ++nreqs; - ++nready; - reqq_push (&req_queue, req); - X_COND_SIGNAL (reqwait); - X_UNLOCK (reqlock); + if (expect_false (req->type == EIO_GROUP)) + { + /* I hope this is worth it :/ */ + X_LOCK (reqlock); + ++nreqs; + X_UNLOCK (reqlock); + + X_LOCK (reslock); + + ++npending; + + if (!reqq_push (&res_queue, req) && want_poll_cb) + want_poll_cb (); - etp_maybe_start_thread (); + X_UNLOCK (reslock); + } + else + { + X_LOCK (reqlock); + ++nreqs; + ++nready; + reqq_push (&req_queue, req); + X_COND_SIGNAL (reqwait); + X_UNLOCK (reqlock); + + etp_maybe_start_thread (); + } } static void etp_set_max_poll_time (double nseconds) @@ -565,12 +629,12 @@ { while (grp->size < grp->int2 && !EIO_CANCELLED (grp)) { - int old_len = grp->size; + grp->flags &= ~EIO_FLAG_GROUPADD; EIO_FEED (grp); /* stop if no progress has been made */ - if (old_len == grp->size) + if (!(grp->flags & EIO_FLAG_GROUPADD)) { grp->feed = 0; break; @@ -697,16 +761,11 @@ /* work around various missing functions */ #if !HAVE_PREADWRITE +# undef pread +# undef pwrite # define pread eio__pread # define pwrite eio__pwrite -/* - * make our pread/pwrite safe against themselves, but not against - * normal read/write by using a mutex. slows down execution a lot, - * but that's your problem, not mine. - */ -static mutex_t preadwritelock = X_MUTEX_INIT; - static ssize_t eio__pread (int fd, void *buf, size_t count, off_t offset) { @@ -733,7 +792,7 @@ ooffset = lseek (fd, 0, SEEK_CUR); lseek (fd, offset, SEEK_SET); res = write (fd, buf, count); - lseek (fd, offset, SEEK_SET); + lseek (fd, ooffset, SEEK_SET); X_UNLOCK (preadwritelock); return res; @@ -742,6 +801,8 @@ #ifndef HAVE_FUTIMES +# undef utimes +# undef futimes # define utimes(path,times) eio__utimes (path, times) # define futimes(fd,times) eio__futimes (fd, times) @@ -770,10 +831,40 @@ #endif #if !HAVE_FDATASYNC -# define fdatasync fsync +# undef fdatasync +# define fdatasync(fd) fsync (fd) +#endif + +/* sync_file_range always needs emulation */ +int +eio__sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags) +{ +#if HAVE_SYNC_FILE_RANGE + int res; + + if (EIO_SYNC_FILE_RANGE_WAIT_BEFORE != SYNC_FILE_RANGE_WAIT_BEFORE + || EIO_SYNC_FILE_RANGE_WRITE != SYNC_FILE_RANGE_WRITE + || EIO_SYNC_FILE_RANGE_WAIT_AFTER != SYNC_FILE_RANGE_WAIT_AFTER) + { + flags = 0 + | (flags & EIO_SYNC_FILE_RANGE_WAIT_BEFORE ? SYNC_FILE_RANGE_WAIT_BEFORE : 0) + | (flags & EIO_SYNC_FILE_RANGE_WRITE ? SYNC_FILE_RANGE_WRITE : 0) + | (flags & EIO_SYNC_FILE_RANGE_WAIT_AFTER ? SYNC_FILE_RANGE_WAIT_AFTER : 0); + } + + res = sync_file_range (fd, offset, nbytes, flags); + + if (!res || errno != ENOSYS) + return res; #endif + /* even though we could play tricks with the flags, it's better to always + * call fdatasync, as thta matches the expectation of it's users best */ + return fdatasync (fd); +} + #if !HAVE_READAHEAD +# undef readahead # define readahead(fd,offset,count) eio__readahead (fd, offset, count, self) static ssize_t @@ -891,31 +982,98 @@ return res; } +static int +eio_dent_cmp (const void *a_, const void *b_) +{ + const eio_dirent *a = (const eio_dirent *)a_; + const eio_dirent *b = (const eio_dirent *)b_; + + return (int)b->score - (int)a->score ? (int)b->score - (int)a->score + : a->inode < b->inode ? -1 : a->inode > b->inode ? 1 : 0; /* int might be < ino_t */ +} + /* read a full directory */ static void eio__scandir (eio_req *req, etp_worker *self) { DIR *dirp; EIO_STRUCT_DIRENT *entp; - char *name, *names; - int memlen = 4096; - int memofs = 0; - int res = 0; + unsigned char *name, *names; + int namesalloc = 4096; + int namesoffs = 0; + int flags = req->int1; + eio_dirent *dents = 0; + int dentalloc = 128; + int dentoffs = 0; + + req->result = -1; + + if (!(flags & EIO_READDIR_DENTS)) + flags &= ~(EIO_READDIR_DIRS_FIRST | EIO_READDIR_STAT_ORDER); X_LOCK (wrklock); + + /* the corresponding closedir is in ETP_WORKER_CLEAR */ self->dirp = dirp = opendir (req->ptr1); - req->flags |= EIO_FLAG_PTR2_FREE; - req->ptr2 = names = malloc (memlen); + req->flags |= EIO_FLAG_PTR1_FREE | EIO_FLAG_PTR2_FREE; + req->ptr1 = names = malloc (namesalloc); + req->ptr2 = dents = flags ? malloc (dentalloc * sizeof (eio_dirent)) : 0; + X_UNLOCK (wrklock); - if (dirp && names) + if (dirp && names && (!flags || dents)) for (;;) { errno = 0; entp = readdir (dirp); if (!entp) - break; + { + if (errno) + break; + + /* sort etc. */ + req->int1 = flags; + req->result = dentoffs; + + if (flags & EIO_READDIR_STAT_ORDER || !(~flags & (EIO_READDIR_DIRS_FIRST | EIO_READDIR_FOUND_UNKNOWN)) + { + /* pray your qsort doesn't use quicksort */ + qsort (dents, dentoffs, sizeof (*dents), eio_dent_cmp); /* score depends of DIRS_FIRST */ + } + else if (flags & EIO_READDIR_DIRS_FIRST && !(flags & EIO_READDIR_FOUND_UNKNOWN)) + { + /* in this case, all is known, and we just put dirs first and sort them */ + eio_dirent *ent = dents + dentoffs; + eio_dirent *dir = dents; + + while (ent > dir) + { + if (dir->type == DT_DIR) + ++dir; + else + { + --ent; + + if (ent->type == DT_DIR) + { + eio_dirent tmp = *dir; + *dir = *ent; + *ent = tmp; + + ++dir; + } + } + } + + /* now sort the dirs only */ + qsort (dents, dir - dents, sizeof (*dents), eio_dent_cmp); + } + + {int i; for(i=0;iinode,e->score,e->name);}}//D + + break; + } name = entp->d_name; @@ -923,28 +1081,135 @@ { int len = strlen (name) + 1; - res++; - - while (memofs + len > memlen) + while (expect_false (namesoffs + len > namesalloc)) { - memlen *= 2; + namesalloc *= 2; X_LOCK (wrklock); - req->ptr2 = names = realloc (names, memlen); + req->ptr1 = names = realloc (names, namesalloc); X_UNLOCK (wrklock); if (!names) break; } - memcpy (names + memofs, name, len); - memofs += len; + memcpy (names + namesoffs, name, len); + + if (dents) + { + struct eio_dirent *ent; + + if (expect_false (dentoffs == dentalloc)) + { + dentalloc *= 2; + X_LOCK (wrklock); + req->ptr2 = dents = realloc (dents, dentalloc * sizeof (eio_dirent)); + X_UNLOCK (wrklock); + + if (!dents) + break; + } + + ent = dents + dentoffs; + + ent->name = names + namesoffs; + ent->namelen = len - 1; + ent->inode = D_INO (entp); + + switch (D_TYPE (entp)) + { + default: + ent->type = EIO_DT_UNKNOWN; + flags |= EIO_READDIR_FOUND_UNKNOWN; + break; + + #ifdef DT_FIFO + case DT_FIFO: ent->type = EIO_DT_FIFO; break; + #endif + #ifdef DT_CHR + case DT_CHR: ent->type = EIO_DT_CHR; break; + #endif + #ifdef DT_DIR + case DT_DIR: ent->type = EIO_DT_DIR; break; + #endif + #ifdef DT_BLK + case DT_BLK: ent->type = EIO_DT_BLK; break; + #endif + #ifdef DT_REG + case DT_REG: ent->type = EIO_DT_REG; break; + #endif + #ifdef DT_LNK + case DT_LNK: ent->type = EIO_DT_LNK; break; + #endif + #ifdef DT_SOCK + case DT_SOCK: ent->type = EIO_DT_SOCK; break; + #endif + #ifdef DT_WHT + case DT_WHT: ent->type = EIO_DT_WHT; break; + #endif + } + + ent->score = 0; + + if (flags & EIO_READDIR_DIRS_FIRST) + { + if (ent->type == EIO_DT_UNKNOWN) + { + if (*name == '.') /* leading dots are likely directories, and, in any case, rare */ + ent->score = 98; + else if (!strchr (name, '.')) /* absense of dots indicate likely dirs */ + ent->score = len <= 4 ? 5 : len <= 7 ? 4 : 1; /* shorter == more likely dir, but avoid too many classes */ + } + else if (ent->type == DT_DIR) + ent->score = 100; + } + } + + namesoffs += len; + ++dentoffs; } } + else + req->result = -1; - if (errno) - res = -1; - - req->result = res; + /* if user doesn't want the dents, do not provide it */ + if (!(flags & EIO_READDIR_DENTS)) + { + X_LOCK (wrklock); + free (dents); + req->ptr2 = req->ptr1; + req->ptr1 = 0; + X_UNLOCK (wrklock); + } +} + +#if !(_POSIX_MAPPED_FILES && _POSIX_SYNCHRONIZED_IO) +# undef msync +# define msync(a,b,c) ((errno = ENOSYS), -1) +#endif + +int +eio__mtouch (void *mem, size_t len, int flags) +{ + intptr_t addr = (intptr_t)mem; + intptr_t end = addr + len; +#ifdef PAGESIZE + const intptr_t page = PAGESIZE; +#else + static intptr_t page; + + if (!page) + page = sysconf (_SC_PAGESIZE); +#endif + + addr &= ~(page - 1); /* assume page size is always a power of two */ + + if (addr < end) + if (flags) /* modify */ + do { *((volatile sig_atomic_t *)addr) |= 0; } while ((addr += page) < len); + else + do { *((volatile sig_atomic_t *)addr) ; } while ((addr += page) < len); + + return 0; } /*****************************************************************************/ @@ -1041,7 +1306,7 @@ int eio_init (void (*want_poll)(void), void (*done_poll)(void)) { - etp_init (want_poll, done_poll); + return etp_init (want_poll, done_poll); } static void eio_api_destroy (eio_req *req) @@ -1121,6 +1386,9 @@ case EIO_SYNC: req->result = 0; sync (); break; case EIO_FSYNC: req->result = fsync (req->int1); break; case EIO_FDATASYNC: req->result = fdatasync (req->int1); break; + case EIO_MSYNC: req->result = msync (req->ptr2, req->size, req->int1); break; + case EIO_MTOUCH: req->result = eio__mtouch (req->ptr2, req->size, req->int1); break; + case EIO_SYNC_FILE_RANGE: req->result = eio__sync_file_range (req->int1, req->offs, req->size, req->int2); break; case EIO_READDIR: eio__scandir (req, self); break; @@ -1162,14 +1430,17 @@ ? futimes (req->int1, times) : utimes (req->ptr1, times); } + break; case EIO_GROUP: + abort (); /* handled in eio_request */ + case EIO_NOP: req->result = 0; break; case EIO_CUSTOM: - req->feed (req); + ((void (*)(eio_req *))req->feed) (req); break; default: @@ -1202,6 +1473,21 @@ REQ (EIO_FSYNC); req->int1 = fd; SEND; } +eio_req *eio_msync (void *addr, size_t length, int flags, int pri, eio_cb cb, void *data) +{ + REQ (EIO_MSYNC); req->ptr2 = addr; req->size = length; req->int1 = flags; SEND; +} + +eio_req *eio_mtouch (void *addr, size_t length, int flags, int pri, eio_cb cb, void *data) +{ + REQ (EIO_MTOUCH); req->ptr2 = addr; req->size = length; req->int1 = flags; SEND; +} + +eio_req *eio_sync_file_range (int fd, off_t offset, size_t nbytes, unsigned int flags, int pri, eio_cb cb, void *data) +{ + REQ (EIO_SYNC_FILE_RANGE); req->int1 = fd; req->offs = offset; req->size = nbytes; req->int2 = flags; SEND; +} + eio_req *eio_fdatasync (int fd, int pri, eio_cb cb, void *data) { REQ (EIO_FDATASYNC); req->int1 = fd; SEND; @@ -1323,9 +1609,9 @@ return eio__1path (EIO_RMDIR, path, pri, cb, data); } -eio_req *eio_readdir (const char *path, int pri, eio_cb cb, void *data) +eio_req *eio_readdir (const char *path, int flags, int pri, eio_cb cb, void *data) { - return eio__1path (EIO_READDIR, path, pri, cb, data); + REQ (EIO_READDIR); PATH; req->int1 = flags; SEND; } eio_req *eio_mknod (const char *path, mode_t mode, dev_t dev, int pri, eio_cb cb, void *data) @@ -1366,7 +1652,7 @@ eio_req *eio_custom (eio_cb execute, int pri, eio_cb cb, void *data) { - REQ (EIO_CUSTOM); req->feed = execute; SEND; + REQ (EIO_CUSTOM); req->feed = (void (*)(eio_req *))execute; SEND; } #endif @@ -1404,6 +1690,8 @@ { assert (("cannot add requests to IO::AIO::GRP after the group finished", grp->int1 != 2)); + grp->flags |= EIO_FLAG_GROUPADD; + ++grp->size; req->grp = grp;