--- BDB/BDB.pm 2007/08/13 12:07:55 1.13 +++ BDB/BDB.pm 2008/09/25 13:30:14 1.49 @@ -6,6 +6,43 @@ use BDB; + my $env = db_env_create; + + mkdir "bdtest", 0700; + db_env_open + $env, + "bdtest", + BDB::INIT_LOCK | BDB::INIT_LOG | BDB::INIT_MPOOL + | BDB::INIT_TXN | BDB::RECOVER | BDB::USE_ENVIRON | BDB::CREATE, + 0600; + + $env->set_flags (BDB::AUTO_COMMIT | BDB::TXN_NOSYNC, 1); + + my $db = db_create $env; + db_open $db, undef, "table", undef, BDB::BTREE, BDB::AUTO_COMMIT | BDB::CREATE + | BDB::READ_UNCOMMITTED, 0600; + db_put $db, undef, "key", "data", 0, sub { + db_del $db, undef, "key"; + }; + db_sync $db; + + # when you also use Coro, management is easy: + use Coro::BDB; + + # automatic event loop intergration with AnyEvent: + use AnyEvent::BDB; + + # automatic result processing with EV: + my $WATCHER = EV::io BDB::poll_fileno, EV::READ, \&BDB::poll_cb; + + # with Glib: + add_watch Glib::IO BDB::poll_fileno, + in => sub { BDB::poll_cb; 1 }; + + # or simply flush manually + BDB::flush; + + =head1 DESCRIPTION See the BerkeleyDB documentation (L). @@ -73,14 +110,17 @@ use base 'Exporter'; +our $VERSION; + BEGIN { - our $VERSION = '1.0'; + $VERSION = '1.8'; our @BDB_REQ = qw( db_env_open db_env_close db_env_txn_checkpoint db_env_lock_detect - db_env_memp_sync db_env_memp_trickle - db_open db_close db_compact db_sync db_put db_get db_pget db_del db_key_range - db_txn_commit db_txn_abort + db_env_memp_sync db_env_memp_trickle db_env_dbrename db_env_dbremove + db_open db_close db_compact db_sync db_upgrade + db_put db_exists db_get db_pget db_del db_key_range + db_txn_commit db_txn_abort db_txn_finish db_c_close db_c_count db_c_put db_c_get db_c_pget db_c_del db_sequence_open db_sequence_close db_sequence_get db_sequence_remove @@ -97,70 +137,159 @@ XSLoader::load ("BDB", $VERSION); } +=head2 WIN32 FILENAMES/DATABASE NAME MESS + +Perl on Win32 supports only ASCII filenames (the reason is that it abuses +an internal flag to store wether a filename is Unicode or ANSI, but that +flag is used for somethign else in the perl core, so there is no way to +detect wether a filename is ANSI or Unicode-encoded). The BDB module +tries to work around this issue by assuming that the filename is an ANSI +filename and BDB was built for unicode support. + =head2 BERKELEYDB FUNCTIONS All of these are functions. The create functions simply return a new -object and never block. All the remaining functions all take an optional -callback as last argument. If it is missing, then the fucntion will be -executed synchronously. +object and never block. All the remaining functions take an optional +callback as last argument. If it is missing, then the function will be +executed synchronously. In both cases, C<$!> will reflect the return value +of the function. BDB functions that cannot block (mostly functions that manipulate settings) are method calls on the relevant objects, so the rule of thumb -is: if its a method, its not blocking, if its a function, it takes a +is: if it's a method, it's not blocking, if it's a function, it takes a callback as last argument. In the following, C<$int> signifies an integer return value, -C is a "binary string" (i.e. a perl string with no character -indices >255), C is an unsigned 32 bit integer, C is some -integer, C is a floating point value. - -The C types are generic perl scalars (for input and output of data -values), and the C is the optional callback function to call -when the request is completed. +C is a "filename" (octets on unix, madness on windows), +C is an unsigned 32 bit integer, C is some integer, C is a +floating point value. + +Most C types are generic perl scalars (for input and output of data +values). The various C etc. arguments are handles return by C, C, C and so on. If they have an appended C<_ornull> this means they are optional and you can pass C for them, resulting a NULL pointer on the C level. +The C is the optional callback function to call when the +request is completed. This last callback argument is special: the callback +is simply the last argument passed. If there are "optional" arguments +before the callback they can be left out. The callback itself can be left +out or specified as C, in which case the function will be executed +synchronously. + +For example, C usually is called with all integer +arguments zero. These can be left out, so all of these specify a call +to C<< DB_ENV->txn_checkpoint >>, to be executed asynchronously with a +callback to be called: + + db_env_txn_checkpoint $db_env, 0, 0, 0, sub { }; + db_env_txn_checkpoint $db_env, 0, 0, sub { }; + db_env_txn_checkpoint $db_env, sub { }; + +While these all specify a call to C<< DB_ENV->txn_checkpoint >> to be +executed synchronously: + + db_env_txn_checkpoint $db_env, 0, 0, 0, undef; + db_env_txn_checkpoint $db_env, 0, 0, 0; + db_env_txn_checkpoint $db_env, 0; + =head3 BDB functions Functions in the BDB namespace, exported by default: $env = db_env_create (U32 env_flags = 0) + flags: RPCCLIENT - db_env_open (DB_ENV *env, octetstring db_home, U32 open_flags, int mode, SV *callback = &PL_sv_undef) + db_env_open (DB_ENV *env, bdb_filename db_home, U32 open_flags, int mode, SV *callback = &PL_sv_undef) + open_flags: INIT_CDB INIT_LOCK INIT_LOG INIT_MPOOL INIT_REP INIT_TXN RECOVER RECOVER_FATAL USE_ENVIRON USE_ENVIRON_ROOT CREATE LOCKDOWN PRIVATE REGISTER SYSTEM_MEM db_env_close (DB_ENV *env, U32 flags = 0, SV *callback = &PL_sv_undef) db_env_txn_checkpoint (DB_ENV *env, U32 kbyte = 0, U32 min = 0, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: FORCE db_env_lock_detect (DB_ENV *env, U32 flags = 0, U32 atype = DB_LOCK_DEFAULT, SV *dummy = 0, SV *callback = &PL_sv_undef) + atype: LOCK_DEFAULT LOCK_EXPIRE LOCK_MAXLOCKS LOCK_MAXWRITE LOCK_MINLOCKS LOCK_MINWRITE LOCK_OLDEST LOCK_RANDOM LOCK_YOUNGEST db_env_memp_sync (DB_ENV *env, SV *dummy = 0, SV *callback = &PL_sv_undef) db_env_memp_trickle (DB_ENV *env, int percent, SV *dummy = 0, SV *callback = &PL_sv_undef) + db_env_dbremove (DB_ENV *env, DB_TXN_ornull *txnid, bdb_filename file, bdb_filename database, U32 flags = 0, SV *callback = &PL_sv_undef) + db_env_dbrename (DB_ENV *env, DB_TXN_ornull *txnid, bdb_filename file, bdb_filename database, bdb_filename newname, U32 flags = 0, SV *callback = &PL_sv_undef) $db = db_create (DB_ENV *env = 0, U32 flags = 0) + flags: XA_CREATE - db_open (DB *db, DB_TXN_ornull *txnid, octetstring file, octetstring database, int type, U32 flags, int mode, SV *callback = &PL_sv_undef) + db_open (DB *db, DB_TXN_ornull *txnid, bdb_filename file, bdb_filename database, int type, U32 flags, int mode, SV *callback = &PL_sv_undef) + flags: AUTO_COMMIT CREATE EXCL MULTIVERSION NOMMAP RDONLY READ_UNCOMMITTED THREAD TRUNCATE db_close (DB *db, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: DB_NOSYNC + db_upgrade (DB *db, bdb_filename file, U32 flags = 0, SV *callback = &PL_sv_undef) db_compact (DB *db, DB_TXN_ornull *txn = 0, SV *start = 0, SV *stop = 0, SV *unused1 = 0, U32 flags = DB_FREE_SPACE, SV *unused2 = 0, SV *callback = &PL_sv_undef) + flags: FREELIST_ONLY FREE_SPACE db_sync (DB *db, U32 flags = 0, SV *callback = &PL_sv_undef) db_key_range (DB *db, DB_TXN_ornull *txn, SV *key, SV *key_range, U32 flags = 0, SV *callback = &PL_sv_undef) db_put (DB *db, DB_TXN_ornull *txn, SV *key, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: APPEND NODUPDATA NOOVERWRITE + db_exists (DB *db, DB_TXN_ornull *txn, SV *key, U32 flags = 0, SV *callback = 0) (v4.6) db_get (DB *db, DB_TXN_ornull *txn, SV *key, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: CONSUME CONSUME_WAIT GET_BOTH SET_RECNO MULTIPLE READ_COMMITTED READ_UNCOMMITTED RMW db_pget (DB *db, DB_TXN_ornull *txn, SV *key, SV *pkey, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: CONSUME CONSUME_WAIT GET_BOTH SET_RECNO MULTIPLE READ_COMMITTED READ_UNCOMMITTED RMW db_del (DB *db, DB_TXN_ornull *txn, SV *key, U32 flags = 0, SV *callback = &PL_sv_undef) db_txn_commit (DB_TXN *txn, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: TXN_NOSYNC TXN_SYNC db_txn_abort (DB_TXN *txn, SV *callback = &PL_sv_undef) + db_c_close (DBC *dbc, SV *callback = &PL_sv_undef) db_c_count (DBC *dbc, SV *count, U32 flags = 0, SV *callback = &PL_sv_undef) db_c_put (DBC *dbc, SV *key, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: AFTER BEFORE CURRENT KEYFIRST KEYLAST NODUPDATA db_c_get (DBC *dbc, SV *key, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: CURRENT FIRST GET_BOTH GET_BOTH_RANGE GET_RECNO JOIN_ITEM LAST NEXT NEXT_DUP NEXT_NODUP PREV PREV_DUP PREV_NODUP SET SET_RANGE SET_RECNO READ_UNCOMMITTED MULTIPLE MULTIPLE_KEY RMW db_c_pget (DBC *dbc, SV *key, SV *pkey, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) db_c_del (DBC *dbc, U32 flags = 0, SV *callback = &PL_sv_undef) db_sequence_open (DB_SEQUENCE *seq, DB_TXN_ornull *txnid, SV *key, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: CREATE EXCL db_sequence_close (DB_SEQUENCE *seq, U32 flags = 0, SV *callback = &PL_sv_undef) db_sequence_get (DB_SEQUENCE *seq, DB_TXN_ornull *txnid, int delta, SV *seq_value, U32 flags = DB_TXN_NOSYNC, SV *callback = &PL_sv_undef) + flags: TXN_NOSYNC db_sequence_remove (DB_SEQUENCE *seq, DB_TXN_ornull *txnid = 0, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: TXN_NOSYNC +=head4 db_txn_finish (DB_TXN *txn, U32 flags = 0, SV *callback = &PL_sv_undef) + +This is not actually a Berkeley DB function but a BDB module +extension. The background for this exytension is: It is very annoying to +have to check every single BDB function for error returns and provide a +codepath out of your transaction. While the BDB module still makes this +possible, it contains the following extensions: + +When a transaction-protected function returns any operating system +error (errno > 0), BDB will set the C flag on the +transaction. This flag is also set by Berkeley DB functions themselves +when an operation fails with LOCK_DEADLOCK, and it causes all further +operations on that transaction (including C) to fail. + +The C request will look at this flag, and, if it is set, +will automatically call C (setting errno to C +if it isn't set to something else yet). If it isn't set, it will call +C and return the error normally. + +How to use this? Easy: just write your transaction normally: + + my $txn = $db_env->txn_begin; + db_get $db, $txn, "key", my $data; + db_put $db, $txn, "key", $data + 1 unless $! == BDB::NOTFOUND; + db_txn_finish $txn; + die "transaction failed" if $!; + +That is, handle only the expected errors. If something unexpected happens +(EIO, LOCK_NOTGRANTED or a deadlock in either db_get or db_put), then the remaining +requests (db_put in this case) will simply be skipped (they will fail with +LOCK_DEADLOCK) and the transaction will be aborted. + +You can use the C<< $txn->failed >> method to check wether a transaction +has failed in this way and abort further processing (excluding +C). =head3 DB_ENV/database environment methods @@ -176,12 +305,14 @@ $int = $env->set_lg_dir (const char *dir) $int = $env->set_shm_key (long shm_key) $int = $env->set_cachesize (U32 gbytes, U32 bytes, int ncache = 0) - $int = $env->set_flags (U32 flags, int onoff) + $int = $env->set_flags (U32 flags, int onoff = 1) + $int = $env->log_set_config (U32 flags, int onoff = 1) (v4.7) + $int = $env->set_intermediate_dir_mode (const char *modestring) (v4.7) $env->set_errfile (FILE *errfile = 0) $env->set_msgfile (FILE *msgfile = 0) $int = $env->set_verbose (U32 which, int onoff = 1) $int = $env->set_encrypt (const char *password, U32 flags = 0) - $int = $env->set_timeout (NV timeout, U32 flags) + $int = $env->set_timeout (NV timeout_seconds, U32 flags = SET_TXN_TIMEOUT) $int = $env->set_mp_max_openfd (int maxopenfd); $int = $env->set_mp_max_write (int maxwrite, int maxwrite_sleep); $int = $env->set_mp_mmapsize (int mmapsize_mb) @@ -191,8 +322,14 @@ $int = $env->set_lk_max_objects (U32 max) $int = $env->set_lg_bsize (U32 max) $int = $env->set_lg_max (U32 max) + $int = $env->mutex_set_increment (U32 increment) + $int = $env->mutex_set_tas_spins (U32 tas_spins) + $int = $env->mutex_set_max (U32 max) + $int = $env->mutex_set_align (U32 align) $txn = $env->txn_begin (DB_TXN_ornull *parent = 0, U32 flags = 0) + flags: READ_COMMITTED READ_UNCOMMITTED TXN_NOSYNC TXN_NOWAIT TXN_SNAPSHOT TXN_SYNC TXN_WAIT TXN_WRITE_NOSYNC + $txn = $env->cdsgroup_begin; (v4.5) =head4 Example: @@ -231,6 +368,12 @@ $int = $db->set_cachesize (U32 gbytes, U32 bytes, int ncache = 0) $int = $db->set_flags (U32 flags) + flags: CHKSUM ENCRYPT TXN_NOT_DURABLE + Btree: DUP DUPSORT RECNUM REVSPLITOFF + Hash: DUP DUPSORT + Queue: INORDER + Recno: RENUMBER SNAPSHOT + $int = $db->set_encrypt (const char *password, U32 flags) $int = $db->set_lorder (int lorder) $int = $db->set_bt_minkey (U32 minkey) @@ -243,6 +386,7 @@ $int = $db->set_q_extentsize (U32 extentsize) $dbc = $db->cursor (DB_TXN_ornull *txn = 0, U32 flags = 0) + flags: READ_COMMITTED READ_UNCOMMITTED WRITECURSOR TXN_SNAPSHOT $seq = $db->sequence (U32 flags = 0) =head4 Example: @@ -271,7 +415,11 @@ if (txn) txn->abort (txn); - $int = $txn->set_timeout (NV timeout, U32 flags) + $int = $txn->set_timeout (NV timeout_seconds, U32 flags = SET_TXN_TIMEOUT) + flags: SET_LOCK_TIMEOUT SET_TXN_TIMEOUT + + $bool = $txn->failed + # see db_txn_finish documentation, above =head3 DBC/cursor methods @@ -283,6 +431,8 @@ if (dbc) dbc->c_close (dbc); + $int = $cursor->set_priority ($priority = PRIORITY_*) (v4.6) + =head4 Example: my $c = $db->cursor; @@ -308,6 +458,7 @@ $int = $seq->initial_value (db_seq_t value) $int = $seq->set_cachesize (U32 size) $int = $seq->set_flags (U32 flags) + flags: SEQ_DEC SEQ_INC SEQ_WRAP $int = $seq->set_range (db_seq_t min, db_seq_t max) =head4 Example: @@ -324,6 +475,14 @@ =over 4 +=item $msg = BDB::strerror [$errno] + +Returns the string corresponding to the given errno value. If no argument +is given, use C<$!>. + +Note that the BDB module also patches the C<$!> variable directly, so you +should be able to get a bdb error string by simply stringifying C<$!>. + =item $fileno = BDB::poll_fileno Return the I. This filehandle must be @@ -371,17 +530,14 @@ For interactive programs, values such as C<0.01> to C<0.1> should be fine. -Example: Install an Event watcher that automatically calls +Example: Install an EV watcher that automatically calls BDB::poll_cb with low priority, to ensure that other parts of the -program get the CPU sometimes even under high AIO load. +program get the CPU sometimes even under high load. # try not to spend much more than 0.1s in poll_cb BDB::max_poll_time 0.1; - # use a low priority so other tasks have priority - Event->io (fd => BDB::poll_fileno, - poll => 'r', nice => 1, - cb => &BDB::poll_cb); + my $bdb_poll = EV::io BDB::poll_fileno, EV::READ, \&BDB::poll_cb); =item BDB::poll_wait @@ -403,7 +559,7 @@ =item BDB::flush -Wait till all outstanding AIO requests have been handled. +Wait till all outstanding BDB requests have been handled. Strictly equivalent to: @@ -412,18 +568,84 @@ =back +=head3 VERSION CHECKING + +BerkeleyDB comes in various versions, many of them have minor +incompatibilities. This means that traditional "at least version x.x" +checks are often not sufficient. + +Example: set the log_autoremove option in a way compatible with set_flags (&BDB::LOG_AUTOREMOVE ) if BDB::VERSION v0, v4.7; + $DB_ENV->log_set_config (&BDB::LOG_AUTO_REMOVE) if BDB::VERSION v4.7; + +=over 4 + +=item BDB::VERSION + +The C function, when called without arguments, returns the +Berkeley DB version as a v-string (usually with 3 components). You should +use C and C operators exclusively to make comparisons. + +Example: check for at least version 4.7. + + BDB::VERSION ge v4.7 or die; + +=item BDB::VERSION min-version + +Returns true if the BDB version is at least the given version (specified +as a v-string), false otherwise. + +Example: check for at least version 4.5. + + BDB::VERSION v4.7 or die; + +=item BDB::VERSION min-version, max-version + +Returns true of the BDB version is at least version C (specify C or C for any minimum version) +and less then C. + +Example: check wether version is strictly less then v4.7. + + BDB::VERSION v0, v4.7 + or die "version 4.7 is not yet supported"; + +=back + +=cut + +sub VERSION { + # I was dumb enough to override the VERSION method here, so let's try + # to fix it up. + + if ($_[0] eq __PACKAGE__) { + $VERSION + } else { + if (@_ > 0) { + return undef if VERSION_v lt $_[0]; + if (@_ > 1) { + return undef if VERSION_v ge $_[1]; + } + } + + VERSION_v + } +} + =head3 CONTROLLING THE NUMBER OF THREADS =over 4 =item BDB::min_parallel $nthreads -Set the minimum number of AIO threads to C<$nthreads>. The current +Set the minimum number of BDB threads to C<$nthreads>. The current default is C<8>, which means eight asynchronous operations can execute concurrently at any one time (the number of outstanding requests, however, is unlimited). -BDB starts threads only on demand, when an AIO request is queued and +BDB starts threads only on demand, when an BDB request is queued and no free thread exists. Please note that queueing up a hundred requests can create demand for a hundred threads, even if it turns out that everything is in the cache and could have been processed faster by a single thread. @@ -438,7 +660,7 @@ =item BDB::max_parallel $nthreads -Sets the maximum number of AIO threads to C<$nthreads>. If more than the +Sets the maximum number of BDB threads to C<$nthreads>. If more than the specified number of threads are currently running, this function kills them. This function blocks until the limit is reached. @@ -487,8 +709,14 @@ Sets a callback that is called whenever a request is created without an explicit callback. It has to return two code references. The first is used -as the request callback, and the second is called to wait until the first -callback has been called. The default implementation works like this: +as the request callback (it should save the return status), and the second +is called to wait until the first callback has been called (it must set +C<$!> to the return status). + +This mechanism can be used to include BDB into other event mechanisms, +such as L or L. + +The default implementation works like this: sub { my $status; @@ -498,6 +726,10 @@ ) } +It simply blocks the process till the request has finished and then sets +C<$!> to the return value. This means that if you don't use a callback, +BDB will simply fall back to synchronous operations. + =back =head3 STATISTICAL INFORMATION @@ -551,7 +783,7 @@ This module should do "the right thing" when the process using it forks: -Before the fork, IO::AIO enters a quiescent state where no requests +Before the fork, BDB enters a quiescent state where no requests can be added in other threads and no results will be processed. After the fork the parent simply leaves the quiescent state and continues request/result processing, while the child frees the request/result queue @@ -560,9 +792,13 @@ parent process has been reached again. In short: the parent will, after a short pause, continue as if fork had -not been called, while the child will act as if IO::AIO has not been used +not been called, while the child will act as if BDB has not been used yet. +Win32 note: there is no fork on win32, and perls emulation of it is too +broken to be supported, so do not use BDB in a windows pseudo-fork, better +yet, switch to a more capable platform. + =head2 MEMORY USAGE Per-request usage: @@ -584,11 +820,17 @@ =head1 KNOWN BUGS -Known bugs will be fixed in the next release. +Known bugs will be fixed in the next release, except: + + If you use a transaction in any request, and the request returns + with an operating system error or DB_LOCK_NOTGRANTED, the internal + TXN_DEADLOCK flag will be set on the transaction. See C, + above. =head1 SEE ALSO -L. +L (event loop integration), L (more natural +syntax), L (nice to have). =head1 AUTHOR