--- BDB/BDB.pm 2007/05/09 06:42:23 1.8 +++ BDB/BDB.pm 2007/12/04 11:07:39 1.18 @@ -8,11 +8,16 @@ =head1 DESCRIPTION -See the eg/ directory in the distribution and the berkeleydb C -documentation. This is inadequate, but the only sources of documentation -known for this module so far. +See the BerkeleyDB documentation (L). +The BDB API is very similar to the C API (the translation has been very faithful). + +See also the example sections in the document below and possibly the eg/ +subdirectory of the BDB distribution. Last not least see the IO::AIO +documentation, as that module uses almost the same asynchronous request +model as this module. + +I know this is woefully inadequate documentation. Send a patch! -=head2 EXAMPLE =head1 REQUEST ANATOMY AND LIFETIME @@ -69,13 +74,13 @@ use base 'Exporter'; BEGIN { - our $VERSION = '0.1'; + our $VERSION = '1.2'; our @BDB_REQ = qw( db_env_open db_env_close db_env_txn_checkpoint db_env_lock_detect db_env_memp_sync db_env_memp_trickle db_open db_close db_compact db_sync db_put db_get db_pget db_del db_key_range - db_txn_commit db_txn_abort + db_txn_commit db_txn_abort db_txn_finish db_c_close db_c_count db_c_put db_c_get db_c_pget db_c_del db_sequence_open db_sequence_close db_sequence_get db_sequence_remove @@ -92,6 +97,293 @@ XSLoader::load ("BDB", $VERSION); } +=head2 BERKELEYDB FUNCTIONS + +All of these are functions. The create functions simply return a new +object and never block. All the remaining functions all take an optional +callback as last argument. If it is missing, then the fucntion will be +executed synchronously. + +BDB functions that cannot block (mostly functions that manipulate +settings) are method calls on the relevant objects, so the rule of thumb +is: if its a method, its not blocking, if its a function, it takes a +callback as last argument. + +In the following, C<$int> signifies an integer return value, +C is a "binary string" (i.e. a perl string with no character +indices >255), C is an unsigned 32 bit integer, C is some +integer, C is a floating point value. + +The C types are generic perl scalars (for input and output of data +values), and the C is the optional callback function to call +when the request is completed. + +The various C etc. arguments are handles return by +C, C, C and so on. If they have an +appended C<_ornull> this means they are optional and you can pass C +for them, resulting a NULL pointer on the C level. + +=head3 BDB functions + +Functions in the BDB namespace, exported by default: + + $env = db_env_create (U32 env_flags = 0) + flags: RPCCLIENT + + db_env_open (DB_ENV *env, octetstring db_home, U32 open_flags, int mode, SV *callback = &PL_sv_undef) + open_flags: INIT_CDB INIT_LOCK INIT_LOG INIT_MPOOL INIT_REP INIT_TXN RECOVER RECOVER_FATAL USE_ENVIRON USE_ENVIRON_ROOT CREATE LOCKDOWN PRIVATE REGISTER SYSTEM_MEM + db_env_close (DB_ENV *env, U32 flags = 0, SV *callback = &PL_sv_undef) + db_env_txn_checkpoint (DB_ENV *env, U32 kbyte = 0, U32 min = 0, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: FORCE + db_env_lock_detect (DB_ENV *env, U32 flags = 0, U32 atype = DB_LOCK_DEFAULT, SV *dummy = 0, SV *callback = &PL_sv_undef) + atype: LOCK_DEFAULT LOCK_EXPIRE LOCK_MAXLOCKS LOCK_MAXWRITE LOCK_MINLOCKS LOCK_MINWRITE LOCK_OLDEST LOCK_RANDOM LOCK_YOUNGEST + db_env_memp_sync (DB_ENV *env, SV *dummy = 0, SV *callback = &PL_sv_undef) + db_env_memp_trickle (DB_ENV *env, int percent, SV *dummy = 0, SV *callback = &PL_sv_undef) + + $db = db_create (DB_ENV *env = 0, U32 flags = 0) + flags: XA_CREATE + + db_open (DB *db, DB_TXN_ornull *txnid, octetstring file, octetstring database, int type, U32 flags, int mode, SV *callback = &PL_sv_undef) + flags: AUTO_COMMIT CREATE EXCL MULTIVERSION NOMMAP RDONLY READ_UNCOMMITTED THREAD TRUNCATE + db_close (DB *db, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: DB_NOSYNC + db_compact (DB *db, DB_TXN_ornull *txn = 0, SV *start = 0, SV *stop = 0, SV *unused1 = 0, U32 flags = DB_FREE_SPACE, SV *unused2 = 0, SV *callback = &PL_sv_undef) + flags: FREELIST_ONLY FREE_SPACE + db_sync (DB *db, U32 flags = 0, SV *callback = &PL_sv_undef) + db_key_range (DB *db, DB_TXN_ornull *txn, SV *key, SV *key_range, U32 flags = 0, SV *callback = &PL_sv_undef) + db_put (DB *db, DB_TXN_ornull *txn, SV *key, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: APPEND NODUPDATA NOOVERWRITE + db_get (DB *db, DB_TXN_ornull *txn, SV *key, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: CONSUME CONSUME_WAIT GET_BOTH SET_RECNO MULTIPLE READ_COMMITTED READ_UNCOMMITTED RMW + db_pget (DB *db, DB_TXN_ornull *txn, SV *key, SV *pkey, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: CONSUME CONSUME_WAIT GET_BOTH SET_RECNO MULTIPLE READ_COMMITTED READ_UNCOMMITTED RMW + db_del (DB *db, DB_TXN_ornull *txn, SV *key, U32 flags = 0, SV *callback = &PL_sv_undef) + db_txn_commit (DB_TXN *txn, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: TXN_NOSYNC TXN_SYNC + db_txn_abort (DB_TXN *txn, SV *callback = &PL_sv_undef) + + db_c_close (DBC *dbc, SV *callback = &PL_sv_undef) + db_c_count (DBC *dbc, SV *count, U32 flags = 0, SV *callback = &PL_sv_undef) + db_c_put (DBC *dbc, SV *key, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: AFTER BEFORE CURRENT KEYFIRST KEYLAST NODUPDATA + db_c_get (DBC *dbc, SV *key, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: CURRENT FIRST GET_BOTH GET_BOTH_RANGE GET_RECNO JOIN_ITEM LAST NEXT NEXT_DUP NEXT_NODUP PREV PREV_DUP PREV_NODUP SET SET_RANGE SET_RECNO READ_UNCOMMITTED MULTIPLE MULTIPLE_KEY RMW + db_c_pget (DBC *dbc, SV *key, SV *pkey, SV *data, U32 flags = 0, SV *callback = &PL_sv_undef) + db_c_del (DBC *dbc, U32 flags = 0, SV *callback = &PL_sv_undef) + + db_sequence_open (DB_SEQUENCE *seq, DB_TXN_ornull *txnid, SV *key, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: CREATE EXCL + db_sequence_close (DB_SEQUENCE *seq, U32 flags = 0, SV *callback = &PL_sv_undef) + db_sequence_get (DB_SEQUENCE *seq, DB_TXN_ornull *txnid, int delta, SV *seq_value, U32 flags = DB_TXN_NOSYNC, SV *callback = &PL_sv_undef) + flags: TXN_NOSYNC + db_sequence_remove (DB_SEQUENCE *seq, DB_TXN_ornull *txnid = 0, U32 flags = 0, SV *callback = &PL_sv_undef) + flags: TXN_NOSYNC + +=head4 db_txn_finish (DB_TXN *txn, U32 flags = 0, SV *callback = &PL_sv_undef) + +This is not actually a Berkeley DB function but a BDB module +extension. The background for this exytension is: It is very annoying to +have to check every single BDB function for error returns and provide a +codepath out of your transaction. While the BDB module still makes this +possible, it contains the following extensions: + +When a transaction-protected function returns any operating system +error (errno > 0), BDB will set the C flag on the +transaction. This flag is also set by Berkeley DB functions themselves +when an operation fails with LOCK_DEADLOCK, and it causes all further +operations on that transaction (including C) to fail. + +The C request will look at this flag, and, if it is set, +will automatically call C (setting errno to C +if it isn't set to something else yet). If it isn't set, it will call +C and return the error normally. + +How to use this? Easy: just write your transaction normally: + + my $txn = $db_env->txn_begin; + db_get $db, $txn, "key", my $data; + db_put $db, $txn, "key", $data + 1 unless $! == BDB::NOTFOUND; + db_txn_finish $txn; + die "transaction failed" if $!; + +That is, handle only the expected errors. If something unexpected happens +(EIO, LOCK_NOTGRANTED or a deadlock in either db_get or db_put), then the remaining +requests (db_put in this case) will simply be skipped (they will fail with +LOCK_DEADLOCK) and the transaction will be aborted. + +You can use the C<< $txn->failed >> method to check wether a transaction +has failed in this way and abort further processing (excluding +C). + +=head3 DB_ENV/database environment methods + +Methods available on DB_ENV/$env handles: + + DESTROY (DB_ENV_ornull *env) + CODE: + if (env) + env->close (env, 0); + + $int = $env->set_data_dir (const char *dir) + $int = $env->set_tmp_dir (const char *dir) + $int = $env->set_lg_dir (const char *dir) + $int = $env->set_shm_key (long shm_key) + $int = $env->set_cachesize (U32 gbytes, U32 bytes, int ncache = 0) + $int = $env->set_flags (U32 flags, int onoff) + $env->set_errfile (FILE *errfile = 0) + $env->set_msgfile (FILE *msgfile = 0) + $int = $env->set_verbose (U32 which, int onoff = 1) + $int = $env->set_encrypt (const char *password, U32 flags = 0) + $int = $env->set_timeout (NV timeout_seconds, U32 flags = SET_TXN_TIMEOUT) + $int = $env->set_mp_max_openfd (int maxopenfd); + $int = $env->set_mp_max_write (int maxwrite, int maxwrite_sleep); + $int = $env->set_mp_mmapsize (int mmapsize_mb) + $int = $env->set_lk_detect (U32 detect = DB_LOCK_DEFAULT) + $int = $env->set_lk_max_lockers (U32 max) + $int = $env->set_lk_max_locks (U32 max) + $int = $env->set_lk_max_objects (U32 max) + $int = $env->set_lg_bsize (U32 max) + $int = $env->set_lg_max (U32 max) + + $txn = $env->txn_begin (DB_TXN_ornull *parent = 0, U32 flags = 0) + flags: READ_COMMITTED READ_UNCOMMITTED TXN_NOSYNC TXN_NOWAIT TXN_SNAPSHOT TXN_SYNC TXN_WAIT TXN_WRITE_NOSYNC + +=head4 Example: + + use AnyEvent; + use BDB; + + our $FH; open $FH, "<&=" . BDB::poll_fileno; + our $WATCHER = AnyEvent->io (fh => $FH, poll => 'r', cb => \&BDB::poll_cb); + + BDB::min_parallel 8; + + my $env = db_env_create; + + mkdir "bdtest", 0700; + db_env_open + $env, + "bdtest", + BDB::INIT_LOCK | BDB::INIT_LOG | BDB::INIT_MPOOL | BDB::INIT_TXN | BDB::RECOVER | BDB::USE_ENVIRON | BDB::CREATE, + 0600; + + $env->set_flags (BDB::AUTO_COMMIT | BDB::TXN_NOSYNC, 1); + + +=head3 DB/database methods + +Methods available on DB/$db handles: + + DESTROY (DB_ornull *db) + CODE: + if (db) + { + SV *env = (SV *)db->app_private; + db->close (db, 0); + SvREFCNT_dec (env); + } + + $int = $db->set_cachesize (U32 gbytes, U32 bytes, int ncache = 0) + $int = $db->set_flags (U32 flags) + flags: CHKSUM ENCRYPT TXN_NOT_DURABLE + Btree: DUP DUPSORT RECNUM REVSPLITOFF + Hash: DUP DUPSORT + Queue: INORDER + Recno: RENUMBER SNAPSHOT + + $int = $db->set_encrypt (const char *password, U32 flags) + $int = $db->set_lorder (int lorder) + $int = $db->set_bt_minkey (U32 minkey) + $int = $db->set_re_delim (int delim) + $int = $db->set_re_pad (int re_pad) + $int = $db->set_re_source (char *source) + $int = $db->set_re_len (U32 re_len) + $int = $db->set_h_ffactor (U32 h_ffactor) + $int = $db->set_h_nelem (U32 h_nelem) + $int = $db->set_q_extentsize (U32 extentsize) + + $dbc = $db->cursor (DB_TXN_ornull *txn = 0, U32 flags = 0) + flags: READ_COMMITTED READ_UNCOMMITTED WRITECURSOR TXN_SNAPSHOT + $seq = $db->sequence (U32 flags = 0) + +=head4 Example: + + my $db = db_create $env; + db_open $db, undef, "table", undef, BDB::BTREE, BDB::AUTO_COMMIT | BDB::CREATE | BDB::READ_UNCOMMITTED, 0600; + + for (1..1000) { + db_put $db, undef, "key $_", "data $_"; + + db_key_range $db, undef, "key $_", my $keyrange; + my ($lt, $eq, $gt) = @$keyrange; + } + + db_del $db, undef, "key $_" for 1..1000; + + db_sync $db; + + +=head3 DB_TXN/transaction methods + +Methods available on DB_TXN/$txn handles: + + DESTROY (DB_TXN_ornull *txn) + CODE: + if (txn) + txn->abort (txn); + + $int = $txn->set_timeout (NV timeout_seconds, U32 flags = SET_TXN_TIMEOUT) + flags: SET_LOCK_TIMEOUT SET_TXN_TIMEOUT + + $bool = $txn->failed + # see db_txn_finish documentation, above + + +=head3 DBC/cursor methods + +Methods available on DBC/$dbc handles: + + DESTROY (DBC_ornull *dbc) + CODE: + if (dbc) + dbc->c_close (dbc); + +=head4 Example: + + my $c = $db->cursor; + + for (;;) { + db_c_get $c, my $key, my $data, BDB::NEXT; + warn "<$!,$key,$data>"; + last if $!; + } + + db_c_close $c; + + +=head3 DB_SEQUENCE/sequence methods + +Methods available on DB_SEQUENCE/$seq handles: + + DESTROY (DB_SEQUENCE_ornull *seq) + CODE: + if (seq) + seq->close (seq, 0); + + $int = $seq->initial_value (db_seq_t value) + $int = $seq->set_cachesize (U32 size) + $int = $seq->set_flags (U32 flags) + flags: SEQ_DEC SEQ_INC SEQ_WRAP + $int = $seq->set_range (db_seq_t min, db_seq_t max) + +=head4 Example: + + my $seq = $db->sequence; + + db_sequence_open $seq, undef, "seq", BDB::CREATE; + db_sequence_get $seq, undef, 1, my $value; + + =head2 SUPPORT FUNCTIONS =head3 EVENT PROCESSING AND EVENT LOOP INTEGRATION @@ -145,17 +437,14 @@ For interactive programs, values such as C<0.01> to C<0.1> should be fine. -Example: Install an Event watcher that automatically calls +Example: Install an EV watcher that automatically calls BDB::poll_cb with low priority, to ensure that other parts of the -program get the CPU sometimes even under high AIO load. +program get the CPU sometimes even under high load. # try not to spend much more than 0.1s in poll_cb BDB::max_poll_time 0.1; - # use a low priority so other tasks have priority - Event->io (fd => BDB::poll_fileno, - poll => 'r', nice => 1, - cb => &BDB::poll_cb); + my $bdb_poll = EV::io BDB::poll_fileno, EV::READ, \&BDB::poll_cb); =item BDB::poll_wait @@ -177,7 +466,7 @@ =item BDB::flush -Wait till all outstanding AIO requests have been handled. +Wait till all outstanding BDB requests have been handled. Strictly equivalent to: @@ -192,12 +481,12 @@ =item BDB::min_parallel $nthreads -Set the minimum number of AIO threads to C<$nthreads>. The current +Set the minimum number of BDB threads to C<$nthreads>. The current default is C<8>, which means eight asynchronous operations can execute concurrently at any one time (the number of outstanding requests, however, is unlimited). -BDB starts threads only on demand, when an AIO request is queued and +BDB starts threads only on demand, when an BDB request is queued and no free thread exists. Please note that queueing up a hundred requests can create demand for a hundred threads, even if it turns out that everything is in the cache and could have been processed faster by a single thread. @@ -212,7 +501,7 @@ =item BDB::max_parallel $nthreads -Sets the maximum number of AIO threads to C<$nthreads>. If more than the +Sets the maximum number of BDB threads to C<$nthreads>. If more than the specified number of threads are currently running, this function kills them. This function blocks until the limit is reached. @@ -325,7 +614,7 @@ This module should do "the right thing" when the process using it forks: -Before the fork, IO::AIO enters a quiescent state where no requests +Before the fork, BDB enters a quiescent state where no requests can be added in other threads and no results will be processed. After the fork the parent simply leaves the quiescent state and continues request/result processing, while the child frees the request/result queue @@ -334,7 +623,7 @@ parent process has been reached again. In short: the parent will, after a short pause, continue as if fork had -not been called, while the child will act as if IO::AIO has not been used +not been called, while the child will act as if BDB has not been used yet. =head2 MEMORY USAGE @@ -347,7 +636,7 @@ scalars and other data passed into aio requests will also be locked and will consume memory till the request has entered the done state. -This is now awfully much, so queuing lots of requests is not usually a +This is not awfully much, so queuing lots of requests is not usually a problem. Per-thread usage: @@ -358,11 +647,16 @@ =head1 KNOWN BUGS -Known bugs will be fixed in the next release. +Known bugs will be fixed in the next release, except: + + If you use a transaction in any request, and the request returns + with an operating system error or DB_LOCK_NOTGRANTED, the internal + TXN_DEADLOCK flag will be set on the transaction. See C, + above. =head1 SEE ALSO -L. +L, L. =head1 AUTHOR