| 1 |
package Tree::M; |
| 2 |
|
| 3 |
use Carp; |
| 4 |
use DynaLoader; |
| 5 |
|
| 6 |
BEGIN { |
| 7 |
$VERSION = 0.02; |
| 8 |
@ISA = qw(DynaLoader); |
| 9 |
bootstrap Tree::M, $VERSION; |
| 10 |
} |
| 11 |
|
| 12 |
=head1 NAME |
| 13 |
|
| 14 |
Tree::M - implement M-trees for efficient "metric/multimedia-searches" |
| 15 |
|
| 16 |
=head1 SYNOPSIS |
| 17 |
|
| 18 |
use Tree::M; |
| 19 |
|
| 20 |
$M = new Tree::M |
| 21 |
|
| 22 |
=head1 DESCRIPTION |
| 23 |
|
| 24 |
(not yet) |
| 25 |
|
| 26 |
Ever had the problem of managing multi-dimensional (spatial) data but your |
| 27 |
database only had one-dimensional indices (b-tree etc.)? Queries like |
| 28 |
|
| 29 |
select data from table where latitude > 40 and latitude < 50 |
| 30 |
and longitude> 50 and longitude< 60; |
| 31 |
|
| 32 |
are quite inefficient, unless longitude and latitude are part of the same |
| 33 |
spatial index (e.g. an R-tree). |
| 34 |
|
| 35 |
An M-tree is an index tree that does not directly look at the stored keys |
| 36 |
but rather requires a I<distance> (a metric, e.g. a vector norm) function |
| 37 |
to be defined that sorts keys according to their distance. In the example |
| 38 |
above the distance function could be the maximum norm (C<max(x1-x2, |
| 39 |
y1-y2)>). The lookup above would then be something like this: |
| 40 |
|
| 41 |
my $res = $M->range([45,55], 5); |
| 42 |
|
| 43 |
This module implements an M-tree. Although the data structure and the |
| 44 |
distance function is arbitrary, the current version only implements |
| 45 |
n-dimensional discrete vectors and hardwires the distance function to the |
| 46 |
suared euclidean metric (i.e. C<(x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2 + |
| 47 |
...>). Evolution towards more freedom is expected ;) |
| 48 |
|
| 49 |
=head2 THE Tree::M CLASS |
| 50 |
|
| 51 |
=over 4 |
| 52 |
|
| 53 |
=item $M = new Tree::M arg => value, ... |
| 54 |
|
| 55 |
Creates a new M-Tree. Before it can be used you have to call one of the |
| 56 |
C<create> or C<open> methods below. |
| 57 |
|
| 58 |
ndims => integer |
| 59 |
the number of dimensions each vector has |
| 60 |
|
| 61 |
range => [min, max, steps] |
| 62 |
min the lowest allowable scalar value in each dimension |
| 63 |
max the maximum allowable number |
| 64 |
steps the number of discrete steps (used when stored externally) |
| 65 |
|
| 66 |
pagesize => integer |
| 67 |
the size of one page on underlying storage. usually 4096, but |
| 68 |
large objects (ndims > 20 or so) might want to increase this |
| 69 |
|
| 70 |
Example: create an M-Tree that stores 8-bit rgb-values: |
| 71 |
|
| 72 |
$M = new Tree::M ndims => 3, range => [0, 255, 256]; |
| 73 |
|
| 74 |
Example: create an M-Tree that stores coordinates from -1..1 with 100 different steps: |
| 75 |
|
| 76 |
$M = new Tree::M ndims => 2, range => [-1, 1, 100]; |
| 77 |
|
| 78 |
=item $M->open(path) |
| 79 |
|
| 80 |
=item $M->create($path) |
| 81 |
|
| 82 |
Open or create the external storage file C<$path> and associate it with the tree. |
| 83 |
|
| 84 |
[this braindamaged API will go away ;)] |
| 85 |
|
| 86 |
=item $M->insert(\@v, $data) |
| 87 |
|
| 88 |
Insert a vector (given by an array reference) into the index and associate |
| 89 |
it with the value C<$data> (a 32-bit integer). |
| 90 |
|
| 91 |
=item $M->sync |
| 92 |
|
| 93 |
Synchronize the data file with memory. Useful after calling C<insert> to |
| 94 |
ensure the data actually reaches stable storage. |
| 95 |
|
| 96 |
=item $res = $M->range(\@v, $radius) |
| 97 |
|
| 98 |
Search all entries not farther away from C<@v> then C<$radius> and return |
| 99 |
an arrayref containing the searchresults. |
| 100 |
|
| 101 |
Each result is again anarrayref composed like this: |
| 102 |
|
| 103 |
[\@v, $data] |
| 104 |
|
| 105 |
e.g. the same as given to the C<insert> method. |
| 106 |
|
| 107 |
=item $res = $M->top(\@v, $n) |
| 108 |
|
| 109 |
Return the C<$n> "nearest neighbours". The results arrayref (see C<range>) |
| 110 |
contains the C<$n> index values nearest to C<@v>, sorted for distance. |
| 111 |
|
| 112 |
=item $distance = $M->distance(\@v1, \@v2) |
| 113 |
|
| 114 |
Calculcate the distance between two vectors, just as they databse engine |
| 115 |
would do it. |
| 116 |
|
| 117 |
=item $depth = $M->maxlevel |
| 118 |
|
| 119 |
Return the maximum height of the tree (usually a small integer specifying |
| 120 |
the length of the path from the root to the farthest leaf) |
| 121 |
|
| 122 |
=cut |
| 123 |
|
| 124 |
sub new { |
| 125 |
my $class = shift; |
| 126 |
my %a = @_; |
| 127 |
$class->_new( |
| 128 |
$a{ndims}, |
| 129 |
$a{range}[0], |
| 130 |
$a{range}[1], |
| 131 |
$a{range}[2], |
| 132 |
$a{pagesize}, |
| 133 |
); |
| 134 |
} |
| 135 |
|
| 136 |
=back |
| 137 |
|
| 138 |
=head1 BUGS |
| 139 |
|
| 140 |
Inserting too many duplicate keys into the tree cause the C++ library to |
| 141 |
die, so don't do that. |
| 142 |
|
| 143 |
=head1 AUTHOR |
| 144 |
|
| 145 |
Marc Lehmann <pcg@goof.com>. |
| 146 |
|
| 147 |
=head1 SEE ALSO |
| 148 |
|
| 149 |
perl(1), L<DBIx::SpatialKeys>. |
| 150 |
|
| 151 |
=cut |
| 152 |
|
| 153 |
1; |
| 154 |
|