1 |
package Tree::M; |
2 |
|
3 |
use Carp; |
4 |
use DynaLoader; |
5 |
|
6 |
BEGIN { |
7 |
$VERSION = 0.02; |
8 |
@ISA = qw(DynaLoader); |
9 |
bootstrap Tree::M, $VERSION; |
10 |
} |
11 |
|
12 |
=head1 NAME |
13 |
|
14 |
Tree::M - implement M-trees for efficient "metric/multimedia-searches" |
15 |
|
16 |
=head1 SYNOPSIS |
17 |
|
18 |
use Tree::M; |
19 |
|
20 |
$M = new Tree::M |
21 |
|
22 |
=head1 DESCRIPTION |
23 |
|
24 |
(not yet) |
25 |
|
26 |
Ever had the problem of managing multi-dimensional (spatial) data but your |
27 |
database only had one-dimensional indices (b-tree etc.)? Queries like |
28 |
|
29 |
select data from table where latitude > 40 and latitude < 50 |
30 |
and longitude> 50 and longitude< 60; |
31 |
|
32 |
are quite inefficient, unless longitude and latitude are part of the same |
33 |
spatial index (e.g. an R-tree). |
34 |
|
35 |
An M-tree is an index tree that does not directly look at the stored keys |
36 |
but rather requires a I<distance> (a metric, e.g. a vector norm) function |
37 |
to be defined that sorts keys according to their distance. In the example |
38 |
above the distance function could be the maximum norm (C<max(x1-x2, |
39 |
y1-y2)>). The lookup above would then be something like this: |
40 |
|
41 |
my $res = $M->range([45,55], 5); |
42 |
|
43 |
This module implements an M-tree. Although the data structure and the |
44 |
distance function is arbitrary, the current version only implements |
45 |
n-dimensional discrete vectors and hardwires the distance function to the |
46 |
suared euclidean metric (i.e. C<(x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2 + |
47 |
...>). Evolution towards more freedom is expected ;) |
48 |
|
49 |
=head2 THE Tree::M CLASS |
50 |
|
51 |
=over 4 |
52 |
|
53 |
=item $M = new Tree::M arg => value, ... |
54 |
|
55 |
Creates a new M-Tree. Before it can be used you have to call one of the |
56 |
C<create> or C<open> methods below. |
57 |
|
58 |
ndims => integer |
59 |
the number of dimensions each vector has |
60 |
|
61 |
range => [min, max, steps] |
62 |
min the lowest allowable scalar value in each dimension |
63 |
max the maximum allowable number |
64 |
steps the number of discrete steps (used when stored externally) |
65 |
|
66 |
pagesize => integer |
67 |
the size of one page on underlying storage. usually 4096, but |
68 |
large objects (ndims > 20 or so) might want to increase this |
69 |
|
70 |
Example: create an M-Tree that stores 8-bit rgb-values: |
71 |
|
72 |
$M = new Tree::M ndims => 3, range => [0, 255, 256]; |
73 |
|
74 |
Example: create an M-Tree that stores coordinates from -1..1 with 100 different steps: |
75 |
|
76 |
$M = new Tree::M ndims => 2, range => [-1, 1, 100]; |
77 |
|
78 |
=item $M->open(path) |
79 |
|
80 |
=item $M->create($path) |
81 |
|
82 |
Open or create the external storage file C<$path> and associate it with the tree. |
83 |
|
84 |
[this braindamaged API will go away ;)] |
85 |
|
86 |
=item $M->insert(\@v, $data) |
87 |
|
88 |
Insert a vector (given by an array reference) into the index and associate |
89 |
it with the value C<$data> (a 32-bit integer). |
90 |
|
91 |
=item $M->sync |
92 |
|
93 |
Synchronize the data file with memory. Useful after calling C<insert> to |
94 |
ensure the data actually reaches stable storage. |
95 |
|
96 |
=item $res = $M->range(\@v, $radius) |
97 |
|
98 |
Search all entries not farther away from C<@v> then C<$radius> and return |
99 |
an arrayref containing the searchresults. |
100 |
|
101 |
Each result is again anarrayref composed like this: |
102 |
|
103 |
[\@v, $data] |
104 |
|
105 |
e.g. the same as given to the C<insert> method. |
106 |
|
107 |
=item $res = $M->top(\@v, $n) |
108 |
|
109 |
Return the C<$n> "nearest neighbours". The results arrayref (see C<range>) |
110 |
contains the C<$n> index values nearest to C<@v>, sorted for distance. |
111 |
|
112 |
=item $distance = $M->distance(\@v1, \@v2) |
113 |
|
114 |
Calculcate the distance between two vectors, just as they databse engine |
115 |
would do it. |
116 |
|
117 |
=item $depth = $M->maxlevel |
118 |
|
119 |
Return the maximum height of the tree (usually a small integer specifying |
120 |
the length of the path from the root to the farthest leaf) |
121 |
|
122 |
=cut |
123 |
|
124 |
sub new { |
125 |
my $class = shift; |
126 |
my %a = @_; |
127 |
$class->_new( |
128 |
$a{ndims}, |
129 |
$a{range}[0], |
130 |
$a{range}[1], |
131 |
$a{range}[2], |
132 |
$a{pagesize}, |
133 |
); |
134 |
} |
135 |
|
136 |
=back |
137 |
|
138 |
=head1 BUGS |
139 |
|
140 |
Inserting too many duplicate keys into the tree cause the C++ library to |
141 |
die, so don't do that. |
142 |
|
143 |
=head1 AUTHOR |
144 |
|
145 |
Marc Lehmann <pcg@goof.com>. |
146 |
|
147 |
=head1 SEE ALSO |
148 |
|
149 |
perl(1), L<DBIx::SpatialKeys>. |
150 |
|
151 |
=cut |
152 |
|
153 |
1; |
154 |
|