… | |
… | |
2 | |
2 | |
3 | use Carp; |
3 | use Carp; |
4 | use DynaLoader; |
4 | use DynaLoader; |
5 | |
5 | |
6 | BEGIN { |
6 | BEGIN { |
7 | $VERSION = 0.01; |
7 | $VERSION = 0.02; |
8 | @ISA = qw(DynaLoader); |
8 | @ISA = qw(DynaLoader); |
9 | bootstrap Tree::M, $VERSION; |
9 | bootstrap Tree::M, $VERSION; |
10 | } |
10 | } |
11 | |
11 | |
12 | =head1 NAME |
12 | =head1 NAME |
13 | |
13 | |
14 | Tree::M - implement M-trees for efficient "multimedia-searched" |
14 | Tree::M - implement M-trees for efficient "metric/multimedia-searches" |
15 | |
15 | |
16 | =head1 SYNOPSIS |
16 | =head1 SYNOPSIS |
17 | |
17 | |
18 | use Tree::M; |
18 | use Tree::M; |
|
|
19 | |
|
|
20 | $M = new Tree::M |
19 | |
21 | |
20 | =head1 DESCRIPTION |
22 | =head1 DESCRIPTION |
21 | |
23 | |
22 | (not yet) |
24 | (not yet) |
23 | |
25 | |
24 | Ever had the problem of managing multi-dimensional (spatial) data but your |
26 | Ever had the problem of managing multi-dimensional (spatial) data but your |
25 | database only had one-dimensional indices (b-tree etc.)? Queries like |
27 | database only had one-dimensional indices (b-tree etc.)? Queries like |
26 | |
28 | |
27 | select data from table where latitude > 40 and latitude < 50 |
29 | select data from table where latitude > 40 and latitude < 50 |
28 | and longitude> 50 and longitude< 70; |
30 | and longitude> 50 and longitude< 60; |
29 | |
31 | |
30 | are quite inefficient, unless longitude and latitude are part of the same |
32 | are quite inefficient, unless longitude and latitude are part of the same |
31 | spatial index (e.g. a r-tree). |
33 | spatial index (e.g. an R-tree). |
32 | |
34 | |
33 | An M-tree etc.. etc... |
35 | An M-tree is an index tree that does not directly look at the stored keys |
|
|
36 | but rather requires a I<distance> (a metric, e.g. a vector norm) function |
|
|
37 | to be defined that sorts keys according to their distance. In the example |
|
|
38 | above the distance function could be the maximum norm (C<max(x1-x2, |
|
|
39 | y1-y2)>). The lookup above would then be something like this: |
|
|
40 | |
|
|
41 | my $res = $M->range([45,55], 5); |
|
|
42 | |
|
|
43 | This module implements an M-tree. Although the data structure and the |
|
|
44 | distance function is arbitrary, the current version only implements |
|
|
45 | n-dimensional discrete vectors and hardwires the distance function to the |
|
|
46 | suared euclidean metric (i.e. C<(x1-x2)**2 + (y1-y2)**2 + (z1-z2)**2 + |
|
|
47 | ...>). Evolution towards more freedom is expected ;) |
34 | |
48 | |
35 | =head2 THE Tree::M CLASS |
49 | =head2 THE Tree::M CLASS |
36 | |
50 | |
37 | =over 4 |
51 | =over 4 |
38 | |
52 | |
39 | =item $M = new Tree::M arg => value, ... |
53 | =item $M = new Tree::M arg => value, ... |
40 | |
54 | |
41 | ... |
55 | Creates a new M-Tree. Before it can be used you have to call one of the |
|
|
56 | C<create> or C<open> methods below. |
42 | |
57 | |
43 | distance => specify a distance function. the default distance |
58 | ndims => integer |
44 | method uses this function in some way. |
59 | the number of dimensions each vector has |
45 | keyof => the keyof method uses this argument to determine |
60 | |
46 | the key of an object |
61 | range => [min, max, steps] |
47 | min => the minimum number of objects stored in a node |
62 | min the lowest allowable scalar value in each dimension |
48 | max => the maximum number of objects stored in a node |
63 | max the maximum allowable number |
|
|
64 | steps the number of discrete steps (used when stored externally) |
|
|
65 | |
|
|
66 | pagesize => integer |
|
|
67 | the size of one page on underlying storage. usually 4096, but |
|
|
68 | large objects (ndims > 20 or so) might want to increase this |
|
|
69 | |
|
|
70 | Example: create an M-Tree that stores 8-bit rgb-values: |
|
|
71 | |
|
|
72 | $M = new Tree::M ndims => 3, range => [0, 255, 256]; |
|
|
73 | |
|
|
74 | Example: create an M-Tree that stores coordinates from -1..1 with 100 different steps: |
|
|
75 | |
|
|
76 | $M = new Tree::M ndims => 2, range => [-1, 1, 100]; |
|
|
77 | |
|
|
78 | =item $M->open(path) |
|
|
79 | |
|
|
80 | =item $M->create($path) |
|
|
81 | |
|
|
82 | Open or create the external storage file C<$path> and associate it with the tree. |
|
|
83 | |
|
|
84 | [this braindamaged API will go away ;)] |
|
|
85 | |
|
|
86 | =item $M->insert(\@v, $data) |
|
|
87 | |
|
|
88 | Insert a vector (given by an array reference) into the index and associate |
|
|
89 | it with the value C<$data> (a 32-bit integer). |
|
|
90 | |
|
|
91 | =item $M->sync |
|
|
92 | |
|
|
93 | Synchronize the data file with memory. Useful after calling C<insert> to |
|
|
94 | ensure the data actually reaches stable storage. |
|
|
95 | |
|
|
96 | =item $res = $M->range(\@v, $radius) |
|
|
97 | |
|
|
98 | Search all entries not farther away from C<@v> then C<$radius> and return |
|
|
99 | an arrayref containing the searchresults. |
|
|
100 | |
|
|
101 | Each result is again anarrayref composed like this: |
|
|
102 | |
|
|
103 | [\@v, $data] |
|
|
104 | |
|
|
105 | e.g. the same as given to the C<insert> method. |
|
|
106 | |
|
|
107 | =item $res = $M->top(\@v, $n) |
|
|
108 | |
|
|
109 | Return the C<$n> "nearest neighbours". The results arrayref (see C<range>) |
|
|
110 | contains the C<$n> index values nearest to C<@v>, sorted for distance. |
|
|
111 | |
|
|
112 | =item $distance = $M->distance(\@v1, \@v2) |
|
|
113 | |
|
|
114 | Calculcate the distance between two vectors, just as they databse engine |
|
|
115 | would do it. |
|
|
116 | |
|
|
117 | =item $depth = $M->maxlevel |
|
|
118 | |
|
|
119 | Return the maximum height of the tree (usually a small integer specifying |
|
|
120 | the length of the path from the root to the farthest leaf) |
49 | |
121 | |
50 | =cut |
122 | =cut |
51 | |
123 | |
|
|
124 | sub new { |
|
|
125 | my $class = shift; |
|
|
126 | my %a = @_; |
|
|
127 | $class->_new( |
|
|
128 | $a{ndims}, |
|
|
129 | $a{range}[0], |
|
|
130 | $a{range}[1], |
|
|
131 | $a{range}[2], |
|
|
132 | $a{pagesize}, |
|
|
133 | ); |
|
|
134 | } |
|
|
135 | |
52 | =back |
136 | =back |
|
|
137 | |
|
|
138 | =head1 BUGS |
|
|
139 | |
|
|
140 | Inserting too many duplicate keys into the tree cause the C++ library to |
|
|
141 | die, so don't do that. |
53 | |
142 | |
54 | =head1 AUTHOR |
143 | =head1 AUTHOR |
55 | |
144 | |
56 | Marc Lehmann <pcg@goof.com>. |
145 | Marc Lehmann <pcg@goof.com>. |
57 | |
146 | |