ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/Geo-LatLon2Place/bin/geo-latlon2place-makedb
Revision: 1.1
Committed: Mon Mar 14 02:41:52 2022 UTC (2 years, 2 months ago) by root
Branch: MAIN
CVS Tags: rel-0_01
Log Message:
*** empty log message ***

File Contents

# Content
1 #!/opt/bin/perl
2
3 # usage: geonames2cdb geonames.txt database.cdb
4 # you can use allCountries.zp, or cities500.zip and so on
5 # from https://download.geonames.org/export/dump/
6
7 =head1 NAME
8
9 geo-latlon2place-makedb - generate database for use with Geo::LatLon2Place
10
11 =head1 SYNOPSIS
12
13 geo-latlon2place-makedb [OPTION]... inputfile.txt outputfile.cdb
14
15 =head1 DESCRIPTION
16
17 abc
18
19 =head1 OPTIONS AND ARGUMENTS
20
21 geo-latlon2place-makedb requires two arguments: a text file with geo data
22 and the name of the database file to be written.
23
24 By default, the input file is considered to be in geonames gazetteer
25 format, but this can be customized using B<--extract>.
26
27 =over
28
29 =item B<--cellsize> km (default C<20>)
30
31 The (minimum) size of a single grid cell in km - the data is binned into
32 cells of at least this size. It should not be larger than the search
33 radius.
34
35 =item B<--extract> C<geonames>|C<geonames-postalcodes>|perl...
36
37 The extraction method: the default is C<geonames>, which expects a
38 geonames database (L<https://download.geonames.org/export/dump/>, for
39 example F<DE.txt>, F<cities500.txt> or F<allCountries.txt>) and extracts
40 I<placename, countrycode> strings from it.
41
42 The method C<geonames-postalcodes> (not yet implemented)
43 does the same, but for a geonames postal code database
44 L<https://download.geonames.org/export/zip>.
45
46 Lastly, you can specify a perl fragment that implements your own filtering
47 and extraction.
48
49 =back
50
51 =head1 FILTERING AND EXTRACTION
52
53 Record selection and the returned data are not fixed - you can filter your records
54 yourself and associate any (reasonably short, the maximum is a bit above 200 octets)
55 binary blob with a coordinate pair.
56
57 To do this, you have to provide a perl fragment that extracts latitude,
58 longitude, a weight and the associated data blob from an input line stored
59 in C<$_>. The file is opened using the C<:perlio> layer, so if your input
60 file is in UTF-8, so will be C<$_>.
61
62 For example, the following would expect an input file with space separated
63 latitude, longitude, weight and name, where name can contain spaces, which
64 is useful when you wat to provide your own input data:
65
66 geo-latlon2place-makedb --extract 'chomp; split / /, 4' input output
67
68 A slighly more verbose example expecting only latitude, longitude and a
69 name would be:
70
71 geo-latlon2place-makedb --extract '
72 chomp;
73 my ($lat, $lon, $name) = split / /, 4;
74 ($lat, $lon, 1, $name)
75 ' input output
76
77 If you want to skip certain lines without adding anything to the database,
78 you can return nothing:
79
80 geo-latlon2place-makedb --extract '
81 chomp;
82 my ($lat, $lon, $name) = split / /;
83 return unless $lat < 0; # only add southern hemisphere points
84 ($lat, $lon, 1, $name)
85 ' input output
86
87 In general, the fragment should return either an empty list, or a
88 four-tuple with decimal latitude, longitude, a weight (integer 0..255)
89 and the binary data to be associated with the coordinate. Other than the
90 weight, these should be self-explaining. The weight is used during search
91 and will be multiplied to the square of the distance, and is used to make
92 larger cities win over small ones when the coordinate is somewhere between
93 them.
94
95 The C<geonames> filter looks similar to this fragment, which shows off
96 more possibilities:
97
98 my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/;
99
100 return if $t1 ne "P"; # only places
101
102 # minimum population 200, or it is a populated place with no known population
103 $pop => 200
104 or ($pop eq "" and $t2 eq "PPL")
105 or return;
106
107 # skip certain places we are not interested in
108 return if $t2 eq "PPLX"; # section of populated place
109 return if $t2 eq "PPLW"; # destroyed populated place
110 return if $t2 eq "PPLH"; # historical populated place
111 return if $t2 eq "PPLQ"; # abandoned populated place
112
113 # geonames has a lot of very long place names which aren't
114 # actually place names, so ignore very long names
115 60 > length $name
116 or return;
117
118 # we estimate a weight by dividing 25 by the radius of the place,
119 # which we get by assuming a fixed population density of 5000 people/kmĀ²,
120 # which is almost always a considerable over-estimate.
121 # 25 and 5000 are pretty much made-up, feel free to improve and
122 # send me the results.
123 my $w = 25 / (1 + sqrt $pop / 5000);
124
125 # administrative centers get a fixed low weight
126 if ($t2 =~ /^PPLA(\d*)/) {
127 $w = $1 || 1;
128 }
129
130 ($lat, $lon, $w, "$name, $cc")
131
132 =head1 AUTHOR
133
134 Marc Lehmann <schmorp@schmorp.de>
135
136 =cut
137
138 use common::sense;
139 use Getopt::Long;
140 use Geo::LatLon2Place ();
141 use Pod::Usage ();
142
143 GetOptions
144 "help|h" => sub { Pod::Usage::pod2usage -exittval => 0, verbose => 1 },
145 "extract=s" => \my $extract,
146 "cellsize=i" => \my $km,
147 or Pod::Usage::pod2usage
148 -exitval => 1,
149 ;
150
151 @ARGV == 2
152 or die "need exactly two paths: inputfile.txt outputdatabase.cdb\n";
153
154 $km ||= 20;
155 $extract ||= "geonames";
156
157 if ($extract eq "geonames") {
158 $extract = sub {
159 my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/;
160
161 return if $t1 ne "P"; # only places
162
163 $pop => 200
164 or ($pop eq "" and $t2 eq "PPL")
165 or return;
166
167 return if $t2 eq "PPLX"; # section of populated place
168 return if $t2 eq "PPLW"; # destroyed populated place
169 return if $t2 eq "PPLH"; # historical populated place
170 return if $t2 eq "PPLQ"; # abandoned populated place
171
172 # geonames has a lot of very long place names which aren't
173 # actually place names, so ignore very long names
174 60 > length $name
175 or return;
176
177 my $w = 25 / (1 + sqrt $pop / 5000);
178
179 if ($t2 =~ /^PPLA(\d+)/) {
180 $w = $1 || 1;
181 }
182
183 ($lat, $lon, $w, "$name, $cc")
184 };
185 } elsif ($extract eq "geonames-postalcodes") {
186 $extract = sub {
187 my ($cc, $zip, $name, undef, undef, undef, undef, undef, undef, $lat, $lon, undef) = split /\t/;
188
189 ($lat, $lon, 1, "$zip $name, $cc")
190 };
191 } else {
192 $extract = eval "#line 1 \"extract fragment\"\nsub { $extract; }";
193 die "$@" if $@;
194 }
195
196 my $torad = (atan2 1,0) / 180;
197
198 my $boxes = int 6378 * 2 * 2 * (atan2 1,0) / $km; # equator radius / cell size
199
200 open my $fh, "<:perlio", $ARGV[0];
201
202 my @grid;
203
204 while (<$fh>) {
205 my ($lat, $lon, $w, $payload) = $extract->()
206 or next;
207
208 unless (255 - (2+2+1) >= length $payload) {
209 $payload =~ s/([^(\x20-\x7e])/sprintf "\\x%02x", ord $1/ge;
210 warn "payload too long, skipping: $payload\n";
211 next;
212 }
213
214 # higher latitudes get a (much) lower resolution, so we always get at least $km cells
215 my $blat = int $boxes * cos abs $lat * $torad;
216 my $x = int (($lon + 180) * $blat / 360);
217 my $y = int (($lat + 90) * $boxes / 180);
218
219 # we use 16 bit for lat/lon, 8 bi8t for the weight, BER-id, counted name and CC
220 push @{ $grid[$y][$x] }, pack "C/a*", pack "s< s< C a*", $lat * 32767 / 90, $lon * 32767 / 180, $w, $payload;
221 }
222
223 my ($max, $sum, $cnt);
224
225 open my $cdb, ">", $ARGV[1]
226 or die;
227 Geo::LatLon2Place::cdb_make_start fileno $cdb
228 and die "cdb_make_start failure";
229
230 Geo::LatLon2Place::cdb_make_add "", pack "a4VVV", "SRGL", 1, $km, $boxes;
231
232 for my $y (0 .. $#grid) {
233 my $r = $grid[$y];
234 for my $x (0 .. $#$r) {
235 my $c = $r->[$x]
236 or next;
237
238 # statistics
239 $sum += scalar @$c;
240 $cnt++;
241 $max = [(scalar @$c), $x, $y] if @$c > $max->[0];
242
243 Geo::LatLon2Place::cdb_make_add
244 +(pack "s< s<", $x, $y),
245 +(join "", @$c)
246 and "cdb_make_add failure";
247 }
248 }
249
250 Geo::LatLon2Place::cdb_make_finish
251 and die "cdb_make_finish failed";
252 close $cdb;
253
254 $max->[2] = $max->[2] * 180 / $boxes - 90;
255 my $blat = int $boxes * cos abs $max->[2] * $torad;
256 $max->[1] = $max->[1] * 360 / $blat - 180;
257 print "cell size: $km km, grid size: $boxes, non-empty cell count: $cnt\naverage cell size: ", int $sum/$cnt, ", maximum cell size: $max->[0] (at $max->[2] $max->[1])\n";
258