1 | #!/opt/bin/perl |
1 | #!/opt/bin/perl |
2 | |
|
|
3 | # usage: geonames2cdb geonames.txt database.cdb |
|
|
4 | # you can use allCountries.zp, or cities500.zip and so on |
|
|
5 | # from https://download.geonames.org/export/dump/ |
|
|
6 | |
2 | |
7 | =head1 NAME |
3 | =head1 NAME |
8 | |
4 | |
9 | geo-latlon2place-makedb - generate database for use with Geo::LatLon2Place |
5 | geo-latlon2place-makedb - generate database for use with Geo::LatLon2Place |
10 | |
6 | |
… | |
… | |
24 | By default, the input file is considered to be in geonames gazetteer |
20 | By default, the input file is considered to be in geonames gazetteer |
25 | format, but this can be customized using B<--extract>. |
21 | format, but this can be customized using B<--extract>. |
26 | |
22 | |
27 | =over |
23 | =over |
28 | |
24 | |
29 | =item B<--cellsize> km (default C<20>) |
25 | =item B<--cellsize> km (default C<20>, or C<10> for geonames-postalcodes) |
30 | |
26 | |
31 | The (minimum) size of a single grid cell in km - the data is binned into |
27 | The (minimum) size of a single grid cell in km - the data is binned into |
32 | cells of at least this size. It should not be larger than the search |
28 | cells of at least this size. It should not be larger than the search |
33 | radius. |
29 | radius. |
34 | |
30 | |
… | |
… | |
162 | ; |
158 | ; |
163 | |
159 | |
164 | @ARGV == 2 |
160 | @ARGV == 2 |
165 | or die "need exactly two paths: inputfile.txt outputdatabase.cdb\n"; |
161 | or die "need exactly two paths: inputfile.txt outputdatabase.cdb\n"; |
166 | |
162 | |
167 | $km ||= 20; |
|
|
168 | $extract ||= "geonames"; |
163 | $extract ||= "geonames"; |
169 | |
164 | |
170 | if ($extract eq "geonames") { |
165 | if ($extract eq "geonames") { |
171 | $extract = sub { |
166 | $extract = sub { |
172 | my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/; |
167 | my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/; |
… | |
… | |
194 | } |
189 | } |
195 | |
190 | |
196 | ($lat, $lon, $w, "$name, $cc") |
191 | ($lat, $lon, $w, "$name, $cc") |
197 | }; |
192 | }; |
198 | } elsif ($extract eq "geonames-postalcodes") { |
193 | } elsif ($extract eq "geonames-postalcodes") { |
|
|
194 | $km ||= 10; |
199 | $extract = sub { |
195 | $extract = sub { |
200 | my ($cc, $zip, $name, undef, undef, undef, undef, undef, undef, $lat, $lon, undef) = split /\t/; |
196 | my ($cc, $zip, $name, undef, undef, undef, undef, undef, undef, $lat, $lon, undef) = split /\t/; |
201 | |
197 | |
202 | ($lat, $lon, 1, "$zip $name, $cc") |
198 | ($lat, $lon, 1, "$zip $name, $cc") |
203 | }; |
199 | }; |
204 | } else { |
200 | } else { |
205 | $extract = eval "#line 1 \"extract fragment\"\nsub { $extract; }"; |
201 | $extract = eval "#line 1 \"extract fragment\"\nsub { $extract; }"; |
206 | die "$@" if $@; |
202 | die "$@" if $@; |
207 | } |
203 | } |
208 | |
204 | |
|
|
205 | $km ||= 20; |
|
|
206 | |
209 | my $torad = (atan2 1,0) / 180; |
207 | my $torad = (atan2 1,0) / 90; |
210 | |
208 | |
211 | my $boxes = int 6378 * 2 * 2 * (atan2 1,0) / $km; # equator radius / cell size |
209 | my $boxes = int 6378 * 2 * 2 * (atan2 1,0) / $km; # equator radius / cell size |
212 | |
210 | |
213 | open my $fh, "<:perlio", $ARGV[0]; |
211 | open my $fh, "<:perlio", $ARGV[0]; |
214 | |
212 | |
… | |
… | |
216 | |
214 | |
217 | while (<$fh>) { |
215 | while (<$fh>) { |
218 | my ($lat, $lon, $w, $payload) = $extract->() |
216 | my ($lat, $lon, $w, $payload) = $extract->() |
219 | or next; |
217 | or next; |
220 | |
218 | |
221 | unless (255 - (2+2+1) >= length $payload) { |
219 | unless (255 >= length $payload) { |
222 | $payload =~ s/([^(\x20-\x7e])/sprintf "\\x%02x", ord $1/ge; |
220 | $payload =~ s/([^(\x20-\x7e])/sprintf "\\x%02x", ord $1/ge; |
223 | warn "payload too long, skipping: $payload\n"; |
221 | warn "payload too long, skipping: $payload\n"; |
224 | next; |
222 | next; |
225 | } |
223 | } |
226 | |
224 | |
227 | # higher latitudes get a (much) lower resolution, so we always get at least $km cells |
225 | my $blat = int $boxes * cos $lat * $torad; # can be 0, but does not matter |
228 | my $blat = int $boxes * cos abs $lat * $torad; |
|
|
229 | my $x = int (($lon + 180) * $blat / 360); |
226 | my $x = int (($lon + 180) * $blat / 360); |
230 | my $y = int (($lat + 90) * $boxes / 180); |
227 | my $y = int (($lat + 90) * $boxes / 180); |
231 | |
228 | |
232 | # we use 16 bit for lat/lon, 8 bi8t for the weight, BER-id, counted name and CC |
229 | # we use 16 bit for lat/lon, 8 bi8t for the weight, BER-id, counted name and CC |
233 | push @{ $grid[$y][$x] }, pack "C/a*", pack "s< s< C a*", $lat * 32767 / 90, $lon * 32767 / 180, $w, $payload; |
230 | push @{ $grid[$y][$x] }, pack "s< s< C C/a*", $lat * 32767 / 90, $lon * 32767 / 180, $w, $payload; |
234 | } |
231 | } |
235 | |
232 | |
236 | my ($max, $sum, $cnt); |
233 | my ($max, $sum, $cnt); |
237 | |
234 | |
238 | open my $cdb, ">", $ARGV[1] |
235 | open my $cdb, ">", $ARGV[1] |
239 | or die; |
236 | or die; |
240 | Geo::LatLon2Place::cdb_make_start fileno $cdb |
237 | Geo::LatLon2Place::cdb_make_start fileno $cdb |
241 | and die "cdb_make_start failure"; |
238 | and die "cdb_make_start failure"; |
242 | |
239 | |
243 | Geo::LatLon2Place::cdb_make_add "", pack "a4VVV", "SRGL", 1, $km, $boxes; |
240 | Geo::LatLon2Place::cdb_make_add "", pack "a4VVV", "SRGL", 2, $km, $boxes; |
244 | |
241 | |
245 | for my $y (0 .. $#grid) { |
242 | for my $y (0 .. $#grid) { |
246 | my $r = $grid[$y]; |
243 | my $r = $grid[$y]; |
247 | for my $x (0 .. $#$r) { |
244 | for my $x (0 .. $#$r) { |
248 | my $c = $r->[$x] |
245 | my $c = $r->[$x] |
… | |
… | |
263 | Geo::LatLon2Place::cdb_make_finish |
260 | Geo::LatLon2Place::cdb_make_finish |
264 | and die "cdb_make_finish failed"; |
261 | and die "cdb_make_finish failed"; |
265 | close $cdb; |
262 | close $cdb; |
266 | |
263 | |
267 | $max->[2] = $max->[2] * 180 / $boxes - 90; |
264 | $max->[2] = $max->[2] * 180 / $boxes - 90; |
268 | my $blat = int $boxes * cos abs $max->[2] * $torad; |
265 | my $blat = int $boxes * cos $max->[2] * $torad; |
269 | $max->[1] = $max->[1] * 360 / $blat - 180; |
266 | $max->[1] = $max->[1] * 360 / $blat - 180; |
270 | print "cell size: $km km, grid size: $boxes, non-empty cell count: $cnt\naverage cell size: ", int $sum/$cnt, ", maximum cell size: $max->[0] (at $max->[2] $max->[1])\n"; |
267 | print "cell size: $km km, grid size: $boxes, non-empty cell count: $cnt\naverage cell size: ", int $sum/$cnt, ", maximum cell size: $max->[0] (at $max->[2] $max->[1])\n"; |
271 | |
268 | |