1 |
#!/opt/bin/perl |
2 |
|
3 |
=head1 NAME |
4 |
|
5 |
geo-latlon2place-makedb - generate database for use with Geo::LatLon2Place |
6 |
|
7 |
=head1 SYNOPSIS |
8 |
|
9 |
geo-latlon2place-makedb [OPTION]... inputfile.txt outputfile.cdb |
10 |
|
11 |
=head1 DESCRIPTION |
12 |
|
13 |
abc |
14 |
|
15 |
=head1 OPTIONS AND ARGUMENTS |
16 |
|
17 |
geo-latlon2place-makedb requires two arguments: a text file with geo data |
18 |
and the name of the database file to be written. |
19 |
|
20 |
By default, the input file is considered to be in geonames gazetteer |
21 |
format, but this can be customized using B<--extract>. |
22 |
|
23 |
=over |
24 |
|
25 |
=item B<--cellsize> km (default C<20>, or C<10> for geonames-postalcodes) |
26 |
|
27 |
The (minimum) size of a single grid cell in km - the data is binned into |
28 |
cells of at least this size. It should not be larger than the search |
29 |
radius. |
30 |
|
31 |
=item B<--extract> C<geonames>|C<geonames-postalcodes>|perl... |
32 |
|
33 |
The extraction method: the default is C<geonames>, which expects a |
34 |
geonames database (L<https://download.geonames.org/export/dump/>, for |
35 |
example F<DE.txt>, F<cities500.txt> or F<allCountries.txt>) and extracts |
36 |
I<placename, countrycode> strings from it. |
37 |
|
38 |
The method C<geonames-postalcodes> does the same, but for a geonames |
39 |
postal code database L<https://download.geonames.org/export/zip>, and |
40 |
extracts C<zip name, countrycopde> strings. |
41 |
|
42 |
Lastly, you can specify a perl fragment that implements your own filtering |
43 |
and extraction. |
44 |
|
45 |
=back |
46 |
|
47 |
=head1 FILTERING AND EXTRACTION |
48 |
|
49 |
Record selection and the returned data are not fixed - you can filter your records |
50 |
yourself and associate any (reasonably short, the maximum is a bit above 200 octets) |
51 |
binary blob with a coordinate pair. |
52 |
|
53 |
To do this, you have to provide a perl fragment that extracts latitude, |
54 |
longitude, a weight and the associated data blob from an input line stored |
55 |
in C<$_>. The file is opened using the C<:perlio> layer, so if your input |
56 |
file is in UTF-8, so will be C<$_>. |
57 |
|
58 |
For example, the following would expect an input file with space separated |
59 |
latitude, longitude, weight and name, where name can contain spaces, which |
60 |
is useful when you want to provide your own input data: |
61 |
|
62 |
geo-latlon2place-makedb --extract 'chomp; split / /, 4' input output |
63 |
|
64 |
A slighly more verbose example expecting only latitude, longitude and a |
65 |
name would be: |
66 |
|
67 |
geo-latlon2place-makedb --extract ' |
68 |
chomp; |
69 |
my ($lat, $lon, $name) = split / /, 4; |
70 |
($lat, $lon, 1, $name) |
71 |
' input output |
72 |
|
73 |
If you want to skip certain lines without adding anything to the database, |
74 |
you can return nothing: |
75 |
|
76 |
geo-latlon2place-makedb --extract ' |
77 |
chomp; |
78 |
my ($lat, $lon, $name) = split / /; |
79 |
return unless $lat < 0; # only add southern hemisphere points |
80 |
($lat, $lon, 1, $name) |
81 |
' input output |
82 |
|
83 |
In general, the fragment should return either an empty list, or a |
84 |
four-tuple with decimal latitude, longitude, a weight (integer 0..255) |
85 |
and the binary data to be associated with the coordinate. Other than the |
86 |
weight, these should be self-explaining. The weight is used during search |
87 |
and will be multiplied to the square of the distance, and is used to make |
88 |
larger cities win over small ones when the coordinate is somewhere between |
89 |
them. |
90 |
|
91 |
The standard extractors (C<geonames> and C<geonames-postalcodes>) provide |
92 |
a UTF-8-encoded string as blob, but any binary data will do, for example, |
93 |
if you want to associate your coordinate pairs with some short-ish |
94 |
integer codes, you could do this: |
95 |
|
96 |
geo-latlon2place-makedb --extract ' |
97 |
chomp; |
98 |
my ($lat, $lon, $id) = split / /, 4; |
99 |
($lat, $lon, 1, pack "w", $id) |
100 |
' input output |
101 |
|
102 |
And later use C<unpack "w"> on the data returned by C<lookup>. |
103 |
|
104 |
The C<geonames> filter looks similar to the following fragment, which |
105 |
shows off some more filtering possibilities: |
106 |
|
107 |
my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/; |
108 |
|
109 |
return if $t1 ne "P"; # only places |
110 |
|
111 |
# minimum population 200, or it is a populated place with no known population |
112 |
$pop => 200 |
113 |
or ($pop eq "" and $t2 eq "PPL") |
114 |
or return; |
115 |
|
116 |
# skip certain places we are not interested in |
117 |
return if $t2 eq "PPLX"; # section of populated place |
118 |
return if $t2 eq "PPLW"; # destroyed populated place |
119 |
return if $t2 eq "PPLH"; # historical populated place |
120 |
return if $t2 eq "PPLQ"; # abandoned populated place |
121 |
|
122 |
# geonames has a lot of very long place names which aren't |
123 |
# actually place names, so ignore very long names |
124 |
60 > length $name |
125 |
or return; |
126 |
|
127 |
# we estimate a weight by dividing 25 by the radius of the place, |
128 |
# which we get by assuming a fixed population density of 5000 # people |
129 |
# per square km, # which is almost always a considerable over-estimate. |
130 |
# 25 and 5000 are pretty much made-up, feel free to improve and |
131 |
# send me the results. |
132 |
my $w = 25 / (1 + sqrt $pop / 5000); |
133 |
|
134 |
# administrative centers get a fixed low weight |
135 |
if ($t2 =~ /^PPLA(\d*)/) { |
136 |
$w = $1 || 1; |
137 |
} |
138 |
|
139 |
($lat, $lon, $w, "$name, $cc") |
140 |
|
141 |
=head1 AUTHOR |
142 |
|
143 |
Marc Lehmann <schmorp@schmorp.de> |
144 |
|
145 |
=cut |
146 |
|
147 |
use common::sense; |
148 |
use Getopt::Long; |
149 |
use Geo::LatLon2Place (); |
150 |
use Pod::Usage (); |
151 |
|
152 |
GetOptions |
153 |
"help|h" => sub { Pod::Usage::pod2usage -exittval => 0, verbose => 1 }, |
154 |
"extract=s" => \my $extract, |
155 |
"cellsize=i" => \my $km, |
156 |
or Pod::Usage::pod2usage |
157 |
-exitval => 1, |
158 |
; |
159 |
|
160 |
@ARGV == 2 |
161 |
or die "need exactly two paths: inputfile.txt outputdatabase.cdb\n"; |
162 |
|
163 |
$extract ||= "geonames"; |
164 |
|
165 |
if ($extract eq "geonames") { |
166 |
$extract = sub { |
167 |
my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/; |
168 |
|
169 |
return if $t1 ne "P"; # only places |
170 |
|
171 |
$pop => 200 |
172 |
or ($pop eq "" and $t2 eq "PPL") |
173 |
or return; |
174 |
|
175 |
return if $t2 eq "PPLX"; # section of populated place |
176 |
return if $t2 eq "PPLW"; # destroyed populated place |
177 |
return if $t2 eq "PPLH"; # historical populated place |
178 |
return if $t2 eq "PPLQ"; # abandoned populated place |
179 |
|
180 |
# geonames has a lot of very long place names which aren't |
181 |
# actually place names, so ignore very long names |
182 |
60 > length $name |
183 |
or return; |
184 |
|
185 |
my $w = 25 / (1 + sqrt $pop / 5000); |
186 |
|
187 |
if ($t2 =~ /^PPLA(\d+)/) { |
188 |
$w = $1 || 1; |
189 |
} |
190 |
|
191 |
($lat, $lon, $w, "$name, $cc") |
192 |
}; |
193 |
} elsif ($extract eq "geonames-postalcodes") { |
194 |
$km ||= 10; |
195 |
$extract = sub { |
196 |
my ($cc, $zip, $name, undef, undef, undef, undef, undef, undef, $lat, $lon, undef) = split /\t/; |
197 |
|
198 |
# https://buttermaidbakery.com/pages/what-is-an-apo-fpo-dpo-address |
199 |
return if $name =~ /^[AFD]PO [A-Z][A-Z]\z/ and $cc eq "US"; |
200 |
|
201 |
($lat, $lon, 1, "$zip $name, $cc") |
202 |
}; |
203 |
} else { |
204 |
$extract = eval "#line 1 \"extract fragment\"\nsub { $extract; }"; |
205 |
die "$@" if $@; |
206 |
} |
207 |
|
208 |
$km ||= 20; |
209 |
|
210 |
my $torad = (atan2 1,0) / 90; |
211 |
|
212 |
my $boxes = int 6378 * 2 * 2 * (atan2 1,0) / $km; # equator radius / cell size |
213 |
|
214 |
open my $fh, "<:perlio", $ARGV[0] |
215 |
or die "$ARGV[0]: $!\n"; |
216 |
|
217 |
my @grid; |
218 |
|
219 |
while (<$fh>) { |
220 |
my ($lat, $lon, $w, $payload) = $extract->() |
221 |
or next; |
222 |
|
223 |
unless (255 >= length $payload) { |
224 |
$payload =~ s/([^(\x20-\x7e])/sprintf "\\x%02x", ord $1/ge; |
225 |
warn "payload too long, skipping: $payload\n"; |
226 |
next; |
227 |
} |
228 |
|
229 |
my $blat = int $boxes * cos $lat * $torad; # can be 0, but does not matter |
230 |
my $x = int (($lon + 180) * $blat / 360); |
231 |
my $y = int (($lat + 90) * $boxes / 180); |
232 |
|
233 |
# we use 16 bit for lat/lon, 8 bi8t for the weight, BER-id, counted name and CC |
234 |
push @{ $grid[$y][$x] }, pack "s< s< C C/a*", $lat * 32767 / 90, $lon * 32767 / 180, $w, $payload; |
235 |
} |
236 |
|
237 |
############################################################################# |
238 |
# write gridded data out |
239 |
|
240 |
open my $cdb, ">", $ARGV[1] |
241 |
or die "$ARGV[1]: $!\n"; |
242 |
Geo::LatLon2Place::cdb_make_start fileno $cdb |
243 |
and die "cdb_make_start failure"; |
244 |
|
245 |
Geo::LatLon2Place::cdb_make_add "", pack "a4VVVV", "SRGL", 2, $km, $boxes, time; |
246 |
|
247 |
############################################################################# |
248 |
# now we walk the grid using a hilbert curve to increase locality when querying |
249 |
|
250 |
my ($max, $sum, $cnt); # statistics |
251 |
|
252 |
my ($x, $y) = (-1, 0); |
253 |
|
254 |
sub step { # move one step into $_[0] direction: 0 x++, 1 y++, 2 x--, 3 y-- |
255 |
($_[0] & 1 ? $y : $x) += 1 - ($_[0] & 2); |
256 |
|
257 |
# write cell at $x,$y, if any |
258 |
my $c = $grid[$y][$x] |
259 |
or return; |
260 |
|
261 |
undef $grid[$y][$x]; # for paranoia check |
262 |
|
263 |
# statistics |
264 |
$sum += scalar @$c; |
265 |
$cnt++; |
266 |
$max = [(scalar @$c), $x, $y] if @$c > $max->[0]; |
267 |
|
268 |
Geo::LatLon2Place::cdb_make_add |
269 |
+(pack "s< s<", $x, $y), |
270 |
+(join "", @$c) |
271 |
and "cdb_make_add failure"; |
272 |
} |
273 |
|
274 |
sub hilbert; |
275 |
sub hilbert { # order dir rot |
276 |
my $order = $_[0] >> 1 |
277 |
or return; |
278 |
|
279 |
hilbert $order, $_[1] + $_[2], -$_[2]; step $_[1] + $_[2]; |
280 |
hilbert $order, $_[1] , $_[2]; step $_[1] ; |
281 |
hilbert $order, $_[1] , $_[2]; step $_[1] - $_[2]; |
282 |
hilbert $order, $_[1] - $_[2], -$_[2]; |
283 |
} |
284 |
|
285 |
step 0; # move to 0,0 |
286 |
hilbert @grid * 2, 0, 1; |
287 |
|
288 |
# paranoia-check, make sure we wrote out all cells |
289 |
for (@grid) { |
290 |
grep $_, @$_ |
291 |
and die; |
292 |
} |
293 |
|
294 |
############################################################################# |
295 |
|
296 |
Geo::LatLon2Place::cdb_make_finish |
297 |
and die "cdb_make_finish failed"; |
298 |
close $cdb; |
299 |
|
300 |
$max->[2] = $max->[2] * 180 / $boxes - 90; |
301 |
my $blat = int $boxes * cos $max->[2] * $torad; |
302 |
$max->[1] = $max->[1] * 360 / $blat - 180; |
303 |
print "cell size: $km km, grid size: $boxes, non-empty cell count: $cnt\naverage cell size: ", int $sum/$cnt, ", maximum cell size: $max->[0] (at $max->[2] $max->[1])\n"; |
304 |
|