1 |
#!/opt/bin/perl |
2 |
|
3 |
# usage: geonames2cdb geonames.txt database.cdb |
4 |
# you can use allCountries.zp, or cities500.zip and so on |
5 |
# from https://download.geonames.org/export/dump/ |
6 |
|
7 |
=head1 NAME |
8 |
|
9 |
geo-latlon2place-makedb - generate database for use with Geo::LatLon2Place |
10 |
|
11 |
=head1 SYNOPSIS |
12 |
|
13 |
geo-latlon2place-makedb [OPTION]... inputfile.txt outputfile.cdb |
14 |
|
15 |
=head1 DESCRIPTION |
16 |
|
17 |
abc |
18 |
|
19 |
=head1 OPTIONS AND ARGUMENTS |
20 |
|
21 |
geo-latlon2place-makedb requires two arguments: a text file with geo data |
22 |
and the name of the database file to be written. |
23 |
|
24 |
By default, the input file is considered to be in geonames gazetteer |
25 |
format, but this can be customized using B<--extract>. |
26 |
|
27 |
=over |
28 |
|
29 |
=item B<--cellsize> km (default C<20>) |
30 |
|
31 |
The (minimum) size of a single grid cell in km - the data is binned into |
32 |
cells of at least this size. It should not be larger than the search |
33 |
radius. |
34 |
|
35 |
=item B<--extract> C<geonames>|C<geonames-postalcodes>|perl... |
36 |
|
37 |
The extraction method: the default is C<geonames>, which expects a |
38 |
geonames database (L<https://download.geonames.org/export/dump/>, for |
39 |
example F<DE.txt>, F<cities500.txt> or F<allCountries.txt>) and extracts |
40 |
I<placename, countrycode> strings from it. |
41 |
|
42 |
The method C<geonames-postalcodes> does the same, but for a geonames |
43 |
postal code database L<https://download.geonames.org/export/zip>, and |
44 |
extracts C<zip name, countrycopde> strings. |
45 |
|
46 |
Lastly, you can specify a perl fragment that implements your own filtering |
47 |
and extraction. |
48 |
|
49 |
=back |
50 |
|
51 |
=head1 FILTERING AND EXTRACTION |
52 |
|
53 |
Record selection and the returned data are not fixed - you can filter your records |
54 |
yourself and associate any (reasonably short, the maximum is a bit above 200 octets) |
55 |
binary blob with a coordinate pair. |
56 |
|
57 |
To do this, you have to provide a perl fragment that extracts latitude, |
58 |
longitude, a weight and the associated data blob from an input line stored |
59 |
in C<$_>. The file is opened using the C<:perlio> layer, so if your input |
60 |
file is in UTF-8, so will be C<$_>. |
61 |
|
62 |
For example, the following would expect an input file with space separated |
63 |
latitude, longitude, weight and name, where name can contain spaces, which |
64 |
is useful when you want to provide your own input data: |
65 |
|
66 |
geo-latlon2place-makedb --extract 'chomp; split / /, 4' input output |
67 |
|
68 |
A slighly more verbose example expecting only latitude, longitude and a |
69 |
name would be: |
70 |
|
71 |
geo-latlon2place-makedb --extract ' |
72 |
chomp; |
73 |
my ($lat, $lon, $name) = split / /, 4; |
74 |
($lat, $lon, 1, $name) |
75 |
' input output |
76 |
|
77 |
If you want to skip certain lines without adding anything to the database, |
78 |
you can return nothing: |
79 |
|
80 |
geo-latlon2place-makedb --extract ' |
81 |
chomp; |
82 |
my ($lat, $lon, $name) = split / /; |
83 |
return unless $lat < 0; # only add southern hemisphere points |
84 |
($lat, $lon, 1, $name) |
85 |
' input output |
86 |
|
87 |
In general, the fragment should return either an empty list, or a |
88 |
four-tuple with decimal latitude, longitude, a weight (integer 0..255) |
89 |
and the binary data to be associated with the coordinate. Other than the |
90 |
weight, these should be self-explaining. The weight is used during search |
91 |
and will be multiplied to the square of the distance, and is used to make |
92 |
larger cities win over small ones when the coordinate is somewhere between |
93 |
them. |
94 |
|
95 |
The standard extractors (C<geonames> and C<geonames-postalcodes>) provide |
96 |
a UTF-8-encoded string as blob, but any binary data will do, for example, |
97 |
if you want to associate your coordinate pairs with some short-ish |
98 |
integer codes, you could do this: |
99 |
|
100 |
geo-latlon2place-makedb --extract ' |
101 |
chomp; |
102 |
my ($lat, $lon, $id) = split / /, 4; |
103 |
($lat, $lon, 1, pack "w", $id) |
104 |
' input output |
105 |
|
106 |
And later use C<unpack "w"> on the data returned by C<lookup>. |
107 |
|
108 |
The C<geonames> filter looks similar to the following fragment, which |
109 |
shows off some more filtering possibilities: |
110 |
|
111 |
my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/; |
112 |
|
113 |
return if $t1 ne "P"; # only places |
114 |
|
115 |
# minimum population 200, or it is a populated place with no known population |
116 |
$pop => 200 |
117 |
or ($pop eq "" and $t2 eq "PPL") |
118 |
or return; |
119 |
|
120 |
# skip certain places we are not interested in |
121 |
return if $t2 eq "PPLX"; # section of populated place |
122 |
return if $t2 eq "PPLW"; # destroyed populated place |
123 |
return if $t2 eq "PPLH"; # historical populated place |
124 |
return if $t2 eq "PPLQ"; # abandoned populated place |
125 |
|
126 |
# geonames has a lot of very long place names which aren't |
127 |
# actually place names, so ignore very long names |
128 |
60 > length $name |
129 |
or return; |
130 |
|
131 |
# we estimate a weight by dividing 25 by the radius of the place, |
132 |
# which we get by assuming a fixed population density of 5000 # people |
133 |
# per square km, # which is almost always a considerable over-estimate. |
134 |
# 25 and 5000 are pretty much made-up, feel free to improve and |
135 |
# send me the results. |
136 |
my $w = 25 / (1 + sqrt $pop / 5000); |
137 |
|
138 |
# administrative centers get a fixed low weight |
139 |
if ($t2 =~ /^PPLA(\d*)/) { |
140 |
$w = $1 || 1; |
141 |
} |
142 |
|
143 |
($lat, $lon, $w, "$name, $cc") |
144 |
|
145 |
=head1 AUTHOR |
146 |
|
147 |
Marc Lehmann <schmorp@schmorp.de> |
148 |
|
149 |
=cut |
150 |
|
151 |
use common::sense; |
152 |
use Getopt::Long; |
153 |
use Geo::LatLon2Place (); |
154 |
use Pod::Usage (); |
155 |
|
156 |
GetOptions |
157 |
"help|h" => sub { Pod::Usage::pod2usage -exittval => 0, verbose => 1 }, |
158 |
"extract=s" => \my $extract, |
159 |
"cellsize=i" => \my $km, |
160 |
or Pod::Usage::pod2usage |
161 |
-exitval => 1, |
162 |
; |
163 |
|
164 |
@ARGV == 2 |
165 |
or die "need exactly two paths: inputfile.txt outputdatabase.cdb\n"; |
166 |
|
167 |
$km ||= 20; |
168 |
$extract ||= "geonames"; |
169 |
|
170 |
if ($extract eq "geonames") { |
171 |
$extract = sub { |
172 |
my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/; |
173 |
|
174 |
return if $t1 ne "P"; # only places |
175 |
|
176 |
$pop => 200 |
177 |
or ($pop eq "" and $t2 eq "PPL") |
178 |
or return; |
179 |
|
180 |
return if $t2 eq "PPLX"; # section of populated place |
181 |
return if $t2 eq "PPLW"; # destroyed populated place |
182 |
return if $t2 eq "PPLH"; # historical populated place |
183 |
return if $t2 eq "PPLQ"; # abandoned populated place |
184 |
|
185 |
# geonames has a lot of very long place names which aren't |
186 |
# actually place names, so ignore very long names |
187 |
60 > length $name |
188 |
or return; |
189 |
|
190 |
my $w = 25 / (1 + sqrt $pop / 5000); |
191 |
|
192 |
if ($t2 =~ /^PPLA(\d+)/) { |
193 |
$w = $1 || 1; |
194 |
} |
195 |
|
196 |
($lat, $lon, $w, "$name, $cc") |
197 |
}; |
198 |
} elsif ($extract eq "geonames-postalcodes") { |
199 |
$extract = sub { |
200 |
my ($cc, $zip, $name, undef, undef, undef, undef, undef, undef, $lat, $lon, undef) = split /\t/; |
201 |
|
202 |
($lat, $lon, 1, "$zip $name, $cc") |
203 |
}; |
204 |
} else { |
205 |
$extract = eval "#line 1 \"extract fragment\"\nsub { $extract; }"; |
206 |
die "$@" if $@; |
207 |
} |
208 |
|
209 |
my $torad = (atan2 1,0) / 180; |
210 |
|
211 |
my $boxes = int 6378 * 2 * 2 * (atan2 1,0) / $km; # equator radius / cell size |
212 |
|
213 |
open my $fh, "<:perlio", $ARGV[0]; |
214 |
|
215 |
my @grid; |
216 |
|
217 |
while (<$fh>) { |
218 |
my ($lat, $lon, $w, $payload) = $extract->() |
219 |
or next; |
220 |
|
221 |
unless (255 - (2+2+1) >= length $payload) { |
222 |
$payload =~ s/([^(\x20-\x7e])/sprintf "\\x%02x", ord $1/ge; |
223 |
warn "payload too long, skipping: $payload\n"; |
224 |
next; |
225 |
} |
226 |
|
227 |
# higher latitudes get a (much) lower resolution, so we always get at least $km cells |
228 |
my $blat = int $boxes * cos abs $lat * $torad; |
229 |
my $x = int (($lon + 180) * $blat / 360); |
230 |
my $y = int (($lat + 90) * $boxes / 180); |
231 |
|
232 |
# we use 16 bit for lat/lon, 8 bi8t for the weight, BER-id, counted name and CC |
233 |
push @{ $grid[$y][$x] }, pack "C/a*", pack "s< s< C a*", $lat * 32767 / 90, $lon * 32767 / 180, $w, $payload; |
234 |
} |
235 |
|
236 |
my ($max, $sum, $cnt); |
237 |
|
238 |
open my $cdb, ">", $ARGV[1] |
239 |
or die; |
240 |
Geo::LatLon2Place::cdb_make_start fileno $cdb |
241 |
and die "cdb_make_start failure"; |
242 |
|
243 |
Geo::LatLon2Place::cdb_make_add "", pack "a4VVV", "SRGL", 1, $km, $boxes; |
244 |
|
245 |
for my $y (0 .. $#grid) { |
246 |
my $r = $grid[$y]; |
247 |
for my $x (0 .. $#$r) { |
248 |
my $c = $r->[$x] |
249 |
or next; |
250 |
|
251 |
# statistics |
252 |
$sum += scalar @$c; |
253 |
$cnt++; |
254 |
$max = [(scalar @$c), $x, $y] if @$c > $max->[0]; |
255 |
|
256 |
Geo::LatLon2Place::cdb_make_add |
257 |
+(pack "s< s<", $x, $y), |
258 |
+(join "", @$c) |
259 |
and "cdb_make_add failure"; |
260 |
} |
261 |
} |
262 |
|
263 |
Geo::LatLon2Place::cdb_make_finish |
264 |
and die "cdb_make_finish failed"; |
265 |
close $cdb; |
266 |
|
267 |
$max->[2] = $max->[2] * 180 / $boxes - 90; |
268 |
my $blat = int $boxes * cos abs $max->[2] * $torad; |
269 |
$max->[1] = $max->[1] * 360 / $blat - 180; |
270 |
print "cell size: $km km, grid size: $boxes, non-empty cell count: $cnt\naverage cell size: ", int $sum/$cnt, ", maximum cell size: $max->[0] (at $max->[2] $max->[1])\n"; |
271 |
|