1 |
#!/opt/bin/perl |
2 |
|
3 |
# usage: geonames2cdb geonames.txt database.cdb |
4 |
# you can use allCountries.zp, or cities500.zip and so on |
5 |
# from https://download.geonames.org/export/dump/ |
6 |
|
7 |
=head1 NAME |
8 |
|
9 |
geo-latlon2place-makedb - generate database for use with Geo::LatLon2Place |
10 |
|
11 |
=head1 SYNOPSIS |
12 |
|
13 |
geo-latlon2place-makedb [OPTION]... inputfile.txt outputfile.cdb |
14 |
|
15 |
=head1 DESCRIPTION |
16 |
|
17 |
abc |
18 |
|
19 |
=head1 OPTIONS AND ARGUMENTS |
20 |
|
21 |
geo-latlon2place-makedb requires two arguments: a text file with geo data |
22 |
and the name of the database file to be written. |
23 |
|
24 |
By default, the input file is considered to be in geonames gazetteer |
25 |
format, but this can be customized using B<--extract>. |
26 |
|
27 |
=over |
28 |
|
29 |
=item B<--cellsize> km (default C<20>) |
30 |
|
31 |
The (minimum) size of a single grid cell in km - the data is binned into |
32 |
cells of at least this size. It should not be larger than the search |
33 |
radius. |
34 |
|
35 |
=item B<--extract> C<geonames>|C<geonames-postalcodes>|perl... |
36 |
|
37 |
The extraction method: the default is C<geonames>, which expects a |
38 |
geonames database (L<https://download.geonames.org/export/dump/>, for |
39 |
example F<DE.txt>, F<cities500.txt> or F<allCountries.txt>) and extracts |
40 |
I<placename, countrycode> strings from it. |
41 |
|
42 |
The method C<geonames-postalcodes> (not yet implemented) |
43 |
does the same, but for a geonames postal code database |
44 |
L<https://download.geonames.org/export/zip>. |
45 |
|
46 |
Lastly, you can specify a perl fragment that implements your own filtering |
47 |
and extraction. |
48 |
|
49 |
=back |
50 |
|
51 |
=head1 FILTERING AND EXTRACTION |
52 |
|
53 |
Record selection and the returned data are not fixed - you can filter your records |
54 |
yourself and associate any (reasonably short, the maximum is a bit above 200 octets) |
55 |
binary blob with a coordinate pair. |
56 |
|
57 |
To do this, you have to provide a perl fragment that extracts latitude, |
58 |
longitude, a weight and the associated data blob from an input line stored |
59 |
in C<$_>. The file is opened using the C<:perlio> layer, so if your input |
60 |
file is in UTF-8, so will be C<$_>. |
61 |
|
62 |
For example, the following would expect an input file with space separated |
63 |
latitude, longitude, weight and name, where name can contain spaces, which |
64 |
is useful when you wat to provide your own input data: |
65 |
|
66 |
geo-latlon2place-makedb --extract 'chomp; split / /, 4' input output |
67 |
|
68 |
A slighly more verbose example expecting only latitude, longitude and a |
69 |
name would be: |
70 |
|
71 |
geo-latlon2place-makedb --extract ' |
72 |
chomp; |
73 |
my ($lat, $lon, $name) = split / /, 4; |
74 |
($lat, $lon, 1, $name) |
75 |
' input output |
76 |
|
77 |
If you want to skip certain lines without adding anything to the database, |
78 |
you can return nothing: |
79 |
|
80 |
geo-latlon2place-makedb --extract ' |
81 |
chomp; |
82 |
my ($lat, $lon, $name) = split / /; |
83 |
return unless $lat < 0; # only add southern hemisphere points |
84 |
($lat, $lon, 1, $name) |
85 |
' input output |
86 |
|
87 |
In general, the fragment should return either an empty list, or a |
88 |
four-tuple with decimal latitude, longitude, a weight (integer 0..255) |
89 |
and the binary data to be associated with the coordinate. Other than the |
90 |
weight, these should be self-explaining. The weight is used during search |
91 |
and will be multiplied to the square of the distance, and is used to make |
92 |
larger cities win over small ones when the coordinate is somewhere between |
93 |
them. |
94 |
|
95 |
The C<geonames> filter looks similar to this fragment, which shows off |
96 |
more possibilities: |
97 |
|
98 |
my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/; |
99 |
|
100 |
return if $t1 ne "P"; # only places |
101 |
|
102 |
# minimum population 200, or it is a populated place with no known population |
103 |
$pop => 200 |
104 |
or ($pop eq "" and $t2 eq "PPL") |
105 |
or return; |
106 |
|
107 |
# skip certain places we are not interested in |
108 |
return if $t2 eq "PPLX"; # section of populated place |
109 |
return if $t2 eq "PPLW"; # destroyed populated place |
110 |
return if $t2 eq "PPLH"; # historical populated place |
111 |
return if $t2 eq "PPLQ"; # abandoned populated place |
112 |
|
113 |
# geonames has a lot of very long place names which aren't |
114 |
# actually place names, so ignore very long names |
115 |
60 > length $name |
116 |
or return; |
117 |
|
118 |
# we estimate a weight by dividing 25 by the radius of the place, |
119 |
# which we get by assuming a fixed population density of 5000 people/kmĀ², |
120 |
# which is almost always a considerable over-estimate. |
121 |
# 25 and 5000 are pretty much made-up, feel free to improve and |
122 |
# send me the results. |
123 |
my $w = 25 / (1 + sqrt $pop / 5000); |
124 |
|
125 |
# administrative centers get a fixed low weight |
126 |
if ($t2 =~ /^PPLA(\d*)/) { |
127 |
$w = $1 || 1; |
128 |
} |
129 |
|
130 |
($lat, $lon, $w, "$name, $cc") |
131 |
|
132 |
=head1 AUTHOR |
133 |
|
134 |
Marc Lehmann <schmorp@schmorp.de> |
135 |
|
136 |
=cut |
137 |
|
138 |
use common::sense; |
139 |
use Getopt::Long; |
140 |
use Geo::LatLon2Place (); |
141 |
use Pod::Usage (); |
142 |
|
143 |
GetOptions |
144 |
"help|h" => sub { Pod::Usage::pod2usage -exittval => 0, verbose => 1 }, |
145 |
"extract=s" => \my $extract, |
146 |
"cellsize=i" => \my $km, |
147 |
or Pod::Usage::pod2usage |
148 |
-exitval => 1, |
149 |
; |
150 |
|
151 |
@ARGV == 2 |
152 |
or die "need exactly two paths: inputfile.txt outputdatabase.cdb\n"; |
153 |
|
154 |
$km ||= 20; |
155 |
$extract ||= "geonames"; |
156 |
|
157 |
if ($extract eq "geonames") { |
158 |
$extract = sub { |
159 |
my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/; |
160 |
|
161 |
return if $t1 ne "P"; # only places |
162 |
|
163 |
$pop => 200 |
164 |
or ($pop eq "" and $t2 eq "PPL") |
165 |
or return; |
166 |
|
167 |
return if $t2 eq "PPLX"; # section of populated place |
168 |
return if $t2 eq "PPLW"; # destroyed populated place |
169 |
return if $t2 eq "PPLH"; # historical populated place |
170 |
return if $t2 eq "PPLQ"; # abandoned populated place |
171 |
|
172 |
# geonames has a lot of very long place names which aren't |
173 |
# actually place names, so ignore very long names |
174 |
60 > length $name |
175 |
or return; |
176 |
|
177 |
my $w = 25 / (1 + sqrt $pop / 5000); |
178 |
|
179 |
if ($t2 =~ /^PPLA(\d+)/) { |
180 |
$w = $1 || 1; |
181 |
} |
182 |
|
183 |
($lat, $lon, $w, "$name, $cc") |
184 |
}; |
185 |
} elsif ($extract eq "geonames-postalcodes") { |
186 |
$extract = sub { |
187 |
my ($cc, $zip, $name, undef, undef, undef, undef, undef, undef, $lat, $lon, undef) = split /\t/; |
188 |
|
189 |
($lat, $lon, 1, "$zip $name, $cc") |
190 |
}; |
191 |
} else { |
192 |
$extract = eval "#line 1 \"extract fragment\"\nsub { $extract; }"; |
193 |
die "$@" if $@; |
194 |
} |
195 |
|
196 |
my $torad = (atan2 1,0) / 180; |
197 |
|
198 |
my $boxes = int 6378 * 2 * 2 * (atan2 1,0) / $km; # equator radius / cell size |
199 |
|
200 |
open my $fh, "<:perlio", $ARGV[0]; |
201 |
|
202 |
my @grid; |
203 |
|
204 |
while (<$fh>) { |
205 |
my ($lat, $lon, $w, $payload) = $extract->() |
206 |
or next; |
207 |
|
208 |
unless (255 - (2+2+1) >= length $payload) { |
209 |
$payload =~ s/([^(\x20-\x7e])/sprintf "\\x%02x", ord $1/ge; |
210 |
warn "payload too long, skipping: $payload\n"; |
211 |
next; |
212 |
} |
213 |
|
214 |
# higher latitudes get a (much) lower resolution, so we always get at least $km cells |
215 |
my $blat = int $boxes * cos abs $lat * $torad; |
216 |
my $x = int (($lon + 180) * $blat / 360); |
217 |
my $y = int (($lat + 90) * $boxes / 180); |
218 |
|
219 |
# we use 16 bit for lat/lon, 8 bi8t for the weight, BER-id, counted name and CC |
220 |
push @{ $grid[$y][$x] }, pack "C/a*", pack "s< s< C a*", $lat * 32767 / 90, $lon * 32767 / 180, $w, $payload; |
221 |
} |
222 |
|
223 |
my ($max, $sum, $cnt); |
224 |
|
225 |
open my $cdb, ">", $ARGV[1] |
226 |
or die; |
227 |
Geo::LatLon2Place::cdb_make_start fileno $cdb |
228 |
and die "cdb_make_start failure"; |
229 |
|
230 |
Geo::LatLon2Place::cdb_make_add "", pack "a4VVV", "SRGL", 1, $km, $boxes; |
231 |
|
232 |
for my $y (0 .. $#grid) { |
233 |
my $r = $grid[$y]; |
234 |
for my $x (0 .. $#$r) { |
235 |
my $c = $r->[$x] |
236 |
or next; |
237 |
|
238 |
# statistics |
239 |
$sum += scalar @$c; |
240 |
$cnt++; |
241 |
$max = [(scalar @$c), $x, $y] if @$c > $max->[0]; |
242 |
|
243 |
Geo::LatLon2Place::cdb_make_add |
244 |
+(pack "s< s<", $x, $y), |
245 |
+(join "", @$c) |
246 |
and "cdb_make_add failure"; |
247 |
} |
248 |
} |
249 |
|
250 |
Geo::LatLon2Place::cdb_make_finish |
251 |
and die "cdb_make_finish failed"; |
252 |
close $cdb; |
253 |
|
254 |
$max->[2] = $max->[2] * 180 / $boxes - 90; |
255 |
my $blat = int $boxes * cos abs $max->[2] * $torad; |
256 |
$max->[1] = $max->[1] * 360 / $blat - 180; |
257 |
print "cell size: $km km, grid size: $boxes, non-empty cell count: $cnt\naverage cell size: ", int $sum/$cnt, ", maximum cell size: $max->[0] (at $max->[2] $max->[1])\n"; |
258 |
|