1 |
root |
1.1 |
#!/opt/bin/perl |
2 |
|
|
|
3 |
|
|
=head1 NAME |
4 |
|
|
|
5 |
|
|
geo-latlon2place-makedb - generate database for use with Geo::LatLon2Place |
6 |
|
|
|
7 |
|
|
=head1 SYNOPSIS |
8 |
|
|
|
9 |
|
|
geo-latlon2place-makedb [OPTION]... inputfile.txt outputfile.cdb |
10 |
|
|
|
11 |
|
|
=head1 DESCRIPTION |
12 |
|
|
|
13 |
|
|
abc |
14 |
|
|
|
15 |
|
|
=head1 OPTIONS AND ARGUMENTS |
16 |
|
|
|
17 |
|
|
geo-latlon2place-makedb requires two arguments: a text file with geo data |
18 |
|
|
and the name of the database file to be written. |
19 |
|
|
|
20 |
|
|
By default, the input file is considered to be in geonames gazetteer |
21 |
|
|
format, but this can be customized using B<--extract>. |
22 |
|
|
|
23 |
|
|
=over |
24 |
|
|
|
25 |
root |
1.3 |
=item B<--cellsize> km (default C<20>, or C<10> for geonames-postalcodes) |
26 |
root |
1.1 |
|
27 |
|
|
The (minimum) size of a single grid cell in km - the data is binned into |
28 |
|
|
cells of at least this size. It should not be larger than the search |
29 |
|
|
radius. |
30 |
|
|
|
31 |
|
|
=item B<--extract> C<geonames>|C<geonames-postalcodes>|perl... |
32 |
|
|
|
33 |
|
|
The extraction method: the default is C<geonames>, which expects a |
34 |
|
|
geonames database (L<https://download.geonames.org/export/dump/>, for |
35 |
|
|
example F<DE.txt>, F<cities500.txt> or F<allCountries.txt>) and extracts |
36 |
|
|
I<placename, countrycode> strings from it. |
37 |
|
|
|
38 |
root |
1.2 |
The method C<geonames-postalcodes> does the same, but for a geonames |
39 |
|
|
postal code database L<https://download.geonames.org/export/zip>, and |
40 |
|
|
extracts C<zip name, countrycopde> strings. |
41 |
root |
1.1 |
|
42 |
|
|
Lastly, you can specify a perl fragment that implements your own filtering |
43 |
|
|
and extraction. |
44 |
|
|
|
45 |
|
|
=back |
46 |
|
|
|
47 |
|
|
=head1 FILTERING AND EXTRACTION |
48 |
|
|
|
49 |
|
|
Record selection and the returned data are not fixed - you can filter your records |
50 |
|
|
yourself and associate any (reasonably short, the maximum is a bit above 200 octets) |
51 |
|
|
binary blob with a coordinate pair. |
52 |
|
|
|
53 |
|
|
To do this, you have to provide a perl fragment that extracts latitude, |
54 |
|
|
longitude, a weight and the associated data blob from an input line stored |
55 |
|
|
in C<$_>. The file is opened using the C<:perlio> layer, so if your input |
56 |
|
|
file is in UTF-8, so will be C<$_>. |
57 |
|
|
|
58 |
|
|
For example, the following would expect an input file with space separated |
59 |
|
|
latitude, longitude, weight and name, where name can contain spaces, which |
60 |
root |
1.2 |
is useful when you want to provide your own input data: |
61 |
root |
1.1 |
|
62 |
|
|
geo-latlon2place-makedb --extract 'chomp; split / /, 4' input output |
63 |
|
|
|
64 |
|
|
A slighly more verbose example expecting only latitude, longitude and a |
65 |
|
|
name would be: |
66 |
|
|
|
67 |
|
|
geo-latlon2place-makedb --extract ' |
68 |
|
|
chomp; |
69 |
|
|
my ($lat, $lon, $name) = split / /, 4; |
70 |
|
|
($lat, $lon, 1, $name) |
71 |
|
|
' input output |
72 |
|
|
|
73 |
|
|
If you want to skip certain lines without adding anything to the database, |
74 |
|
|
you can return nothing: |
75 |
|
|
|
76 |
|
|
geo-latlon2place-makedb --extract ' |
77 |
|
|
chomp; |
78 |
|
|
my ($lat, $lon, $name) = split / /; |
79 |
|
|
return unless $lat < 0; # only add southern hemisphere points |
80 |
|
|
($lat, $lon, 1, $name) |
81 |
|
|
' input output |
82 |
|
|
|
83 |
|
|
In general, the fragment should return either an empty list, or a |
84 |
|
|
four-tuple with decimal latitude, longitude, a weight (integer 0..255) |
85 |
|
|
and the binary data to be associated with the coordinate. Other than the |
86 |
|
|
weight, these should be self-explaining. The weight is used during search |
87 |
|
|
and will be multiplied to the square of the distance, and is used to make |
88 |
|
|
larger cities win over small ones when the coordinate is somewhere between |
89 |
|
|
them. |
90 |
|
|
|
91 |
root |
1.2 |
The standard extractors (C<geonames> and C<geonames-postalcodes>) provide |
92 |
|
|
a UTF-8-encoded string as blob, but any binary data will do, for example, |
93 |
|
|
if you want to associate your coordinate pairs with some short-ish |
94 |
|
|
integer codes, you could do this: |
95 |
|
|
|
96 |
|
|
geo-latlon2place-makedb --extract ' |
97 |
|
|
chomp; |
98 |
|
|
my ($lat, $lon, $id) = split / /, 4; |
99 |
|
|
($lat, $lon, 1, pack "w", $id) |
100 |
|
|
' input output |
101 |
|
|
|
102 |
|
|
And later use C<unpack "w"> on the data returned by C<lookup>. |
103 |
|
|
|
104 |
|
|
The C<geonames> filter looks similar to the following fragment, which |
105 |
|
|
shows off some more filtering possibilities: |
106 |
root |
1.1 |
|
107 |
|
|
my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/; |
108 |
|
|
|
109 |
|
|
return if $t1 ne "P"; # only places |
110 |
|
|
|
111 |
|
|
# minimum population 200, or it is a populated place with no known population |
112 |
|
|
$pop => 200 |
113 |
|
|
or ($pop eq "" and $t2 eq "PPL") |
114 |
|
|
or return; |
115 |
|
|
|
116 |
|
|
# skip certain places we are not interested in |
117 |
|
|
return if $t2 eq "PPLX"; # section of populated place |
118 |
|
|
return if $t2 eq "PPLW"; # destroyed populated place |
119 |
|
|
return if $t2 eq "PPLH"; # historical populated place |
120 |
|
|
return if $t2 eq "PPLQ"; # abandoned populated place |
121 |
|
|
|
122 |
|
|
# geonames has a lot of very long place names which aren't |
123 |
|
|
# actually place names, so ignore very long names |
124 |
|
|
60 > length $name |
125 |
|
|
or return; |
126 |
|
|
|
127 |
|
|
# we estimate a weight by dividing 25 by the radius of the place, |
128 |
root |
1.2 |
# which we get by assuming a fixed population density of 5000 # people |
129 |
|
|
# per square km, # which is almost always a considerable over-estimate. |
130 |
root |
1.1 |
# 25 and 5000 are pretty much made-up, feel free to improve and |
131 |
|
|
# send me the results. |
132 |
|
|
my $w = 25 / (1 + sqrt $pop / 5000); |
133 |
|
|
|
134 |
|
|
# administrative centers get a fixed low weight |
135 |
|
|
if ($t2 =~ /^PPLA(\d*)/) { |
136 |
|
|
$w = $1 || 1; |
137 |
|
|
} |
138 |
|
|
|
139 |
|
|
($lat, $lon, $w, "$name, $cc") |
140 |
|
|
|
141 |
|
|
=head1 AUTHOR |
142 |
|
|
|
143 |
|
|
Marc Lehmann <schmorp@schmorp.de> |
144 |
|
|
|
145 |
|
|
=cut |
146 |
|
|
|
147 |
|
|
use common::sense; |
148 |
|
|
use Getopt::Long; |
149 |
|
|
use Geo::LatLon2Place (); |
150 |
|
|
use Pod::Usage (); |
151 |
|
|
|
152 |
|
|
GetOptions |
153 |
|
|
"help|h" => sub { Pod::Usage::pod2usage -exittval => 0, verbose => 1 }, |
154 |
|
|
"extract=s" => \my $extract, |
155 |
|
|
"cellsize=i" => \my $km, |
156 |
|
|
or Pod::Usage::pod2usage |
157 |
|
|
-exitval => 1, |
158 |
|
|
; |
159 |
|
|
|
160 |
|
|
@ARGV == 2 |
161 |
|
|
or die "need exactly two paths: inputfile.txt outputdatabase.cdb\n"; |
162 |
|
|
|
163 |
|
|
$extract ||= "geonames"; |
164 |
|
|
|
165 |
|
|
if ($extract eq "geonames") { |
166 |
|
|
$extract = sub { |
167 |
|
|
my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/; |
168 |
|
|
|
169 |
|
|
return if $t1 ne "P"; # only places |
170 |
|
|
|
171 |
|
|
$pop => 200 |
172 |
|
|
or ($pop eq "" and $t2 eq "PPL") |
173 |
|
|
or return; |
174 |
|
|
|
175 |
|
|
return if $t2 eq "PPLX"; # section of populated place |
176 |
|
|
return if $t2 eq "PPLW"; # destroyed populated place |
177 |
|
|
return if $t2 eq "PPLH"; # historical populated place |
178 |
|
|
return if $t2 eq "PPLQ"; # abandoned populated place |
179 |
|
|
|
180 |
|
|
# geonames has a lot of very long place names which aren't |
181 |
|
|
# actually place names, so ignore very long names |
182 |
|
|
60 > length $name |
183 |
|
|
or return; |
184 |
|
|
|
185 |
|
|
my $w = 25 / (1 + sqrt $pop / 5000); |
186 |
|
|
|
187 |
|
|
if ($t2 =~ /^PPLA(\d+)/) { |
188 |
|
|
$w = $1 || 1; |
189 |
|
|
} |
190 |
|
|
|
191 |
|
|
($lat, $lon, $w, "$name, $cc") |
192 |
|
|
}; |
193 |
|
|
} elsif ($extract eq "geonames-postalcodes") { |
194 |
root |
1.3 |
$km ||= 10; |
195 |
root |
1.1 |
$extract = sub { |
196 |
|
|
my ($cc, $zip, $name, undef, undef, undef, undef, undef, undef, $lat, $lon, undef) = split /\t/; |
197 |
|
|
|
198 |
|
|
($lat, $lon, 1, "$zip $name, $cc") |
199 |
|
|
}; |
200 |
|
|
} else { |
201 |
|
|
$extract = eval "#line 1 \"extract fragment\"\nsub { $extract; }"; |
202 |
|
|
die "$@" if $@; |
203 |
|
|
} |
204 |
|
|
|
205 |
root |
1.3 |
$km ||= 20; |
206 |
|
|
|
207 |
|
|
my $torad = (atan2 1,0) / 90; |
208 |
root |
1.1 |
|
209 |
|
|
my $boxes = int 6378 * 2 * 2 * (atan2 1,0) / $km; # equator radius / cell size |
210 |
|
|
|
211 |
|
|
open my $fh, "<:perlio", $ARGV[0]; |
212 |
|
|
|
213 |
|
|
my @grid; |
214 |
|
|
|
215 |
|
|
while (<$fh>) { |
216 |
|
|
my ($lat, $lon, $w, $payload) = $extract->() |
217 |
|
|
or next; |
218 |
|
|
|
219 |
root |
1.3 |
unless (255 >= length $payload) { |
220 |
root |
1.1 |
$payload =~ s/([^(\x20-\x7e])/sprintf "\\x%02x", ord $1/ge; |
221 |
|
|
warn "payload too long, skipping: $payload\n"; |
222 |
|
|
next; |
223 |
|
|
} |
224 |
|
|
|
225 |
root |
1.3 |
my $blat = int $boxes * cos $lat * $torad; # can be 0, but does not matter |
226 |
root |
1.1 |
my $x = int (($lon + 180) * $blat / 360); |
227 |
|
|
my $y = int (($lat + 90) * $boxes / 180); |
228 |
|
|
|
229 |
|
|
# we use 16 bit for lat/lon, 8 bi8t for the weight, BER-id, counted name and CC |
230 |
root |
1.3 |
push @{ $grid[$y][$x] }, pack "s< s< C C/a*", $lat * 32767 / 90, $lon * 32767 / 180, $w, $payload; |
231 |
root |
1.1 |
} |
232 |
|
|
|
233 |
|
|
my ($max, $sum, $cnt); |
234 |
|
|
|
235 |
|
|
open my $cdb, ">", $ARGV[1] |
236 |
|
|
or die; |
237 |
|
|
Geo::LatLon2Place::cdb_make_start fileno $cdb |
238 |
|
|
and die "cdb_make_start failure"; |
239 |
|
|
|
240 |
root |
1.3 |
Geo::LatLon2Place::cdb_make_add "", pack "a4VVV", "SRGL", 2, $km, $boxes; |
241 |
root |
1.1 |
|
242 |
|
|
for my $y (0 .. $#grid) { |
243 |
|
|
my $r = $grid[$y]; |
244 |
|
|
for my $x (0 .. $#$r) { |
245 |
|
|
my $c = $r->[$x] |
246 |
|
|
or next; |
247 |
|
|
|
248 |
|
|
# statistics |
249 |
|
|
$sum += scalar @$c; |
250 |
|
|
$cnt++; |
251 |
|
|
$max = [(scalar @$c), $x, $y] if @$c > $max->[0]; |
252 |
|
|
|
253 |
|
|
Geo::LatLon2Place::cdb_make_add |
254 |
|
|
+(pack "s< s<", $x, $y), |
255 |
|
|
+(join "", @$c) |
256 |
|
|
and "cdb_make_add failure"; |
257 |
|
|
} |
258 |
|
|
} |
259 |
|
|
|
260 |
|
|
Geo::LatLon2Place::cdb_make_finish |
261 |
|
|
and die "cdb_make_finish failed"; |
262 |
|
|
close $cdb; |
263 |
|
|
|
264 |
|
|
$max->[2] = $max->[2] * 180 / $boxes - 90; |
265 |
root |
1.3 |
my $blat = int $boxes * cos $max->[2] * $torad; |
266 |
root |
1.1 |
$max->[1] = $max->[1] * 360 / $blat - 180; |
267 |
|
|
print "cell size: $km km, grid size: $boxes, non-empty cell count: $cnt\naverage cell size: ", int $sum/$cnt, ", maximum cell size: $max->[0] (at $max->[2] $max->[1])\n"; |
268 |
|
|
|