ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/Geo-LatLon2Place/bin/geo-latlon2place-makedb
Revision: 1.5
Committed: Tue Jun 28 16:39:27 2022 UTC (23 months, 1 week ago) by root
Branch: MAIN
CVS Tags: HEAD
Changes since 1.4: +3 -0 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 #!/opt/bin/perl
2    
3     =head1 NAME
4    
5     geo-latlon2place-makedb - generate database for use with Geo::LatLon2Place
6    
7     =head1 SYNOPSIS
8    
9     geo-latlon2place-makedb [OPTION]... inputfile.txt outputfile.cdb
10    
11     =head1 DESCRIPTION
12    
13     abc
14    
15     =head1 OPTIONS AND ARGUMENTS
16    
17     geo-latlon2place-makedb requires two arguments: a text file with geo data
18     and the name of the database file to be written.
19    
20     By default, the input file is considered to be in geonames gazetteer
21     format, but this can be customized using B<--extract>.
22    
23     =over
24    
25 root 1.3 =item B<--cellsize> km (default C<20>, or C<10> for geonames-postalcodes)
26 root 1.1
27     The (minimum) size of a single grid cell in km - the data is binned into
28     cells of at least this size. It should not be larger than the search
29     radius.
30    
31     =item B<--extract> C<geonames>|C<geonames-postalcodes>|perl...
32    
33     The extraction method: the default is C<geonames>, which expects a
34     geonames database (L<https://download.geonames.org/export/dump/>, for
35     example F<DE.txt>, F<cities500.txt> or F<allCountries.txt>) and extracts
36     I<placename, countrycode> strings from it.
37    
38 root 1.2 The method C<geonames-postalcodes> does the same, but for a geonames
39     postal code database L<https://download.geonames.org/export/zip>, and
40     extracts C<zip name, countrycopde> strings.
41 root 1.1
42     Lastly, you can specify a perl fragment that implements your own filtering
43     and extraction.
44    
45     =back
46    
47     =head1 FILTERING AND EXTRACTION
48    
49     Record selection and the returned data are not fixed - you can filter your records
50     yourself and associate any (reasonably short, the maximum is a bit above 200 octets)
51     binary blob with a coordinate pair.
52    
53     To do this, you have to provide a perl fragment that extracts latitude,
54     longitude, a weight and the associated data blob from an input line stored
55     in C<$_>. The file is opened using the C<:perlio> layer, so if your input
56     file is in UTF-8, so will be C<$_>.
57    
58     For example, the following would expect an input file with space separated
59     latitude, longitude, weight and name, where name can contain spaces, which
60 root 1.2 is useful when you want to provide your own input data:
61 root 1.1
62     geo-latlon2place-makedb --extract 'chomp; split / /, 4' input output
63    
64     A slighly more verbose example expecting only latitude, longitude and a
65     name would be:
66    
67     geo-latlon2place-makedb --extract '
68     chomp;
69     my ($lat, $lon, $name) = split / /, 4;
70     ($lat, $lon, 1, $name)
71     ' input output
72    
73     If you want to skip certain lines without adding anything to the database,
74     you can return nothing:
75    
76     geo-latlon2place-makedb --extract '
77     chomp;
78     my ($lat, $lon, $name) = split / /;
79     return unless $lat < 0; # only add southern hemisphere points
80     ($lat, $lon, 1, $name)
81     ' input output
82    
83     In general, the fragment should return either an empty list, or a
84     four-tuple with decimal latitude, longitude, a weight (integer 0..255)
85     and the binary data to be associated with the coordinate. Other than the
86     weight, these should be self-explaining. The weight is used during search
87     and will be multiplied to the square of the distance, and is used to make
88     larger cities win over small ones when the coordinate is somewhere between
89     them.
90    
91 root 1.2 The standard extractors (C<geonames> and C<geonames-postalcodes>) provide
92     a UTF-8-encoded string as blob, but any binary data will do, for example,
93     if you want to associate your coordinate pairs with some short-ish
94     integer codes, you could do this:
95    
96     geo-latlon2place-makedb --extract '
97     chomp;
98     my ($lat, $lon, $id) = split / /, 4;
99     ($lat, $lon, 1, pack "w", $id)
100     ' input output
101    
102     And later use C<unpack "w"> on the data returned by C<lookup>.
103    
104     The C<geonames> filter looks similar to the following fragment, which
105     shows off some more filtering possibilities:
106 root 1.1
107     my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/;
108    
109     return if $t1 ne "P"; # only places
110    
111     # minimum population 200, or it is a populated place with no known population
112     $pop => 200
113     or ($pop eq "" and $t2 eq "PPL")
114     or return;
115    
116     # skip certain places we are not interested in
117     return if $t2 eq "PPLX"; # section of populated place
118     return if $t2 eq "PPLW"; # destroyed populated place
119     return if $t2 eq "PPLH"; # historical populated place
120     return if $t2 eq "PPLQ"; # abandoned populated place
121    
122     # geonames has a lot of very long place names which aren't
123     # actually place names, so ignore very long names
124     60 > length $name
125     or return;
126    
127     # we estimate a weight by dividing 25 by the radius of the place,
128 root 1.2 # which we get by assuming a fixed population density of 5000 # people
129     # per square km, # which is almost always a considerable over-estimate.
130 root 1.1 # 25 and 5000 are pretty much made-up, feel free to improve and
131     # send me the results.
132     my $w = 25 / (1 + sqrt $pop / 5000);
133    
134     # administrative centers get a fixed low weight
135     if ($t2 =~ /^PPLA(\d*)/) {
136     $w = $1 || 1;
137     }
138    
139     ($lat, $lon, $w, "$name, $cc")
140    
141     =head1 AUTHOR
142    
143     Marc Lehmann <schmorp@schmorp.de>
144    
145     =cut
146    
147     use common::sense;
148     use Getopt::Long;
149     use Geo::LatLon2Place ();
150     use Pod::Usage ();
151    
152     GetOptions
153     "help|h" => sub { Pod::Usage::pod2usage -exittval => 0, verbose => 1 },
154     "extract=s" => \my $extract,
155     "cellsize=i" => \my $km,
156     or Pod::Usage::pod2usage
157     -exitval => 1,
158     ;
159    
160     @ARGV == 2
161     or die "need exactly two paths: inputfile.txt outputdatabase.cdb\n";
162    
163     $extract ||= "geonames";
164    
165     if ($extract eq "geonames") {
166     $extract = sub {
167     my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/;
168    
169     return if $t1 ne "P"; # only places
170    
171     $pop => 200
172     or ($pop eq "" and $t2 eq "PPL")
173     or return;
174    
175     return if $t2 eq "PPLX"; # section of populated place
176     return if $t2 eq "PPLW"; # destroyed populated place
177     return if $t2 eq "PPLH"; # historical populated place
178     return if $t2 eq "PPLQ"; # abandoned populated place
179    
180     # geonames has a lot of very long place names which aren't
181     # actually place names, so ignore very long names
182     60 > length $name
183     or return;
184    
185     my $w = 25 / (1 + sqrt $pop / 5000);
186    
187     if ($t2 =~ /^PPLA(\d+)/) {
188     $w = $1 || 1;
189     }
190    
191     ($lat, $lon, $w, "$name, $cc")
192     };
193     } elsif ($extract eq "geonames-postalcodes") {
194 root 1.3 $km ||= 10;
195 root 1.1 $extract = sub {
196     my ($cc, $zip, $name, undef, undef, undef, undef, undef, undef, $lat, $lon, undef) = split /\t/;
197    
198 root 1.5 # https://buttermaidbakery.com/pages/what-is-an-apo-fpo-dpo-address
199     return if $name =~ /^[AFD]PO [A-Z][A-Z]\z/ and $cc eq "US";
200    
201 root 1.1 ($lat, $lon, 1, "$zip $name, $cc")
202     };
203     } else {
204     $extract = eval "#line 1 \"extract fragment\"\nsub { $extract; }";
205     die "$@" if $@;
206     }
207    
208 root 1.3 $km ||= 20;
209    
210     my $torad = (atan2 1,0) / 90;
211 root 1.1
212     my $boxes = int 6378 * 2 * 2 * (atan2 1,0) / $km; # equator radius / cell size
213    
214 root 1.4 open my $fh, "<:perlio", $ARGV[0]
215     or die "$ARGV[0]: $!\n";
216 root 1.1
217     my @grid;
218    
219     while (<$fh>) {
220     my ($lat, $lon, $w, $payload) = $extract->()
221     or next;
222    
223 root 1.3 unless (255 >= length $payload) {
224 root 1.1 $payload =~ s/([^(\x20-\x7e])/sprintf "\\x%02x", ord $1/ge;
225     warn "payload too long, skipping: $payload\n";
226     next;
227     }
228    
229 root 1.3 my $blat = int $boxes * cos $lat * $torad; # can be 0, but does not matter
230 root 1.1 my $x = int (($lon + 180) * $blat / 360);
231     my $y = int (($lat + 90) * $boxes / 180);
232    
233     # we use 16 bit for lat/lon, 8 bi8t for the weight, BER-id, counted name and CC
234 root 1.3 push @{ $grid[$y][$x] }, pack "s< s< C C/a*", $lat * 32767 / 90, $lon * 32767 / 180, $w, $payload;
235 root 1.1 }
236    
237 root 1.4 #############################################################################
238     # write gridded data out
239 root 1.1
240     open my $cdb, ">", $ARGV[1]
241 root 1.4 or die "$ARGV[1]: $!\n";
242 root 1.1 Geo::LatLon2Place::cdb_make_start fileno $cdb
243     and die "cdb_make_start failure";
244    
245 root 1.4 Geo::LatLon2Place::cdb_make_add "", pack "a4VVVV", "SRGL", 2, $km, $boxes, time;
246 root 1.1
247 root 1.4 #############################################################################
248     # now we walk the grid using a hilbert curve to increase locality when querying
249    
250     my ($max, $sum, $cnt); # statistics
251    
252     my ($x, $y) = (-1, 0);
253    
254     sub step { # move one step into $_[0] direction: 0 x++, 1 y++, 2 x--, 3 y--
255     ($_[0] & 1 ? $y : $x) += 1 - ($_[0] & 2);
256    
257     # write cell at $x,$y, if any
258     my $c = $grid[$y][$x]
259     or return;
260    
261     undef $grid[$y][$x]; # for paranoia check
262    
263     # statistics
264     $sum += scalar @$c;
265     $cnt++;
266     $max = [(scalar @$c), $x, $y] if @$c > $max->[0];
267    
268     Geo::LatLon2Place::cdb_make_add
269     +(pack "s< s<", $x, $y),
270     +(join "", @$c)
271     and "cdb_make_add failure";
272     }
273    
274     sub hilbert;
275     sub hilbert { # order dir rot
276     my $order = $_[0] >> 1
277     or return;
278    
279     hilbert $order, $_[1] + $_[2], -$_[2]; step $_[1] + $_[2];
280     hilbert $order, $_[1] , $_[2]; step $_[1] ;
281     hilbert $order, $_[1] , $_[2]; step $_[1] - $_[2];
282     hilbert $order, $_[1] - $_[2], -$_[2];
283     }
284    
285     step 0; # move to 0,0
286     hilbert @grid * 2, 0, 1;
287    
288     # paranoia-check, make sure we wrote out all cells
289     for (@grid) {
290     grep $_, @$_
291     and die;
292 root 1.1 }
293    
294 root 1.4 #############################################################################
295    
296 root 1.1 Geo::LatLon2Place::cdb_make_finish
297     and die "cdb_make_finish failed";
298     close $cdb;
299    
300     $max->[2] = $max->[2] * 180 / $boxes - 90;
301 root 1.3 my $blat = int $boxes * cos $max->[2] * $torad;
302 root 1.1 $max->[1] = $max->[1] * 360 / $blat - 180;
303     print "cell size: $km km, grid size: $boxes, non-empty cell count: $cnt\naverage cell size: ", int $sum/$cnt, ", maximum cell size: $max->[0] (at $max->[2] $max->[1])\n";
304