ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/Geo-LatLon2Place/bin/geo-latlon2place-makedb
Revision: 1.1
Committed: Mon Mar 14 02:41:52 2022 UTC (2 years, 2 months ago) by root
Branch: MAIN
CVS Tags: rel-0_01
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 #!/opt/bin/perl
2    
3     # usage: geonames2cdb geonames.txt database.cdb
4     # you can use allCountries.zp, or cities500.zip and so on
5     # from https://download.geonames.org/export/dump/
6    
7     =head1 NAME
8    
9     geo-latlon2place-makedb - generate database for use with Geo::LatLon2Place
10    
11     =head1 SYNOPSIS
12    
13     geo-latlon2place-makedb [OPTION]... inputfile.txt outputfile.cdb
14    
15     =head1 DESCRIPTION
16    
17     abc
18    
19     =head1 OPTIONS AND ARGUMENTS
20    
21     geo-latlon2place-makedb requires two arguments: a text file with geo data
22     and the name of the database file to be written.
23    
24     By default, the input file is considered to be in geonames gazetteer
25     format, but this can be customized using B<--extract>.
26    
27     =over
28    
29     =item B<--cellsize> km (default C<20>)
30    
31     The (minimum) size of a single grid cell in km - the data is binned into
32     cells of at least this size. It should not be larger than the search
33     radius.
34    
35     =item B<--extract> C<geonames>|C<geonames-postalcodes>|perl...
36    
37     The extraction method: the default is C<geonames>, which expects a
38     geonames database (L<https://download.geonames.org/export/dump/>, for
39     example F<DE.txt>, F<cities500.txt> or F<allCountries.txt>) and extracts
40     I<placename, countrycode> strings from it.
41    
42     The method C<geonames-postalcodes> (not yet implemented)
43     does the same, but for a geonames postal code database
44     L<https://download.geonames.org/export/zip>.
45    
46     Lastly, you can specify a perl fragment that implements your own filtering
47     and extraction.
48    
49     =back
50    
51     =head1 FILTERING AND EXTRACTION
52    
53     Record selection and the returned data are not fixed - you can filter your records
54     yourself and associate any (reasonably short, the maximum is a bit above 200 octets)
55     binary blob with a coordinate pair.
56    
57     To do this, you have to provide a perl fragment that extracts latitude,
58     longitude, a weight and the associated data blob from an input line stored
59     in C<$_>. The file is opened using the C<:perlio> layer, so if your input
60     file is in UTF-8, so will be C<$_>.
61    
62     For example, the following would expect an input file with space separated
63     latitude, longitude, weight and name, where name can contain spaces, which
64     is useful when you wat to provide your own input data:
65    
66     geo-latlon2place-makedb --extract 'chomp; split / /, 4' input output
67    
68     A slighly more verbose example expecting only latitude, longitude and a
69     name would be:
70    
71     geo-latlon2place-makedb --extract '
72     chomp;
73     my ($lat, $lon, $name) = split / /, 4;
74     ($lat, $lon, 1, $name)
75     ' input output
76    
77     If you want to skip certain lines without adding anything to the database,
78     you can return nothing:
79    
80     geo-latlon2place-makedb --extract '
81     chomp;
82     my ($lat, $lon, $name) = split / /;
83     return unless $lat < 0; # only add southern hemisphere points
84     ($lat, $lon, 1, $name)
85     ' input output
86    
87     In general, the fragment should return either an empty list, or a
88     four-tuple with decimal latitude, longitude, a weight (integer 0..255)
89     and the binary data to be associated with the coordinate. Other than the
90     weight, these should be self-explaining. The weight is used during search
91     and will be multiplied to the square of the distance, and is used to make
92     larger cities win over small ones when the coordinate is somewhere between
93     them.
94    
95     The C<geonames> filter looks similar to this fragment, which shows off
96     more possibilities:
97    
98     my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/;
99    
100     return if $t1 ne "P"; # only places
101    
102     # minimum population 200, or it is a populated place with no known population
103     $pop => 200
104     or ($pop eq "" and $t2 eq "PPL")
105     or return;
106    
107     # skip certain places we are not interested in
108     return if $t2 eq "PPLX"; # section of populated place
109     return if $t2 eq "PPLW"; # destroyed populated place
110     return if $t2 eq "PPLH"; # historical populated place
111     return if $t2 eq "PPLQ"; # abandoned populated place
112    
113     # geonames has a lot of very long place names which aren't
114     # actually place names, so ignore very long names
115     60 > length $name
116     or return;
117    
118     # we estimate a weight by dividing 25 by the radius of the place,
119     # which we get by assuming a fixed population density of 5000 people/kmĀ²,
120     # which is almost always a considerable over-estimate.
121     # 25 and 5000 are pretty much made-up, feel free to improve and
122     # send me the results.
123     my $w = 25 / (1 + sqrt $pop / 5000);
124    
125     # administrative centers get a fixed low weight
126     if ($t2 =~ /^PPLA(\d*)/) {
127     $w = $1 || 1;
128     }
129    
130     ($lat, $lon, $w, "$name, $cc")
131    
132     =head1 AUTHOR
133    
134     Marc Lehmann <schmorp@schmorp.de>
135    
136     =cut
137    
138     use common::sense;
139     use Getopt::Long;
140     use Geo::LatLon2Place ();
141     use Pod::Usage ();
142    
143     GetOptions
144     "help|h" => sub { Pod::Usage::pod2usage -exittval => 0, verbose => 1 },
145     "extract=s" => \my $extract,
146     "cellsize=i" => \my $km,
147     or Pod::Usage::pod2usage
148     -exitval => 1,
149     ;
150    
151     @ARGV == 2
152     or die "need exactly two paths: inputfile.txt outputdatabase.cdb\n";
153    
154     $km ||= 20;
155     $extract ||= "geonames";
156    
157     if ($extract eq "geonames") {
158     $extract = sub {
159     my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/;
160    
161     return if $t1 ne "P"; # only places
162    
163     $pop => 200
164     or ($pop eq "" and $t2 eq "PPL")
165     or return;
166    
167     return if $t2 eq "PPLX"; # section of populated place
168     return if $t2 eq "PPLW"; # destroyed populated place
169     return if $t2 eq "PPLH"; # historical populated place
170     return if $t2 eq "PPLQ"; # abandoned populated place
171    
172     # geonames has a lot of very long place names which aren't
173     # actually place names, so ignore very long names
174     60 > length $name
175     or return;
176    
177     my $w = 25 / (1 + sqrt $pop / 5000);
178    
179     if ($t2 =~ /^PPLA(\d+)/) {
180     $w = $1 || 1;
181     }
182    
183     ($lat, $lon, $w, "$name, $cc")
184     };
185     } elsif ($extract eq "geonames-postalcodes") {
186     $extract = sub {
187     my ($cc, $zip, $name, undef, undef, undef, undef, undef, undef, $lat, $lon, undef) = split /\t/;
188    
189     ($lat, $lon, 1, "$zip $name, $cc")
190     };
191     } else {
192     $extract = eval "#line 1 \"extract fragment\"\nsub { $extract; }";
193     die "$@" if $@;
194     }
195    
196     my $torad = (atan2 1,0) / 180;
197    
198     my $boxes = int 6378 * 2 * 2 * (atan2 1,0) / $km; # equator radius / cell size
199    
200     open my $fh, "<:perlio", $ARGV[0];
201    
202     my @grid;
203    
204     while (<$fh>) {
205     my ($lat, $lon, $w, $payload) = $extract->()
206     or next;
207    
208     unless (255 - (2+2+1) >= length $payload) {
209     $payload =~ s/([^(\x20-\x7e])/sprintf "\\x%02x", ord $1/ge;
210     warn "payload too long, skipping: $payload\n";
211     next;
212     }
213    
214     # higher latitudes get a (much) lower resolution, so we always get at least $km cells
215     my $blat = int $boxes * cos abs $lat * $torad;
216     my $x = int (($lon + 180) * $blat / 360);
217     my $y = int (($lat + 90) * $boxes / 180);
218    
219     # we use 16 bit for lat/lon, 8 bi8t for the weight, BER-id, counted name and CC
220     push @{ $grid[$y][$x] }, pack "C/a*", pack "s< s< C a*", $lat * 32767 / 90, $lon * 32767 / 180, $w, $payload;
221     }
222    
223     my ($max, $sum, $cnt);
224    
225     open my $cdb, ">", $ARGV[1]
226     or die;
227     Geo::LatLon2Place::cdb_make_start fileno $cdb
228     and die "cdb_make_start failure";
229    
230     Geo::LatLon2Place::cdb_make_add "", pack "a4VVV", "SRGL", 1, $km, $boxes;
231    
232     for my $y (0 .. $#grid) {
233     my $r = $grid[$y];
234     for my $x (0 .. $#$r) {
235     my $c = $r->[$x]
236     or next;
237    
238     # statistics
239     $sum += scalar @$c;
240     $cnt++;
241     $max = [(scalar @$c), $x, $y] if @$c > $max->[0];
242    
243     Geo::LatLon2Place::cdb_make_add
244     +(pack "s< s<", $x, $y),
245     +(join "", @$c)
246     and "cdb_make_add failure";
247     }
248     }
249    
250     Geo::LatLon2Place::cdb_make_finish
251     and die "cdb_make_finish failed";
252     close $cdb;
253    
254     $max->[2] = $max->[2] * 180 / $boxes - 90;
255     my $blat = int $boxes * cos abs $max->[2] * $torad;
256     $max->[1] = $max->[1] * 360 / $blat - 180;
257     print "cell size: $km km, grid size: $boxes, non-empty cell count: $cnt\naverage cell size: ", int $sum/$cnt, ", maximum cell size: $max->[0] (at $max->[2] $max->[1])\n";
258