[ViewVC] Annotation of: cvs/Geo-LatLon2Place/bin/geo-latlon2place-makedb

#!/opt/bin/perl

# usage: geonames2cdb geonames.txt database.cdb
# you can use allCountries.zp, or cities500.zip and so on
# from https://download.geonames.org/export/dump/

=head1 NAME

geo-latlon2place-makedb - generate database for use with Geo::LatLon2Place

=head1 SYNOPSIS

geo-latlon2place-makedb [OPTION]... inputfile.txt outputfile.cdb

=head1 DESCRIPTION

abc

=head1 OPTIONS AND ARGUMENTS

geo-latlon2place-makedb requires two arguments: a text file with geo data
and the name of the database file to be written.

By default, the input file is considered to be in geonames gazetteer
format, but this can be customized using B<--extract>.

=over

=item B<--cellsize> km (default C<20>)

The (minimum) size of a single grid cell in km - the data is binned into
cells of at least this size. It should not be larger than the search
radius.

=item B<--extract> C<geonames>|C<geonames-postalcodes>|perl...

The extraction method: the default is C<geonames>, which expects a
geonames database (L<https://download.geonames.org/export/dump/>, for
example F<DE.txt>, F<cities500.txt> or F<allCountries.txt>) and extracts
I<placename, countrycode> strings from it.

The method C<geonames-postalcodes> (not yet implemented)
does the same, but for a geonames postal code database
L<https://download.geonames.org/export/zip>.

Lastly, you can specify a perl fragment that implements your own filtering
and extraction.

=back

=head1 FILTERING AND EXTRACTION

Record selection and the returned data are not fixed - you can filter your records
yourself and associate any (reasonably short, the maximum is a bit above 200 octets)
binary blob with a coordinate pair.

To do this, you have to provide a perl fragment that extracts latitude,
longitude, a weight and the associated data blob from an input line stored
in C<$_>. The file is opened using the C<:perlio> layer, so if your input
file is in UTF-8, so will be C<$_>.

For example, the following would expect an input file with space separated
latitude, longitude, weight and name, where name can contain spaces, which
is useful when you wat to provide your own input data:

   geo-latlon2place-makedb --extract 'chomp; split / /, 4' input output

A slighly more verbose example expecting only latitude, longitude and a
name would be:

   geo-latlon2place-makedb --extract '
      chomp;
      my ($lat, $lon, $name) = split / /, 4;
      ($lat, $lon, 1, $name)
   ' input output

If you want to skip certain lines without adding anything to the database,
you can return nothing:

   geo-latlon2place-makedb --extract '
      chomp;
      my ($lat, $lon, $name) = split / /;
      return unless $lat < 0; # only add southern hemisphere points
      ($lat, $lon, 1, $name)
   ' input output

In general, the fragment should return either an empty list, or a
four-tuple with decimal latitude, longitude, a weight (integer 0..255)
and the binary data to be associated with the coordinate. Other than the
weight, these should be self-explaining. The weight is used during search
and will be multiplied to the square of the distance, and is used to make
larger cities win over small ones when the coordinate is somewhere between
them.

The C<geonames> filter looks similar to this fragment, which shows off
more possibilities:

   my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/;

   return if $t1 ne "P"; # only places

   # minimum population 200, or it is a populated place with no known population
   $pop => 200
      or ($pop eq "" and $t2 eq "PPL")
      or return;

   # skip certain places we are not interested in
   return if $t2 eq "PPLX"; # section of populated place
   return if $t2 eq "PPLW"; # destroyed populated place
   return if $t2 eq "PPLH"; # historical populated place
   return if $t2 eq "PPLQ"; # abandoned populated place

   # geonames has a lot of very long place names which aren't
   # actually place names, so ignore very long names
   60 > length $name
      or return;

   # we estimate a weight by dividing 25 by the radius of the place,
   # which we get by assuming a fixed population density of 5000 people/km²,
   # which is almost always a considerable over-estimate.
   # 25 and 5000 are pretty much made-up, feel free to improve and
   # send me the results.
   my $w = 25 / (1 + sqrt $pop / 5000);

   # administrative centers get a fixed low weight
   if ($t2 =~ /^PPLA(\d*)/) {
      $w = $1 || 1;
   }

   ($lat, $lon, $w, "$name, $cc")

=head1 AUTHOR

Marc Lehmann <schmorp@schmorp.de>

=cut

use common::sense;
use Getopt::Long;
use Geo::LatLon2Place ();
use Pod::Usage ();

GetOptions
   "help|h"     => sub { Pod::Usage::pod2usage -exittval => 0, verbose => 1 },
   "extract=s"  => \my $extract,
   "cellsize=i" => \my $km,
   or Pod::Usage::pod2usage
      -exitval => 1,
;

@ARGV == 2
   or die "need exactly two paths: inputfile.txt outputdatabase.cdb\n";

$km ||= 20;
$extract ||= "geonames";

if ($extract eq "geonames") {
   $extract = sub {
      my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/;

      return if $t1 ne "P"; # only places

      $pop => 200
         or ($pop eq "" and $t2 eq "PPL")
         or return;

      return if $t2 eq "PPLX"; # section of populated place
      return if $t2 eq "PPLW"; # destroyed populated place
      return if $t2 eq "PPLH"; # historical populated place
      return if $t2 eq "PPLQ"; # abandoned populated place

      # geonames has a lot of very long place names which aren't
      # actually place names, so ignore very long names
      60 > length $name
         or return;

      my $w = 25 / (1 + sqrt $pop / 5000);

      if ($t2 =~ /^PPLA(\d+)/) {
         $w = $1 || 1;
      }

      ($lat, $lon, $w, "$name, $cc")
   };
} elsif ($extract eq "geonames-postalcodes") {
   $extract = sub {
      my ($cc, $zip, $name, undef, undef, undef, undef, undef, undef, $lat, $lon, undef) = split /\t/;

      ($lat, $lon, 1, "$zip $name, $cc")
   };
} else {
   $extract = eval "#line 1 \"extract fragment\"\nsub { $extract; }";
   die "$@" if $@;
}

my $torad = (atan2 1,0) / 180;

my $boxes = int 6378 * 2 * 2 * (atan2 1,0) / $km; # equator radius / cell size

open my $fh, "<:perlio", $ARGV[0];

my @grid;

while (<$fh>) {
   my ($lat, $lon, $w, $payload) = $extract->()
      or next;

   unless (255 - (2+2+1) >= length $payload) {
      $payload =~ s/([^(\x20-\x7e])/sprintf "\\x%02x", ord $1/ge;
      warn "payload too long, skipping: $payload\n";
      next;
   }

   # higher latitudes get a (much) lower resolution, so we always get at least $km cells
   my $blat = int $boxes * cos abs $lat * $torad;
   my $x = int (($lon + 180) * $blat  / 360);
   my $y = int (($lat +  90) * $boxes / 180);

   # we use 16 bit for lat/lon, 8 bi8t for the weight, BER-id, counted name and CC
   push @{ $grid[$y][$x] }, pack "C/a*", pack "s< s< C a*", $lat * 32767 / 90, $lon * 32767 / 180, $w, $payload;
}

my ($max, $sum, $cnt);

open my $cdb, ">", $ARGV[1]
   or die;
Geo::LatLon2Place::cdb_make_start fileno $cdb
   and die "cdb_make_start failure";

Geo::LatLon2Place::cdb_make_add "", pack "a4VVV", "SRGL", 1, $km, $boxes;

for my $y (0 .. $#grid) {
   my $r = $grid[$y];
   for my $x (0 .. $#$r) {
      my $c = $r->[$x]
         or next;

      # statistics
      $sum += scalar @$c;
      $cnt++;
      $max = [(scalar @$c), $x, $y] if @$c > $max->[0];

      Geo::LatLon2Place::cdb_make_add
            +(pack "s< s<", $x, $y),
            +(join "", @$c)
         and "cdb_make_add failure";
   }
}

Geo::LatLon2Place::cdb_make_finish
   and die "cdb_make_finish failed";
close $cdb;

$max->[2] = $max->[2] * 180 / $boxes - 90;
my $blat = int $boxes * cos abs $max->[2] * $torad;
$max->[1] = $max->[1] * 360 / $blat - 180;
print "cell size: $km km, grid size: $boxes, non-empty cell count: $cnt\naverage cell size: ", int $sum/$cnt, ", maximum cell size: $max->[0] (at $max->[2] $max->[1])\n";

Revision:	1.1
Committed:	Mon Mar 14 02:41:52 2022 UTC (2 years, 2 months ago) by root
Branch:	MAIN
CVS Tags:	rel-0_01
Log Message:	* empty log message *
#	User	Rev	Content
1	root	1.1	#!/opt/bin/perl
2
3			# usage: geonames2cdb geonames.txt database.cdb
4			# you can use allCountries.zp, or cities500.zip and so on
5			# from https://download.geonames.org/export/dump/
6
7			=head1 NAME
8
9			geo-latlon2place-makedb - generate database for use with Geo::LatLon2Place
10
11			=head1 SYNOPSIS
12
13			geo-latlon2place-makedb [OPTION]... inputfile.txt outputfile.cdb
14
15			=head1 DESCRIPTION
16
17			abc
18
19			=head1 OPTIONS AND ARGUMENTS
20
21			geo-latlon2place-makedb requires two arguments: a text file with geo data
22			and the name of the database file to be written.
23
24			By default, the input file is considered to be in geonames gazetteer
25			format, but this can be customized using B<--extract>.
26
27			=over
28
29			=item B<--cellsize> km (default C<20>)
30
31			The (minimum) size of a single grid cell in km - the data is binned into
32			cells of at least this size. It should not be larger than the search
33			radius.
34
35			=item B<--extract> C<geonames>\|C<geonames-postalcodes>\|perl...
36
37			The extraction method: the default is C<geonames>, which expects a
38			geonames database (L<https://download.geonames.org/export/dump/>, for
39			example F<DE.txt>, F<cities500.txt> or F<allCountries.txt>) and extracts
40			I<placename, countrycode> strings from it.
41
42			The method C<geonames-postalcodes> (not yet implemented)
43			does the same, but for a geonames postal code database
44			L<https://download.geonames.org/export/zip>.
45
46			Lastly, you can specify a perl fragment that implements your own filtering
47			and extraction.
48
49			=back
50
51			=head1 FILTERING AND EXTRACTION
52
53			Record selection and the returned data are not fixed - you can filter your records
54			yourself and associate any (reasonably short, the maximum is a bit above 200 octets)
55			binary blob with a coordinate pair.
56
57			To do this, you have to provide a perl fragment that extracts latitude,
58			longitude, a weight and the associated data blob from an input line stored
59			in C<$_>. The file is opened using the C<:perlio> layer, so if your input
60			file is in UTF-8, so will be C<$_>.
61
62			For example, the following would expect an input file with space separated
63			latitude, longitude, weight and name, where name can contain spaces, which
64			is useful when you wat to provide your own input data:
65
66			geo-latlon2place-makedb --extract 'chomp; split / /, 4' input output
67
68			A slighly more verbose example expecting only latitude, longitude and a
69			name would be:
70
71			geo-latlon2place-makedb --extract '
72			chomp;
73			my ($lat, $lon, $name) = split / /, 4;
74			($lat, $lon, 1, $name)
75			' input output
76
77			If you want to skip certain lines without adding anything to the database,
78			you can return nothing:
79
80			geo-latlon2place-makedb --extract '
81			chomp;
82			my ($lat, $lon, $name) = split / /;
83			return unless $lat < 0; # only add southern hemisphere points
84			($lat, $lon, 1, $name)
85			' input output
86
87			In general, the fragment should return either an empty list, or a
88			four-tuple with decimal latitude, longitude, a weight (integer 0..255)
89			and the binary data to be associated with the coordinate. Other than the
90			weight, these should be self-explaining. The weight is used during search
91			and will be multiplied to the square of the distance, and is used to make
92			larger cities win over small ones when the coordinate is somewhere between
93			them.
94
95			The C<geonames> filter looks similar to this fragment, which shows off
96			more possibilities:
97
98			my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/;
99
100			return if $t1 ne "P"; # only places
101
102			# minimum population 200, or it is a populated place with no known population
103			$pop => 200
104			or ($pop eq "" and $t2 eq "PPL")
105			or return;
106
107			# skip certain places we are not interested in
108			return if $t2 eq "PPLX"; # section of populated place
109			return if $t2 eq "PPLW"; # destroyed populated place
110			return if $t2 eq "PPLH"; # historical populated place
111			return if $t2 eq "PPLQ"; # abandoned populated place
112
113			# geonames has a lot of very long place names which aren't
114			# actually place names, so ignore very long names
115			60 > length $name
116			or return;
117
118			# we estimate a weight by dividing 25 by the radius of the place,
119			# which we get by assuming a fixed population density of 5000 people/km²,
120			# which is almost always a considerable over-estimate.
121			# 25 and 5000 are pretty much made-up, feel free to improve and
122			# send me the results.
123			my $w = 25 / (1 + sqrt $pop / 5000);
124
125			# administrative centers get a fixed low weight
126			if ($t2 =~ /^PPLA(\d*)/) {
127			$w = $1 \|\| 1;
128			}
129
130			($lat, $lon, $w, "$name, $cc")
131
132			=head1 AUTHOR
133
134			Marc Lehmann <schmorp@schmorp.de>
135
136			=cut
137
138			use common::sense;
139			use Getopt::Long;
140			use Geo::LatLon2Place ();
141			use Pod::Usage ();
142
143			GetOptions
144			"help\|h" => sub { Pod::Usage::pod2usage -exittval => 0, verbose => 1 },
145			"extract=s" => \my $extract,
146			"cellsize=i" => \my $km,
147			or Pod::Usage::pod2usage
148			-exitval => 1,
149			;
150
151			@ARGV == 2
152			or die "need exactly two paths: inputfile.txt outputdatabase.cdb\n";
153
154			$km \|\|= 20;
155			$extract \|\|= "geonames";
156
157			if ($extract eq "geonames") {
158			$extract = sub {
159			my ($id, $name, undef, undef, $lat, $lon, $t1, $t2, $cc, undef, $a1, $s2, $a3, $a4, $pop, undef) = split /\t/;
160
161			return if $t1 ne "P"; # only places
162
163			$pop => 200
164			or ($pop eq "" and $t2 eq "PPL")
165			or return;
166
167			return if $t2 eq "PPLX"; # section of populated place
168			return if $t2 eq "PPLW"; # destroyed populated place
169			return if $t2 eq "PPLH"; # historical populated place
170			return if $t2 eq "PPLQ"; # abandoned populated place
171
172			# geonames has a lot of very long place names which aren't
173			# actually place names, so ignore very long names
174			60 > length $name
175			or return;
176
177			my $w = 25 / (1 + sqrt $pop / 5000);
178
179			if ($t2 =~ /^PPLA(\d+)/) {
180			$w = $1 \|\| 1;
181			}
182
183			($lat, $lon, $w, "$name, $cc")
184			};
185			} elsif ($extract eq "geonames-postalcodes") {
186			$extract = sub {
187			my ($cc, $zip, $name, undef, undef, undef, undef, undef, undef, $lat, $lon, undef) = split /\t/;
188
189			($lat, $lon, 1, "$zip $name, $cc")
190			};
191			} else {
192			$extract = eval "#line 1 \"extract fragment\"\nsub { $extract; }";
193			die "$@" if $@;
194			}
195
196			my $torad = (atan2 1,0) / 180;
197
198			my $boxes = int 6378 * 2 * 2 * (atan2 1,0) / $km; # equator radius / cell size
199
200			open my $fh, "<:perlio", $ARGV[0];
201
202			my @grid;
203
204			while (<$fh>) {
205			my ($lat, $lon, $w, $payload) = $extract->()
206			or next;
207
208			unless (255 - (2+2+1) >= length $payload) {
209			$payload =~ s/([^(\x20-\x7e])/sprintf "\\x%02x", ord $1/ge;
210			warn "payload too long, skipping: $payload\n";
211			next;
212			}
213
214			# higher latitudes get a (much) lower resolution, so we always get at least $km cells
215			my $blat = int $boxes * cos abs $lat * $torad;
216			my $x = int (($lon + 180) * $blat / 360);
217			my $y = int (($lat + 90) * $boxes / 180);
218
219			# we use 16 bit for lat/lon, 8 bi8t for the weight, BER-id, counted name and CC
220			push @{ $grid[$y][$x] }, pack "C/a", pack "s< s< C a", $lat * 32767 / 90, $lon * 32767 / 180, $w, $payload;
221			}
222
223			my ($max, $sum, $cnt);
224
225			open my $cdb, ">", $ARGV[1]
226			or die;
227			Geo::LatLon2Place::cdb_make_start fileno $cdb
228			and die "cdb_make_start failure";
229
230			Geo::LatLon2Place::cdb_make_add "", pack "a4VVV", "SRGL", 1, $km, $boxes;
231
232			for my $y (0 .. $#grid) {
233			my $r = $grid[$y];
234			for my $x (0 .. $#$r) {
235			my $c = $r->[$x]
236			or next;
237
238			# statistics
239			$sum += scalar @$c;
240			$cnt++;
241			$max = [(scalar @$c), $x, $y] if @$c > $max->[0];
242
243			Geo::LatLon2Place::cdb_make_add
244			+(pack "s< s<", $x, $y),
245			+(join "", @$c)
246			and "cdb_make_add failure";
247			}
248			}
249
250			Geo::LatLon2Place::cdb_make_finish
251			and die "cdb_make_finish failed";
252			close $cdb;
253
254			$max->[2] = $max->[2] * 180 / $boxes - 90;
255			my $blat = int $boxes * cos abs $max->[2] * $torad;
256			$max->[1] = $max->[1] * 360 / $blat - 180;
257			print "cell size: $km km, grid size: $boxes, non-empty cell count: $cnt\naverage cell size: ", int $sum/$cnt, ", maximum cell size: $max->[0] (at $max->[2] $max->[1])\n";
258