| 1 |
#!/usr/bin/perl |
| 2 |
|
| 3 |
use Carp; |
| 4 |
|
| 5 |
use PApp::SQL; |
| 6 |
use BerkeleyDB; |
| 7 |
|
| 8 |
$DEST = "data/tsv"; |
| 9 |
|
| 10 |
my @cols = qw(hausnr name vorname zusatz1 zusatz2 zusatz3 vorwahl strasse plz ort branche typ); |
| 11 |
|
| 12 |
local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo"; |
| 13 |
|
| 14 |
#sql_exec "truncate $_" |
| 15 |
# for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3); |
| 16 |
|
| 17 |
for my $col (@cols) { |
| 18 |
print "$col ANALYZE\n"; |
| 19 |
|
| 20 |
my %col; |
| 21 |
|
| 22 |
open DMP, "<:raw", "data/dump/$col" or die; |
| 23 |
$col{$_}++ while <DMP>; |
| 24 |
close DMP; |
| 25 |
|
| 26 |
print "$col SORT\n"; |
| 27 |
my @sort = sort { $col{$b} <=> $col{$a} } keys %col; |
| 28 |
|
| 29 |
print "$col RENUMBER\n"; |
| 30 |
open COL, ">:raw", "data/col/$col" or die; |
| 31 |
for (0 .. $#sort) { |
| 32 |
print COL "$_\t$sort[$_]\n"; |
| 33 |
$col{$sort[$_]} = $_; |
| 34 |
} |
| 35 |
close COL; |
| 36 |
|
| 37 |
print "$col CLEAR\n"; |
| 38 |
@sort = (); |
| 39 |
|
| 40 |
print "$col WRITE\n"; |
| 41 |
|
| 42 |
open DMP, "<:raw", "data/dump/$col" or die; |
| 43 |
open ROW, ">:raw", "data/row/$col" or die; |
| 44 |
|
| 45 |
print ROW "$col{$_}\n" while <DMP>; |
| 46 |
|
| 47 |
close ROW; |
| 48 |
close DMP; |
| 49 |
} |
| 50 |
|
| 51 |
__END__ |
| 52 |
while (<STDIN>) { |
| 53 |
chomp; |
| 54 |
my @data = split /\t/; |
| 55 |
|
| 56 |
$data[3] =~ s/^, //; |
| 57 |
$data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./; |
| 58 |
|
| 59 |
$data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do { |
| 60 |
warn "ERR: unparseable telnr. '$data[6]'"; |
| 61 |
next; |
| 62 |
}; |
| 63 |
|
| 64 |
$data[6] = $2; |
| 65 |
$data[13] = $1; |
| 66 |
|
| 67 |
($data[5] eq substr $data[6], 0, length $data[5]) |
| 68 |
and substr $data[6], 0, length $data[5], ""; |
| 69 |
|
| 70 |
if (length $data[6] > 12) { |
| 71 |
warn "ERR: number too long '$data[6]'"; |
| 72 |
next; |
| 73 |
} |
| 74 |
|
| 75 |
for (@cvtid) { |
| 76 |
$data[$_] = ($cache{"$_,$data[$_]"} ||= do { |
| 77 |
my $data = $data[$_]; |
| 78 |
my $fh = $fh{$_} ||= do { |
| 79 |
open my $fh, ">", "$DEST/$cvtid{$_}.txt" |
| 80 |
or die "$DEST/$cvtid{$_}.txt: $!"; |
| 81 |
$fh; |
| 82 |
}; |
| 83 |
my $id = $data eq "" ? 1 : 1 + ++$seq[$_]; |
| 84 |
$data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g; |
| 85 |
|
| 86 |
print $fh "$id\t$data\n"; |
| 87 |
|
| 88 |
$id; |
| 89 |
}); |
| 90 |
} |
| 91 |
|
| 92 |
$data[13] < 16 or die "ERR: too many typ's"; |
| 93 |
|
| 94 |
$data[6] =~ s/ //g; |
| 95 |
$data[6] = pack "H*", (substr "$data[6]000000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13]; |
| 96 |
#warn unpack "H*", $data[6]; |
| 97 |
$data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g; |
| 98 |
|
| 99 |
#name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche |
| 100 |
|
| 101 |
print +(join "\t", |
| 102 |
$data[0], $data[1], $data[2], $data[3], $data[4], |
| 103 |
$data[5], $data[6], |
| 104 |
$data[7], $data[8], $data[9], $data[10], $data[12], |
| 105 |
), "\n"; |
| 106 |
|
| 107 |
$count++ & 4095 or warn time . " $count"; |
| 108 |
|
| 109 |
if ($count == 20_000_000) { |
| 110 |
while (%cache) { |
| 111 |
warn "copying cache to db...\n"; |
| 112 |
while (my ($k, $v) = each %cache) { |
| 113 |
$cache2{$k} = delete $cache{$k}; |
| 114 |
} |
| 115 |
} |
| 116 |
warn "nuking mem cache...\n"; |
| 117 |
undef %cache; |
| 118 |
warn "assigning db cache...\n"; |
| 119 |
*cache = \%cache2; |
| 120 |
} |
| 121 |
} |
| 122 |
|