1 |
#!/usr/bin/perl |
2 |
|
3 |
use Carp; |
4 |
|
5 |
use PApp::SQL; |
6 |
use BerkeleyDB; |
7 |
|
8 |
$DEST = "data/tsv"; |
9 |
|
10 |
my @cols = qw(hausnr name vorname zusatz1 zusatz2 zusatz3 vorwahl strasse plz ort branche typ); |
11 |
|
12 |
local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo"; |
13 |
|
14 |
#sql_exec "truncate $_" |
15 |
# for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3); |
16 |
|
17 |
for my $col (@cols) { |
18 |
print "$col ANALYZE\n"; |
19 |
|
20 |
my %col; |
21 |
|
22 |
open DMP, "<:raw", "data/dump/$col" or die; |
23 |
$col{$_}++ while <DMP>; |
24 |
close DMP; |
25 |
|
26 |
print "$col SORT\n"; |
27 |
my @sort = sort { $col{$b} <=> $col{$a} } keys %col; |
28 |
|
29 |
print "$col RENUMBER\n"; |
30 |
open COL, ">:raw", "data/col/$col" or die; |
31 |
for (0 .. $#sort) { |
32 |
print COL "$_\t$sort[$_]\n"; |
33 |
$col{$sort[$_]} = $_; |
34 |
} |
35 |
close COL; |
36 |
|
37 |
print "$col CLEAR\n"; |
38 |
@sort = (); |
39 |
|
40 |
print "$col WRITE\n"; |
41 |
|
42 |
open DMP, "<:raw", "data/dump/$col" or die; |
43 |
open ROW, ">:raw", "data/row/$col" or die; |
44 |
|
45 |
print ROW "$col{$_}\n" while <DMP>; |
46 |
|
47 |
close ROW; |
48 |
close DMP; |
49 |
} |
50 |
|
51 |
__END__ |
52 |
while (<STDIN>) { |
53 |
chomp; |
54 |
my @data = split /\t/; |
55 |
|
56 |
$data[3] =~ s/^, //; |
57 |
$data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./; |
58 |
|
59 |
$data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do { |
60 |
warn "ERR: unparseable telnr. '$data[6]'"; |
61 |
next; |
62 |
}; |
63 |
|
64 |
$data[6] = $2; |
65 |
$data[13] = $1; |
66 |
|
67 |
($data[5] eq substr $data[6], 0, length $data[5]) |
68 |
and substr $data[6], 0, length $data[5], ""; |
69 |
|
70 |
if (length $data[6] > 12) { |
71 |
warn "ERR: number too long '$data[6]'"; |
72 |
next; |
73 |
} |
74 |
|
75 |
for (@cvtid) { |
76 |
$data[$_] = ($cache{"$_,$data[$_]"} ||= do { |
77 |
my $data = $data[$_]; |
78 |
my $fh = $fh{$_} ||= do { |
79 |
open my $fh, ">", "$DEST/$cvtid{$_}.txt" |
80 |
or die "$DEST/$cvtid{$_}.txt: $!"; |
81 |
$fh; |
82 |
}; |
83 |
my $id = $data eq "" ? 1 : 1 + ++$seq[$_]; |
84 |
$data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g; |
85 |
|
86 |
print $fh "$id\t$data\n"; |
87 |
|
88 |
$id; |
89 |
}); |
90 |
} |
91 |
|
92 |
$data[13] < 16 or die "ERR: too many typ's"; |
93 |
|
94 |
$data[6] =~ s/ //g; |
95 |
$data[6] = pack "H*", (substr "$data[6]000000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13]; |
96 |
#warn unpack "H*", $data[6]; |
97 |
$data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g; |
98 |
|
99 |
#name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche |
100 |
|
101 |
print +(join "\t", |
102 |
$data[0], $data[1], $data[2], $data[3], $data[4], |
103 |
$data[5], $data[6], |
104 |
$data[7], $data[8], $data[9], $data[10], $data[12], |
105 |
), "\n"; |
106 |
|
107 |
$count++ & 4095 or warn time . " $count"; |
108 |
|
109 |
if ($count == 20_000_000) { |
110 |
while (%cache) { |
111 |
warn "copying cache to db...\n"; |
112 |
while (my ($k, $v) = each %cache) { |
113 |
$cache2{$k} = delete $cache{$k}; |
114 |
} |
115 |
} |
116 |
warn "nuking mem cache...\n"; |
117 |
undef %cache; |
118 |
warn "assigning db cache...\n"; |
119 |
*cache = \%cache2; |
120 |
} |
121 |
} |
122 |
|