1 |
root |
1.1 |
#!/usr/bin/perl |
2 |
|
|
|
3 |
|
|
use Carp; |
4 |
|
|
|
5 |
|
|
use PApp::SQL; |
6 |
root |
1.2 |
use BerkeleyDB; |
7 |
root |
1.1 |
|
8 |
root |
1.4 |
$DEST = "data/tsv"; |
9 |
|
|
|
10 |
root |
1.1 |
# 2031616: 0:435514 1:44163 2:107528 3:127671 4:8311 5:2028604 6:117468 7:8210 8:11754 9:24860 14:4344 |
11 |
|
|
# 4128768: 0:650846 1:92556 2:183027 3:170894 4:8605 5:4122242 6:147178 7:8238 8:11802 9:31453 14:6460 |
12 |
|
|
|
13 |
root |
1.4 |
my %cvtid = ( |
14 |
|
|
0 => "name", |
15 |
|
|
1 => "vorname", |
16 |
|
|
2 => "zusatz1", |
17 |
|
|
3 => "zusatz2", |
18 |
|
|
4 => "zusatz3", |
19 |
|
|
5 => "vorwahl", |
20 |
root |
1.1 |
# nummer |
21 |
root |
1.4 |
7 => "strasse", |
22 |
|
|
8 => "haus", |
23 |
|
|
9 => "plz", |
24 |
|
|
10 => "ort", |
25 |
|
|
#[11 => "ort", # Ortsteil |
26 |
|
|
12 => "branche", |
27 |
|
|
13 => "typ", |
28 |
root |
1.1 |
); |
29 |
|
|
|
30 |
root |
1.4 |
my @cvtid = keys %cvtid; |
31 |
|
|
|
32 |
root |
1.1 |
local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo"; |
33 |
|
|
|
34 |
|
|
sql_exec "truncate $_" |
35 |
|
|
for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3); |
36 |
|
|
|
37 |
|
|
sql_exec "lock tables " . join ", ", map "$_ write", |
38 |
root |
1.2 |
qw(branche haus name ort plz strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3); |
39 |
root |
1.1 |
|
40 |
root |
1.2 |
# special treatment for name, due to it's size :( |
41 |
root |
1.4 |
tie %cache2, |
42 |
root |
1.3 |
'BerkeleyDB::Hash', |
43 |
root |
1.2 |
-Filename => "/tmp/dinfo-import1-cache-$$", |
44 |
root |
1.3 |
-Flags => DB_CREATE|DB_EXCL, |
45 |
root |
1.2 |
-Cachesize => 380*1024*1024; |
46 |
root |
1.3 |
unlink "/tmp/dinfo-import1-cache-$$"; |
47 |
root |
1.1 |
|
48 |
root |
1.4 |
open OUT, ">", "$DEST/row.txt"; |
49 |
|
|
select OUT; |
50 |
|
|
|
51 |
root |
1.2 |
while (<STDIN>) { |
52 |
root |
1.1 |
chomp; |
53 |
|
|
my @data = split /\t/; |
54 |
|
|
|
55 |
|
|
$data[3] =~ s/^, //; |
56 |
|
|
$data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./; |
57 |
|
|
|
58 |
|
|
$data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do { |
59 |
|
|
warn "ERR: unparseable telnr. '$data[6]'"; |
60 |
|
|
next; |
61 |
|
|
}; |
62 |
|
|
|
63 |
|
|
$data[6] = $2; |
64 |
|
|
$data[13] = $1; |
65 |
|
|
|
66 |
|
|
($data[5] eq substr $data[6], 0, length $data[5]) |
67 |
|
|
and substr $data[6], 0, length $data[5], ""; |
68 |
|
|
|
69 |
|
|
if (length $data[6] > 12) { |
70 |
|
|
warn "ERR: number too long '$data[6]'"; |
71 |
|
|
next; |
72 |
|
|
} |
73 |
|
|
|
74 |
|
|
for (@cvtid) { |
75 |
root |
1.4 |
$data[$_] = ($cache{"$_,$data[$_]"} ||= do { |
76 |
|
|
my $data = $data[$_]; |
77 |
|
|
my $fh = $fh{$_} ||= do { |
78 |
|
|
open my $fh, ">", "$DEST/$cvtid{$_}.txt" |
79 |
|
|
or die "$DEST/$cvtid{$_}.txt: $!"; |
80 |
|
|
$fh; |
81 |
|
|
}; |
82 |
|
|
my $id = $data eq "" ? 1 : 1 + ++$seq[$_]; |
83 |
|
|
$data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g; |
84 |
|
|
|
85 |
|
|
print $fh "$id\t$data\n"; |
86 |
|
|
|
87 |
|
|
$id; |
88 |
|
|
}); |
89 |
root |
1.1 |
} |
90 |
|
|
|
91 |
|
|
$data[13] < 16 or die "ERR: too many typ's"; |
92 |
|
|
|
93 |
|
|
$data[6] =~ s/ //g; |
94 |
|
|
$data[6] = pack "H*", (substr "$data[6]000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13]; |
95 |
|
|
#warn unpack "H*", $data[6]; |
96 |
root |
1.4 |
$data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g; |
97 |
root |
1.1 |
|
98 |
|
|
#name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche |
99 |
|
|
|
100 |
|
|
print +(join "\t", |
101 |
|
|
$data[0], $data[1], $data[2], $data[3], $data[4], |
102 |
|
|
$data[5], $data[6], |
103 |
root |
1.2 |
$data[7], $data[8], $data[9], $data[10], $data[12], |
104 |
root |
1.1 |
), "\n"; |
105 |
|
|
|
106 |
|
|
$count++ & 4095 or warn time . " $count"; |
107 |
root |
1.4 |
|
108 |
|
|
if ($count == 20_000_000) { |
109 |
|
|
warn "copying...\n"; |
110 |
|
|
%cache2 = %cache; |
111 |
|
|
undef %cache; |
112 |
|
|
*cache = \%cache2; |
113 |
|
|
} |
114 |
root |
1.1 |
} |
115 |
|
|
|