ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/dinfo/dinfo-import1
Revision: 1.4
Committed: Mon Aug 25 16:29:46 2003 UTC (20 years, 8 months ago) by root
Branch: MAIN
Changes since 1.3: +44 -21 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 #!/usr/bin/perl
2    
3     use Carp;
4    
5     use PApp::SQL;
6 root 1.2 use BerkeleyDB;
7 root 1.1
8 root 1.4 $DEST = "data/tsv";
9    
10 root 1.1 # 2031616: 0:435514 1:44163 2:107528 3:127671 4:8311 5:2028604 6:117468 7:8210 8:11754 9:24860 14:4344
11     # 4128768: 0:650846 1:92556 2:183027 3:170894 4:8605 5:4122242 6:147178 7:8238 8:11802 9:31453 14:6460
12    
13 root 1.4 my %cvtid = (
14     0 => "name",
15     1 => "vorname",
16     2 => "zusatz1",
17     3 => "zusatz2",
18     4 => "zusatz3",
19     5 => "vorwahl",
20 root 1.1 # nummer
21 root 1.4 7 => "strasse",
22     8 => "haus",
23     9 => "plz",
24     10 => "ort",
25     #[11 => "ort", # Ortsteil
26     12 => "branche",
27     13 => "typ",
28 root 1.1 );
29    
30 root 1.4 my @cvtid = keys %cvtid;
31    
32 root 1.1 local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo";
33    
34     sql_exec "truncate $_"
35     for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3);
36    
37     sql_exec "lock tables " . join ", ", map "$_ write",
38 root 1.2 qw(branche haus name ort plz strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3);
39 root 1.1
40 root 1.2 # special treatment for name, due to it's size :(
41 root 1.4 tie %cache2,
42 root 1.3 'BerkeleyDB::Hash',
43 root 1.2 -Filename => "/tmp/dinfo-import1-cache-$$",
44 root 1.3 -Flags => DB_CREATE|DB_EXCL,
45 root 1.2 -Cachesize => 380*1024*1024;
46 root 1.3 unlink "/tmp/dinfo-import1-cache-$$";
47 root 1.1
48 root 1.4 open OUT, ">", "$DEST/row.txt";
49     select OUT;
50    
51 root 1.2 while (<STDIN>) {
52 root 1.1 chomp;
53     my @data = split /\t/;
54    
55     $data[3] =~ s/^, //;
56     $data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./;
57    
58     $data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do {
59     warn "ERR: unparseable telnr. '$data[6]'";
60     next;
61     };
62    
63     $data[6] = $2;
64     $data[13] = $1;
65    
66     ($data[5] eq substr $data[6], 0, length $data[5])
67     and substr $data[6], 0, length $data[5], "";
68    
69     if (length $data[6] > 12) {
70     warn "ERR: number too long '$data[6]'";
71     next;
72     }
73    
74     for (@cvtid) {
75 root 1.4 $data[$_] = ($cache{"$_,$data[$_]"} ||= do {
76     my $data = $data[$_];
77     my $fh = $fh{$_} ||= do {
78     open my $fh, ">", "$DEST/$cvtid{$_}.txt"
79     or die "$DEST/$cvtid{$_}.txt: $!";
80     $fh;
81     };
82     my $id = $data eq "" ? 1 : 1 + ++$seq[$_];
83     $data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g;
84    
85     print $fh "$id\t$data\n";
86    
87     $id;
88     });
89 root 1.1 }
90    
91     $data[13] < 16 or die "ERR: too many typ's";
92    
93     $data[6] =~ s/ //g;
94     $data[6] = pack "H*", (substr "$data[6]000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13];
95     #warn unpack "H*", $data[6];
96 root 1.4 $data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g;
97 root 1.1
98     #name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche
99    
100     print +(join "\t",
101     $data[0], $data[1], $data[2], $data[3], $data[4],
102     $data[5], $data[6],
103 root 1.2 $data[7], $data[8], $data[9], $data[10], $data[12],
104 root 1.1 ), "\n";
105    
106     $count++ & 4095 or warn time . " $count";
107 root 1.4
108     if ($count == 20_000_000) {
109     warn "copying...\n";
110     %cache2 = %cache;
111     undef %cache;
112     *cache = \%cache2;
113     }
114 root 1.1 }
115