ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/dinfo/dinfo-import1
Revision: 1.8
Committed: Tue Aug 26 23:26:37 2003 UTC (20 years, 8 months ago) by root
Branch: MAIN
Changes since 1.7: +1 -1 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 #!/usr/bin/perl
2    
3     use Carp;
4    
5     use PApp::SQL;
6 root 1.2 use BerkeleyDB;
7 root 1.1
8 root 1.4 $DEST = "data/tsv";
9    
10 root 1.1 # 2031616: 0:435514 1:44163 2:107528 3:127671 4:8311 5:2028604 6:117468 7:8210 8:11754 9:24860 14:4344
11     # 4128768: 0:650846 1:92556 2:183027 3:170894 4:8605 5:4122242 6:147178 7:8238 8:11802 9:31453 14:6460
12    
13 root 1.4 my %cvtid = (
14     0 => "name",
15     1 => "vorname",
16     2 => "zusatz1",
17     3 => "zusatz2",
18     4 => "zusatz3",
19     5 => "vorwahl",
20 root 1.1 # nummer
21 root 1.4 7 => "strasse",
22     8 => "haus",
23     9 => "plz",
24     10 => "ort",
25     #[11 => "ort", # Ortsteil
26     12 => "branche",
27     13 => "typ",
28 root 1.1 );
29    
30 root 1.4 my @cvtid = keys %cvtid;
31    
32 root 1.1 local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo";
33    
34     sql_exec "truncate $_"
35     for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3);
36    
37 root 1.2 # special treatment for name, due to it's size :(
38 root 1.4 tie %cache2,
39 root 1.3 'BerkeleyDB::Hash',
40 root 1.2 -Filename => "/tmp/dinfo-import1-cache-$$",
41 root 1.3 -Flags => DB_CREATE|DB_EXCL,
42 root 1.6 -Cachesize => 300*1024*1024;
43 root 1.3 unlink "/tmp/dinfo-import1-cache-$$";
44 root 1.1
45 root 1.4 open OUT, ">", "$DEST/row.txt";
46     select OUT;
47    
48 root 1.2 while (<STDIN>) {
49 root 1.1 chomp;
50     my @data = split /\t/;
51    
52     $data[3] =~ s/^, //;
53     $data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./;
54    
55     $data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do {
56     warn "ERR: unparseable telnr. '$data[6]'";
57     next;
58     };
59    
60     $data[6] = $2;
61     $data[13] = $1;
62    
63     ($data[5] eq substr $data[6], 0, length $data[5])
64     and substr $data[6], 0, length $data[5], "";
65    
66     if (length $data[6] > 12) {
67     warn "ERR: number too long '$data[6]'";
68     next;
69     }
70    
71     for (@cvtid) {
72 root 1.4 $data[$_] = ($cache{"$_,$data[$_]"} ||= do {
73     my $data = $data[$_];
74     my $fh = $fh{$_} ||= do {
75     open my $fh, ">", "$DEST/$cvtid{$_}.txt"
76     or die "$DEST/$cvtid{$_}.txt: $!";
77     $fh;
78     };
79     my $id = $data eq "" ? 1 : 1 + ++$seq[$_];
80     $data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g;
81    
82     print $fh "$id\t$data\n";
83    
84     $id;
85     });
86 root 1.1 }
87    
88     $data[13] < 16 or die "ERR: too many typ's";
89    
90     $data[6] =~ s/ //g;
91 root 1.8 $data[6] = pack "H*", (substr "$data[6]000000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13];
92 root 1.1 #warn unpack "H*", $data[6];
93 root 1.4 $data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g;
94 root 1.1
95     #name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche
96    
97     print +(join "\t",
98     $data[0], $data[1], $data[2], $data[3], $data[4],
99     $data[5], $data[6],
100 root 1.2 $data[7], $data[8], $data[9], $data[10], $data[12],
101 root 1.1 ), "\n";
102    
103     $count++ & 4095 or warn time . " $count";
104 root 1.4
105     if ($count == 20_000_000) {
106 root 1.7 while (%cache) {
107     warn "copying cache to db...\n";
108     while (my ($k, $v) = each %cache) {
109     $cache2{$k} = delete $cache{$k};
110     }
111     }
112 root 1.5 warn "nuking mem cache...\n";
113 root 1.4 undef %cache;
114 root 1.5 warn "assigning db cache...\n";
115 root 1.4 *cache = \%cache2;
116     }
117 root 1.1 }
118