ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/dinfo/dinfo-import1
Revision: 1.9
Committed: Fri Aug 29 04:15:01 2003 UTC (20 years, 9 months ago) by root
Branch: MAIN
CVS Tags: HEAD
Changes since 1.8: +35 -31 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 #!/usr/bin/perl
2    
3     use Carp;
4    
5     use PApp::SQL;
6 root 1.2 use BerkeleyDB;
7 root 1.1
8 root 1.4 $DEST = "data/tsv";
9    
10 root 1.9 my @cols = qw(hausnr name vorname zusatz1 zusatz2 zusatz3 vorwahl strasse plz ort branche typ);
11 root 1.1
12 root 1.9 local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo";
13    
14     #sql_exec "truncate $_"
15     # for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3);
16    
17     for my $col (@cols) {
18     print "$col ANALYZE\n";
19    
20     my %col;
21 root 1.1
22 root 1.9 open DMP, "<:raw", "data/dump/$col" or die;
23     $col{$_}++ while <DMP>;
24     close DMP;
25 root 1.4
26 root 1.9 print "$col SORT\n";
27     my @sort = sort { $col{$b} <=> $col{$a} } keys %col;
28    
29     print "$col RENUMBER\n";
30     open COL, ">:raw", "data/col/$col" or die;
31     for (0 .. $#sort) {
32     print COL "$_\t$sort[$_]\n";
33     $col{$sort[$_]} = $_;
34     }
35     close COL;
36    
37     print "$col CLEAR\n";
38     @sort = ();
39    
40     print "$col WRITE\n";
41 root 1.1
42 root 1.9 open DMP, "<:raw", "data/dump/$col" or die;
43     open ROW, ">:raw", "data/row/$col" or die;
44 root 1.1
45 root 1.9 print ROW "$col{$_}\n" while <DMP>;
46 root 1.1
47 root 1.9 close ROW;
48     close DMP;
49     }
50 root 1.4
51 root 1.9 __END__
52 root 1.2 while (<STDIN>) {
53 root 1.1 chomp;
54     my @data = split /\t/;
55    
56     $data[3] =~ s/^, //;
57     $data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./;
58    
59     $data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do {
60     warn "ERR: unparseable telnr. '$data[6]'";
61     next;
62     };
63    
64     $data[6] = $2;
65     $data[13] = $1;
66    
67     ($data[5] eq substr $data[6], 0, length $data[5])
68     and substr $data[6], 0, length $data[5], "";
69    
70     if (length $data[6] > 12) {
71     warn "ERR: number too long '$data[6]'";
72     next;
73     }
74    
75     for (@cvtid) {
76 root 1.4 $data[$_] = ($cache{"$_,$data[$_]"} ||= do {
77     my $data = $data[$_];
78     my $fh = $fh{$_} ||= do {
79     open my $fh, ">", "$DEST/$cvtid{$_}.txt"
80     or die "$DEST/$cvtid{$_}.txt: $!";
81     $fh;
82     };
83     my $id = $data eq "" ? 1 : 1 + ++$seq[$_];
84     $data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g;
85    
86     print $fh "$id\t$data\n";
87    
88     $id;
89     });
90 root 1.1 }
91    
92     $data[13] < 16 or die "ERR: too many typ's";
93    
94     $data[6] =~ s/ //g;
95 root 1.8 $data[6] = pack "H*", (substr "$data[6]000000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13];
96 root 1.1 #warn unpack "H*", $data[6];
97 root 1.4 $data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g;
98 root 1.1
99     #name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche
100    
101     print +(join "\t",
102     $data[0], $data[1], $data[2], $data[3], $data[4],
103     $data[5], $data[6],
104 root 1.2 $data[7], $data[8], $data[9], $data[10], $data[12],
105 root 1.1 ), "\n";
106    
107     $count++ & 4095 or warn time . " $count";
108 root 1.4
109     if ($count == 20_000_000) {
110 root 1.7 while (%cache) {
111     warn "copying cache to db...\n";
112     while (my ($k, $v) = each %cache) {
113     $cache2{$k} = delete $cache{$k};
114     }
115     }
116 root 1.5 warn "nuking mem cache...\n";
117 root 1.4 undef %cache;
118 root 1.5 warn "assigning db cache...\n";
119 root 1.4 *cache = \%cache2;
120     }
121 root 1.1 }
122