#!/usr/bin/perl use Carp; use PApp::SQL; use BerkeleyDB; $DEST = "data/tsv"; # 2031616: 0:435514 1:44163 2:107528 3:127671 4:8311 5:2028604 6:117468 7:8210 8:11754 9:24860 14:4344 # 4128768: 0:650846 1:92556 2:183027 3:170894 4:8605 5:4122242 6:147178 7:8238 8:11802 9:31453 14:6460 my %cvtid = ( 0 => "name", 1 => "vorname", 2 => "zusatz1", 3 => "zusatz2", 4 => "zusatz3", 5 => "vorwahl", # nummer 7 => "strasse", 8 => "haus", 9 => "plz", 10 => "ort", #[11 => "ort", # Ortsteil 12 => "branche", 13 => "typ", ); my @cvtid = keys %cvtid; local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo"; sql_exec "truncate $_" for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3); # special treatment for name, due to it's size :( tie %cache2, 'BerkeleyDB::Hash', -Filename => "/tmp/dinfo-import1-cache-$$", -Flags => DB_CREATE|DB_EXCL, -Cachesize => 380*1024*1024; unlink "/tmp/dinfo-import1-cache-$$"; open OUT, ">", "$DEST/row.txt"; select OUT; while () { chomp; my @data = split /\t/; $data[3] =~ s/^, //; $data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./; $data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do { warn "ERR: unparseable telnr. '$data[6]'"; next; }; $data[6] = $2; $data[13] = $1; ($data[5] eq substr $data[6], 0, length $data[5]) and substr $data[6], 0, length $data[5], ""; if (length $data[6] > 12) { warn "ERR: number too long '$data[6]'"; next; } for (@cvtid) { $data[$_] = ($cache{"$_,$data[$_]"} ||= do { my $data = $data[$_]; my $fh = $fh{$_} ||= do { open my $fh, ">", "$DEST/$cvtid{$_}.txt" or die "$DEST/$cvtid{$_}.txt: $!"; $fh; }; my $id = $data eq "" ? 1 : 1 + ++$seq[$_]; $data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g; print $fh "$id\t$data\n"; $id; }); } $data[13] < 16 or die "ERR: too many typ's"; $data[6] =~ s/ //g; $data[6] = pack "H*", (substr "$data[6]000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13]; #warn unpack "H*", $data[6]; $data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g; #name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche print +(join "\t", $data[0], $data[1], $data[2], $data[3], $data[4], $data[5], $data[6], $data[7], $data[8], $data[9], $data[10], $data[12], ), "\n"; $count++ & 4095 or warn time . " $count"; if ($count == 20_000_000) { warn "copying cache to db...\n"; %cache2 = %cache; warn "nuking mem cache...\n"; undef %cache; warn "assigning db cache...\n"; *cache = \%cache2; } }