#include #include #include #include #include #include #include #include #include #include #include unsigned long sze (int fd) { struct stat st; fstat (fd, &st); return st.st_size; } struct my_int { int val; my_int(int i = 0) : val(i) { }; }; struct lt_string { bool operator ()(char * const &a, char * const &b) { return strcmp (a, b) < 0; } }; std::map typedef col_map; struct { bool operator () (col_map::value_type *a, col_map::value_type *b) { return a->second.val > b->second.val; } } lt_pair; static int count = 0; static inline void progress (void) { if (!(++count & 0xfffff)) { printf ("\r%d ", count); fflush (stdout); } } int main(int argc, char *argv[]) { char dmp_n[200]; sprintf (dmp_n, "data/dump/%s", argv[1]); char row_n[200]; sprintf (row_n, "data/row/%s", argv[1]); char col_n[200]; sprintf (col_n, "data/col/%s.txt", argv[1]); int dmp_fd = open (dmp_n, O_RDWR); assert (dmp_fd >= 0); int dmp_sze = sze (dmp_fd); char *dmp = (char *)mmap (0, dmp_sze, PROT_READ|PROT_WRITE, MAP_SHARED, dmp_fd, 0); col_map col; printf (" count\n"); for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++) { if (!*c) { ++col[b].val; b = c + 1; progress (); } } printf (" copy (%d)\n", count); std::vector srt; for (col_map::iterator i = col.begin (); i != col.end (); ++i) srt.push_back (&*i); printf (" sort\n"); sort (srt.begin (), srt.end (), lt_pair); FILE *col_f = fopen (col_n, "w"); int c = 0; for (std::vector::iterator i = srt.begin (); i != srt.end(); ++i, ++c) { if (c < 100) printf ("%3d %8d %8d %s\n", c, (*i)->second.val, (*(i+1))->second.val - (*i)->second.val, (*i)->first); (*i)->second.val = c; fprintf (col_f, "%d\t%s\n", (*i)->second.val, (*i)->first); } fclose (col_f); printf (" xfrm\n"); int buf[1024]; int *bufptr = buf; int row_fd = open (row_n, O_RDWR|O_CREAT|O_TRUNC, 0666); for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++) { if (!*c) { *bufptr++ = col[b].val; if (bufptr > buf + 1023) { write (row_fd, buf, (char *)bufptr - (char *)buf); bufptr = buf; } b = c + 1; progress (); } } write (row_fd, buf, (char *)bufptr - (char *)buf); close (row_fd); } #if 0 #!/usr/bin/perl use Carp; use PApp::SQL; use BerkeleyDB; $DEST = "data/tsv"; my @cols = qw(hausnr name vorname zusatz1 zusatz2 zusatz3 vorwahl strasse plz ort branche typ); local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo"; #sql_exec "truncate $_" # for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3); for my $col (@cols) { print "$col ANALYZE\n"; my %col; open DMP, "<:raw", "data/dump/$col" or die; $col{$_}++ while ; close DMP; print "$col SORT\n"; my @sort = sort { $col{$b} <=> $col{$a} } keys %col; print "$col RENUMBER\n"; open COL, ">:raw", "data/col/$col" or die; for (0 .. $#sort) { print COL "$_\t$sort[$_]\n"; $col{$sort[$_]} = $_; } close COL; print "$col CLEAR\n"; @sort = (); print "$col WRITE\n"; open DMP, "<:raw", "data/dump/$col" or die; open ROW, ">:raw", "data/row/$col" or die; print ROW "$col{$_}\n" while ; close ROW; close DMP; } __END__ while () { chomp; my @data = split /\t/; $data[3] =~ s/^, //; $data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./; $data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do { warn "ERR: unparseable telnr. '$data[6]'"; next; }; $data[6] = $2; $data[13] = $1; ($data[5] eq substr $data[6], 0, length $data[5]) and substr $data[6], 0, length $data[5], ""; if (length $data[6] > 12) { warn "ERR: number too long '$data[6]'"; next; } for (@cvtid) { $data[$_] = ($cache{"$_,$data[$_]"} ||= do { my $data = $data[$_]; my $fh = $fh{$_} ||= do { open my $fh, ">", "$DEST/$cvtid{$_}.txt" or die "$DEST/$cvtid{$_}.txt: $!"; $fh; }; my $id = $data eq "" ? 1 : 1 + ++$seq[$_]; $data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g; print $fh "$id\t$data\n"; $id; }); } $data[13] < 16 or die "ERR: too many typ's"; $data[6] =~ s/ //g; $data[6] = pack "H*", (substr "$data[6]000000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13]; #warn unpack "H*", $data[6]; $data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g; #name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche print +(join "\t", $data[0], $data[1], $data[2], $data[3], $data[4], $data[5], $data[6], $data[7], $data[8], $data[9], $data[10], $data[12], ), "\n"; $count++ & 4095 or warn time . " $count"; if ($count == 20_000_000) { while (%cache) { warn "copying cache to db...\n"; while (my ($k, $v) = each %cache) { $cache2{$k} = delete $cache{$k}; } } warn "nuking mem cache...\n"; undef %cache; warn "assigning db cache...\n"; *cache = \%cache2; } } #endif