| 1 |
#include <sys/types.h> |
| 2 |
#include <sys/stat.h> |
| 3 |
#include <fcntl.h> |
| 4 |
#include <unistd.h> |
| 5 |
#include <sys/mman.h> |
| 6 |
|
| 7 |
#include <cstdio> |
| 8 |
#include <cstring> |
| 9 |
#include <cassert> |
| 10 |
#include <map> |
| 11 |
#include <vector> |
| 12 |
#include <algorithm> |
| 13 |
|
| 14 |
unsigned long sze (int fd) |
| 15 |
{ |
| 16 |
struct stat st; |
| 17 |
|
| 18 |
fstat (fd, &st); |
| 19 |
|
| 20 |
return st.st_size; |
| 21 |
} |
| 22 |
|
| 23 |
struct my_int { |
| 24 |
int val; |
| 25 |
|
| 26 |
my_int(int i = 0) : val(i) { }; |
| 27 |
}; |
| 28 |
|
| 29 |
struct lt_string { |
| 30 |
bool operator ()(char * const &a, char * const &b) |
| 31 |
{ |
| 32 |
return strcmp (a, b) < 0; |
| 33 |
} |
| 34 |
}; |
| 35 |
|
| 36 |
std::map<char *, my_int, lt_string> typedef col_map; |
| 37 |
|
| 38 |
struct { |
| 39 |
bool operator () (col_map::value_type *a, col_map::value_type *b) |
| 40 |
{ |
| 41 |
return a->second.val > b->second.val; |
| 42 |
} |
| 43 |
} lt_pair; |
| 44 |
|
| 45 |
static int count = 0; |
| 46 |
|
| 47 |
static inline void progress (void) |
| 48 |
{ |
| 49 |
if (!(++count & 0xfffff)) |
| 50 |
{ |
| 51 |
printf ("\r%d ", count); |
| 52 |
fflush (stdout); |
| 53 |
} |
| 54 |
} |
| 55 |
|
| 56 |
int main(int argc, char *argv[]) |
| 57 |
{ |
| 58 |
char dmp_n[200]; sprintf (dmp_n, "data/dump/%s", argv[1]); |
| 59 |
char row_n[200]; sprintf (row_n, "data/row/%s", argv[1]); |
| 60 |
char col_n[200]; sprintf (col_n, "data/col/%s.txt", argv[1]); |
| 61 |
|
| 62 |
int dmp_fd = open (dmp_n, O_RDWR); |
| 63 |
assert (dmp_fd >= 0); |
| 64 |
|
| 65 |
int dmp_sze = sze (dmp_fd); |
| 66 |
char *dmp = (char *)mmap (0, dmp_sze, PROT_READ|PROT_WRITE, MAP_SHARED, dmp_fd, 0); |
| 67 |
|
| 68 |
col_map col; |
| 69 |
|
| 70 |
printf (" count\n"); |
| 71 |
for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++) |
| 72 |
{ |
| 73 |
if (!*c) |
| 74 |
{ |
| 75 |
++col[b].val; |
| 76 |
b = c + 1; |
| 77 |
progress (); |
| 78 |
} |
| 79 |
} |
| 80 |
|
| 81 |
printf (" copy (%d)\n", count); |
| 82 |
std::vector<col_map::value_type *> srt; |
| 83 |
|
| 84 |
for (col_map::iterator i = col.begin (); i != col.end (); ++i) |
| 85 |
srt.push_back (&*i); |
| 86 |
|
| 87 |
printf (" sort\n"); |
| 88 |
sort (srt.begin (), srt.end (), lt_pair); |
| 89 |
|
| 90 |
FILE *col_f = fopen (col_n, "w"); |
| 91 |
int c = 0; |
| 92 |
for (std::vector<col_map::value_type *>::iterator i = srt.begin (); i != srt.end(); ++i, ++c) |
| 93 |
{ |
| 94 |
if (c < 100) |
| 95 |
printf ("%3d %8d %8d %s\n", c, (*i)->second.val, (*(i+1))->second.val - (*i)->second.val, (*i)->first); |
| 96 |
|
| 97 |
(*i)->second.val = c; |
| 98 |
|
| 99 |
fprintf (col_f, "%d\t%s\n", (*i)->second.val, (*i)->first); |
| 100 |
} |
| 101 |
|
| 102 |
fclose (col_f); |
| 103 |
|
| 104 |
printf (" xfrm\n"); |
| 105 |
int buf[1024]; |
| 106 |
int *bufptr = buf; |
| 107 |
int row_fd = open (row_n, O_RDWR|O_CREAT|O_TRUNC, 0666); |
| 108 |
for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++) |
| 109 |
{ |
| 110 |
if (!*c) |
| 111 |
{ |
| 112 |
*bufptr++ = col[b].val; |
| 113 |
if (bufptr > buf + 1023) |
| 114 |
{ |
| 115 |
write (row_fd, buf, (char *)bufptr - (char *)buf); |
| 116 |
bufptr = buf; |
| 117 |
} |
| 118 |
|
| 119 |
b = c + 1; |
| 120 |
progress (); |
| 121 |
} |
| 122 |
} |
| 123 |
|
| 124 |
write (row_fd, buf, (char *)bufptr - (char *)buf); |
| 125 |
close (row_fd); |
| 126 |
} |
| 127 |
|
| 128 |
#if 0 |
| 129 |
#!/usr/bin/perl |
| 130 |
|
| 131 |
use Carp; |
| 132 |
|
| 133 |
use PApp::SQL; |
| 134 |
use BerkeleyDB; |
| 135 |
|
| 136 |
$DEST = "data/tsv"; |
| 137 |
|
| 138 |
my @cols = qw(hausnr name vorname zusatz1 zusatz2 zusatz3 vorwahl strasse plz ort branche typ); |
| 139 |
|
| 140 |
local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo"; |
| 141 |
|
| 142 |
#sql_exec "truncate $_" |
| 143 |
# for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3); |
| 144 |
|
| 145 |
for my $col (@cols) { |
| 146 |
print "$col ANALYZE\n"; |
| 147 |
|
| 148 |
my %col; |
| 149 |
|
| 150 |
open DMP, "<:raw", "data/dump/$col" or die; |
| 151 |
$col{$_}++ while <DMP>; |
| 152 |
close DMP; |
| 153 |
|
| 154 |
print "$col SORT\n"; |
| 155 |
my @sort = sort { $col{$b} <=> $col{$a} } keys %col; |
| 156 |
|
| 157 |
print "$col RENUMBER\n"; |
| 158 |
open COL, ">:raw", "data/col/$col" or die; |
| 159 |
for (0 .. $#sort) { |
| 160 |
print COL "$_\t$sort[$_]\n"; |
| 161 |
$col{$sort[$_]} = $_; |
| 162 |
} |
| 163 |
close COL; |
| 164 |
|
| 165 |
print "$col CLEAR\n"; |
| 166 |
@sort = (); |
| 167 |
|
| 168 |
print "$col WRITE\n"; |
| 169 |
|
| 170 |
open DMP, "<:raw", "data/dump/$col" or die; |
| 171 |
open ROW, ">:raw", "data/row/$col" or die; |
| 172 |
|
| 173 |
print ROW "$col{$_}\n" while <DMP>; |
| 174 |
|
| 175 |
close ROW; |
| 176 |
close DMP; |
| 177 |
} |
| 178 |
|
| 179 |
__END__ |
| 180 |
while (<STDIN>) { |
| 181 |
chomp; |
| 182 |
my @data = split /\t/; |
| 183 |
|
| 184 |
$data[3] =~ s/^, //; |
| 185 |
$data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./; |
| 186 |
|
| 187 |
$data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do { |
| 188 |
warn "ERR: unparseable telnr. '$data[6]'"; |
| 189 |
next; |
| 190 |
}; |
| 191 |
|
| 192 |
$data[6] = $2; |
| 193 |
$data[13] = $1; |
| 194 |
|
| 195 |
($data[5] eq substr $data[6], 0, length $data[5]) |
| 196 |
and substr $data[6], 0, length $data[5], ""; |
| 197 |
|
| 198 |
if (length $data[6] > 12) { |
| 199 |
warn "ERR: number too long '$data[6]'"; |
| 200 |
next; |
| 201 |
} |
| 202 |
|
| 203 |
for (@cvtid) { |
| 204 |
$data[$_] = ($cache{"$_,$data[$_]"} ||= do { |
| 205 |
my $data = $data[$_]; |
| 206 |
my $fh = $fh{$_} ||= do { |
| 207 |
open my $fh, ">", "$DEST/$cvtid{$_}.txt" |
| 208 |
or die "$DEST/$cvtid{$_}.txt: $!"; |
| 209 |
$fh; |
| 210 |
}; |
| 211 |
my $id = $data eq "" ? 1 : 1 + ++$seq[$_]; |
| 212 |
$data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g; |
| 213 |
|
| 214 |
print $fh "$id\t$data\n"; |
| 215 |
|
| 216 |
$id; |
| 217 |
}); |
| 218 |
} |
| 219 |
|
| 220 |
$data[13] < 16 or die "ERR: too many typ's"; |
| 221 |
|
| 222 |
$data[6] =~ s/ //g; |
| 223 |
$data[6] = pack "H*", (substr "$data[6]000000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13]; |
| 224 |
#warn unpack "H*", $data[6]; |
| 225 |
$data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g; |
| 226 |
|
| 227 |
#name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche |
| 228 |
|
| 229 |
print +(join "\t", |
| 230 |
$data[0], $data[1], $data[2], $data[3], $data[4], |
| 231 |
$data[5], $data[6], |
| 232 |
$data[7], $data[8], $data[9], $data[10], $data[12], |
| 233 |
), "\n"; |
| 234 |
|
| 235 |
$count++ & 4095 or warn time . " $count"; |
| 236 |
|
| 237 |
if ($count == 20_000_000) { |
| 238 |
while (%cache) { |
| 239 |
warn "copying cache to db...\n"; |
| 240 |
while (my ($k, $v) = each %cache) { |
| 241 |
$cache2{$k} = delete $cache{$k}; |
| 242 |
} |
| 243 |
} |
| 244 |
warn "nuking mem cache...\n"; |
| 245 |
undef %cache; |
| 246 |
warn "assigning db cache...\n"; |
| 247 |
*cache = \%cache2; |
| 248 |
} |
| 249 |
} |
| 250 |
|
| 251 |
#endif |
| 252 |
|