1 |
#include <sys/types.h> |
2 |
#include <sys/stat.h> |
3 |
#include <fcntl.h> |
4 |
#include <unistd.h> |
5 |
#include <sys/mman.h> |
6 |
|
7 |
#include <cstdio> |
8 |
#include <cstring> |
9 |
#include <cassert> |
10 |
#include <map> |
11 |
#include <vector> |
12 |
#include <algorithm> |
13 |
|
14 |
unsigned long sze (int fd) |
15 |
{ |
16 |
struct stat st; |
17 |
|
18 |
fstat (fd, &st); |
19 |
|
20 |
return st.st_size; |
21 |
} |
22 |
|
23 |
struct my_int { |
24 |
int val; |
25 |
|
26 |
my_int(int i = 0) : val(i) { }; |
27 |
}; |
28 |
|
29 |
struct lt_string { |
30 |
bool operator ()(char * const &a, char * const &b) |
31 |
{ |
32 |
return strcmp (a, b) < 0; |
33 |
} |
34 |
}; |
35 |
|
36 |
std::map<char *, my_int, lt_string> typedef col_map; |
37 |
|
38 |
struct { |
39 |
bool operator () (col_map::value_type *a, col_map::value_type *b) |
40 |
{ |
41 |
return a->second.val > b->second.val; |
42 |
} |
43 |
} lt_pair; |
44 |
|
45 |
static int count = 0; |
46 |
|
47 |
static inline void progress (void) |
48 |
{ |
49 |
if (!(++count & 0xfffff)) |
50 |
{ |
51 |
printf ("\r%d ", count); |
52 |
fflush (stdout); |
53 |
} |
54 |
} |
55 |
|
56 |
int main(int argc, char *argv[]) |
57 |
{ |
58 |
char dmp_n[200]; sprintf (dmp_n, "data/dump/%s", argv[1]); |
59 |
char row_n[200]; sprintf (row_n, "data/row/%s", argv[1]); |
60 |
char col_n[200]; sprintf (col_n, "data/col/%s.txt", argv[1]); |
61 |
|
62 |
int dmp_fd = open (dmp_n, O_RDWR); |
63 |
assert (dmp_fd >= 0); |
64 |
|
65 |
int dmp_sze = sze (dmp_fd); |
66 |
char *dmp = (char *)mmap (0, dmp_sze, PROT_READ|PROT_WRITE, MAP_SHARED, dmp_fd, 0); |
67 |
|
68 |
col_map col; |
69 |
|
70 |
printf (" count\n"); |
71 |
for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++) |
72 |
{ |
73 |
if (!*c) |
74 |
{ |
75 |
++col[b].val; |
76 |
b = c + 1; |
77 |
progress (); |
78 |
} |
79 |
} |
80 |
|
81 |
printf (" copy (%d)\n", count); |
82 |
std::vector<col_map::value_type *> srt; |
83 |
|
84 |
for (col_map::iterator i = col.begin (); i != col.end (); ++i) |
85 |
srt.push_back (&*i); |
86 |
|
87 |
printf (" sort\n"); |
88 |
sort (srt.begin (), srt.end (), lt_pair); |
89 |
|
90 |
FILE *col_f = fopen (col_n, "w"); |
91 |
int c = 0; |
92 |
for (std::vector<col_map::value_type *>::iterator i = srt.begin (); i != srt.end(); ++i, ++c) |
93 |
{ |
94 |
if (c < 100) |
95 |
printf ("%3d %8d %8d %s\n", c, (*i)->second.val, (*(i+1))->second.val - (*i)->second.val, (*i)->first); |
96 |
|
97 |
(*i)->second.val = c; |
98 |
|
99 |
fprintf (col_f, "%d\t%s\n", (*i)->second.val, (*i)->first); |
100 |
} |
101 |
|
102 |
fclose (col_f); |
103 |
|
104 |
printf (" xfrm\n"); |
105 |
int buf[1024]; |
106 |
int *bufptr = buf; |
107 |
int row_fd = open (row_n, O_RDWR|O_CREAT|O_TRUNC, 0666); |
108 |
for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++) |
109 |
{ |
110 |
if (!*c) |
111 |
{ |
112 |
*bufptr++ = col[b].val; |
113 |
if (bufptr > buf + 1023) |
114 |
{ |
115 |
write (row_fd, buf, (char *)bufptr - (char *)buf); |
116 |
bufptr = buf; |
117 |
} |
118 |
|
119 |
b = c + 1; |
120 |
progress (); |
121 |
} |
122 |
} |
123 |
|
124 |
write (row_fd, buf, (char *)bufptr - (char *)buf); |
125 |
close (row_fd); |
126 |
} |
127 |
|
128 |
#if 0 |
129 |
#!/usr/bin/perl |
130 |
|
131 |
use Carp; |
132 |
|
133 |
use PApp::SQL; |
134 |
use BerkeleyDB; |
135 |
|
136 |
$DEST = "data/tsv"; |
137 |
|
138 |
my @cols = qw(hausnr name vorname zusatz1 zusatz2 zusatz3 vorwahl strasse plz ort branche typ); |
139 |
|
140 |
local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo"; |
141 |
|
142 |
#sql_exec "truncate $_" |
143 |
# for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3); |
144 |
|
145 |
for my $col (@cols) { |
146 |
print "$col ANALYZE\n"; |
147 |
|
148 |
my %col; |
149 |
|
150 |
open DMP, "<:raw", "data/dump/$col" or die; |
151 |
$col{$_}++ while <DMP>; |
152 |
close DMP; |
153 |
|
154 |
print "$col SORT\n"; |
155 |
my @sort = sort { $col{$b} <=> $col{$a} } keys %col; |
156 |
|
157 |
print "$col RENUMBER\n"; |
158 |
open COL, ">:raw", "data/col/$col" or die; |
159 |
for (0 .. $#sort) { |
160 |
print COL "$_\t$sort[$_]\n"; |
161 |
$col{$sort[$_]} = $_; |
162 |
} |
163 |
close COL; |
164 |
|
165 |
print "$col CLEAR\n"; |
166 |
@sort = (); |
167 |
|
168 |
print "$col WRITE\n"; |
169 |
|
170 |
open DMP, "<:raw", "data/dump/$col" or die; |
171 |
open ROW, ">:raw", "data/row/$col" or die; |
172 |
|
173 |
print ROW "$col{$_}\n" while <DMP>; |
174 |
|
175 |
close ROW; |
176 |
close DMP; |
177 |
} |
178 |
|
179 |
__END__ |
180 |
while (<STDIN>) { |
181 |
chomp; |
182 |
my @data = split /\t/; |
183 |
|
184 |
$data[3] =~ s/^, //; |
185 |
$data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./; |
186 |
|
187 |
$data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do { |
188 |
warn "ERR: unparseable telnr. '$data[6]'"; |
189 |
next; |
190 |
}; |
191 |
|
192 |
$data[6] = $2; |
193 |
$data[13] = $1; |
194 |
|
195 |
($data[5] eq substr $data[6], 0, length $data[5]) |
196 |
and substr $data[6], 0, length $data[5], ""; |
197 |
|
198 |
if (length $data[6] > 12) { |
199 |
warn "ERR: number too long '$data[6]'"; |
200 |
next; |
201 |
} |
202 |
|
203 |
for (@cvtid) { |
204 |
$data[$_] = ($cache{"$_,$data[$_]"} ||= do { |
205 |
my $data = $data[$_]; |
206 |
my $fh = $fh{$_} ||= do { |
207 |
open my $fh, ">", "$DEST/$cvtid{$_}.txt" |
208 |
or die "$DEST/$cvtid{$_}.txt: $!"; |
209 |
$fh; |
210 |
}; |
211 |
my $id = $data eq "" ? 1 : 1 + ++$seq[$_]; |
212 |
$data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g; |
213 |
|
214 |
print $fh "$id\t$data\n"; |
215 |
|
216 |
$id; |
217 |
}); |
218 |
} |
219 |
|
220 |
$data[13] < 16 or die "ERR: too many typ's"; |
221 |
|
222 |
$data[6] =~ s/ //g; |
223 |
$data[6] = pack "H*", (substr "$data[6]000000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13]; |
224 |
#warn unpack "H*", $data[6]; |
225 |
$data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g; |
226 |
|
227 |
#name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche |
228 |
|
229 |
print +(join "\t", |
230 |
$data[0], $data[1], $data[2], $data[3], $data[4], |
231 |
$data[5], $data[6], |
232 |
$data[7], $data[8], $data[9], $data[10], $data[12], |
233 |
), "\n"; |
234 |
|
235 |
$count++ & 4095 or warn time . " $count"; |
236 |
|
237 |
if ($count == 20_000_000) { |
238 |
while (%cache) { |
239 |
warn "copying cache to db...\n"; |
240 |
while (my ($k, $v) = each %cache) { |
241 |
$cache2{$k} = delete $cache{$k}; |
242 |
} |
243 |
} |
244 |
warn "nuking mem cache...\n"; |
245 |
undef %cache; |
246 |
warn "assigning db cache...\n"; |
247 |
*cache = \%cache2; |
248 |
} |
249 |
} |
250 |
|
251 |
#endif |
252 |
|