1 |
root |
1.1 |
#include <sys/types.h> |
2 |
|
|
#include <sys/stat.h> |
3 |
|
|
#include <fcntl.h> |
4 |
|
|
#include <unistd.h> |
5 |
|
|
#include <sys/mman.h> |
6 |
|
|
|
7 |
|
|
#include <cstdio> |
8 |
|
|
#include <cstring> |
9 |
|
|
#include <cassert> |
10 |
|
|
#include <map> |
11 |
|
|
#include <vector> |
12 |
|
|
#include <algorithm> |
13 |
|
|
|
14 |
|
|
unsigned long sze (int fd) |
15 |
|
|
{ |
16 |
|
|
struct stat st; |
17 |
|
|
|
18 |
|
|
fstat (fd, &st); |
19 |
|
|
|
20 |
|
|
return st.st_size; |
21 |
|
|
} |
22 |
|
|
|
23 |
|
|
struct my_int { |
24 |
|
|
int val; |
25 |
|
|
|
26 |
|
|
my_int(int i = 0) : val(i) { }; |
27 |
|
|
}; |
28 |
|
|
|
29 |
|
|
struct lt_string { |
30 |
|
|
bool operator ()(char * const &a, char * const &b) |
31 |
|
|
{ |
32 |
|
|
return strcmp (a, b) < 0; |
33 |
|
|
} |
34 |
|
|
}; |
35 |
|
|
|
36 |
|
|
std::map<char *, my_int, lt_string> typedef col_map; |
37 |
|
|
|
38 |
|
|
struct { |
39 |
|
|
bool operator () (col_map::value_type *a, col_map::value_type *b) |
40 |
|
|
{ |
41 |
|
|
return a->second.val > b->second.val; |
42 |
|
|
} |
43 |
|
|
} lt_pair; |
44 |
|
|
|
45 |
|
|
static int count = 0; |
46 |
|
|
|
47 |
|
|
static inline void progress (void) |
48 |
|
|
{ |
49 |
|
|
if (!(++count & 0xfffff)) |
50 |
|
|
{ |
51 |
|
|
printf ("\r%d ", count); |
52 |
|
|
fflush (stdout); |
53 |
|
|
} |
54 |
|
|
} |
55 |
|
|
|
56 |
|
|
int main(int argc, char *argv[]) |
57 |
|
|
{ |
58 |
|
|
char dmp_n[200]; sprintf (dmp_n, "data/dump/%s", argv[1]); |
59 |
|
|
char row_n[200]; sprintf (row_n, "data/row/%s", argv[1]); |
60 |
|
|
char col_n[200]; sprintf (col_n, "data/col/%s.txt", argv[1]); |
61 |
|
|
|
62 |
|
|
int dmp_fd = open (dmp_n, O_RDWR); |
63 |
|
|
assert (dmp_fd >= 0); |
64 |
|
|
|
65 |
|
|
int dmp_sze = sze (dmp_fd); |
66 |
|
|
char *dmp = (char *)mmap (0, dmp_sze, PROT_READ|PROT_WRITE, MAP_SHARED, dmp_fd, 0); |
67 |
|
|
|
68 |
|
|
col_map col; |
69 |
|
|
|
70 |
|
|
printf (" count\n"); |
71 |
|
|
for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++) |
72 |
|
|
{ |
73 |
|
|
if (!*c) |
74 |
|
|
{ |
75 |
|
|
++col[b].val; |
76 |
|
|
b = c + 1; |
77 |
|
|
progress (); |
78 |
|
|
} |
79 |
|
|
} |
80 |
|
|
|
81 |
|
|
printf (" copy (%d)\n", count); |
82 |
|
|
std::vector<col_map::value_type *> srt; |
83 |
|
|
|
84 |
|
|
for (col_map::iterator i = col.begin (); i != col.end (); ++i) |
85 |
|
|
srt.push_back (&*i); |
86 |
|
|
|
87 |
|
|
printf (" sort\n"); |
88 |
|
|
sort (srt.begin (), srt.end (), lt_pair); |
89 |
|
|
|
90 |
|
|
FILE *col_f = fopen (col_n, "w"); |
91 |
|
|
int c = 0; |
92 |
|
|
for (std::vector<col_map::value_type *>::iterator i = srt.begin (); i != srt.end(); ++i, ++c) |
93 |
|
|
{ |
94 |
|
|
if (c < 100) |
95 |
|
|
printf ("%3d %8d %8d %s\n", c, (*i)->second.val, (*(i+1))->second.val - (*i)->second.val, (*i)->first); |
96 |
|
|
|
97 |
|
|
(*i)->second.val = c; |
98 |
|
|
|
99 |
|
|
fprintf (col_f, "%d\t%s\n", (*i)->second.val, (*i)->first); |
100 |
|
|
} |
101 |
|
|
|
102 |
|
|
fclose (col_f); |
103 |
|
|
|
104 |
|
|
printf (" xfrm\n"); |
105 |
|
|
int buf[1024]; |
106 |
|
|
int *bufptr = buf; |
107 |
|
|
int row_fd = open (row_n, O_RDWR|O_CREAT|O_TRUNC, 0666); |
108 |
|
|
for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++) |
109 |
|
|
{ |
110 |
|
|
if (!*c) |
111 |
|
|
{ |
112 |
|
|
*bufptr++ = col[b].val; |
113 |
|
|
if (bufptr > buf + 1023) |
114 |
|
|
{ |
115 |
|
|
write (row_fd, buf, (char *)bufptr - (char *)buf); |
116 |
|
|
bufptr = buf; |
117 |
|
|
} |
118 |
|
|
|
119 |
|
|
b = c + 1; |
120 |
|
|
progress (); |
121 |
|
|
} |
122 |
|
|
} |
123 |
|
|
|
124 |
|
|
write (row_fd, buf, (char *)bufptr - (char *)buf); |
125 |
|
|
close (row_fd); |
126 |
|
|
} |
127 |
|
|
|
128 |
|
|
#if 0 |
129 |
|
|
#!/usr/bin/perl |
130 |
|
|
|
131 |
|
|
use Carp; |
132 |
|
|
|
133 |
|
|
use PApp::SQL; |
134 |
|
|
use BerkeleyDB; |
135 |
|
|
|
136 |
|
|
$DEST = "data/tsv"; |
137 |
|
|
|
138 |
|
|
my @cols = qw(hausnr name vorname zusatz1 zusatz2 zusatz3 vorwahl strasse plz ort branche typ); |
139 |
|
|
|
140 |
|
|
local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo"; |
141 |
|
|
|
142 |
|
|
#sql_exec "truncate $_" |
143 |
|
|
# for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3); |
144 |
|
|
|
145 |
|
|
for my $col (@cols) { |
146 |
|
|
print "$col ANALYZE\n"; |
147 |
|
|
|
148 |
|
|
my %col; |
149 |
|
|
|
150 |
|
|
open DMP, "<:raw", "data/dump/$col" or die; |
151 |
|
|
$col{$_}++ while <DMP>; |
152 |
|
|
close DMP; |
153 |
|
|
|
154 |
|
|
print "$col SORT\n"; |
155 |
|
|
my @sort = sort { $col{$b} <=> $col{$a} } keys %col; |
156 |
|
|
|
157 |
|
|
print "$col RENUMBER\n"; |
158 |
|
|
open COL, ">:raw", "data/col/$col" or die; |
159 |
|
|
for (0 .. $#sort) { |
160 |
|
|
print COL "$_\t$sort[$_]\n"; |
161 |
|
|
$col{$sort[$_]} = $_; |
162 |
|
|
} |
163 |
|
|
close COL; |
164 |
|
|
|
165 |
|
|
print "$col CLEAR\n"; |
166 |
|
|
@sort = (); |
167 |
|
|
|
168 |
|
|
print "$col WRITE\n"; |
169 |
|
|
|
170 |
|
|
open DMP, "<:raw", "data/dump/$col" or die; |
171 |
|
|
open ROW, ">:raw", "data/row/$col" or die; |
172 |
|
|
|
173 |
|
|
print ROW "$col{$_}\n" while <DMP>; |
174 |
|
|
|
175 |
|
|
close ROW; |
176 |
|
|
close DMP; |
177 |
|
|
} |
178 |
|
|
|
179 |
|
|
__END__ |
180 |
|
|
while (<STDIN>) { |
181 |
|
|
chomp; |
182 |
|
|
my @data = split /\t/; |
183 |
|
|
|
184 |
|
|
$data[3] =~ s/^, //; |
185 |
|
|
$data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./; |
186 |
|
|
|
187 |
|
|
$data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do { |
188 |
|
|
warn "ERR: unparseable telnr. '$data[6]'"; |
189 |
|
|
next; |
190 |
|
|
}; |
191 |
|
|
|
192 |
|
|
$data[6] = $2; |
193 |
|
|
$data[13] = $1; |
194 |
|
|
|
195 |
|
|
($data[5] eq substr $data[6], 0, length $data[5]) |
196 |
|
|
and substr $data[6], 0, length $data[5], ""; |
197 |
|
|
|
198 |
|
|
if (length $data[6] > 12) { |
199 |
|
|
warn "ERR: number too long '$data[6]'"; |
200 |
|
|
next; |
201 |
|
|
} |
202 |
|
|
|
203 |
|
|
for (@cvtid) { |
204 |
|
|
$data[$_] = ($cache{"$_,$data[$_]"} ||= do { |
205 |
|
|
my $data = $data[$_]; |
206 |
|
|
my $fh = $fh{$_} ||= do { |
207 |
|
|
open my $fh, ">", "$DEST/$cvtid{$_}.txt" |
208 |
|
|
or die "$DEST/$cvtid{$_}.txt: $!"; |
209 |
|
|
$fh; |
210 |
|
|
}; |
211 |
|
|
my $id = $data eq "" ? 1 : 1 + ++$seq[$_]; |
212 |
|
|
$data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g; |
213 |
|
|
|
214 |
|
|
print $fh "$id\t$data\n"; |
215 |
|
|
|
216 |
|
|
$id; |
217 |
|
|
}); |
218 |
|
|
} |
219 |
|
|
|
220 |
|
|
$data[13] < 16 or die "ERR: too many typ's"; |
221 |
|
|
|
222 |
|
|
$data[6] =~ s/ //g; |
223 |
|
|
$data[6] = pack "H*", (substr "$data[6]000000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13]; |
224 |
|
|
#warn unpack "H*", $data[6]; |
225 |
|
|
$data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g; |
226 |
|
|
|
227 |
|
|
#name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche |
228 |
|
|
|
229 |
|
|
print +(join "\t", |
230 |
|
|
$data[0], $data[1], $data[2], $data[3], $data[4], |
231 |
|
|
$data[5], $data[6], |
232 |
|
|
$data[7], $data[8], $data[9], $data[10], $data[12], |
233 |
|
|
), "\n"; |
234 |
|
|
|
235 |
|
|
$count++ & 4095 or warn time . " $count"; |
236 |
|
|
|
237 |
|
|
if ($count == 20_000_000) { |
238 |
|
|
while (%cache) { |
239 |
|
|
warn "copying cache to db...\n"; |
240 |
|
|
while (my ($k, $v) = each %cache) { |
241 |
|
|
$cache2{$k} = delete $cache{$k}; |
242 |
|
|
} |
243 |
|
|
} |
244 |
|
|
warn "nuking mem cache...\n"; |
245 |
|
|
undef %cache; |
246 |
|
|
warn "assigning db cache...\n"; |
247 |
|
|
*cache = \%cache2; |
248 |
|
|
} |
249 |
|
|
} |
250 |
|
|
|
251 |
|
|
#endif |
252 |
|
|
|