ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/dinfo/dump2fsv.C
Revision: 1.2
Committed: Fri Aug 29 20:17:57 2003 UTC (20 years, 8 months ago) by root
Content type: text/plain
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +0 -0 lines
State: FILE REMOVED
Log Message:
*** empty log message ***

File Contents

# Content
1 #include <sys/types.h>
2 #include <sys/stat.h>
3 #include <fcntl.h>
4 #include <unistd.h>
5 #include <sys/mman.h>
6
7 #include <cstdio>
8 #include <cstring>
9 #include <cassert>
10 #include <map>
11 #include <vector>
12 #include <algorithm>
13
14 unsigned long sze (int fd)
15 {
16 struct stat st;
17
18 fstat (fd, &st);
19
20 return st.st_size;
21 }
22
23 struct my_int {
24 int val;
25
26 my_int(int i = 0) : val(i) { };
27 };
28
29 struct lt_string {
30 bool operator ()(char * const &a, char * const &b)
31 {
32 return strcmp (a, b) < 0;
33 }
34 };
35
36 std::map<char *, my_int, lt_string> typedef col_map;
37
38 struct {
39 bool operator () (col_map::value_type *a, col_map::value_type *b)
40 {
41 return a->second.val > b->second.val;
42 }
43 } lt_pair;
44
45 static int count = 0;
46
47 static inline void progress (void)
48 {
49 if (!(++count & 0xfffff))
50 {
51 printf ("\r%d ", count);
52 fflush (stdout);
53 }
54 }
55
56 int main(int argc, char *argv[])
57 {
58 char dmp_n[200]; sprintf (dmp_n, "data/dump/%s", argv[1]);
59 char row_n[200]; sprintf (row_n, "data/row/%s", argv[1]);
60 char col_n[200]; sprintf (col_n, "data/col/%s.txt", argv[1]);
61
62 int dmp_fd = open (dmp_n, O_RDWR);
63 assert (dmp_fd >= 0);
64
65 int dmp_sze = sze (dmp_fd);
66 char *dmp = (char *)mmap (0, dmp_sze, PROT_READ|PROT_WRITE, MAP_SHARED, dmp_fd, 0);
67
68 col_map col;
69
70 printf (" count\n");
71 for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++)
72 {
73 if (!*c)
74 {
75 ++col[b].val;
76 b = c + 1;
77 progress ();
78 }
79 }
80
81 printf (" copy (%d)\n", count);
82 std::vector<col_map::value_type *> srt;
83
84 for (col_map::iterator i = col.begin (); i != col.end (); ++i)
85 srt.push_back (&*i);
86
87 printf (" sort\n");
88 sort (srt.begin (), srt.end (), lt_pair);
89
90 FILE *col_f = fopen (col_n, "w");
91 int c = 0;
92 for (std::vector<col_map::value_type *>::iterator i = srt.begin (); i != srt.end(); ++i, ++c)
93 {
94 if (c < 100)
95 printf ("%3d %8d %8d %s\n", c, (*i)->second.val, (*(i+1))->second.val - (*i)->second.val, (*i)->first);
96
97 (*i)->second.val = c;
98
99 fprintf (col_f, "%d\t%s\n", (*i)->second.val, (*i)->first);
100 }
101
102 fclose (col_f);
103
104 printf (" xfrm\n");
105 int buf[1024];
106 int *bufptr = buf;
107 int row_fd = open (row_n, O_RDWR|O_CREAT|O_TRUNC, 0666);
108 for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++)
109 {
110 if (!*c)
111 {
112 *bufptr++ = col[b].val;
113 if (bufptr > buf + 1023)
114 {
115 write (row_fd, buf, (char *)bufptr - (char *)buf);
116 bufptr = buf;
117 }
118
119 b = c + 1;
120 progress ();
121 }
122 }
123
124 write (row_fd, buf, (char *)bufptr - (char *)buf);
125 close (row_fd);
126 }
127
128 #if 0
129 #!/usr/bin/perl
130
131 use Carp;
132
133 use PApp::SQL;
134 use BerkeleyDB;
135
136 $DEST = "data/tsv";
137
138 my @cols = qw(hausnr name vorname zusatz1 zusatz2 zusatz3 vorwahl strasse plz ort branche typ);
139
140 local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo";
141
142 #sql_exec "truncate $_"
143 # for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3);
144
145 for my $col (@cols) {
146 print "$col ANALYZE\n";
147
148 my %col;
149
150 open DMP, "<:raw", "data/dump/$col" or die;
151 $col{$_}++ while <DMP>;
152 close DMP;
153
154 print "$col SORT\n";
155 my @sort = sort { $col{$b} <=> $col{$a} } keys %col;
156
157 print "$col RENUMBER\n";
158 open COL, ">:raw", "data/col/$col" or die;
159 for (0 .. $#sort) {
160 print COL "$_\t$sort[$_]\n";
161 $col{$sort[$_]} = $_;
162 }
163 close COL;
164
165 print "$col CLEAR\n";
166 @sort = ();
167
168 print "$col WRITE\n";
169
170 open DMP, "<:raw", "data/dump/$col" or die;
171 open ROW, ">:raw", "data/row/$col" or die;
172
173 print ROW "$col{$_}\n" while <DMP>;
174
175 close ROW;
176 close DMP;
177 }
178
179 __END__
180 while (<STDIN>) {
181 chomp;
182 my @data = split /\t/;
183
184 $data[3] =~ s/^, //;
185 $data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./;
186
187 $data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do {
188 warn "ERR: unparseable telnr. '$data[6]'";
189 next;
190 };
191
192 $data[6] = $2;
193 $data[13] = $1;
194
195 ($data[5] eq substr $data[6], 0, length $data[5])
196 and substr $data[6], 0, length $data[5], "";
197
198 if (length $data[6] > 12) {
199 warn "ERR: number too long '$data[6]'";
200 next;
201 }
202
203 for (@cvtid) {
204 $data[$_] = ($cache{"$_,$data[$_]"} ||= do {
205 my $data = $data[$_];
206 my $fh = $fh{$_} ||= do {
207 open my $fh, ">", "$DEST/$cvtid{$_}.txt"
208 or die "$DEST/$cvtid{$_}.txt: $!";
209 $fh;
210 };
211 my $id = $data eq "" ? 1 : 1 + ++$seq[$_];
212 $data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g;
213
214 print $fh "$id\t$data\n";
215
216 $id;
217 });
218 }
219
220 $data[13] < 16 or die "ERR: too many typ's";
221
222 $data[6] =~ s/ //g;
223 $data[6] = pack "H*", (substr "$data[6]000000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13];
224 #warn unpack "H*", $data[6];
225 $data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g;
226
227 #name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche
228
229 print +(join "\t",
230 $data[0], $data[1], $data[2], $data[3], $data[4],
231 $data[5], $data[6],
232 $data[7], $data[8], $data[9], $data[10], $data[12],
233 ), "\n";
234
235 $count++ & 4095 or warn time . " $count";
236
237 if ($count == 20_000_000) {
238 while (%cache) {
239 warn "copying cache to db...\n";
240 while (my ($k, $v) = each %cache) {
241 $cache2{$k} = delete $cache{$k};
242 }
243 }
244 warn "nuking mem cache...\n";
245 undef %cache;
246 warn "assigning db cache...\n";
247 *cache = \%cache2;
248 }
249 }
250
251 #endif
252