ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/dinfo/dump2fsv.C
Revision: 1.1
Committed: Fri Aug 29 04:15:01 2003 UTC (20 years, 10 months ago) by root
Content type: text/plain
Branch: MAIN
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 #include <sys/types.h>
2     #include <sys/stat.h>
3     #include <fcntl.h>
4     #include <unistd.h>
5     #include <sys/mman.h>
6    
7     #include <cstdio>
8     #include <cstring>
9     #include <cassert>
10     #include <map>
11     #include <vector>
12     #include <algorithm>
13    
14     unsigned long sze (int fd)
15     {
16     struct stat st;
17    
18     fstat (fd, &st);
19    
20     return st.st_size;
21     }
22    
23     struct my_int {
24     int val;
25    
26     my_int(int i = 0) : val(i) { };
27     };
28    
29     struct lt_string {
30     bool operator ()(char * const &a, char * const &b)
31     {
32     return strcmp (a, b) < 0;
33     }
34     };
35    
36     std::map<char *, my_int, lt_string> typedef col_map;
37    
38     struct {
39     bool operator () (col_map::value_type *a, col_map::value_type *b)
40     {
41     return a->second.val > b->second.val;
42     }
43     } lt_pair;
44    
45     static int count = 0;
46    
47     static inline void progress (void)
48     {
49     if (!(++count & 0xfffff))
50     {
51     printf ("\r%d ", count);
52     fflush (stdout);
53     }
54     }
55    
56     int main(int argc, char *argv[])
57     {
58     char dmp_n[200]; sprintf (dmp_n, "data/dump/%s", argv[1]);
59     char row_n[200]; sprintf (row_n, "data/row/%s", argv[1]);
60     char col_n[200]; sprintf (col_n, "data/col/%s.txt", argv[1]);
61    
62     int dmp_fd = open (dmp_n, O_RDWR);
63     assert (dmp_fd >= 0);
64    
65     int dmp_sze = sze (dmp_fd);
66     char *dmp = (char *)mmap (0, dmp_sze, PROT_READ|PROT_WRITE, MAP_SHARED, dmp_fd, 0);
67    
68     col_map col;
69    
70     printf (" count\n");
71     for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++)
72     {
73     if (!*c)
74     {
75     ++col[b].val;
76     b = c + 1;
77     progress ();
78     }
79     }
80    
81     printf (" copy (%d)\n", count);
82     std::vector<col_map::value_type *> srt;
83    
84     for (col_map::iterator i = col.begin (); i != col.end (); ++i)
85     srt.push_back (&*i);
86    
87     printf (" sort\n");
88     sort (srt.begin (), srt.end (), lt_pair);
89    
90     FILE *col_f = fopen (col_n, "w");
91     int c = 0;
92     for (std::vector<col_map::value_type *>::iterator i = srt.begin (); i != srt.end(); ++i, ++c)
93     {
94     if (c < 100)
95     printf ("%3d %8d %8d %s\n", c, (*i)->second.val, (*(i+1))->second.val - (*i)->second.val, (*i)->first);
96    
97     (*i)->second.val = c;
98    
99     fprintf (col_f, "%d\t%s\n", (*i)->second.val, (*i)->first);
100     }
101    
102     fclose (col_f);
103    
104     printf (" xfrm\n");
105     int buf[1024];
106     int *bufptr = buf;
107     int row_fd = open (row_n, O_RDWR|O_CREAT|O_TRUNC, 0666);
108     for (char *c = dmp, *b = dmp; c < dmp + dmp_sze; c++)
109     {
110     if (!*c)
111     {
112     *bufptr++ = col[b].val;
113     if (bufptr > buf + 1023)
114     {
115     write (row_fd, buf, (char *)bufptr - (char *)buf);
116     bufptr = buf;
117     }
118    
119     b = c + 1;
120     progress ();
121     }
122     }
123    
124     write (row_fd, buf, (char *)bufptr - (char *)buf);
125     close (row_fd);
126     }
127    
128     #if 0
129     #!/usr/bin/perl
130    
131     use Carp;
132    
133     use PApp::SQL;
134     use BerkeleyDB;
135    
136     $DEST = "data/tsv";
137    
138     my @cols = qw(hausnr name vorname zusatz1 zusatz2 zusatz3 vorwahl strasse plz ort branche typ);
139    
140     local $PApp::SQL::DBH = PApp::SQL::connect_cached "dinfo::", "DBI:mysql:dinfo";
141    
142     #sql_exec "truncate $_"
143     # for qw(branche haus name ort plz row strasse typ vorname vorwahl zusatz1 zusatz2 zusatz3);
144    
145     for my $col (@cols) {
146     print "$col ANALYZE\n";
147    
148     my %col;
149    
150     open DMP, "<:raw", "data/dump/$col" or die;
151     $col{$_}++ while <DMP>;
152     close DMP;
153    
154     print "$col SORT\n";
155     my @sort = sort { $col{$b} <=> $col{$a} } keys %col;
156    
157     print "$col RENUMBER\n";
158     open COL, ">:raw", "data/col/$col" or die;
159     for (0 .. $#sort) {
160     print COL "$_\t$sort[$_]\n";
161     $col{$sort[$_]} = $_;
162     }
163     close COL;
164    
165     print "$col CLEAR\n";
166     @sort = ();
167    
168     print "$col WRITE\n";
169    
170     open DMP, "<:raw", "data/dump/$col" or die;
171     open ROW, ">:raw", "data/row/$col" or die;
172    
173     print ROW "$col{$_}\n" while <DMP>;
174    
175     close ROW;
176     close DMP;
177     }
178    
179     __END__
180     while (<STDIN>) {
181     chomp;
182     my @data = split /\t/;
183    
184     $data[3] =~ s/^, //;
185     $data[7] =~ s/\.$// && $data[6] =~ s/ Geb$/Geb./;
186    
187     $data[6] =~ /^((?:[a-zA-Z][-.\/a-zA-Z0-9]*)?) (?:\s*\+)? \s* ([0-9\ ]+)$/x or do {
188     warn "ERR: unparseable telnr. '$data[6]'";
189     next;
190     };
191    
192     $data[6] = $2;
193     $data[13] = $1;
194    
195     ($data[5] eq substr $data[6], 0, length $data[5])
196     and substr $data[6], 0, length $data[5], "";
197    
198     if (length $data[6] > 12) {
199     warn "ERR: number too long '$data[6]'";
200     next;
201     }
202    
203     for (@cvtid) {
204     $data[$_] = ($cache{"$_,$data[$_]"} ||= do {
205     my $data = $data[$_];
206     my $fh = $fh{$_} ||= do {
207     open my $fh, ">", "$DEST/$cvtid{$_}.txt"
208     or die "$DEST/$cvtid{$_}.txt: $!";
209     $fh;
210     };
211     my $id = $data eq "" ? 1 : 1 + ++$seq[$_];
212     $data =~ s/([\\\x0a\x09])/\\$1/g; $data =~ s/\x00/\\0/g;
213    
214     print $fh "$id\t$data\n";
215    
216     $id;
217     });
218     }
219    
220     $data[13] < 16 or die "ERR: too many typ's";
221    
222     $data[6] =~ s/ //g;
223     $data[6] = pack "H*", (substr "$data[6]000000000000", 0, 12) . sprintf "%x%x", 12 - length $data[6], $data[13];
224     #warn unpack "H*", $data[6];
225     $data[6] =~ s/([\\\x0a\x09])/\\$1/g; $data[6] =~ s/\x00/\\0/g;
226    
227     #name vorname zusatz1 zusatz2 zusatz3 vorwahl nummer strasse haus plz ort branche
228    
229     print +(join "\t",
230     $data[0], $data[1], $data[2], $data[3], $data[4],
231     $data[5], $data[6],
232     $data[7], $data[8], $data[9], $data[10], $data[12],
233     ), "\n";
234    
235     $count++ & 4095 or warn time . " $count";
236    
237     if ($count == 20_000_000) {
238     while (%cache) {
239     warn "copying cache to db...\n";
240     while (my ($k, $v) = each %cache) {
241     $cache2{$k} = delete $cache{$k};
242     }
243     }
244     warn "nuking mem cache...\n";
245     undef %cache;
246     warn "assigning db cache...\n";
247     *cache = \%cache2;
248     }
249     }
250    
251     #endif
252