ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/tpb/scrape
Revision: 1.1
Committed: Sun Sep 27 07:55:20 2015 UTC (8 years, 8 months ago) by root
Branch: MAIN
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 #!/opt/bin/perl
2    
3     BEGIN { require "common.pl" }
4     our ($TXN, %stat, $db_env, $TPB, $db_info, $TODAY);
5    
6     use common::sense;
7    
8     our $NUM_DOWNLOADERS = 6;
9    
10     our $db_http = table "http";
11     our $db_torrent = table "torrent";
12    
13     our $TODO;
14    
15     #############################################################################
16    
17     my $pool = new Coro::Channel 10000;
18     my @pool = map {
19     async {
20     while (my $args = $pool->get) {
21     my $cb = pop @$args;
22     $cb->(@$args);
23     }
24     }
25     } 1..1;
26    
27     sub pool(&@) {
28     $pool->put ([@_]);
29     }
30    
31     sub http_extract($) {
32     $_[0] =~ / id='NumComments'>(\d+)</
33     or die "no numcomments in <$_[0]>\n";
34     $1
35     }
36    
37     my $finish = new Coro::Channel 100;
38     my @finishers = async {
39     {
40     my $txn;
41     $Coro::current->swap_sv (\$TXN, \$txn);
42     }
43    
44     while (my $job = $finish->get) {
45     # $TXN = $db_env->txn_begin;
46    
47     my ($i, $html, $torrent) = @$job;
48    
49     $html =~ s/^.*?Details for this torrent//s;
50     $html =~ s/<div id="foot".*$//s;
51    
52     my $info = iget $i;
53    
54     my $old = get $db_http, $i;
55    
56     unless ($!) {
57     my $old_html = dic_decompress 0, $old;
58    
59     if ((http_extract $html) ne eval { http_extract $old_html }) {
60     ++$stat{change};
61     $info->[1] = ($info->[1] >> 1) || 1;
62     } else {
63     ++$stat{same};
64     $info->[1] *= 2;
65     $info->[1] = 365 if $info->[1] > 365;
66     }
67     }
68    
69     $stat{data} += length $html;
70     $html = dic_compress 0, $html if length $html;
71     $stat{cmpr} += length $html;
72    
73     $stat{torrent} += length $torrent;
74    
75     put $db_torrent, $i, $torrent if defined $torrent;
76     put $db_http, $i, $html;
77     iput $i, [$TODAY, $info->[1]]; # check every day
78    
79     # db_txn_finish $TXN;
80     # die "transaction failed" if $!;
81    
82     remove $TODO $i;
83     }
84     };
85    
86     #############################################################################
87    
88     my $download = new Coro::Channel 200;
89     my @downloaders = map {
90     async {
91     while (my $i = $download->get) {
92     my ($html, $hdr) = GET "https://$TPB/torrent/$i";
93    
94     if ($hdr->{Status} == 200) {
95     BDB::db_exists $db_torrent, undef, $i, 0;
96     if ($!) {
97     if ($html =~ m%<a href="([^"]+)" title="Torrent File"%) {
98     my $url = $1;
99     $url =~ s/ /%20/g;
100     $url =~ s%^//%https://%;
101     $url =~ s%^/+%https://$TPB/%;
102     my ($torrent, $hdr) = GET $url;
103    
104     if ($hdr->{Status} == 200) {
105     $finish->put ([$i, $html, $torrent]);
106     } elsif ($hdr->{Status} == 404) {
107     ++$stat{notorrent};
108     $finish->put ([$i, $html, undef]);
109    
110     #warn "https://$TPB/torrent/$i $url\n";
111     #remove $TODO $i;
112     } else {
113     warn "$url https://$TPB/torrent/$i: status $hdr->{Status}\n";
114     }
115     } else {
116     ++$stat{notorrentlink};
117     $finish->put ([$i, $html, undef]);
118     }
119     } else {
120     $finish->put ([$i, $html, undef]);
121     }
122     } elsif ($hdr->{Status} == 404) {
123     remove $TODO $i;
124     del $db_info, $i;
125     } else {
126     warn "https://$TPB/torrent/$i: status $hdr->{Status}\n";
127     }
128     }
129     }
130     } 1..$NUM_DOWNLOADERS;
131    
132     #############################################################################
133    
134     if (0) {
135     #sput "last_recent", 7100000;
136     sput "last_recent", 3210000;
137     }
138    
139     if (0) {
140     db_env_txn_checkpoint $db_env;
141     db_compact $db_info;
142     db_env_txn_checkpoint $db_env;
143     return;
144     }
145    
146     async {
147     $TODO = new Set::IntSpan sget "todo";
148    
149     if (1) {
150     my $last = sget "last_recent";
151    
152     my ($html, $hdr) = GET "https://$TPB/recent";
153    
154     $html =~ m%<div class="detName">\s*<a href="/torrent/(\d+)/%
155     or die "https://$TPB/recent: unable to parse recent torrent ($html)";
156    
157     my $next = $1;
158     #$next = 7043798;#d#
159    
160     $TODO->U ([[$last, $next]]);
161    
162     sput todo => "$TODO";
163     sput last_recent => $next;
164     }
165    
166     my $sputter = AE::timer 10, 10, sub {
167     async_pool {
168     sput todo => "$TODO";
169     };
170     };
171    
172     # new torrents
173     if (1) {
174     for (spans $TODO) {
175     for my $i ($_->[0] .. $_->[1]) {
176     $stat{stat} = $i;
177     BDB::db_exists $db_http, undef, $i, 0;
178     unless ($!) {
179     BDB::db_exists $db_torrent, undef, $i, 0;
180     unless ($!) {
181     # already have both
182     remove $TODO $i;
183     next;
184     }
185     }
186    
187     iput $i, [0, 1];
188     # $download->put ($i);#d#
189     }
190     }
191     }
192    
193     # refresh
194     if (1) {
195     my $c = $db_info->cursor (undef, 0);
196     my ($k, $v);
197     db_c_get $c, $k, $v, BDB::SET_RANGE;
198     my @i;
199    
200     until ($!) {
201     my $info = decode_cbor $v;
202     push @i, $k
203     # $download->put ($k)
204     if $info->[0] + $info->[1] <= $TODAY;
205     db_c_get $c, $k, $v, BDB::NEXT;
206     #d# last if @i > 200000;#d#
207     }
208    
209     undef $c;
210    
211     $download->put ($_)
212     for @i;
213     }
214    
215     $download->shutdown;
216     $_->join
217     for @downloaders;
218    
219     $finish->shutdown;
220     $_->join for @finishers;
221    
222     $pool->shutdown;
223     $_->join for @pool;
224    
225     sput todo => "$TODO";
226    
227     EV::unloop;
228     };
229    
230     EV::loop;
231