ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/tpb/scrape
Revision: 1.4
Committed: Fri Aug 17 21:18:14 2018 UTC (5 years, 9 months ago) by root
Branch: MAIN
CVS Tags: HEAD
Changes since 1.3: +19 -13 lines
Log Message:
*** empty log message ***

File Contents

# Content
1 #!/opt/bin/perl
2
3 BEGIN { require "common.pl" }
4 our ($TXN, %stat, $db_env, $TPB, $db_info, $TODAY);
5
6 use common::sense;
7
8 our $NUM_DOWNLOADERS = $ENV{DOWNLOADERS} || 4; # minimum 2
9
10 our $db_http = table "http";
11 our $db_torrent = table "torrent";
12
13 our $TODO;
14
15 #############################################################################
16
17 my $pool = new Coro::Channel 10000;
18 my @pool = map {
19 async {
20 while (my $args = $pool->get) {
21 my $cb = pop @$args;
22 $cb->(@$args);
23 }
24 }
25 } 1..1;
26
27 sub pool(&@) {
28 $pool->put ([@_]);
29 }
30
31 sub http_extract($) {
32 $_[0] =~ / id=['"]NumComments['"]>(\d+)</
33 or die "no numcomments in <$_[0]>\n";
34 $1
35 }
36
37 my $finish = new Coro::Channel 100;
38 my @finishers = async {
39 {
40 my $txn;
41 $Coro::current->swap_sv (\$TXN, \$txn);
42 }
43
44 while (my $job = $finish->get) {
45 # $TXN = $db_env->txn_begin;
46
47 my ($i, $html, $torrent) = @$job;
48
49 $html =~ s/^.*?Details for this torrent//s;
50 $html =~ s/<div id="foot".*$//s;
51
52 my $info = iget $i;
53
54 my $old = get $db_http, $i;
55
56 unless ($!) {
57 my $old_html = dic_decompress 0, $old;
58
59 if ((http_extract $html) ne eval { http_extract $old_html }) {
60 ++$stat{change};
61 $info->[1] = ($info->[1] >> 1) || 1;
62 } else {
63 ++$stat{same};
64 $info->[1] *= 2;
65 $info->[1] = 365 if $info->[1] > 365;
66 }
67 }
68
69 $stat{data} += length $html;
70 $html = dic_compress 0, $html if length $html;
71 $stat{cmpr} += length $html;
72
73 $stat{torrent} += length $torrent;
74
75 put $db_torrent, $i, $torrent if defined $torrent;
76 put $db_http, $i, $html;
77 iput $i, [$TODAY, $info->[1]]; # check every day
78
79 $stat{cur} = $i;
80
81 # db_txn_finish $TXN;
82 # die "transaction failed" if $!;
83
84 remove $TODO $i;
85 }
86 };
87
88 #############################################################################
89
90 my $download = new Coro::Channel 200;
91 my @downloaders = map {
92 async {
93 while (my $i = $download->get) {
94 my ($html, $hdr) = GET "$TPB/torrent/$i";
95
96 if ($hdr->{Status} == 200) {
97 BDB::db_exists $db_torrent, undef, $i, 0;
98 if ($!) {
99 if ($html =~ m%<a href="([^"]+)" title="Torrent File"%) {
100 my $url = $1;
101 $url =~ s/ /%20/g;
102 $url =~ s%^//%http://%;
103 $url =~ s%^/+%$TPB/%;
104 my ($torrent, $hdr) = GET $url;
105
106 if ($hdr->{Status} == 200) {
107 $finish->put ([$i, $html, $torrent]);
108 } elsif ($hdr->{Status} == 404) {
109 ++$stat{notorrent};
110 $finish->put ([$i, $html, undef]);
111
112 #warn "$TPB/torrent/$i $url\n";
113 #remove $TODO $i;
114 } else {
115 warn "$url $TPB/torrent/$i: status $hdr->{Status}\n";
116 }
117 } else {
118 ++$stat{notorrentlink};
119 $finish->put ([$i, $html, undef]);
120 }
121 } else {
122 $finish->put ([$i, $html, undef]);
123 }
124 } elsif ($hdr->{Status} == 404) {
125 remove $TODO $i;
126 del $db_info, $i;
127 } else {
128 warn "$TPB/torrent/$i: status $hdr->{Status}\n";
129 }
130 }
131 }
132 } 1..$NUM_DOWNLOADERS;
133
134 #############################################################################
135
136 if (0) {
137 #sput "last_recent", 7100000;
138 sput "last_recent", 3210000;
139 }
140
141 if (0) {
142 db_env_txn_checkpoint $db_env;
143 db_compact $db_info;
144 db_env_txn_checkpoint $db_env;
145 return;
146 }
147
148 async {
149 $TODO = new Set::IntSpan sget "todo";
150
151 if (1) {
152 my $last = sget "last_recent";
153
154 my ($html, $hdr) = GET "$TPB/recent";
155
156 $html =~ m%<div class="detName">\s*<a href="/torrent/(\d+)/%
157 or die "$TPB/recent: unable to parse recent torrent ($html)";
158
159 my $next = $1;
160 #$next = 7043798;#d#
161
162 $TODO->U ([[$last, $next]]);
163
164 sput todo => "$TODO";
165 sput last_recent => $next;
166 }
167
168 my $sputter = AE::timer 10, 10, sub {
169 async_pool {
170 sput todo => "$TODO";
171 };
172 };
173
174 # new torrents
175 if (1) {
176 for (spans $TODO) {
177 for my $i ($_->[0] .. $_->[1]) {
178 $stat{stat} = $i;
179 BDB::db_exists $db_http, undef, $i, 0;
180 unless ($!) {
181 BDB::db_exists $db_torrent, undef, $i, 0;
182 unless ($!) {
183 # already have both
184 remove $TODO $i;
185 next;
186 }
187 }
188
189 iput $i, [0, 1];
190 # $download->put ($i);#d#
191 }
192 }
193 }
194
195 # refresh
196 if (1) {
197 my $c = $db_info->cursor (undef, 0);
198 my ($k, $v);
199 db_c_get $c, $k, $v, BDB::SET_RANGE;
200 my @i;
201
202 until ($!) {
203 # some buggy? keys + info
204 # ["", [0, 1]]
205 # ["", 10026234, 10027247]
206 if ($k) {
207 my $info = decode_cbor $v;
208 push @i, $k
209 # $download->put ($k)
210 if $info->[0] + $info->[1] <= $TODAY;
211 # last if @i > 10000;#d#
212 }
213
214 db_c_get $c, $k, $v, BDB::NEXT;
215 }
216
217 undef $c;
218
219 $download->put ($_)
220 for sort { $b <=> $a } @i;
221 }
222
223 $download->shutdown;
224 $_->join
225 for @downloaders;
226
227 $finish->shutdown;
228 $_->join for @finishers;
229
230 $pool->shutdown;
231 $_->join for @pool;
232
233 sput todo => "$TODO";
234
235 EV::unloop;
236 };
237
238 EV::loop;
239