ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/tpb/scrape
Revision: 1.2
Committed: Sat Jan 2 21:45:30 2016 UTC (8 years, 5 months ago) by root
Branch: MAIN
Changes since 1.1: +1 -1 lines
Log Message:
*** empty log message ***

File Contents

# Content
1 #!/opt/bin/perl
2
3 BEGIN { require "common.pl" }
4 our ($TXN, %stat, $db_env, $TPB, $db_info, $TODAY);
5
6 use common::sense;
7
8 our $NUM_DOWNLOADERS = 2; # minimum 2
9
10 our $db_http = table "http";
11 our $db_torrent = table "torrent";
12
13 our $TODO;
14
15 #############################################################################
16
17 my $pool = new Coro::Channel 10000;
18 my @pool = map {
19 async {
20 while (my $args = $pool->get) {
21 my $cb = pop @$args;
22 $cb->(@$args);
23 }
24 }
25 } 1..1;
26
27 sub pool(&@) {
28 $pool->put ([@_]);
29 }
30
31 sub http_extract($) {
32 $_[0] =~ / id='NumComments'>(\d+)</
33 or die "no numcomments in <$_[0]>\n";
34 $1
35 }
36
37 my $finish = new Coro::Channel 100;
38 my @finishers = async {
39 {
40 my $txn;
41 $Coro::current->swap_sv (\$TXN, \$txn);
42 }
43
44 while (my $job = $finish->get) {
45 # $TXN = $db_env->txn_begin;
46
47 my ($i, $html, $torrent) = @$job;
48
49 $html =~ s/^.*?Details for this torrent//s;
50 $html =~ s/<div id="foot".*$//s;
51
52 my $info = iget $i;
53
54 my $old = get $db_http, $i;
55
56 unless ($!) {
57 my $old_html = dic_decompress 0, $old;
58
59 if ((http_extract $html) ne eval { http_extract $old_html }) {
60 ++$stat{change};
61 $info->[1] = ($info->[1] >> 1) || 1;
62 } else {
63 ++$stat{same};
64 $info->[1] *= 2;
65 $info->[1] = 365 if $info->[1] > 365;
66 }
67 }
68
69 $stat{data} += length $html;
70 $html = dic_compress 0, $html if length $html;
71 $stat{cmpr} += length $html;
72
73 $stat{torrent} += length $torrent;
74
75 put $db_torrent, $i, $torrent if defined $torrent;
76 put $db_http, $i, $html;
77 iput $i, [$TODAY, $info->[1]]; # check every day
78
79 # db_txn_finish $TXN;
80 # die "transaction failed" if $!;
81
82 remove $TODO $i;
83 }
84 };
85
86 #############################################################################
87
88 my $download = new Coro::Channel 200;
89 my @downloaders = map {
90 async {
91 while (my $i = $download->get) {
92 my ($html, $hdr) = GET "https://$TPB/torrent/$i";
93
94 if ($hdr->{Status} == 200) {
95 BDB::db_exists $db_torrent, undef, $i, 0;
96 if ($!) {
97 if ($html =~ m%<a href="([^"]+)" title="Torrent File"%) {
98 my $url = $1;
99 $url =~ s/ /%20/g;
100 $url =~ s%^//%https://%;
101 $url =~ s%^/+%https://$TPB/%;
102 my ($torrent, $hdr) = GET $url;
103
104 if ($hdr->{Status} == 200) {
105 $finish->put ([$i, $html, $torrent]);
106 } elsif ($hdr->{Status} == 404) {
107 ++$stat{notorrent};
108 $finish->put ([$i, $html, undef]);
109
110 #warn "https://$TPB/torrent/$i $url\n";
111 #remove $TODO $i;
112 } else {
113 warn "$url https://$TPB/torrent/$i: status $hdr->{Status}\n";
114 }
115 } else {
116 ++$stat{notorrentlink};
117 $finish->put ([$i, $html, undef]);
118 }
119 } else {
120 $finish->put ([$i, $html, undef]);
121 }
122 } elsif ($hdr->{Status} == 404) {
123 remove $TODO $i;
124 del $db_info, $i;
125 } else {
126 warn "https://$TPB/torrent/$i: status $hdr->{Status}\n";
127 }
128 }
129 }
130 } 1..$NUM_DOWNLOADERS;
131
132 #############################################################################
133
134 if (0) {
135 #sput "last_recent", 7100000;
136 sput "last_recent", 3210000;
137 }
138
139 if (0) {
140 db_env_txn_checkpoint $db_env;
141 db_compact $db_info;
142 db_env_txn_checkpoint $db_env;
143 return;
144 }
145
146 async {
147 $TODO = new Set::IntSpan sget "todo";
148
149 if (1) {
150 my $last = sget "last_recent";
151
152 my ($html, $hdr) = GET "https://$TPB/recent";
153
154 $html =~ m%<div class="detName">\s*<a href="/torrent/(\d+)/%
155 or die "https://$TPB/recent: unable to parse recent torrent ($html)";
156
157 my $next = $1;
158 #$next = 7043798;#d#
159
160 $TODO->U ([[$last, $next]]);
161
162 sput todo => "$TODO";
163 sput last_recent => $next;
164 }
165
166 my $sputter = AE::timer 10, 10, sub {
167 async_pool {
168 sput todo => "$TODO";
169 };
170 };
171
172 # new torrents
173 if (1) {
174 for (spans $TODO) {
175 for my $i ($_->[0] .. $_->[1]) {
176 $stat{stat} = $i;
177 BDB::db_exists $db_http, undef, $i, 0;
178 unless ($!) {
179 BDB::db_exists $db_torrent, undef, $i, 0;
180 unless ($!) {
181 # already have both
182 remove $TODO $i;
183 next;
184 }
185 }
186
187 iput $i, [0, 1];
188 # $download->put ($i);#d#
189 }
190 }
191 }
192
193 # refresh
194 if (1) {
195 my $c = $db_info->cursor (undef, 0);
196 my ($k, $v);
197 db_c_get $c, $k, $v, BDB::SET_RANGE;
198 my @i;
199
200 until ($!) {
201 my $info = decode_cbor $v;
202 push @i, $k
203 # $download->put ($k)
204 if $info->[0] + $info->[1] <= $TODAY;
205 db_c_get $c, $k, $v, BDB::NEXT;
206 #d# last if @i > 200000;#d#
207 }
208
209 undef $c;
210
211 $download->put ($_)
212 for @i;
213 }
214
215 $download->shutdown;
216 $_->join
217 for @downloaders;
218
219 $finish->shutdown;
220 $_->join for @finishers;
221
222 $pool->shutdown;
223 $_->join for @pool;
224
225 sput todo => "$TODO";
226
227 EV::unloop;
228 };
229
230 EV::loop;
231