1 |
#!/opt/bin/perl |
2 |
|
3 |
BEGIN { require "common.pl" } |
4 |
our ($TXN, %stat, $db_env, $TPB, $db_info, $TODAY); |
5 |
|
6 |
use common::sense; |
7 |
use Compress::LZF; |
8 |
use BDB; |
9 |
use JSON::XS; |
10 |
use HTML::Entities; |
11 |
use Date::Parse; |
12 |
|
13 |
############################################################################# |
14 |
|
15 |
our $db_http = table "http"; |
16 |
our $db_torrent = table "torrent"; |
17 |
|
18 |
############################################################################# |
19 |
|
20 |
#db_compact $db_http, undef, undef, undef, undef, 0, undef; |
21 |
|
22 |
my $c = $db_http->cursor; |
23 |
my @del; |
24 |
|
25 |
#open my $out, "| mbuffer -s4M -m64M | lzma -9 >tpb.json.lzma" |
26 |
open my $out, "| mbuffer -s4M -m64M >tpb.json" |
27 |
or die "tpb.json: $!"; |
28 |
|
29 |
sub ashtml($) { |
30 |
local $_ = shift; |
31 |
|
32 |
s/^\s+//; |
33 |
s/\s+$//; |
34 |
s/\015//g; |
35 |
|
36 |
utf8::decode $_; |
37 |
|
38 |
$_ |
39 |
} |
40 |
|
41 |
sub astext($) { |
42 |
local $_ = ashtml shift; |
43 |
|
44 |
#s%<a href="([^"]+).*?>.*?</a>%URL<$1>%g; |
45 |
#s%<br\s*/?>%\n%g; |
46 |
|
47 |
HTML::Entities::decode_entities $_; |
48 |
utf8::decode $_; |
49 |
|
50 |
$_ |
51 |
} |
52 |
|
53 |
sub astime($) { |
54 |
str2time $_[0] |
55 |
} |
56 |
|
57 |
while () { |
58 |
db_c_get $c, my $key, my $data, BDB::NEXT; |
59 |
last if $!; |
60 |
|
61 |
syswrite STDOUT, "\r$key "; |
62 |
|
63 |
eval { |
64 |
for (dic_decompress 0, $data) { |
65 |
my @info; |
66 |
|
67 |
m%title="More from this category">(.*?)<%s or die "$_: no category"; |
68 |
$info[0] = [split / > /, $1]; |
69 |
|
70 |
$info[1] = $key+0; |
71 |
|
72 |
m%<div id="title">(.*?)</div>%s or die "$_: no title"; |
73 |
$info[2] = astext $1; |
74 |
|
75 |
m% href="magnet:\?xt=urn:btih:([0-9a-f]*)&dn% or die "$_: no magnet"; |
76 |
$info[3] = $1; |
77 |
|
78 |
$info[4] = [m%<a href="/tag/([^"]+)%g]; |
79 |
|
80 |
m%<dt>By:</dt>.*?<dd>.*?<a href="/user/([^/]+)/%s |
81 |
and $info[5] = astext $1; |
82 |
|
83 |
m%<dt>Uploaded:</dt>.*?<dd>(.*?)</dd>%s or die "$_: no uploaded"; |
84 |
$info[6] = astime $1; |
85 |
|
86 |
m%<div class="nfo">\s*<pre>(.*?)</pre>%s or die "$_: no nfo"; |
87 |
$info[7] = astext $1; |
88 |
|
89 |
my @comments; |
90 |
|
91 |
while ( |
92 |
m% |
93 |
<div\sid="comment-\d+"> |
94 |
<p\sclass="byline"> |
95 |
\s* |
96 |
<a\shref="/user/([^/]+)/".*?\sat\s(....-..-..\s..:..\sCET):\s*</p> |
97 |
<div\sclass="comment">(.*?)</div>\s*</div> |
98 |
%gsx |
99 |
) { |
100 |
my ($user, $date, $comment) = ($1, $2, $3); |
101 |
push @comments, [$user, astime $date, astext $comment]; |
102 |
} |
103 |
|
104 |
$info[8] = \@comments; |
105 |
|
106 |
print $out JSON::XS::encode_json \@info, "\n"; |
107 |
} |
108 |
}; |
109 |
|
110 |
if ($@) { |
111 |
print STDOUT "$@"; |
112 |
push @del, $key; |
113 |
} |
114 |
} |
115 |
|
116 |
undef $c; |
117 |
|
118 |
if (@del) { |
119 |
for (@del) { |
120 |
iput $_, [0, 1]; |
121 |
db_del $db_torrent, undef, $_; |
122 |
db_del $db_http , undef, $_; |
123 |
} |
124 |
} |
125 |
|