ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.11
Committed: Thu Jun 5 15:34:00 2008 UTC (15 years, 11 months ago) by root
Branch: MAIN
Changes since 1.10: +175 -122 lines
Log Message:
*** empty log message ***

File Contents

# Content
1 =head1 NAME
2
3 AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4
5 =head1 SYNOPSIS
6
7 use AnyEvent::HTTP;
8
9 =head1 DESCRIPTION
10
11 This module is an L<AnyEvent> user, you need to make sure that you use and
12 run a supported event loop.
13
14 This module implements a simple, stateless and non-blocking HTTP
15 client. It supports GET, POST and other request methods, cookies and more,
16 all on a very low level. It can follow redirects supports proxies and
17 automatically limits the number of connections to the values specified in
18 the RFC.
19
20 It should generally be a "good client" that is enough for most HTTP
21 tasks. Simple tasks should be simple, but complex tasks should still be
22 possible as the user retains control over request and response headers.
23
24 The caller is responsible for authentication management, cookies (if
25 the simplistic implementation in this module doesn't suffice), referer
26 and other high-level protocol details for which this module offers only
27 limited support.
28
29 =head2 METHODS
30
31 =over 4
32
33 =cut
34
35 package AnyEvent::HTTP;
36
37 use strict;
38 no warnings;
39
40 use Carp;
41
42 use AnyEvent ();
43 use AnyEvent::Util ();
44 use AnyEvent::Socket ();
45 use AnyEvent::Handle ();
46
47 use base Exporter::;
48
49 our $VERSION = '1.0';
50
51 our @EXPORT = qw(http_get http_request);
52
53 our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
54 our $MAX_RECURSE = 10;
55 our $MAX_PERSISTENT = 8;
56 our $PERSISTENT_TIMEOUT = 2;
57 our $TIMEOUT = 300;
58
59 # changing these is evil
60 our $MAX_PERSISTENT_PER_HOST = 2;
61 our $MAX_PER_HOST = 4;
62
63 our $PROXY;
64
65 my %KA_COUNT; # number of open keep-alive connections per host
66 my %CO_SLOT; # number of open connections, and wait queue, per host
67
68 =item http_get $url, key => value..., $cb->($data, $headers)
69
70 Executes an HTTP-GET request. See the http_request function for details on
71 additional parameters.
72
73 =item http_head $url, key => value..., $cb->($data, $headers)
74
75 Executes an HTTP-HEAD request. See the http_request function for details on
76 additional parameters.
77
78 =item http_post $url, $body, key => value..., $cb->($data, $headers)
79
80 Executes an HTTP-POST request with a request body of C<$bod>. See the
81 http_request function for details on additional parameters.
82
83 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84
85 Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86 must be an absolute http or https URL.
87
88 The callback will be called with the response data as first argument
89 (or C<undef> if it wasn't available due to errors), and a hash-ref with
90 response headers as second argument.
91
92 All the headers in that hash are lowercased. In addition to the response
93 headers, the three "pseudo-headers" C<HTTPVersion>, C<Status> and
94 C<Reason> contain the three parts of the HTTP Status-Line of the same
95 name. If the server sends a header multiple lines, then their contents
96 will be joined together with C<\x00>.
97
98 If an internal error occurs, such as not being able to resolve a hostname,
99 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
100 and the C<Reason> pseudo-header will contain an error message.
101
102 A typical callback might look like this:
103
104 sub {
105 my ($body, $hdr) = @_;
106
107 if ($hdr->{Status} =~ /^2/) {
108 ... everything should be ok
109 } else {
110 print "error, $hdr->{Status} $hdr->{Reason}\n";
111 }
112 }
113
114 Additional parameters are key-value pairs, and are fully optional. They
115 include:
116
117 =over 4
118
119 =item recurse => $count (default: $MAX_RECURSE)
120
121 Whether to recurse requests or not, e.g. on redirects, authentication
122 retries and so on, and how often to do so.
123
124 =item headers => hashref
125
126 The request headers to use.
127
128 =item timeout => $seconds
129
130 The time-out to use for various stages - each connect attempt will reset
131 the timeout, as will read or write activity. Default timeout is 5 minutes.
132
133 =item proxy => [$host, $port[, $scheme]] or undef
134
135 Use the given http proxy for all requests. If not specified, then the
136 default proxy (as specified by C<$ENV{http_proxy}>) is used.
137
138 C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
139 HTTPS.
140
141 =item body => $string
142
143 The request body, usually empty. Will be-sent as-is (future versions of
144 this module might offer more options).
145
146 =item cookie_jar => $hash_ref
147
148 Passing this parameter enables (simplified) cookie-processing, loosely
149 based on the original netscape specification.
150
151 The C<$hash_ref> must be an (initially empty) hash reference which will
152 get updated automatically. It is possible to save the cookie_jar to
153 persistent storage with something like JSON or Storable, but this is not
154 recommended, as expire times are currently being ignored.
155
156 Note that this cookie implementation is not of very high quality, nor
157 meant to be complete. If you want complete cookie management you have to
158 do that on your own. C<cookie_jar> is meant as a quick fix to get some
159 cookie-using sites working. Cookies are a privacy disaster, do not use
160 them unless required to.
161
162 =back
163
164 Example: make a simple HTTP GET request for http://www.nethype.de/
165
166 http_request GET => "http://www.nethype.de/", sub {
167 my ($body, $hdr) = @_;
168 print "$body\n";
169 };
170
171 Example: make a HTTP HEAD request on https://www.google.com/, use a
172 timeout of 30 seconds.
173
174 http_request
175 GET => "https://www.google.com",
176 timeout => 30,
177 sub {
178 my ($body, $hdr) = @_;
179 use Data::Dumper;
180 print Dumper $hdr;
181 }
182 ;
183
184 =cut
185
186 sub _slot_schedule($) {
187 my $host = shift;
188
189 while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
190 if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
191 # somebody wnats that slot
192 ++$CO_SLOT{$host}[0];
193
194 $cb->(AnyEvent::Util::guard {
195 --$CO_SLOT{$host}[0];
196 _slot_schedule $host;
197 });
198 } else {
199 # nobody wants the slot, maybe we can forget about it
200 delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
201 warn "$host deleted" unless $CO_SLOT{$host}[0];#d#
202 last;
203 }
204 }
205 }
206
207 # wait for a free slot on host, call callback
208 sub _get_slot($$) {
209 push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
210
211 _slot_schedule $_[0];
212 }
213
214 sub http_request($$$;@) {
215 my $cb = pop;
216 my ($method, $url, %arg) = @_;
217
218 my %hdr;
219
220 $method = uc $method;
221
222 if (my $hdr = $arg{headers}) {
223 while (my ($k, $v) = each %$hdr) {
224 $hdr{lc $k} = $v;
225 }
226 }
227
228 my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE;
229
230 return $cb->(undef, { Status => 599, Reason => "recursion limit reached" })
231 if $recurse < 0;
232
233 my $proxy = $arg{proxy} || $PROXY;
234 my $timeout = $arg{timeout} || $TIMEOUT;
235
236 $hdr{"user-agent"} ||= $USERAGENT;
237
238 my ($scheme, $authority, $upath, $query, $fragment) =
239 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
240
241 $scheme = lc $scheme;
242
243 my $uport = $scheme eq "http" ? 80
244 : $scheme eq "https" ? 443
245 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" });
246
247 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
248 or return $cb->(undef, { Status => 599, Reason => "unparsable URL" });
249
250 my $uhost = $1;
251 $uport = $2 if defined $2;
252
253 $uhost =~ s/^\[(.*)\]$/$1/;
254 $upath .= "?$query" if length $query;
255
256 $upath =~ s%^/?%/%;
257
258 # cookie processing
259 if (my $jar = $arg{cookie_jar}) {
260 %$jar = () if $jar->{version} < 1;
261
262 my @cookie;
263
264 while (my ($chost, $v) = each %$jar) {
265 next unless $chost eq substr $uhost, -length $chost;
266 next unless $chost =~ /^\./;
267
268 while (my ($cpath, $v) = each %$v) {
269 next unless $cpath eq substr $upath, 0, length $cpath;
270
271 while (my ($k, $v) = each %$v) {
272 next if $scheme ne "https" && exists $v->{secure};
273 push @cookie, "$k=$v->{value}";
274 }
275 }
276 }
277
278 $hdr{cookie} = join "; ", @cookie
279 if @cookie;
280 }
281
282 my ($rhost, $rport, $rpath); # request host, port, path
283
284 if ($proxy) {
285 ($rhost, $rport, $scheme) = @$proxy;
286 $rpath = $url;
287 } else {
288 ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
289 $hdr{host} = $uhost;
290 }
291
292 $hdr{"content-length"} = length $arg{body};
293
294 my %state = (connect_guard => 1);
295
296 _get_slot $uhost, sub {
297 $state{slot_guard} = shift;
298
299 return unless $state{connect_guard};
300
301 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
302 $state{fh} = shift
303 or return $cb->(undef, { Status => 599, Reason => "$!" });
304
305 delete $state{connect_guard}; # reduce memory usage, save a tree
306
307 # get handle
308 $state{handle} = new AnyEvent::Handle
309 fh => $state{fh},
310 ($scheme eq "https" ? (tls => "connect") : ());
311
312 # limit the number of persistent connections
313 if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
314 ++$KA_COUNT{$_[1]};
315 $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
316 $hdr{connection} = "keep-alive";
317 delete $hdr{connection}; # keep-alive not yet supported
318 } else {
319 delete $hdr{connection};
320 }
321
322 # (re-)configure handle
323 $state{handle}->timeout ($timeout);
324 $state{handle}->on_error (sub {
325 %state = ();
326 $cb->(undef, { Status => 599, Reason => "$!" });
327 });
328 $state{handle}->on_eof (sub {
329 %state = ();
330 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" });
331 });
332
333 # send request
334 $state{handle}->push_write (
335 "$method $rpath HTTP/1.0\015\012"
336 . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
337 . "\015\012"
338 . (delete $arg{body})
339 );
340
341 %hdr = (); # reduce memory usage, save a kitten
342
343 # status line
344 $state{handle}->push_read (line => qr/\015?\012/, sub {
345 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
346 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" }));
347
348 my %hdr = ( # response headers
349 HTTPVersion => "\x00$1",
350 Status => "\x00$2",
351 Reason => "\x00$3",
352 );
353
354 # headers, could be optimized a bit
355 $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
356 for ("$_[1]\012") {
357 # we support spaces in field names, as lotus domino
358 # creates them.
359 $hdr{lc $1} .= "\x00$2"
360 while /\G
361 ([^:\000-\037]+):
362 [\011\040]*
363 ((?: [^\015\012]+ | \015?\012[\011\040] )*)
364 \015?\012
365 /gxc;
366
367 /\G$/
368 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" }));
369 }
370
371 substr $_, 0, 1, ""
372 for values %hdr;
373
374 my $finish = sub {
375 %state = ();
376
377 # set-cookie processing
378 if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
379 for (split /\x00/, $hdr{"set-cookie"}) {
380 my ($cookie, @arg) = split /;\s*/;
381 my ($name, $value) = split /=/, $cookie, 2;
382 my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
383
384 my $cdom = (delete $kv{domain}) || $uhost;
385 my $cpath = (delete $kv{path}) || "/";
386
387 $cdom =~ s/^.?/./; # make sure it starts with a "."
388
389 next if $cdom =~ /\.$/;
390
391 # this is not rfc-like and not netscape-like. go figure.
392 my $ndots = $cdom =~ y/.//;
393 next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
394
395 # store it
396 $arg{cookie_jar}{version} = 1;
397 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
398 }
399 }
400
401 if ($_[1]{Status} =~ /^x30[12]$/ && $recurse) {
402 # microsoft and other assholes don't give a shit for following standards,
403 # try to support a common form of broken Location header.
404 $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
405
406 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
407 } else {
408 $cb->($_[0], $_[1]);
409 }
410 };
411
412 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
413 $finish->(undef, \%hdr);
414 } else {
415 if (exists $hdr{"content-length"}) {
416 $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
417 # could cache persistent connection now
418 if ($hdr{connection} =~ /\bkeep-alive\b/i) {
419 # but we don't, due to misdesigns, this is annoyingly complex
420 };
421
422 $finish->($_[1], \%hdr);
423 });
424 } else {
425 # too bad, need to read until we get an error or EOF,
426 # no way to detect winged data.
427 $_[0]->on_error (sub {
428 $finish->($_[0]{rbuf}, \%hdr);
429 });
430 $_[0]->on_eof (undef);
431 $_[0]->on_read (sub { });
432 }
433 }
434 });
435 });
436 }, sub {
437 $timeout
438 };
439 };
440
441 defined wantarray && AnyEvent::Util::guard { %state = () }
442 }
443
444 sub http_get($$;@) {
445 unshift @_, "GET";
446 &http_request
447 }
448
449 sub http_head($$;@) {
450 unshift @_, "HEAD";
451 &http_request
452 }
453
454 sub http_post($$$;@) {
455 unshift @_, "POST", "body";
456 &http_request
457 }
458
459 =back
460
461 =head2 GLOBAL FUNCTIONS AND VARIABLES
462
463 =over 4
464
465 =item AnyEvent::HTTP::set_proxy "proxy-url"
466
467 Sets the default proxy server to use. The proxy-url must begin with a
468 string of the form C<http://host:port> (optionally C<https:...>).
469
470 =item $AnyEvent::HTTP::MAX_RECURSE
471
472 The default value for the C<recurse> request parameter (default: C<10>).
473
474 =item $AnyEvent::HTTP::USERAGENT
475
476 The default value for the C<User-Agent> header (the default is
477 C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
478
479 =item $AnyEvent::HTTP::MAX_PERSISTENT
480
481 The maximum number of persistent connections to keep open (default: 8).
482
483 Not implemented currently.
484
485 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
486
487 The maximum time to cache a persistent connection, in seconds (default: 2).
488
489 Not implemented currently.
490
491 =back
492
493 =cut
494
495 sub set_proxy($) {
496 $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
497 }
498
499 # initialise proxy from environment
500 set_proxy $ENV{http_proxy};
501
502 =head1 SEE ALSO
503
504 L<AnyEvent>.
505
506 =head1 AUTHOR
507
508 Marc Lehmann <schmorp@schmorp.de>
509 http://home.schmorp.de/
510
511 =cut
512
513 1
514