ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.30
Committed: Thu Oct 23 02:46:20 2008 UTC (15 years, 6 months ago) by root
Branch: MAIN
Changes since 1.29: +27 -11 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44     use Carp;
45    
46     use AnyEvent ();
47     use AnyEvent::Util ();
48     use AnyEvent::Socket ();
49     use AnyEvent::Handle ();
50    
51     use base Exporter::;
52    
53 root 1.28 our $VERSION = '1.05';
54 root 1.1
55 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
56 root 1.1
57     our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
58 root 1.3 our $MAX_RECURSE = 10;
59 root 1.2 our $MAX_PERSISTENT = 8;
60     our $PERSISTENT_TIMEOUT = 2;
61     our $TIMEOUT = 300;
62 root 1.1
63     # changing these is evil
64     our $MAX_PERSISTENT_PER_HOST = 2;
65 root 1.11 our $MAX_PER_HOST = 4;
66 root 1.1
67 root 1.2 our $PROXY;
68 root 1.14 our $ACTIVE = 0;
69 root 1.2
70 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
71 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
72 root 1.1
73     =item http_get $url, key => value..., $cb->($data, $headers)
74    
75     Executes an HTTP-GET request. See the http_request function for details on
76 root 1.29 additional parameters and the return value.
77 root 1.1
78 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
79    
80 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
81     on additional parameters and the return value.
82 root 1.5
83     =item http_post $url, $body, key => value..., $cb->($data, $headers)
84 root 1.3
85 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
86 root 1.29 http_request function for details on additional parameters and the return
87     value.
88 root 1.3
89 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
90    
91     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
92     must be an absolute http or https URL.
93    
94 root 1.29 When called in void context, nothing is returned. In other contexts,
95     C<http_request> returns a "cancellation guard" - you have to keep the
96     object at least alive until the callback get called. If the object gets
97     destroyed before the callbakc is called, the request will be cancelled.
98    
99 root 1.2 The callback will be called with the response data as first argument
100     (or C<undef> if it wasn't available due to errors), and a hash-ref with
101     response headers as second argument.
102    
103 root 1.7 All the headers in that hash are lowercased. In addition to the response
104 root 1.20 headers, the "pseudo-headers" C<HTTPVersion>, C<Status> and C<Reason>
105     contain the three parts of the HTTP Status-Line of the same name. The
106     pseudo-header C<URL> contains the original URL (which can differ from the
107     requested URL when following redirects).
108    
109     If the server sends a header multiple lines, then their contents will be
110     joined together with C<\x00>.
111 root 1.2
112     If an internal error occurs, such as not being able to resolve a hostname,
113     then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
114     and the C<Reason> pseudo-header will contain an error message.
115    
116 root 1.6 A typical callback might look like this:
117    
118     sub {
119     my ($body, $hdr) = @_;
120    
121     if ($hdr->{Status} =~ /^2/) {
122     ... everything should be ok
123     } else {
124     print "error, $hdr->{Status} $hdr->{Reason}\n";
125     }
126     }
127    
128 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
129     include:
130    
131     =over 4
132    
133 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
134 root 1.1
135     Whether to recurse requests or not, e.g. on redirects, authentication
136 root 1.3 retries and so on, and how often to do so.
137 root 1.1
138     =item headers => hashref
139    
140 root 1.12 The request headers to use. Currently, C<http_request> may provide its
141     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
142     and will provide defaults for C<User-Agent:> and C<Referer:>.
143 root 1.1
144     =item timeout => $seconds
145    
146     The time-out to use for various stages - each connect attempt will reset
147 root 1.2 the timeout, as will read or write activity. Default timeout is 5 minutes.
148    
149     =item proxy => [$host, $port[, $scheme]] or undef
150    
151     Use the given http proxy for all requests. If not specified, then the
152     default proxy (as specified by C<$ENV{http_proxy}>) is used.
153    
154     C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
155     HTTPS.
156 root 1.1
157 root 1.3 =item body => $string
158    
159     The request body, usually empty. Will be-sent as-is (future versions of
160     this module might offer more options).
161    
162 root 1.10 =item cookie_jar => $hash_ref
163    
164     Passing this parameter enables (simplified) cookie-processing, loosely
165     based on the original netscape specification.
166    
167     The C<$hash_ref> must be an (initially empty) hash reference which will
168     get updated automatically. It is possible to save the cookie_jar to
169     persistent storage with something like JSON or Storable, but this is not
170     recommended, as expire times are currently being ignored.
171    
172     Note that this cookie implementation is not of very high quality, nor
173     meant to be complete. If you want complete cookie management you have to
174     do that on your own. C<cookie_jar> is meant as a quick fix to get some
175     cookie-using sites working. Cookies are a privacy disaster, do not use
176     them unless required to.
177    
178 root 1.1 =back
179    
180 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
181    
182     http_request GET => "http://www.nethype.de/", sub {
183     my ($body, $hdr) = @_;
184     print "$body\n";
185     };
186    
187     Example: make a HTTP HEAD request on https://www.google.com/, use a
188     timeout of 30 seconds.
189    
190     http_request
191     GET => "https://www.google.com",
192     timeout => 30,
193     sub {
194     my ($body, $hdr) = @_;
195     use Data::Dumper;
196     print Dumper $hdr;
197     }
198     ;
199 root 1.1
200 root 1.29 Example: make another simple HTTP GET request, but immediately try to
201     cancel it.
202    
203     my $request = http_request GET => "http://www.nethype.de/", sub {
204     my ($body, $hdr) = @_;
205     print "$body\n";
206     };
207    
208     undef $request;
209    
210 root 1.1 =cut
211    
212 root 1.12 sub _slot_schedule;
213 root 1.11 sub _slot_schedule($) {
214     my $host = shift;
215    
216     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
217     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
218 root 1.12 # somebody wants that slot
219 root 1.11 ++$CO_SLOT{$host}[0];
220 root 1.14 ++$ACTIVE;
221 root 1.11
222     $cb->(AnyEvent::Util::guard {
223 root 1.14 --$ACTIVE;
224 root 1.11 --$CO_SLOT{$host}[0];
225     _slot_schedule $host;
226     });
227     } else {
228     # nobody wants the slot, maybe we can forget about it
229     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
230     last;
231     }
232     }
233     }
234    
235     # wait for a free slot on host, call callback
236     sub _get_slot($$) {
237     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
238    
239     _slot_schedule $_[0];
240     }
241    
242 elmex 1.15 sub http_request($$@) {
243 root 1.1 my $cb = pop;
244     my ($method, $url, %arg) = @_;
245    
246     my %hdr;
247    
248 root 1.3 $method = uc $method;
249    
250 root 1.8 if (my $hdr = $arg{headers}) {
251 root 1.1 while (my ($k, $v) = each %$hdr) {
252     $hdr{lc $k} = $v;
253     }
254     }
255    
256 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
257 root 1.8
258 elmex 1.19 return $cb->(undef, { Status => 599, Reason => "recursion limit reached", URL => $url })
259 root 1.8 if $recurse < 0;
260    
261 root 1.2 my $proxy = $arg{proxy} || $PROXY;
262 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
263    
264     $hdr{"user-agent"} ||= $USERAGENT;
265    
266 root 1.10 my ($scheme, $authority, $upath, $query, $fragment) =
267     $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
268 root 1.2
269 root 1.10 $scheme = lc $scheme;
270 root 1.1
271 root 1.10 my $uport = $scheme eq "http" ? 80
272     : $scheme eq "https" ? 443
273 elmex 1.19 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported", URL => $url });
274 root 1.10
275 root 1.13 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
276    
277 root 1.10 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
278 elmex 1.19 or return $cb->(undef, { Status => 599, Reason => "unparsable URL", URL => $url });
279 root 1.10
280     my $uhost = $1;
281     $uport = $2 if defined $2;
282    
283     $uhost =~ s/^\[(.*)\]$/$1/;
284     $upath .= "?$query" if length $query;
285    
286     $upath =~ s%^/?%/%;
287    
288     # cookie processing
289     if (my $jar = $arg{cookie_jar}) {
290     %$jar = () if $jar->{version} < 1;
291    
292     my @cookie;
293    
294     while (my ($chost, $v) = each %$jar) {
295 root 1.30 if ($chost =~ /^\./) {
296     next unless $chost eq substr $uhost, -length $chost;
297     } elsif ($chost =~ /\./) {
298     next unless $chost eq $uhost;
299     } else {
300     next;
301     }
302 root 1.10
303     while (my ($cpath, $v) = each %$v) {
304     next unless $cpath eq substr $upath, 0, length $cpath;
305    
306     while (my ($k, $v) = each %$v) {
307     next if $scheme ne "https" && exists $v->{secure};
308     push @cookie, "$k=$v->{value}";
309     }
310     }
311     }
312    
313     $hdr{cookie} = join "; ", @cookie
314     if @cookie;
315     }
316 root 1.1
317 root 1.10 my ($rhost, $rport, $rpath); # request host, port, path
318 root 1.2
319 root 1.10 if ($proxy) {
320     ($rhost, $rport, $scheme) = @$proxy;
321     $rpath = $url;
322     } else {
323     ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
324     $hdr{host} = $uhost;
325 root 1.2 }
326    
327 root 1.10 $hdr{"content-length"} = length $arg{body};
328 root 1.1
329 root 1.11 my %state = (connect_guard => 1);
330    
331     _get_slot $uhost, sub {
332     $state{slot_guard} = shift;
333 root 1.1
334 root 1.11 return unless $state{connect_guard};
335 root 1.1
336 root 1.11 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
337     $state{fh} = shift
338 elmex 1.19 or return $cb->(undef, { Status => 599, Reason => "$!", URL => $url });
339 root 1.11
340     delete $state{connect_guard}; # reduce memory usage, save a tree
341    
342     # get handle
343     $state{handle} = new AnyEvent::Handle
344     fh => $state{fh},
345     ($scheme eq "https" ? (tls => "connect") : ());
346    
347     # limit the number of persistent connections
348     if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
349     ++$KA_COUNT{$_[1]};
350     $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
351     $hdr{connection} = "keep-alive";
352     delete $hdr{connection}; # keep-alive not yet supported
353     } else {
354     delete $hdr{connection};
355     }
356 root 1.1
357 root 1.11 # (re-)configure handle
358     $state{handle}->timeout ($timeout);
359     $state{handle}->on_error (sub {
360 root 1.14 my $errno = "$!";
361 root 1.11 %state = ();
362 elmex 1.19 $cb->(undef, { Status => 599, Reason => $errno, URL => $url });
363 root 1.11 });
364     $state{handle}->on_eof (sub {
365     %state = ();
366 elmex 1.19 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file", URL => $url });
367 root 1.11 });
368 root 1.1
369 root 1.11 # send request
370     $state{handle}->push_write (
371     "$method $rpath HTTP/1.0\015\012"
372     . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
373     . "\015\012"
374     . (delete $arg{body})
375     );
376 root 1.1
377 root 1.11 %hdr = (); # reduce memory usage, save a kitten
378 root 1.1
379 root 1.11 # status line
380     $state{handle}->push_read (line => qr/\015?\012/, sub {
381 root 1.25 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
382 elmex 1.19 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])", URL => $url }));
383 root 1.11
384     my %hdr = ( # response headers
385     HTTPVersion => "\x00$1",
386     Status => "\x00$2",
387     Reason => "\x00$3",
388 elmex 1.19 URL => "\x00$url"
389 root 1.11 );
390    
391     # headers, could be optimized a bit
392     $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
393     for ("$_[1]\012") {
394     # we support spaces in field names, as lotus domino
395 root 1.30 # creates them (actually spaces around seperators
396     # are strictly allowed in http, they are a security issue).
397 root 1.11 $hdr{lc $1} .= "\x00$2"
398     while /\G
399     ([^:\000-\037]+):
400     [\011\040]*
401     ((?: [^\015\012]+ | \015?\012[\011\040] )*)
402     \015?\012
403     /gxc;
404 root 1.10
405 root 1.11 /\G$/
406 elmex 1.19 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers", URL => $url }));
407 root 1.10 }
408    
409 root 1.11 substr $_, 0, 1, ""
410     for values %hdr;
411    
412     my $finish = sub {
413 root 1.30 # TODO: use destroy method, when/if available
414     #$state{handle}->destroy;
415     $state{handle}->on_eof (undef);
416     $state{handle}->on_error (undef);
417 root 1.11 %state = ();
418 root 1.10
419 root 1.11 # set-cookie processing
420     if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
421     for (split /\x00/, $hdr{"set-cookie"}) {
422     my ($cookie, @arg) = split /;\s*/;
423     my ($name, $value) = split /=/, $cookie, 2;
424     my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
425    
426 root 1.30 my $cdom;
427 root 1.11 my $cpath = (delete $kv{path}) || "/";
428 root 1.30
429     if (exists $kv{domain}) {
430     $cdom = delete $kv{domain};
431 root 1.11
432 root 1.30 $cdom =~ s/^\.?/./; # make sure it starts with a "."
433 root 1.11
434 root 1.30 next if $cdom =~ /\.$/;
435    
436     # this is not rfc-like and not netscape-like. go figure.
437     my $ndots = $cdom =~ y/.//;
438     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
439     } else {
440     $cdom = $uhost;
441     }
442 root 1.11
443     # store it
444     $arg{cookie_jar}{version} = 1;
445     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
446     }
447     }
448 root 1.8
449 root 1.25 # microsoft and other shitheads don't give a shit for following standards,
450     # try to support some common forms of broken Location headers.
451     if ($_[1]{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
452     $_[1]{location} =~ s/^\.\/+//;
453    
454     my $url = "$scheme://$uhost:$uport";
455    
456     unless ($_[1]{location} =~ s/^\///) {
457     $url .= $upath;
458     $url =~ s/\/[^\/]*$//;
459     }
460    
461     $_[1]{location} = "$url/$_[1]{location}";
462     }
463 root 1.24
464 root 1.23 if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") {
465     # apparently, mozilla et al. just change POST to GET here
466     # more research is needed before we do the same
467 root 1.11 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
468 root 1.23 } elsif ($_[1]{Status} == 303 && $recurse) {
469 root 1.30 # even http/1.1 is unclear on how to mutate the method
470 root 1.24 $method = "GET" unless $method eq "HEAD";
471     http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
472     } elsif ($_[1]{Status} == 307 && $recurse && $method =~ /^(?:GET|HEAD)$/) {
473     http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
474 root 1.11 } else {
475     $cb->($_[0], $_[1]);
476     }
477     };
478 root 1.3
479 root 1.11 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
480     $finish->(undef, \%hdr);
481 root 1.3 } else {
482 root 1.11 if (exists $hdr{"content-length"}) {
483     $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
484     # could cache persistent connection now
485     if ($hdr{connection} =~ /\bkeep-alive\b/i) {
486     # but we don't, due to misdesigns, this is annoyingly complex
487     };
488    
489     $finish->($_[1], \%hdr);
490     });
491     } else {
492     # too bad, need to read until we get an error or EOF,
493     # no way to detect winged data.
494     $_[0]->on_error (sub {
495     $finish->($_[0]{rbuf}, \%hdr);
496     });
497     $_[0]->on_eof (undef);
498     $_[0]->on_read (sub { });
499     }
500 root 1.3 }
501 root 1.11 });
502 root 1.1 });
503 root 1.11 }, sub {
504     $timeout
505     };
506 root 1.1 };
507    
508     defined wantarray && AnyEvent::Util::guard { %state = () }
509     }
510    
511 elmex 1.15 sub http_get($@) {
512 root 1.1 unshift @_, "GET";
513     &http_request
514     }
515    
516 elmex 1.15 sub http_head($@) {
517 root 1.4 unshift @_, "HEAD";
518     &http_request
519     }
520    
521 elmex 1.15 sub http_post($$@) {
522 root 1.22 my $url = shift;
523     unshift @_, "POST", $url, "body";
524 root 1.3 &http_request
525     }
526    
527 root 1.9 =back
528    
529 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
530 root 1.1
531     =over 4
532    
533 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
534    
535     Sets the default proxy server to use. The proxy-url must begin with a
536     string of the form C<http://host:port> (optionally C<https:...>).
537    
538 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
539 root 1.1
540 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
541 root 1.1
542     =item $AnyEvent::HTTP::USERAGENT
543    
544     The default value for the C<User-Agent> header (the default is
545     C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
546    
547     =item $AnyEvent::HTTP::MAX_PERSISTENT
548    
549     The maximum number of persistent connections to keep open (default: 8).
550    
551 root 1.3 Not implemented currently.
552    
553 root 1.1 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
554    
555 root 1.2 The maximum time to cache a persistent connection, in seconds (default: 2).
556 root 1.1
557 root 1.3 Not implemented currently.
558    
559 root 1.14 =item $AnyEvent::HTTP::ACTIVE
560    
561     The number of active connections. This is not the number of currently
562     running requests, but the number of currently open and non-idle TCP
563     connections. This number of can be useful for load-leveling.
564    
565 root 1.1 =back
566    
567     =cut
568    
569 root 1.2 sub set_proxy($) {
570     $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
571     }
572    
573     # initialise proxy from environment
574     set_proxy $ENV{http_proxy};
575    
576 root 1.1 =head1 SEE ALSO
577    
578     L<AnyEvent>.
579    
580     =head1 AUTHOR
581    
582 root 1.18 Marc Lehmann <schmorp@schmorp.de>
583     http://home.schmorp.de/
584 root 1.1
585     =cut
586    
587     1
588