ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.29
Committed: Wed Oct 22 23:28:11 2008 UTC (15 years, 6 months ago) by root
Branch: MAIN
Changes since 1.28: +20 -4 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44     use Carp;
45    
46     use AnyEvent ();
47     use AnyEvent::Util ();
48     use AnyEvent::Socket ();
49     use AnyEvent::Handle ();
50    
51     use base Exporter::;
52    
53 root 1.28 our $VERSION = '1.05';
54 root 1.1
55 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
56 root 1.1
57     our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
58 root 1.3 our $MAX_RECURSE = 10;
59 root 1.2 our $MAX_PERSISTENT = 8;
60     our $PERSISTENT_TIMEOUT = 2;
61     our $TIMEOUT = 300;
62 root 1.1
63     # changing these is evil
64     our $MAX_PERSISTENT_PER_HOST = 2;
65 root 1.11 our $MAX_PER_HOST = 4;
66 root 1.1
67 root 1.2 our $PROXY;
68 root 1.14 our $ACTIVE = 0;
69 root 1.2
70 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
71 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
72 root 1.1
73     =item http_get $url, key => value..., $cb->($data, $headers)
74    
75     Executes an HTTP-GET request. See the http_request function for details on
76 root 1.29 additional parameters and the return value.
77 root 1.1
78 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
79    
80 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
81     on additional parameters and the return value.
82 root 1.5
83     =item http_post $url, $body, key => value..., $cb->($data, $headers)
84 root 1.3
85 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
86 root 1.29 http_request function for details on additional parameters and the return
87     value.
88 root 1.3
89 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
90    
91     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
92     must be an absolute http or https URL.
93    
94 root 1.29 When called in void context, nothing is returned. In other contexts,
95     C<http_request> returns a "cancellation guard" - you have to keep the
96     object at least alive until the callback get called. If the object gets
97     destroyed before the callbakc is called, the request will be cancelled.
98    
99 root 1.2 The callback will be called with the response data as first argument
100     (or C<undef> if it wasn't available due to errors), and a hash-ref with
101     response headers as second argument.
102    
103 root 1.7 All the headers in that hash are lowercased. In addition to the response
104 root 1.20 headers, the "pseudo-headers" C<HTTPVersion>, C<Status> and C<Reason>
105     contain the three parts of the HTTP Status-Line of the same name. The
106     pseudo-header C<URL> contains the original URL (which can differ from the
107     requested URL when following redirects).
108    
109     If the server sends a header multiple lines, then their contents will be
110     joined together with C<\x00>.
111 root 1.2
112     If an internal error occurs, such as not being able to resolve a hostname,
113     then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
114     and the C<Reason> pseudo-header will contain an error message.
115    
116 root 1.6 A typical callback might look like this:
117    
118     sub {
119     my ($body, $hdr) = @_;
120    
121     if ($hdr->{Status} =~ /^2/) {
122     ... everything should be ok
123     } else {
124     print "error, $hdr->{Status} $hdr->{Reason}\n";
125     }
126     }
127    
128 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
129     include:
130    
131     =over 4
132    
133 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
134 root 1.1
135     Whether to recurse requests or not, e.g. on redirects, authentication
136 root 1.3 retries and so on, and how often to do so.
137 root 1.1
138     =item headers => hashref
139    
140 root 1.12 The request headers to use. Currently, C<http_request> may provide its
141     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
142     and will provide defaults for C<User-Agent:> and C<Referer:>.
143 root 1.1
144     =item timeout => $seconds
145    
146     The time-out to use for various stages - each connect attempt will reset
147 root 1.2 the timeout, as will read or write activity. Default timeout is 5 minutes.
148    
149     =item proxy => [$host, $port[, $scheme]] or undef
150    
151     Use the given http proxy for all requests. If not specified, then the
152     default proxy (as specified by C<$ENV{http_proxy}>) is used.
153    
154     C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
155     HTTPS.
156 root 1.1
157 root 1.3 =item body => $string
158    
159     The request body, usually empty. Will be-sent as-is (future versions of
160     this module might offer more options).
161    
162 root 1.10 =item cookie_jar => $hash_ref
163    
164     Passing this parameter enables (simplified) cookie-processing, loosely
165     based on the original netscape specification.
166    
167     The C<$hash_ref> must be an (initially empty) hash reference which will
168     get updated automatically. It is possible to save the cookie_jar to
169     persistent storage with something like JSON or Storable, but this is not
170     recommended, as expire times are currently being ignored.
171    
172     Note that this cookie implementation is not of very high quality, nor
173     meant to be complete. If you want complete cookie management you have to
174     do that on your own. C<cookie_jar> is meant as a quick fix to get some
175     cookie-using sites working. Cookies are a privacy disaster, do not use
176     them unless required to.
177    
178 root 1.1 =back
179    
180 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
181    
182     http_request GET => "http://www.nethype.de/", sub {
183     my ($body, $hdr) = @_;
184     print "$body\n";
185     };
186    
187     Example: make a HTTP HEAD request on https://www.google.com/, use a
188     timeout of 30 seconds.
189    
190     http_request
191     GET => "https://www.google.com",
192     timeout => 30,
193     sub {
194     my ($body, $hdr) = @_;
195     use Data::Dumper;
196     print Dumper $hdr;
197     }
198     ;
199 root 1.1
200 root 1.29 Example: make another simple HTTP GET request, but immediately try to
201     cancel it.
202    
203     my $request = http_request GET => "http://www.nethype.de/", sub {
204     my ($body, $hdr) = @_;
205     print "$body\n";
206     };
207    
208     undef $request;
209    
210 root 1.1 =cut
211    
212 root 1.12 sub _slot_schedule;
213 root 1.11 sub _slot_schedule($) {
214     my $host = shift;
215    
216     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
217     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
218 root 1.12 # somebody wants that slot
219 root 1.11 ++$CO_SLOT{$host}[0];
220 root 1.14 ++$ACTIVE;
221 root 1.11
222     $cb->(AnyEvent::Util::guard {
223 root 1.14 --$ACTIVE;
224 root 1.11 --$CO_SLOT{$host}[0];
225     _slot_schedule $host;
226     });
227     } else {
228     # nobody wants the slot, maybe we can forget about it
229     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
230     last;
231     }
232     }
233     }
234    
235     # wait for a free slot on host, call callback
236     sub _get_slot($$) {
237     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
238    
239     _slot_schedule $_[0];
240     }
241    
242 elmex 1.15 sub http_request($$@) {
243 root 1.1 my $cb = pop;
244     my ($method, $url, %arg) = @_;
245    
246     my %hdr;
247    
248 root 1.3 $method = uc $method;
249    
250 root 1.8 if (my $hdr = $arg{headers}) {
251 root 1.1 while (my ($k, $v) = each %$hdr) {
252     $hdr{lc $k} = $v;
253     }
254     }
255    
256 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
257 root 1.8
258 elmex 1.19 return $cb->(undef, { Status => 599, Reason => "recursion limit reached", URL => $url })
259 root 1.8 if $recurse < 0;
260    
261 root 1.2 my $proxy = $arg{proxy} || $PROXY;
262 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
263    
264     $hdr{"user-agent"} ||= $USERAGENT;
265    
266 root 1.10 my ($scheme, $authority, $upath, $query, $fragment) =
267     $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
268 root 1.2
269 root 1.10 $scheme = lc $scheme;
270 root 1.1
271 root 1.10 my $uport = $scheme eq "http" ? 80
272     : $scheme eq "https" ? 443
273 elmex 1.19 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported", URL => $url });
274 root 1.10
275 root 1.13 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
276    
277 root 1.10 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
278 elmex 1.19 or return $cb->(undef, { Status => 599, Reason => "unparsable URL", URL => $url });
279 root 1.10
280     my $uhost = $1;
281     $uport = $2 if defined $2;
282    
283     $uhost =~ s/^\[(.*)\]$/$1/;
284     $upath .= "?$query" if length $query;
285    
286     $upath =~ s%^/?%/%;
287    
288     # cookie processing
289     if (my $jar = $arg{cookie_jar}) {
290     %$jar = () if $jar->{version} < 1;
291    
292     my @cookie;
293    
294     while (my ($chost, $v) = each %$jar) {
295     next unless $chost eq substr $uhost, -length $chost;
296     next unless $chost =~ /^\./;
297    
298     while (my ($cpath, $v) = each %$v) {
299     next unless $cpath eq substr $upath, 0, length $cpath;
300    
301     while (my ($k, $v) = each %$v) {
302     next if $scheme ne "https" && exists $v->{secure};
303     push @cookie, "$k=$v->{value}";
304     }
305     }
306     }
307    
308     $hdr{cookie} = join "; ", @cookie
309     if @cookie;
310     }
311 root 1.1
312 root 1.10 my ($rhost, $rport, $rpath); # request host, port, path
313 root 1.2
314 root 1.10 if ($proxy) {
315     ($rhost, $rport, $scheme) = @$proxy;
316     $rpath = $url;
317     } else {
318     ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
319     $hdr{host} = $uhost;
320 root 1.2 }
321    
322 root 1.10 $hdr{"content-length"} = length $arg{body};
323 root 1.1
324 root 1.11 my %state = (connect_guard => 1);
325    
326     _get_slot $uhost, sub {
327     $state{slot_guard} = shift;
328 root 1.1
329 root 1.11 return unless $state{connect_guard};
330 root 1.1
331 root 1.11 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
332     $state{fh} = shift
333 elmex 1.19 or return $cb->(undef, { Status => 599, Reason => "$!", URL => $url });
334 root 1.11
335     delete $state{connect_guard}; # reduce memory usage, save a tree
336    
337     # get handle
338     $state{handle} = new AnyEvent::Handle
339     fh => $state{fh},
340     ($scheme eq "https" ? (tls => "connect") : ());
341    
342     # limit the number of persistent connections
343     if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
344     ++$KA_COUNT{$_[1]};
345     $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
346     $hdr{connection} = "keep-alive";
347     delete $hdr{connection}; # keep-alive not yet supported
348     } else {
349     delete $hdr{connection};
350     }
351 root 1.1
352 root 1.11 # (re-)configure handle
353     $state{handle}->timeout ($timeout);
354     $state{handle}->on_error (sub {
355 root 1.14 my $errno = "$!";
356 root 1.11 %state = ();
357 elmex 1.19 $cb->(undef, { Status => 599, Reason => $errno, URL => $url });
358 root 1.11 });
359     $state{handle}->on_eof (sub {
360     %state = ();
361 elmex 1.19 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file", URL => $url });
362 root 1.11 });
363 root 1.1
364 root 1.11 # send request
365     $state{handle}->push_write (
366     "$method $rpath HTTP/1.0\015\012"
367     . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
368     . "\015\012"
369     . (delete $arg{body})
370     );
371 root 1.1
372 root 1.11 %hdr = (); # reduce memory usage, save a kitten
373 root 1.1
374 root 1.11 # status line
375     $state{handle}->push_read (line => qr/\015?\012/, sub {
376 root 1.25 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
377 elmex 1.19 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])", URL => $url }));
378 root 1.11
379     my %hdr = ( # response headers
380     HTTPVersion => "\x00$1",
381     Status => "\x00$2",
382     Reason => "\x00$3",
383 elmex 1.19 URL => "\x00$url"
384 root 1.11 );
385    
386     # headers, could be optimized a bit
387     $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
388     for ("$_[1]\012") {
389     # we support spaces in field names, as lotus domino
390     # creates them.
391     $hdr{lc $1} .= "\x00$2"
392     while /\G
393     ([^:\000-\037]+):
394     [\011\040]*
395     ((?: [^\015\012]+ | \015?\012[\011\040] )*)
396     \015?\012
397     /gxc;
398 root 1.10
399 root 1.11 /\G$/
400 elmex 1.19 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers", URL => $url }));
401 root 1.10 }
402    
403 root 1.11 substr $_, 0, 1, ""
404     for values %hdr;
405    
406     my $finish = sub {
407     %state = ();
408 root 1.10
409 root 1.11 # set-cookie processing
410     if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
411     for (split /\x00/, $hdr{"set-cookie"}) {
412     my ($cookie, @arg) = split /;\s*/;
413     my ($name, $value) = split /=/, $cookie, 2;
414     my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
415    
416     my $cdom = (delete $kv{domain}) || $uhost;
417     my $cpath = (delete $kv{path}) || "/";
418    
419 root 1.28 $cdom =~ s/^\.?/./; # make sure it starts with a "."
420 root 1.11
421     next if $cdom =~ /\.$/;
422    
423     # this is not rfc-like and not netscape-like. go figure.
424     my $ndots = $cdom =~ y/.//;
425     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
426    
427     # store it
428     $arg{cookie_jar}{version} = 1;
429     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
430     }
431     }
432 root 1.8
433 root 1.25 # microsoft and other shitheads don't give a shit for following standards,
434     # try to support some common forms of broken Location headers.
435     if ($_[1]{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
436     $_[1]{location} =~ s/^\.\/+//;
437    
438     my $url = "$scheme://$uhost:$uport";
439    
440     unless ($_[1]{location} =~ s/^\///) {
441     $url .= $upath;
442     $url =~ s/\/[^\/]*$//;
443     }
444    
445     $_[1]{location} = "$url/$_[1]{location}";
446     }
447 root 1.24
448 root 1.23 if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") {
449     # apparently, mozilla et al. just change POST to GET here
450     # more research is needed before we do the same
451 root 1.11 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
452 root 1.23 } elsif ($_[1]{Status} == 303 && $recurse) {
453 root 1.24 # even http/1.1 is unlear on how to mutate the method
454     $method = "GET" unless $method eq "HEAD";
455     http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
456     } elsif ($_[1]{Status} == 307 && $recurse && $method =~ /^(?:GET|HEAD)$/) {
457     http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
458 root 1.11 } else {
459     $cb->($_[0], $_[1]);
460     }
461     };
462 root 1.3
463 root 1.11 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
464     $finish->(undef, \%hdr);
465 root 1.3 } else {
466 root 1.11 if (exists $hdr{"content-length"}) {
467     $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
468     # could cache persistent connection now
469     if ($hdr{connection} =~ /\bkeep-alive\b/i) {
470     # but we don't, due to misdesigns, this is annoyingly complex
471     };
472    
473     $finish->($_[1], \%hdr);
474     });
475     } else {
476     # too bad, need to read until we get an error or EOF,
477     # no way to detect winged data.
478     $_[0]->on_error (sub {
479     $finish->($_[0]{rbuf}, \%hdr);
480     });
481     $_[0]->on_eof (undef);
482     $_[0]->on_read (sub { });
483     }
484 root 1.3 }
485 root 1.11 });
486 root 1.1 });
487 root 1.11 }, sub {
488     $timeout
489     };
490 root 1.1 };
491    
492     defined wantarray && AnyEvent::Util::guard { %state = () }
493     }
494    
495 elmex 1.15 sub http_get($@) {
496 root 1.1 unshift @_, "GET";
497     &http_request
498     }
499    
500 elmex 1.15 sub http_head($@) {
501 root 1.4 unshift @_, "HEAD";
502     &http_request
503     }
504    
505 elmex 1.15 sub http_post($$@) {
506 root 1.22 my $url = shift;
507     unshift @_, "POST", $url, "body";
508 root 1.3 &http_request
509     }
510    
511 root 1.9 =back
512    
513 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
514 root 1.1
515     =over 4
516    
517 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
518    
519     Sets the default proxy server to use. The proxy-url must begin with a
520     string of the form C<http://host:port> (optionally C<https:...>).
521    
522 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
523 root 1.1
524 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
525 root 1.1
526     =item $AnyEvent::HTTP::USERAGENT
527    
528     The default value for the C<User-Agent> header (the default is
529     C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
530    
531     =item $AnyEvent::HTTP::MAX_PERSISTENT
532    
533     The maximum number of persistent connections to keep open (default: 8).
534    
535 root 1.3 Not implemented currently.
536    
537 root 1.1 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
538    
539 root 1.2 The maximum time to cache a persistent connection, in seconds (default: 2).
540 root 1.1
541 root 1.3 Not implemented currently.
542    
543 root 1.14 =item $AnyEvent::HTTP::ACTIVE
544    
545     The number of active connections. This is not the number of currently
546     running requests, but the number of currently open and non-idle TCP
547     connections. This number of can be useful for load-leveling.
548    
549 root 1.1 =back
550    
551     =cut
552    
553 root 1.2 sub set_proxy($) {
554     $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
555     }
556    
557     # initialise proxy from environment
558     set_proxy $ENV{http_proxy};
559    
560 root 1.1 =head1 SEE ALSO
561    
562     L<AnyEvent>.
563    
564     =head1 AUTHOR
565    
566 root 1.18 Marc Lehmann <schmorp@schmorp.de>
567     http://home.schmorp.de/
568 root 1.1
569     =cut
570    
571     1
572