ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.66
Committed: Fri Dec 31 06:18:30 2010 UTC (13 years, 4 months ago) by root
Branch: MAIN
Changes since 1.65: +88 -28 lines
Log Message:
implemented chunked, and therefore hopefully http/1.1

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44 root 1.41 use Errno ();
45 root 1.1
46 root 1.51 use AnyEvent 5.0 ();
47 root 1.1 use AnyEvent::Util ();
48     use AnyEvent::Handle ();
49    
50     use base Exporter::;
51    
52 root 1.65 our $VERSION = '1.5';
53 root 1.1
54 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
55 root 1.1
56 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
57 root 1.3 our $MAX_RECURSE = 10;
58 root 1.2 our $MAX_PERSISTENT = 8;
59     our $PERSISTENT_TIMEOUT = 2;
60     our $TIMEOUT = 300;
61 root 1.1
62     # changing these is evil
63 root 1.43 our $MAX_PERSISTENT_PER_HOST = 0;
64 root 1.11 our $MAX_PER_HOST = 4;
65 root 1.1
66 root 1.2 our $PROXY;
67 root 1.14 our $ACTIVE = 0;
68 root 1.2
69 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
70 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
71 root 1.1
72     =item http_get $url, key => value..., $cb->($data, $headers)
73    
74     Executes an HTTP-GET request. See the http_request function for details on
75 root 1.29 additional parameters and the return value.
76 root 1.1
77 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
78    
79 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
80     on additional parameters and the return value.
81 root 1.5
82     =item http_post $url, $body, key => value..., $cb->($data, $headers)
83 root 1.3
84 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
85 root 1.29 http_request function for details on additional parameters and the return
86     value.
87 root 1.3
88 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89    
90     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91     must be an absolute http or https URL.
92    
93 root 1.29 When called in void context, nothing is returned. In other contexts,
94     C<http_request> returns a "cancellation guard" - you have to keep the
95     object at least alive until the callback get called. If the object gets
96 root 1.58 destroyed before the callback is called, the request will be cancelled.
97 root 1.29
98 root 1.42 The callback will be called with the response body data as first argument
99     (or C<undef> if an error occured), and a hash-ref with response headers as
100     second argument.
101 root 1.2
102 root 1.7 All the headers in that hash are lowercased. In addition to the response
103 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
104     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
105 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
106     during the body phase of a request, then the original C<Status> and
107     C<Reason> values from the header are available as C<OrigStatus> and
108     C<OrigReason>.
109 root 1.55
110     The pseudo-header C<URL> contains the actual URL (which can differ from
111     the requested URL when following redirects - for example, you might get
112     an error that your URL scheme is not supported even though your URL is a
113     valid http URL because it redirected to an ftp URL, in which case you can
114     look at the URL pseudo header).
115    
116     The pseudo-header C<Redirect> only exists when the request was a result
117     of an internal redirect. In that case it is an array reference with
118     the C<($data, $headers)> from the redirect response. Note that this
119     response could in turn be the result of a redirect itself, and C<<
120     $headers->{Redirect}[1]{Redirect} >> will then contain the original
121     response, and so on.
122 root 1.20
123 root 1.32 If the server sends a header multiple times, then their contents will be
124     joined together with a comma (C<,>), as per the HTTP spec.
125 root 1.2
126     If an internal error occurs, such as not being able to resolve a hostname,
127 root 1.41 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<59x>
128     (usually C<599>) and the C<Reason> pseudo-header will contain an error
129     message.
130 root 1.2
131 root 1.6 A typical callback might look like this:
132    
133     sub {
134     my ($body, $hdr) = @_;
135    
136     if ($hdr->{Status} =~ /^2/) {
137     ... everything should be ok
138     } else {
139     print "error, $hdr->{Status} $hdr->{Reason}\n";
140     }
141     }
142    
143 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
144     include:
145    
146     =over 4
147    
148 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
149 root 1.1
150     Whether to recurse requests or not, e.g. on redirects, authentication
151 root 1.3 retries and so on, and how often to do so.
152 root 1.1
153     =item headers => hashref
154    
155 root 1.63 The request headers to use. Currently, C<http_request> may provide its
156     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
157     and will provide defaults for C<User-Agent:> and C<Referer:> (this can be
158     suppressed by using C<undef> for these headers in which case they won't be
159     sent at all).
160 root 1.1
161     =item timeout => $seconds
162    
163     The time-out to use for various stages - each connect attempt will reset
164 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
165     timeout.
166    
167     Default timeout is 5 minutes.
168 root 1.2
169     =item proxy => [$host, $port[, $scheme]] or undef
170    
171     Use the given http proxy for all requests. If not specified, then the
172     default proxy (as specified by C<$ENV{http_proxy}>) is used.
173    
174 root 1.47 C<$scheme> must be either missing, C<http> for HTTP or C<https> for
175 root 1.2 HTTPS.
176 root 1.1
177 root 1.3 =item body => $string
178    
179     The request body, usually empty. Will be-sent as-is (future versions of
180     this module might offer more options).
181    
182 root 1.10 =item cookie_jar => $hash_ref
183    
184     Passing this parameter enables (simplified) cookie-processing, loosely
185     based on the original netscape specification.
186    
187     The C<$hash_ref> must be an (initially empty) hash reference which will
188     get updated automatically. It is possible to save the cookie_jar to
189     persistent storage with something like JSON or Storable, but this is not
190 root 1.40 recommended, as expiry times are currently being ignored.
191 root 1.10
192     Note that this cookie implementation is not of very high quality, nor
193     meant to be complete. If you want complete cookie management you have to
194     do that on your own. C<cookie_jar> is meant as a quick fix to get some
195     cookie-using sites working. Cookies are a privacy disaster, do not use
196     them unless required to.
197    
198 root 1.40 =item tls_ctx => $scheme | $tls_ctx
199    
200     Specifies the AnyEvent::TLS context to be used for https connections. This
201     parameter follows the same rules as the C<tls_ctx> parameter to
202     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
203     C<high> can be specified, which give you a predefined low-security (no
204     verification, highest compatibility) and high-security (CA and common-name
205     verification) TLS context.
206    
207     The default for this option is C<low>, which could be interpreted as "give
208     me the page, no matter what".
209    
210 root 1.51 =item on_prepare => $callback->($fh)
211    
212     In rare cases you need to "tune" the socket before it is used to
213     connect (for exmaple, to bind it on a given IP address). This parameter
214     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
215     and behaves exactly the same way (e.g. it has to provide a
216     timeout). See the description for the C<$prepare_cb> argument of
217     C<AnyEvent::Socket::tcp_connect> for details.
218    
219 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
220    
221     In even rarer cases you want total control over how AnyEvent::HTTP
222     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
223     to do this, but you can provide your own C<tcp_connect> function -
224 root 1.60 obviously, it has to follow the same calling conventions, except that it
225     may always return a connection guard object.
226 root 1.59
227     There are probably lots of weird uses for this function, starting from
228     tracing the hosts C<http_request> actually tries to connect, to (inexact
229     but fast) host => IP address caching or even socks protocol support.
230    
231 root 1.42 =item on_header => $callback->($headers)
232 root 1.41
233     When specified, this callback will be called with the header hash as soon
234     as headers have been successfully received from the remote server (not on
235     locally-generated errors).
236    
237     It has to return either true (in which case AnyEvent::HTTP will continue),
238     or false, in which case AnyEvent::HTTP will cancel the download (and call
239     the finish callback with an error code of C<598>).
240    
241     This callback is useful, among other things, to quickly reject unwanted
242     content, which, if it is supposed to be rare, can be faster than first
243     doing a C<HEAD> request.
244    
245 root 1.42 Example: cancel the request unless the content-type is "text/html".
246 root 1.41
247 root 1.42 on_header => sub {
248     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
249     },
250 root 1.41
251 root 1.42 =item on_body => $callback->($partial_body, $headers)
252 root 1.41
253 root 1.42 When specified, all body data will be passed to this callback instead of
254     to the completion callback. The completion callback will get the empty
255     string instead of the body data.
256 root 1.41
257 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
258     or false, in which case AnyEvent::HTTP will cancel the download (and call
259     the completion callback with an error code of C<598>).
260    
261     This callback is useful when the data is too large to be held in memory
262     (so the callback writes it to a file) or when only some information should
263     be extracted, or when the body should be processed incrementally.
264 root 1.41
265     It is usually preferred over doing your own body handling via
266 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
267     only used to create a connection, C<want_body_handle> is the better
268     alternative, as it allows you to install your own event handler, reducing
269     resource usage.
270 root 1.41
271     =item want_body_handle => $enable
272    
273     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
274     changes considerably: after parsing the headers, and instead of
275     downloading the body (if any), the completion callback will be
276     called. Instead of the C<$body> argument containing the body data, the
277     callback will receive the L<AnyEvent::Handle> object associated with the
278     connection. In error cases, C<undef> will be passed. When there is no body
279     (e.g. status C<304>), the empty string will be passed.
280    
281     The handle object might or might not be in TLS mode, might be connected to
282     a proxy, be a persistent connection etc., and configured in unspecified
283     ways. The user is responsible for this handle (it will not be used by this
284     module anymore).
285    
286     This is useful with some push-type services, where, after the initial
287     headers, an interactive protocol is used (typical example would be the
288     push-style twitter API which starts a JSON/XML stream).
289    
290     If you think you need this, first have a look at C<on_body>, to see if
291 root 1.45 that doesn't solve your problem in a better way.
292 root 1.41
293 root 1.1 =back
294    
295 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
296    
297     http_request GET => "http://www.nethype.de/", sub {
298     my ($body, $hdr) = @_;
299     print "$body\n";
300     };
301    
302     Example: make a HTTP HEAD request on https://www.google.com/, use a
303     timeout of 30 seconds.
304    
305     http_request
306     GET => "https://www.google.com",
307     timeout => 30,
308     sub {
309     my ($body, $hdr) = @_;
310     use Data::Dumper;
311     print Dumper $hdr;
312     }
313     ;
314 root 1.1
315 root 1.29 Example: make another simple HTTP GET request, but immediately try to
316     cancel it.
317    
318     my $request = http_request GET => "http://www.nethype.de/", sub {
319     my ($body, $hdr) = @_;
320     print "$body\n";
321     };
322    
323     undef $request;
324    
325 root 1.1 =cut
326    
327 root 1.12 sub _slot_schedule;
328 root 1.11 sub _slot_schedule($) {
329     my $host = shift;
330    
331     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
332     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
333 root 1.12 # somebody wants that slot
334 root 1.11 ++$CO_SLOT{$host}[0];
335 root 1.14 ++$ACTIVE;
336 root 1.11
337     $cb->(AnyEvent::Util::guard {
338 root 1.14 --$ACTIVE;
339 root 1.11 --$CO_SLOT{$host}[0];
340     _slot_schedule $host;
341     });
342     } else {
343     # nobody wants the slot, maybe we can forget about it
344     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
345     last;
346     }
347     }
348     }
349    
350     # wait for a free slot on host, call callback
351     sub _get_slot($$) {
352     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
353    
354     _slot_schedule $_[0];
355     }
356    
357 root 1.66 # continue to parse $_ for headers and place them into the arg
358     sub parse_hdr() {
359     my %hdr;
360    
361     # things seen, not parsed:
362     # p3pP="NON CUR OTPi OUR NOR UNI"
363    
364     $hdr{lc $1} .= ",$2"
365     while /\G
366     ([^:\000-\037]*):
367     [\011\040]*
368     ((?: [^\012]+ | \012[\011\040] )*)
369     \012
370     /gxc;
371    
372     /\G$/
373     or return;
374    
375     # remove the "," prefix we added to all headers above
376     substr $_, 0, 1, ""
377     for values %hdr;
378    
379     \%hdr
380     }
381    
382 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
383 root 1.34
384 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
385     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
386 root 1.40
387 elmex 1.15 sub http_request($$@) {
388 root 1.1 my $cb = pop;
389     my ($method, $url, %arg) = @_;
390    
391     my %hdr;
392    
393 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
394     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
395    
396 root 1.3 $method = uc $method;
397    
398 root 1.8 if (my $hdr = $arg{headers}) {
399 root 1.1 while (my ($k, $v) = each %$hdr) {
400     $hdr{lc $k} = $v;
401     }
402     }
403    
404 root 1.55 # pseudo headers for all subsequent responses
405     my @pseudo = (URL => $url);
406     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
407    
408 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
409 root 1.8
410 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
411 root 1.8 if $recurse < 0;
412    
413 root 1.2 my $proxy = $arg{proxy} || $PROXY;
414 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
415    
416 root 1.31 my ($uscheme, $uauthority, $upath, $query, $fragment) =
417 root 1.56 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?|;
418 root 1.2
419 root 1.31 $uscheme = lc $uscheme;
420 root 1.1
421 root 1.31 my $uport = $uscheme eq "http" ? 80
422     : $uscheme eq "https" ? 443
423 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
424 root 1.13
425 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
426 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
427 root 1.10
428     my $uhost = $1;
429     $uport = $2 if defined $2;
430    
431 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
432     unless exists $hdr{host};
433 root 1.43
434 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
435 root 1.56 $upath .= $query if length $query;
436 root 1.10
437     $upath =~ s%^/?%/%;
438    
439     # cookie processing
440     if (my $jar = $arg{cookie_jar}) {
441 root 1.31 %$jar = () if $jar->{version} != 1;
442 root 1.10
443     my @cookie;
444    
445     while (my ($chost, $v) = each %$jar) {
446 root 1.30 if ($chost =~ /^\./) {
447     next unless $chost eq substr $uhost, -length $chost;
448     } elsif ($chost =~ /\./) {
449     next unless $chost eq $uhost;
450     } else {
451     next;
452     }
453 root 1.10
454     while (my ($cpath, $v) = each %$v) {
455     next unless $cpath eq substr $upath, 0, length $cpath;
456    
457     while (my ($k, $v) = each %$v) {
458 root 1.31 next if $uscheme ne "https" && exists $v->{secure};
459     my $value = $v->{value};
460     $value =~ s/([\\"])/\\$1/g;
461     push @cookie, "$k=\"$value\"";
462 root 1.10 }
463     }
464     }
465    
466     $hdr{cookie} = join "; ", @cookie
467     if @cookie;
468     }
469 root 1.1
470 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
471 root 1.2
472 root 1.10 if ($proxy) {
473 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
474 root 1.31
475 root 1.47 $rscheme = "http" unless defined $rscheme;
476    
477 root 1.31 # don't support https requests over https-proxy transport,
478 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
479 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
480 root 1.10 } else {
481 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
482 root 1.2 }
483    
484 root 1.47 # leave out fragment and query string, just a heuristic
485 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
486     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
487 root 1.41
488 root 1.53 $hdr{"content-length"} = length $arg{body}
489     if length $arg{body} || $method ne "GET";
490 root 1.1
491 root 1.66 $hdr{connection} = "close TE";
492     $hdr{te} = "trailers" unless exists $hdr{te};
493    
494 root 1.11 my %state = (connect_guard => 1);
495    
496     _get_slot $uhost, sub {
497     $state{slot_guard} = shift;
498 root 1.1
499 root 1.11 return unless $state{connect_guard};
500 root 1.1
501 root 1.64 my $connect_cb = sub {
502     $state{fh} = shift
503     or do {
504     my $err = "$!";
505     %state = ();
506     return $cb->(undef, { @pseudo, Status => 599, Reason => $err });
507     };
508 root 1.44
509 root 1.64 pop; # free memory, save a tree
510 root 1.11
511 root 1.64 return unless delete $state{connect_guard};
512 root 1.11
513 root 1.64 # get handle
514     $state{handle} = new AnyEvent::Handle
515     fh => $state{fh},
516     peername => $rhost,
517     tls_ctx => $arg{tls_ctx},
518     # these need to be reconfigured on keepalive handles
519     timeout => $timeout,
520     on_error => sub {
521     %state = ();
522     $cb->(undef, { @pseudo, Status => 599, Reason => $_[2] });
523     },
524     on_eof => sub {
525     %state = ();
526     $cb->(undef, { @pseudo, Status => 599, Reason => "Unexpected end-of-file" });
527     },
528     ;
529 root 1.11
530 root 1.64 # limit the number of persistent connections
531     # keepalive not yet supported
532 root 1.56 # if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
533     # ++$KA_COUNT{$_[1]};
534     # $state{handle}{ka_count_guard} = AnyEvent::Util::guard {
535     # --$KA_COUNT{$_[1]}
536     # };
537     # $hdr{connection} = "keep-alive";
538     # } else {
539 root 1.66 # delete $hdr{connection};
540 root 1.56 # }
541 root 1.1
542 root 1.64 $state{handle}->starttls ("connect") if $rscheme eq "https";
543    
544     # handle actual, non-tunneled, request
545     my $handle_actual_request = sub {
546     $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls};
547    
548     # send request
549     $state{handle}->push_write (
550 root 1.66 "$method $rpath HTTP/1.1\015\012"
551 root 1.64 . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
552     . "\015\012"
553     . (delete $arg{body})
554     );
555    
556     # return if error occured during push_write()
557     return unless %state;
558    
559     %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use
560    
561     # status line and headers
562     $state{handle}->push_read (line => $qr_nlnl, sub {
563     my $keepalive = pop;
564    
565     for ("$_[1]") {
566     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
567    
568 root 1.66 /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/igxc
569 root 1.64 or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Invalid server response" }));
570    
571     push @pseudo,
572     HTTPVersion => $1,
573     Status => $2,
574     Reason => $3,
575     ;
576    
577 root 1.66 my $hdr = parse_hdr
578     or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Garbled response headers" }));
579 root 1.64
580 root 1.66 %hdr = (%$hdr, @pseudo);
581 root 1.64 }
582    
583     # redirect handling
584     # microsoft and other shitheads don't give a shit for following standards,
585     # try to support some common forms of broken Location headers.
586     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
587     $hdr{location} =~ s/^\.\/+//;
588    
589     my $url = "$rscheme://$uhost:$uport";
590    
591     unless ($hdr{location} =~ s/^\///) {
592     $url .= $upath;
593     $url =~ s/\/[^\/]*$//;
594     }
595 root 1.59
596 root 1.64 $hdr{location} = "$url/$hdr{location}";
597     }
598 root 1.31
599 root 1.64 my $redirect;
600 root 1.41
601 root 1.64 if ($recurse) {
602     my $status = $hdr{Status};
603 root 1.59
604 root 1.64 # industry standard is to redirect POST as GET for
605     # 301, 302 and 303, in contrast to http/1.0 and 1.1.
606     # also, the UA should ask the user for 301 and 307 and POST,
607     # industry standard seems to be to simply follow.
608     # we go with the industry standard.
609     if ($status == 301 or $status == 302 or $status == 303) {
610     # HTTP/1.1 is unclear on how to mutate the method
611     $method = "GET" unless $method eq "HEAD";
612     $redirect = 1;
613     } elsif ($status == 307) {
614     $redirect = 1;
615 root 1.59 }
616 root 1.64 }
617 root 1.57
618 root 1.64 my $finish = sub { # ($data, $err_status, $err_reason[, $keepalive])
619     $state{handle}->destroy if $state{handle};
620     %state = ();
621 root 1.55
622 root 1.64 if (defined $_[1]) {
623     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
624     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
625 root 1.41 }
626    
627 root 1.64 # set-cookie processing
628     if ($arg{cookie_jar}) {
629     for ($hdr{"set-cookie"}) {
630     # parse NAME=VALUE
631     my @kv;
632    
633     while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) {
634     my $name = $1;
635     my $value = $3;
636    
637     unless ($value) {
638     $value = $2;
639     $value =~ s/\\(.)/$1/gs;
640     }
641 root 1.59
642 root 1.64 push @kv, $name => $value;
643 root 1.31
644 root 1.64 last unless /\G\s*;/gc;
645     }
646 root 1.57
647 root 1.64 last unless @kv;
648 root 1.31
649 root 1.64 my $name = shift @kv;
650     my %kv = (value => shift @kv, @kv);
651 root 1.31
652 root 1.64 my $cdom;
653     my $cpath = (delete $kv{path}) || "/";
654 root 1.10
655 root 1.64 if (exists $kv{domain}) {
656     $cdom = delete $kv{domain};
657    
658     $cdom =~ s/^\.?/./; # make sure it starts with a "."
659 root 1.11
660 root 1.64 next if $cdom =~ /\.$/;
661 root 1.59
662 root 1.64 # this is not rfc-like and not netscape-like. go figure.
663     my $ndots = $cdom =~ y/.//;
664     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
665     } else {
666     $cdom = $uhost;
667     }
668    
669     # store it
670     $arg{cookie_jar}{version} = 1;
671     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
672 root 1.11
673 root 1.64 redo if /\G\s*,/gc;
674 root 1.59 }
675 root 1.64 }
676 root 1.31
677 root 1.64 if ($redirect && exists $hdr{location}) {
678     # we ignore any errors, as it is very common to receive
679     # Content-Length != 0 but no actual body
680     # we also access %hdr, as $_[1] might be an erro
681     http_request (
682     $method => $hdr{location},
683     %arg,
684     recurse => $recurse - 1,
685     Redirect => [$_[0], \%hdr],
686     $cb);
687     } else {
688     $cb->($_[0], \%hdr);
689     }
690     };
691    
692     my $len = $hdr{"content-length"};
693    
694     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
695     $finish->(undef, 598 => "Request cancelled by on_header");
696     } elsif (
697     $hdr{Status} =~ /^(?:1..|204|205|304)$/
698     or $method eq "HEAD"
699     or (defined $len && !$len)
700     ) {
701     # no body
702     $finish->("", undef, undef, 1);
703     } else {
704 root 1.66 # body handling, many different code paths
705     # - no body expected
706     # - want_body_handle
707     # - te chunked
708     # - 2x length known (with or without on_body)
709     # - 2x length not known (with or without on_body)
710 root 1.64 if (!$redirect && $arg{want_body_handle}) {
711     $_[0]->on_eof (undef);
712     $_[0]->on_error (undef);
713     $_[0]->on_read (undef);
714    
715     $finish->(delete $state{handle});
716    
717 root 1.66 } elsif ($hdr{"transfer-encoding"} =~ /chunked/) {
718     my $body = undef;
719     my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
720    
721     $_[0]->on_error (sub { $finish->(undef, 599 => $_[2]) });
722    
723     my $read_chunk; $read_chunk = sub {
724     warn $_[1];#d#
725     $_[1] =~ /^([0-9a-fA-F]+)/
726     or $finish->(undef, 599 => "Garbled chunked transfer encoding");
727    
728     my $len = hex $1;
729    
730     if ($len) {
731     $_[0]->push_read (chunk => hex $1, sub {
732     $on_body->($_[1], \%hdr)
733     or return $finish->(undef, 598 => "Request cancelled by on_body");
734    
735     $_[0]->push_read (line => sub {
736     length $_[1]
737     and return $finish->(undef, 599 => "Garbled chunked transfer encoding");
738     $_[0]->push_read (line => $read_chunk);
739     });
740     });
741     } else {
742     $_[0]->push_read (line => $qr_nlnl, sub {
743     if (length $_[1]) {
744     for ("$_[1]") {
745     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
746    
747     my $hdr = parse_hdr
748     or return $finish->(undef, 599 => "Garbled response trailers");
749    
750     %hdr = (%hdr, %$hdr);
751     }
752     }
753    
754     $finish->($body, undef, undef, 1);
755     });
756     }
757     };
758    
759     $_[0]->push_read (line => $read_chunk);
760    
761 root 1.64 } elsif ($arg{on_body}) {
762     $_[0]->on_error (sub { $finish->(undef, 599 => $_[2]) });
763 root 1.66
764 root 1.64 if ($len) {
765     $_[0]->on_read (sub {
766     $len -= length $_[0]{rbuf};
767    
768     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
769 root 1.66 or return $finish->(undef, 598 => "Request cancelled by on_body");
770 root 1.64
771     $len > 0
772     or $finish->("", undef, undef, 1);
773     });
774 root 1.59 } else {
775 root 1.64 $_[0]->on_eof (sub {
776     $finish->("");
777     });
778     $_[0]->on_read (sub {
779     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
780     or $finish->(undef, 598 => "Request cancelled by on_body");
781     });
782 root 1.11 }
783 root 1.64 } else {
784     $_[0]->on_eof (undef);
785 root 1.59
786 root 1.64 if ($len) {
787     $_[0]->on_error (sub { $finish->(undef, 599 => $_[2]) });
788     $_[0]->on_read (sub {
789     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
790     if $len <= length $_[0]{rbuf};
791     });
792 root 1.59 } else {
793 root 1.64 $_[0]->on_error (sub {
794     ($! == Errno::EPIPE || !$!)
795     ? $finish->(delete $_[0]{rbuf})
796     : $finish->(undef, 599 => $_[2]);
797     });
798     $_[0]->on_read (sub { });
799 root 1.59 }
800     }
801 root 1.64 }
802     });
803     };
804    
805     # now handle proxy-CONNECT method
806     if ($proxy && $uscheme eq "https") {
807     # oh dear, we have to wrap it into a connect request
808    
809     # maybe re-use $uauthority with patched port?
810     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012");
811     $state{handle}->push_read (line => $qr_nlnl, sub {
812     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
813     or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" }));
814    
815     if ($2 == 200) {
816     $rpath = $upath;
817     &$handle_actual_request;
818     } else {
819     %state = ();
820     $cb->(undef, { @pseudo, Status => $2, Reason => $3 });
821     }
822     });
823     } else {
824     &$handle_actual_request;
825     }
826     };
827    
828     my $tcp_connect = $arg{tcp_connect}
829     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
830 root 1.57
831 root 1.64 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
832 root 1.31
833 root 1.1 };
834    
835     defined wantarray && AnyEvent::Util::guard { %state = () }
836     }
837    
838 elmex 1.15 sub http_get($@) {
839 root 1.1 unshift @_, "GET";
840     &http_request
841     }
842    
843 elmex 1.15 sub http_head($@) {
844 root 1.4 unshift @_, "HEAD";
845     &http_request
846     }
847    
848 elmex 1.15 sub http_post($$@) {
849 root 1.22 my $url = shift;
850     unshift @_, "POST", $url, "body";
851 root 1.3 &http_request
852     }
853    
854 root 1.9 =back
855    
856 root 1.55 =head2 DNS CACHING
857    
858     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
859     the actual connection, which in turn uses AnyEvent::DNS to resolve
860     hostnames. The latter is a simple stub resolver and does no caching
861     on its own. If you want DNS caching, you currently have to provide
862     your own default resolver (by storing a suitable resolver object in
863     C<$AnyEvent::DNS::RESOLVER>).
864    
865 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
866 root 1.1
867     =over 4
868    
869 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
870    
871     Sets the default proxy server to use. The proxy-url must begin with a
872 root 1.52 string of the form C<http://host:port> (optionally C<https:...>), croaks
873     otherwise.
874    
875     To clear an already-set proxy, use C<undef>.
876 root 1.2
877 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
878    
879     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
880     Date (RFC 2616).
881    
882     =item $timestamp = AnyEvent::HTTP::parse_date $date
883    
884     Takes a HTTP Date (RFC 2616) and returns the corresponding POSIX
885     timestamp, or C<undef> if the date cannot be parsed.
886    
887 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
888 root 1.1
889 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
890 root 1.1
891     =item $AnyEvent::HTTP::USERAGENT
892    
893     The default value for the C<User-Agent> header (the default is
894 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
895 root 1.1
896 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
897 root 1.1
898 root 1.47 The maximum number of concurrent connections to the same host (identified
899 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
900     are queued until previous connections are closed.
901 root 1.1
902 root 1.43 The default value for this is C<4>, and it is highly advisable to not
903     increase it.
904 root 1.3
905 root 1.14 =item $AnyEvent::HTTP::ACTIVE
906    
907     The number of active connections. This is not the number of currently
908     running requests, but the number of currently open and non-idle TCP
909     connections. This number of can be useful for load-leveling.
910    
911 root 1.1 =back
912    
913     =cut
914    
915 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
916     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
917    
918     sub format_date($) {
919     my ($time) = @_;
920    
921     # RFC 822/1123 format
922     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
923    
924     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
925     $weekday[$wday], $mday, $month[$mon], $year + 1900,
926     $H, $M, $S;
927     }
928    
929     sub parse_date($) {
930     my ($date) = @_;
931    
932     my ($d, $m, $y, $H, $M, $S);
933    
934     if ($date =~ /^[A-Z][a-z][a-z], ([0-9][0-9]) ([A-Z][a-z][a-z]) ([0-9][0-9][0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
935     # RFC 822/1123, required by RFC 2616
936     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
937    
938     } elsif ($date =~ /^[A-Z][a-z]+, ([0-9][0-9])-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
939     # RFC 850
940     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
941    
942     } elsif ($date =~ /^[A-Z][a-z][a-z] ([A-Z][a-z][a-z]) ([0-9 ][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) ([0-9][0-9][0-9][0-9])$/) {
943     # ISO C's asctime
944     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
945     }
946     # other formats fail in the loop below
947    
948     for (0..11) {
949     if ($m eq $month[$_]) {
950     require Time::Local;
951     return Time::Local::timegm ($S, $M, $H, $d, $_, $y);
952     }
953     }
954    
955     undef
956     }
957    
958 root 1.2 sub set_proxy($) {
959 root 1.52 if (length $_[0]) {
960     $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix
961     or Carp::croak "$_[0]: invalid proxy URL";
962     $PROXY = [$2, $3 || 3128, $1]
963     } else {
964     undef $PROXY;
965     }
966 root 1.2 }
967    
968     # initialise proxy from environment
969 root 1.52 eval {
970     set_proxy $ENV{http_proxy};
971     };
972 root 1.2
973 root 1.60 =head2 SOCKS PROXIES
974    
975     Socks proxies are not directly supported by AnyEvent::HTTP. You can
976     compile your perl to support socks, or use an external program such as
977     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
978     transparently.
979    
980     Alternatively, for AnyEvent::HTTP only, you can use your own
981     C<tcp_connect> function that does the proxy handshake - here is an example
982     that works with socks4a proxies:
983    
984     use Errno;
985     use AnyEvent::Util;
986     use AnyEvent::Socket;
987     use AnyEvent::Handle;
988    
989     # host, port and username of/for your socks4a proxy
990     my $socks_host = "10.0.0.23";
991     my $socks_port = 9050;
992     my $socks_user = "";
993    
994     sub socks4a_connect {
995     my ($host, $port, $connect_cb, $prepare_cb) = @_;
996    
997     my $hdl = new AnyEvent::Handle
998     connect => [$socks_host, $socks_port],
999     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1000     on_error => sub { $connect_cb->() },
1001     ;
1002    
1003     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1004    
1005     $hdl->push_read (chunk => 8, sub {
1006     my ($hdl, $chunk) = @_;
1007     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1008    
1009     if ($status == 0x5a) {
1010     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1011     } else {
1012     $! = Errno::ENXIO; $connect_cb->();
1013     }
1014     });
1015    
1016     $hdl
1017     }
1018    
1019     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1020     possibly after switching off other proxy types:
1021    
1022     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1023    
1024     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1025     my ($data, $headers) = @_;
1026     ...
1027     };
1028    
1029 root 1.1 =head1 SEE ALSO
1030    
1031     L<AnyEvent>.
1032    
1033     =head1 AUTHOR
1034    
1035 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1036     http://home.schmorp.de/
1037 root 1.1
1038 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1039     testcases and bugreports.
1040    
1041 root 1.1 =cut
1042    
1043     1
1044