ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.68
Committed: Fri Dec 31 19:22:18 2010 UTC (13 years, 4 months ago) by root
Branch: MAIN
Changes since 1.67: +42 -21 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44 root 1.41 use Errno ();
45 root 1.1
46 root 1.51 use AnyEvent 5.0 ();
47 root 1.1 use AnyEvent::Util ();
48     use AnyEvent::Handle ();
49    
50     use base Exporter::;
51    
52 root 1.65 our $VERSION = '1.5';
53 root 1.1
54 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
55 root 1.1
56 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
57 root 1.3 our $MAX_RECURSE = 10;
58 root 1.2 our $MAX_PERSISTENT = 8;
59     our $PERSISTENT_TIMEOUT = 2;
60     our $TIMEOUT = 300;
61 root 1.1
62     # changing these is evil
63 root 1.43 our $MAX_PERSISTENT_PER_HOST = 0;
64 root 1.11 our $MAX_PER_HOST = 4;
65 root 1.1
66 root 1.2 our $PROXY;
67 root 1.14 our $ACTIVE = 0;
68 root 1.2
69 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
70 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
71 root 1.1
72     =item http_get $url, key => value..., $cb->($data, $headers)
73    
74     Executes an HTTP-GET request. See the http_request function for details on
75 root 1.29 additional parameters and the return value.
76 root 1.1
77 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
78    
79 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
80     on additional parameters and the return value.
81 root 1.5
82     =item http_post $url, $body, key => value..., $cb->($data, $headers)
83 root 1.3
84 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
85 root 1.29 http_request function for details on additional parameters and the return
86     value.
87 root 1.3
88 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89    
90     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91     must be an absolute http or https URL.
92    
93 root 1.29 When called in void context, nothing is returned. In other contexts,
94     C<http_request> returns a "cancellation guard" - you have to keep the
95     object at least alive until the callback get called. If the object gets
96 root 1.58 destroyed before the callback is called, the request will be cancelled.
97 root 1.29
98 root 1.42 The callback will be called with the response body data as first argument
99 root 1.68 (or C<undef> if an error occured), and a hash-ref with response headers
100     (and trailers) as second argument.
101 root 1.2
102 root 1.7 All the headers in that hash are lowercased. In addition to the response
103 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
104     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
105 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
106     during the body phase of a request, then the original C<Status> and
107     C<Reason> values from the header are available as C<OrigStatus> and
108     C<OrigReason>.
109 root 1.55
110     The pseudo-header C<URL> contains the actual URL (which can differ from
111     the requested URL when following redirects - for example, you might get
112     an error that your URL scheme is not supported even though your URL is a
113     valid http URL because it redirected to an ftp URL, in which case you can
114     look at the URL pseudo header).
115    
116     The pseudo-header C<Redirect> only exists when the request was a result
117     of an internal redirect. In that case it is an array reference with
118     the C<($data, $headers)> from the redirect response. Note that this
119     response could in turn be the result of a redirect itself, and C<<
120     $headers->{Redirect}[1]{Redirect} >> will then contain the original
121     response, and so on.
122 root 1.20
123 root 1.32 If the server sends a header multiple times, then their contents will be
124     joined together with a comma (C<,>), as per the HTTP spec.
125 root 1.2
126     If an internal error occurs, such as not being able to resolve a hostname,
127 root 1.41 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<59x>
128     (usually C<599>) and the C<Reason> pseudo-header will contain an error
129     message.
130 root 1.2
131 root 1.6 A typical callback might look like this:
132    
133     sub {
134     my ($body, $hdr) = @_;
135    
136     if ($hdr->{Status} =~ /^2/) {
137     ... everything should be ok
138     } else {
139     print "error, $hdr->{Status} $hdr->{Reason}\n";
140     }
141     }
142    
143 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
144     include:
145    
146     =over 4
147    
148 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
149 root 1.1
150     Whether to recurse requests or not, e.g. on redirects, authentication
151 root 1.3 retries and so on, and how often to do so.
152 root 1.1
153     =item headers => hashref
154    
155 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
156     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
157     will provide defaults for C<TE:>, C<Referer:> and C<User-Agent:> (this can
158     be suppressed by using C<undef> for these headers in which case they won't
159     be sent at all).
160 root 1.1
161     =item timeout => $seconds
162    
163     The time-out to use for various stages - each connect attempt will reset
164 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
165     timeout.
166    
167     Default timeout is 5 minutes.
168 root 1.2
169     =item proxy => [$host, $port[, $scheme]] or undef
170    
171     Use the given http proxy for all requests. If not specified, then the
172     default proxy (as specified by C<$ENV{http_proxy}>) is used.
173    
174 root 1.47 C<$scheme> must be either missing, C<http> for HTTP or C<https> for
175 root 1.2 HTTPS.
176 root 1.1
177 root 1.3 =item body => $string
178    
179 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
180 root 1.3 this module might offer more options).
181    
182 root 1.10 =item cookie_jar => $hash_ref
183    
184     Passing this parameter enables (simplified) cookie-processing, loosely
185     based on the original netscape specification.
186    
187     The C<$hash_ref> must be an (initially empty) hash reference which will
188     get updated automatically. It is possible to save the cookie_jar to
189     persistent storage with something like JSON or Storable, but this is not
190 root 1.40 recommended, as expiry times are currently being ignored.
191 root 1.10
192     Note that this cookie implementation is not of very high quality, nor
193     meant to be complete. If you want complete cookie management you have to
194     do that on your own. C<cookie_jar> is meant as a quick fix to get some
195     cookie-using sites working. Cookies are a privacy disaster, do not use
196     them unless required to.
197    
198 root 1.40 =item tls_ctx => $scheme | $tls_ctx
199    
200     Specifies the AnyEvent::TLS context to be used for https connections. This
201     parameter follows the same rules as the C<tls_ctx> parameter to
202     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
203     C<high> can be specified, which give you a predefined low-security (no
204     verification, highest compatibility) and high-security (CA and common-name
205     verification) TLS context.
206    
207     The default for this option is C<low>, which could be interpreted as "give
208     me the page, no matter what".
209    
210 root 1.51 =item on_prepare => $callback->($fh)
211    
212     In rare cases you need to "tune" the socket before it is used to
213     connect (for exmaple, to bind it on a given IP address). This parameter
214     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
215     and behaves exactly the same way (e.g. it has to provide a
216     timeout). See the description for the C<$prepare_cb> argument of
217     C<AnyEvent::Socket::tcp_connect> for details.
218    
219 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
220    
221     In even rarer cases you want total control over how AnyEvent::HTTP
222     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
223     to do this, but you can provide your own C<tcp_connect> function -
224 root 1.60 obviously, it has to follow the same calling conventions, except that it
225     may always return a connection guard object.
226 root 1.59
227     There are probably lots of weird uses for this function, starting from
228     tracing the hosts C<http_request> actually tries to connect, to (inexact
229     but fast) host => IP address caching or even socks protocol support.
230    
231 root 1.42 =item on_header => $callback->($headers)
232 root 1.41
233     When specified, this callback will be called with the header hash as soon
234     as headers have been successfully received from the remote server (not on
235     locally-generated errors).
236    
237     It has to return either true (in which case AnyEvent::HTTP will continue),
238     or false, in which case AnyEvent::HTTP will cancel the download (and call
239     the finish callback with an error code of C<598>).
240    
241     This callback is useful, among other things, to quickly reject unwanted
242     content, which, if it is supposed to be rare, can be faster than first
243     doing a C<HEAD> request.
244    
245 root 1.68 The downside is that cancelling the request makes it impossible to re-use
246     the connection. Also, the C<on_header> callback will not receive any
247     trailer (headers sent after the response body).
248    
249 root 1.42 Example: cancel the request unless the content-type is "text/html".
250 root 1.41
251 root 1.42 on_header => sub {
252     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
253     },
254 root 1.41
255 root 1.42 =item on_body => $callback->($partial_body, $headers)
256 root 1.41
257 root 1.42 When specified, all body data will be passed to this callback instead of
258     to the completion callback. The completion callback will get the empty
259     string instead of the body data.
260 root 1.41
261 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
262     or false, in which case AnyEvent::HTTP will cancel the download (and call
263     the completion callback with an error code of C<598>).
264    
265 root 1.68 The downside to cancelling the request is that it makes it impossible to
266     re-use the connection.
267    
268 root 1.42 This callback is useful when the data is too large to be held in memory
269     (so the callback writes it to a file) or when only some information should
270     be extracted, or when the body should be processed incrementally.
271 root 1.41
272     It is usually preferred over doing your own body handling via
273 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
274     only used to create a connection, C<want_body_handle> is the better
275     alternative, as it allows you to install your own event handler, reducing
276     resource usage.
277 root 1.41
278     =item want_body_handle => $enable
279    
280     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
281     changes considerably: after parsing the headers, and instead of
282     downloading the body (if any), the completion callback will be
283     called. Instead of the C<$body> argument containing the body data, the
284     callback will receive the L<AnyEvent::Handle> object associated with the
285     connection. In error cases, C<undef> will be passed. When there is no body
286     (e.g. status C<304>), the empty string will be passed.
287    
288     The handle object might or might not be in TLS mode, might be connected to
289     a proxy, be a persistent connection etc., and configured in unspecified
290     ways. The user is responsible for this handle (it will not be used by this
291     module anymore).
292    
293     This is useful with some push-type services, where, after the initial
294     headers, an interactive protocol is used (typical example would be the
295     push-style twitter API which starts a JSON/XML stream).
296    
297     If you think you need this, first have a look at C<on_body>, to see if
298 root 1.45 that doesn't solve your problem in a better way.
299 root 1.41
300 root 1.1 =back
301    
302 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
303     the response body.
304 root 1.9
305     http_request GET => "http://www.nethype.de/", sub {
306     my ($body, $hdr) = @_;
307     print "$body\n";
308     };
309    
310 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
311 root 1.9 timeout of 30 seconds.
312    
313     http_request
314     GET => "https://www.google.com",
315     timeout => 30,
316     sub {
317     my ($body, $hdr) = @_;
318     use Data::Dumper;
319     print Dumper $hdr;
320     }
321     ;
322 root 1.1
323 root 1.68 Example: do another simple HTTP GET request, but immediately try to
324 root 1.29 cancel it.
325    
326     my $request = http_request GET => "http://www.nethype.de/", sub {
327     my ($body, $hdr) = @_;
328     print "$body\n";
329     };
330    
331     undef $request;
332    
333 root 1.1 =cut
334    
335 root 1.12 sub _slot_schedule;
336 root 1.11 sub _slot_schedule($) {
337     my $host = shift;
338    
339     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
340     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
341 root 1.12 # somebody wants that slot
342 root 1.11 ++$CO_SLOT{$host}[0];
343 root 1.14 ++$ACTIVE;
344 root 1.11
345     $cb->(AnyEvent::Util::guard {
346 root 1.14 --$ACTIVE;
347 root 1.11 --$CO_SLOT{$host}[0];
348     _slot_schedule $host;
349     });
350     } else {
351     # nobody wants the slot, maybe we can forget about it
352     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
353     last;
354     }
355     }
356     }
357    
358     # wait for a free slot on host, call callback
359     sub _get_slot($$) {
360     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
361    
362     _slot_schedule $_[0];
363     }
364    
365 root 1.66 # continue to parse $_ for headers and place them into the arg
366     sub parse_hdr() {
367     my %hdr;
368    
369     # things seen, not parsed:
370     # p3pP="NON CUR OTPi OUR NOR UNI"
371    
372     $hdr{lc $1} .= ",$2"
373     while /\G
374     ([^:\000-\037]*):
375     [\011\040]*
376     ((?: [^\012]+ | \012[\011\040] )*)
377     \012
378     /gxc;
379    
380     /\G$/
381     or return;
382    
383     # remove the "," prefix we added to all headers above
384     substr $_, 0, 1, ""
385     for values %hdr;
386    
387     \%hdr
388     }
389    
390 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
391 root 1.34
392 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
393     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
394 root 1.40
395 elmex 1.15 sub http_request($$@) {
396 root 1.1 my $cb = pop;
397     my ($method, $url, %arg) = @_;
398    
399     my %hdr;
400    
401 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
402     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
403    
404 root 1.3 $method = uc $method;
405    
406 root 1.8 if (my $hdr = $arg{headers}) {
407 root 1.1 while (my ($k, $v) = each %$hdr) {
408     $hdr{lc $k} = $v;
409     }
410     }
411    
412 root 1.55 # pseudo headers for all subsequent responses
413     my @pseudo = (URL => $url);
414     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
415    
416 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
417 root 1.8
418 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
419 root 1.8 if $recurse < 0;
420    
421 root 1.2 my $proxy = $arg{proxy} || $PROXY;
422 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
423    
424 root 1.31 my ($uscheme, $uauthority, $upath, $query, $fragment) =
425 root 1.56 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?|;
426 root 1.2
427 root 1.31 $uscheme = lc $uscheme;
428 root 1.1
429 root 1.31 my $uport = $uscheme eq "http" ? 80
430     : $uscheme eq "https" ? 443
431 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
432 root 1.13
433 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
434 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
435 root 1.10
436     my $uhost = $1;
437     $uport = $2 if defined $2;
438    
439 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
440     unless exists $hdr{host};
441 root 1.43
442 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
443 root 1.56 $upath .= $query if length $query;
444 root 1.10
445     $upath =~ s%^/?%/%;
446    
447     # cookie processing
448     if (my $jar = $arg{cookie_jar}) {
449 root 1.31 %$jar = () if $jar->{version} != 1;
450 root 1.10
451     my @cookie;
452    
453     while (my ($chost, $v) = each %$jar) {
454 root 1.30 if ($chost =~ /^\./) {
455     next unless $chost eq substr $uhost, -length $chost;
456     } elsif ($chost =~ /\./) {
457     next unless $chost eq $uhost;
458     } else {
459     next;
460     }
461 root 1.10
462     while (my ($cpath, $v) = each %$v) {
463     next unless $cpath eq substr $upath, 0, length $cpath;
464    
465     while (my ($k, $v) = each %$v) {
466 root 1.31 next if $uscheme ne "https" && exists $v->{secure};
467     my $value = $v->{value};
468     $value =~ s/([\\"])/\\$1/g;
469     push @cookie, "$k=\"$value\"";
470 root 1.10 }
471     }
472     }
473    
474     $hdr{cookie} = join "; ", @cookie
475     if @cookie;
476     }
477 root 1.1
478 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
479 root 1.2
480 root 1.10 if ($proxy) {
481 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
482 root 1.31
483 root 1.47 $rscheme = "http" unless defined $rscheme;
484    
485 root 1.31 # don't support https requests over https-proxy transport,
486 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
487 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
488 root 1.10 } else {
489 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
490 root 1.2 }
491    
492 root 1.47 # leave out fragment and query string, just a heuristic
493 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
494     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
495 root 1.41
496 root 1.53 $hdr{"content-length"} = length $arg{body}
497     if length $arg{body} || $method ne "GET";
498 root 1.1
499 root 1.68 $hdr{connection} = "close TE"; #1.1
500     $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
501 root 1.66
502 root 1.11 my %state = (connect_guard => 1);
503    
504     _get_slot $uhost, sub {
505     $state{slot_guard} = shift;
506 root 1.1
507 root 1.11 return unless $state{connect_guard};
508 root 1.1
509 root 1.64 my $connect_cb = sub {
510     $state{fh} = shift
511     or do {
512     my $err = "$!";
513     %state = ();
514     return $cb->(undef, { @pseudo, Status => 599, Reason => $err });
515     };
516 root 1.44
517 root 1.64 pop; # free memory, save a tree
518 root 1.11
519 root 1.64 return unless delete $state{connect_guard};
520 root 1.11
521 root 1.64 # get handle
522     $state{handle} = new AnyEvent::Handle
523     fh => $state{fh},
524     peername => $rhost,
525     tls_ctx => $arg{tls_ctx},
526     # these need to be reconfigured on keepalive handles
527     timeout => $timeout,
528     on_error => sub {
529     %state = ();
530     $cb->(undef, { @pseudo, Status => 599, Reason => $_[2] });
531     },
532     on_eof => sub {
533     %state = ();
534     $cb->(undef, { @pseudo, Status => 599, Reason => "Unexpected end-of-file" });
535     },
536     ;
537 root 1.11
538 root 1.64 # limit the number of persistent connections
539     # keepalive not yet supported
540 root 1.56 # if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
541     # ++$KA_COUNT{$_[1]};
542     # $state{handle}{ka_count_guard} = AnyEvent::Util::guard {
543     # --$KA_COUNT{$_[1]}
544     # };
545     # $hdr{connection} = "keep-alive";
546     # }
547 root 1.1
548 root 1.64 $state{handle}->starttls ("connect") if $rscheme eq "https";
549    
550     # handle actual, non-tunneled, request
551     my $handle_actual_request = sub {
552     $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls};
553    
554     # send request
555     $state{handle}->push_write (
556 root 1.66 "$method $rpath HTTP/1.1\015\012"
557 root 1.64 . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
558     . "\015\012"
559     . (delete $arg{body})
560     );
561    
562     # return if error occured during push_write()
563     return unless %state;
564    
565     %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use
566    
567     # status line and headers
568 root 1.68 $state{read_response} = sub {
569 root 1.64 for ("$_[1]") {
570     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
571    
572 root 1.66 /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/igxc
573 root 1.64 or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Invalid server response" }));
574    
575 root 1.68 # 100 Continue handling
576     # should not happen as we don't send expect: 100-continue,
577     # but we handle it just in case.
578     # since we send the request body regardless, if we get an error
579     # we are out of-sync, which we currently do NOT handle correctly.
580     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
581     if $2 eq 100;
582    
583 root 1.64 push @pseudo,
584     HTTPVersion => $1,
585     Status => $2,
586     Reason => $3,
587     ;
588    
589 root 1.66 my $hdr = parse_hdr
590     or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Garbled response headers" }));
591 root 1.64
592 root 1.66 %hdr = (%$hdr, @pseudo);
593 root 1.64 }
594    
595     # redirect handling
596     # microsoft and other shitheads don't give a shit for following standards,
597     # try to support some common forms of broken Location headers.
598     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
599     $hdr{location} =~ s/^\.\/+//;
600    
601     my $url = "$rscheme://$uhost:$uport";
602    
603     unless ($hdr{location} =~ s/^\///) {
604     $url .= $upath;
605     $url =~ s/\/[^\/]*$//;
606     }
607 root 1.59
608 root 1.64 $hdr{location} = "$url/$hdr{location}";
609     }
610 root 1.31
611 root 1.64 my $redirect;
612 root 1.41
613 root 1.64 if ($recurse) {
614     my $status = $hdr{Status};
615 root 1.59
616 root 1.64 # industry standard is to redirect POST as GET for
617     # 301, 302 and 303, in contrast to http/1.0 and 1.1.
618     # also, the UA should ask the user for 301 and 307 and POST,
619     # industry standard seems to be to simply follow.
620     # we go with the industry standard.
621     if ($status == 301 or $status == 302 or $status == 303) {
622     # HTTP/1.1 is unclear on how to mutate the method
623     $method = "GET" unless $method eq "HEAD";
624     $redirect = 1;
625     } elsif ($status == 307) {
626     $redirect = 1;
627 root 1.59 }
628 root 1.64 }
629 root 1.57
630 root 1.64 my $finish = sub { # ($data, $err_status, $err_reason[, $keepalive])
631 root 1.68 my $keepalive = pop;
632    
633 root 1.64 $state{handle}->destroy if $state{handle};
634     %state = ();
635 root 1.55
636 root 1.64 if (defined $_[1]) {
637     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
638     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
639 root 1.41 }
640    
641 root 1.64 # set-cookie processing
642     if ($arg{cookie_jar}) {
643     for ($hdr{"set-cookie"}) {
644     # parse NAME=VALUE
645     my @kv;
646    
647     while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) {
648     my $name = $1;
649     my $value = $3;
650    
651     unless ($value) {
652     $value = $2;
653     $value =~ s/\\(.)/$1/gs;
654     }
655 root 1.59
656 root 1.64 push @kv, $name => $value;
657 root 1.31
658 root 1.64 last unless /\G\s*;/gc;
659     }
660 root 1.57
661 root 1.64 last unless @kv;
662 root 1.31
663 root 1.64 my $name = shift @kv;
664     my %kv = (value => shift @kv, @kv);
665 root 1.31
666 root 1.64 my $cdom;
667     my $cpath = (delete $kv{path}) || "/";
668 root 1.10
669 root 1.64 if (exists $kv{domain}) {
670     $cdom = delete $kv{domain};
671    
672     $cdom =~ s/^\.?/./; # make sure it starts with a "."
673 root 1.11
674 root 1.64 next if $cdom =~ /\.$/;
675 root 1.59
676 root 1.64 # this is not rfc-like and not netscape-like. go figure.
677     my $ndots = $cdom =~ y/.//;
678     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
679     } else {
680     $cdom = $uhost;
681     }
682    
683     # store it
684     $arg{cookie_jar}{version} = 1;
685     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
686 root 1.11
687 root 1.64 redo if /\G\s*,/gc;
688 root 1.59 }
689 root 1.64 }
690 root 1.31
691 root 1.64 if ($redirect && exists $hdr{location}) {
692     # we ignore any errors, as it is very common to receive
693     # Content-Length != 0 but no actual body
694     # we also access %hdr, as $_[1] might be an erro
695     http_request (
696     $method => $hdr{location},
697     %arg,
698     recurse => $recurse - 1,
699     Redirect => [$_[0], \%hdr],
700     $cb);
701     } else {
702     $cb->($_[0], \%hdr);
703     }
704     };
705    
706     my $len = $hdr{"content-length"};
707    
708     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
709     $finish->(undef, 598 => "Request cancelled by on_header");
710     } elsif (
711     $hdr{Status} =~ /^(?:1..|204|205|304)$/
712     or $method eq "HEAD"
713     or (defined $len && !$len)
714     ) {
715     # no body
716     $finish->("", undef, undef, 1);
717     } else {
718 root 1.66 # body handling, many different code paths
719     # - no body expected
720     # - want_body_handle
721     # - te chunked
722     # - 2x length known (with or without on_body)
723     # - 2x length not known (with or without on_body)
724 root 1.64 if (!$redirect && $arg{want_body_handle}) {
725     $_[0]->on_eof (undef);
726     $_[0]->on_error (undef);
727     $_[0]->on_read (undef);
728    
729     $finish->(delete $state{handle});
730    
731 root 1.68 } elsif ($hdr{"transfer-encoding"} =~ /\bchunked\b/i) {
732     my $cl = 0;
733 root 1.66 my $body = undef;
734     my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
735    
736     $_[0]->on_error (sub { $finish->(undef, 599 => $_[2]) });
737    
738     my $read_chunk; $read_chunk = sub {
739     $_[1] =~ /^([0-9a-fA-F]+)/
740     or $finish->(undef, 599 => "Garbled chunked transfer encoding");
741    
742     my $len = hex $1;
743    
744     if ($len) {
745 root 1.68 $cl += $len;
746    
747     $_[0]->push_read (chunk => $len, sub {
748 root 1.66 $on_body->($_[1], \%hdr)
749     or return $finish->(undef, 598 => "Request cancelled by on_body");
750    
751     $_[0]->push_read (line => sub {
752     length $_[1]
753     and return $finish->(undef, 599 => "Garbled chunked transfer encoding");
754     $_[0]->push_read (line => $read_chunk);
755     });
756     });
757     } else {
758 root 1.68 $hdr{"content-length"} ||= $cl;
759    
760 root 1.66 $_[0]->push_read (line => $qr_nlnl, sub {
761     if (length $_[1]) {
762     for ("$_[1]") {
763     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
764    
765     my $hdr = parse_hdr
766     or return $finish->(undef, 599 => "Garbled response trailers");
767    
768     %hdr = (%hdr, %$hdr);
769     }
770     }
771    
772     $finish->($body, undef, undef, 1);
773     });
774     }
775     };
776    
777     $_[0]->push_read (line => $read_chunk);
778    
779 root 1.64 } elsif ($arg{on_body}) {
780     $_[0]->on_error (sub { $finish->(undef, 599 => $_[2]) });
781 root 1.66
782 root 1.64 if ($len) {
783     $_[0]->on_read (sub {
784     $len -= length $_[0]{rbuf};
785    
786     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
787 root 1.66 or return $finish->(undef, 598 => "Request cancelled by on_body");
788 root 1.64
789     $len > 0
790     or $finish->("", undef, undef, 1);
791     });
792 root 1.59 } else {
793 root 1.64 $_[0]->on_eof (sub {
794     $finish->("");
795     });
796     $_[0]->on_read (sub {
797     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
798     or $finish->(undef, 598 => "Request cancelled by on_body");
799     });
800 root 1.11 }
801 root 1.64 } else {
802     $_[0]->on_eof (undef);
803 root 1.59
804 root 1.64 if ($len) {
805     $_[0]->on_error (sub { $finish->(undef, 599 => $_[2]) });
806     $_[0]->on_read (sub {
807     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
808     if $len <= length $_[0]{rbuf};
809     });
810 root 1.59 } else {
811 root 1.64 $_[0]->on_error (sub {
812     ($! == Errno::EPIPE || !$!)
813     ? $finish->(delete $_[0]{rbuf})
814     : $finish->(undef, 599 => $_[2]);
815     });
816     $_[0]->on_read (sub { });
817 root 1.59 }
818     }
819 root 1.64 }
820 root 1.68 };
821    
822     $state{handle}->push_read (line => $qr_nlnl, $state{read_response});
823 root 1.64 };
824    
825     # now handle proxy-CONNECT method
826     if ($proxy && $uscheme eq "https") {
827     # oh dear, we have to wrap it into a connect request
828    
829     # maybe re-use $uauthority with patched port?
830     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012");
831     $state{handle}->push_read (line => $qr_nlnl, sub {
832     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
833     or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" }));
834    
835     if ($2 == 200) {
836     $rpath = $upath;
837     &$handle_actual_request;
838     } else {
839     %state = ();
840     $cb->(undef, { @pseudo, Status => $2, Reason => $3 });
841     }
842     });
843     } else {
844     &$handle_actual_request;
845     }
846     };
847    
848     my $tcp_connect = $arg{tcp_connect}
849     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
850 root 1.57
851 root 1.64 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
852 root 1.31
853 root 1.1 };
854    
855     defined wantarray && AnyEvent::Util::guard { %state = () }
856     }
857    
858 elmex 1.15 sub http_get($@) {
859 root 1.1 unshift @_, "GET";
860     &http_request
861     }
862    
863 elmex 1.15 sub http_head($@) {
864 root 1.4 unshift @_, "HEAD";
865     &http_request
866     }
867    
868 elmex 1.15 sub http_post($$@) {
869 root 1.22 my $url = shift;
870     unshift @_, "POST", $url, "body";
871 root 1.3 &http_request
872     }
873    
874 root 1.9 =back
875    
876 root 1.55 =head2 DNS CACHING
877    
878     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
879     the actual connection, which in turn uses AnyEvent::DNS to resolve
880     hostnames. The latter is a simple stub resolver and does no caching
881     on its own. If you want DNS caching, you currently have to provide
882     your own default resolver (by storing a suitable resolver object in
883     C<$AnyEvent::DNS::RESOLVER>).
884    
885 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
886 root 1.1
887     =over 4
888    
889 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
890    
891     Sets the default proxy server to use. The proxy-url must begin with a
892 root 1.52 string of the form C<http://host:port> (optionally C<https:...>), croaks
893     otherwise.
894    
895     To clear an already-set proxy, use C<undef>.
896 root 1.2
897 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
898    
899     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
900     Date (RFC 2616).
901    
902     =item $timestamp = AnyEvent::HTTP::parse_date $date
903    
904     Takes a HTTP Date (RFC 2616) and returns the corresponding POSIX
905     timestamp, or C<undef> if the date cannot be parsed.
906    
907 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
908 root 1.1
909 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
910 root 1.1
911     =item $AnyEvent::HTTP::USERAGENT
912    
913     The default value for the C<User-Agent> header (the default is
914 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
915 root 1.1
916 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
917 root 1.1
918 root 1.47 The maximum number of concurrent connections to the same host (identified
919 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
920     are queued until previous connections are closed.
921 root 1.1
922 root 1.43 The default value for this is C<4>, and it is highly advisable to not
923     increase it.
924 root 1.3
925 root 1.14 =item $AnyEvent::HTTP::ACTIVE
926    
927     The number of active connections. This is not the number of currently
928     running requests, but the number of currently open and non-idle TCP
929     connections. This number of can be useful for load-leveling.
930    
931 root 1.1 =back
932    
933     =cut
934    
935 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
936     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
937    
938     sub format_date($) {
939     my ($time) = @_;
940    
941     # RFC 822/1123 format
942     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
943    
944     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
945     $weekday[$wday], $mday, $month[$mon], $year + 1900,
946     $H, $M, $S;
947     }
948    
949     sub parse_date($) {
950     my ($date) = @_;
951    
952     my ($d, $m, $y, $H, $M, $S);
953    
954     if ($date =~ /^[A-Z][a-z][a-z], ([0-9][0-9]) ([A-Z][a-z][a-z]) ([0-9][0-9][0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
955     # RFC 822/1123, required by RFC 2616
956     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
957    
958     } elsif ($date =~ /^[A-Z][a-z]+, ([0-9][0-9])-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
959     # RFC 850
960     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
961    
962     } elsif ($date =~ /^[A-Z][a-z][a-z] ([A-Z][a-z][a-z]) ([0-9 ][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) ([0-9][0-9][0-9][0-9])$/) {
963     # ISO C's asctime
964     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
965     }
966     # other formats fail in the loop below
967    
968     for (0..11) {
969     if ($m eq $month[$_]) {
970     require Time::Local;
971     return Time::Local::timegm ($S, $M, $H, $d, $_, $y);
972     }
973     }
974    
975     undef
976     }
977    
978 root 1.2 sub set_proxy($) {
979 root 1.52 if (length $_[0]) {
980     $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix
981     or Carp::croak "$_[0]: invalid proxy URL";
982     $PROXY = [$2, $3 || 3128, $1]
983     } else {
984     undef $PROXY;
985     }
986 root 1.2 }
987    
988     # initialise proxy from environment
989 root 1.52 eval {
990     set_proxy $ENV{http_proxy};
991     };
992 root 1.2
993 root 1.60 =head2 SOCKS PROXIES
994    
995     Socks proxies are not directly supported by AnyEvent::HTTP. You can
996     compile your perl to support socks, or use an external program such as
997     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
998     transparently.
999    
1000     Alternatively, for AnyEvent::HTTP only, you can use your own
1001     C<tcp_connect> function that does the proxy handshake - here is an example
1002     that works with socks4a proxies:
1003    
1004     use Errno;
1005     use AnyEvent::Util;
1006     use AnyEvent::Socket;
1007     use AnyEvent::Handle;
1008    
1009     # host, port and username of/for your socks4a proxy
1010     my $socks_host = "10.0.0.23";
1011     my $socks_port = 9050;
1012     my $socks_user = "";
1013    
1014     sub socks4a_connect {
1015     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1016    
1017     my $hdl = new AnyEvent::Handle
1018     connect => [$socks_host, $socks_port],
1019     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1020     on_error => sub { $connect_cb->() },
1021     ;
1022    
1023     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1024    
1025     $hdl->push_read (chunk => 8, sub {
1026     my ($hdl, $chunk) = @_;
1027     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1028    
1029     if ($status == 0x5a) {
1030     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1031     } else {
1032     $! = Errno::ENXIO; $connect_cb->();
1033     }
1034     });
1035    
1036     $hdl
1037     }
1038    
1039     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1040     possibly after switching off other proxy types:
1041    
1042     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1043    
1044     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1045     my ($data, $headers) = @_;
1046     ...
1047     };
1048    
1049 root 1.1 =head1 SEE ALSO
1050    
1051     L<AnyEvent>.
1052    
1053     =head1 AUTHOR
1054    
1055 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1056     http://home.schmorp.de/
1057 root 1.1
1058 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1059     testcases and bugreports.
1060    
1061 root 1.1 =cut
1062    
1063     1
1064