ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.62
Committed: Thu Dec 30 04:30:24 2010 UTC (13 years, 4 months ago) by root
Branch: MAIN
Changes since 1.61: +7 -5 lines
Log Message:
lowercase, really

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44 root 1.41 use Errno ();
45 root 1.1
46 root 1.51 use AnyEvent 5.0 ();
47 root 1.1 use AnyEvent::Util ();
48     use AnyEvent::Handle ();
49    
50     use base Exporter::;
51    
52 root 1.57 our $VERSION = '1.46';
53 root 1.1
54 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
55 root 1.1
56 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
57 root 1.3 our $MAX_RECURSE = 10;
58 root 1.2 our $MAX_PERSISTENT = 8;
59     our $PERSISTENT_TIMEOUT = 2;
60     our $TIMEOUT = 300;
61 root 1.1
62     # changing these is evil
63 root 1.43 our $MAX_PERSISTENT_PER_HOST = 0;
64 root 1.11 our $MAX_PER_HOST = 4;
65 root 1.1
66 root 1.2 our $PROXY;
67 root 1.14 our $ACTIVE = 0;
68 root 1.2
69 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
70 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
71 root 1.1
72     =item http_get $url, key => value..., $cb->($data, $headers)
73    
74     Executes an HTTP-GET request. See the http_request function for details on
75 root 1.29 additional parameters and the return value.
76 root 1.1
77 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
78    
79 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
80     on additional parameters and the return value.
81 root 1.5
82     =item http_post $url, $body, key => value..., $cb->($data, $headers)
83 root 1.3
84 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
85 root 1.29 http_request function for details on additional parameters and the return
86     value.
87 root 1.3
88 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89    
90     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91     must be an absolute http or https URL.
92    
93 root 1.29 When called in void context, nothing is returned. In other contexts,
94     C<http_request> returns a "cancellation guard" - you have to keep the
95     object at least alive until the callback get called. If the object gets
96 root 1.58 destroyed before the callback is called, the request will be cancelled.
97 root 1.29
98 root 1.42 The callback will be called with the response body data as first argument
99     (or C<undef> if an error occured), and a hash-ref with response headers as
100     second argument.
101 root 1.2
102 root 1.7 All the headers in that hash are lowercased. In addition to the response
103 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
104     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
105     three parts of the HTTP Status-Line of the same name.
106    
107     The pseudo-header C<URL> contains the actual URL (which can differ from
108     the requested URL when following redirects - for example, you might get
109     an error that your URL scheme is not supported even though your URL is a
110     valid http URL because it redirected to an ftp URL, in which case you can
111     look at the URL pseudo header).
112    
113     The pseudo-header C<Redirect> only exists when the request was a result
114     of an internal redirect. In that case it is an array reference with
115     the C<($data, $headers)> from the redirect response. Note that this
116     response could in turn be the result of a redirect itself, and C<<
117     $headers->{Redirect}[1]{Redirect} >> will then contain the original
118     response, and so on.
119 root 1.20
120 root 1.32 If the server sends a header multiple times, then their contents will be
121     joined together with a comma (C<,>), as per the HTTP spec.
122 root 1.2
123     If an internal error occurs, such as not being able to resolve a hostname,
124 root 1.41 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<59x>
125     (usually C<599>) and the C<Reason> pseudo-header will contain an error
126     message.
127 root 1.2
128 root 1.6 A typical callback might look like this:
129    
130     sub {
131     my ($body, $hdr) = @_;
132    
133     if ($hdr->{Status} =~ /^2/) {
134     ... everything should be ok
135     } else {
136     print "error, $hdr->{Status} $hdr->{Reason}\n";
137     }
138     }
139    
140 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
141     include:
142    
143     =over 4
144    
145 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
146 root 1.1
147     Whether to recurse requests or not, e.g. on redirects, authentication
148 root 1.3 retries and so on, and how often to do so.
149 root 1.1
150     =item headers => hashref
151    
152 root 1.62 The request headers to use, with the header name (I<MUST be in lowercase>)
153     as key and header value as hash value.
154    
155     Currently, http_request> may provide its own C<host>, C<content-length>,
156     C<connection> and C<cookie> headers and will provide defaults for
157     C<user-agent> and C<referer> (this can be suppressed by using a value of
158     C<undef> for these headers in which case they won't be sent at all).
159 root 1.1
160     =item timeout => $seconds
161    
162     The time-out to use for various stages - each connect attempt will reset
163 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
164     timeout.
165    
166     Default timeout is 5 minutes.
167 root 1.2
168     =item proxy => [$host, $port[, $scheme]] or undef
169    
170     Use the given http proxy for all requests. If not specified, then the
171     default proxy (as specified by C<$ENV{http_proxy}>) is used.
172    
173 root 1.47 C<$scheme> must be either missing, C<http> for HTTP or C<https> for
174 root 1.2 HTTPS.
175 root 1.1
176 root 1.3 =item body => $string
177    
178     The request body, usually empty. Will be-sent as-is (future versions of
179     this module might offer more options).
180    
181 root 1.10 =item cookie_jar => $hash_ref
182    
183     Passing this parameter enables (simplified) cookie-processing, loosely
184     based on the original netscape specification.
185    
186     The C<$hash_ref> must be an (initially empty) hash reference which will
187     get updated automatically. It is possible to save the cookie_jar to
188     persistent storage with something like JSON or Storable, but this is not
189 root 1.40 recommended, as expiry times are currently being ignored.
190 root 1.10
191     Note that this cookie implementation is not of very high quality, nor
192     meant to be complete. If you want complete cookie management you have to
193     do that on your own. C<cookie_jar> is meant as a quick fix to get some
194     cookie-using sites working. Cookies are a privacy disaster, do not use
195     them unless required to.
196    
197 root 1.40 =item tls_ctx => $scheme | $tls_ctx
198    
199     Specifies the AnyEvent::TLS context to be used for https connections. This
200     parameter follows the same rules as the C<tls_ctx> parameter to
201     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
202     C<high> can be specified, which give you a predefined low-security (no
203     verification, highest compatibility) and high-security (CA and common-name
204     verification) TLS context.
205    
206     The default for this option is C<low>, which could be interpreted as "give
207     me the page, no matter what".
208    
209 root 1.51 =item on_prepare => $callback->($fh)
210    
211     In rare cases you need to "tune" the socket before it is used to
212     connect (for exmaple, to bind it on a given IP address). This parameter
213     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
214     and behaves exactly the same way (e.g. it has to provide a
215     timeout). See the description for the C<$prepare_cb> argument of
216     C<AnyEvent::Socket::tcp_connect> for details.
217    
218 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
219    
220     In even rarer cases you want total control over how AnyEvent::HTTP
221     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
222     to do this, but you can provide your own C<tcp_connect> function -
223 root 1.60 obviously, it has to follow the same calling conventions, except that it
224     may always return a connection guard object.
225 root 1.59
226     There are probably lots of weird uses for this function, starting from
227     tracing the hosts C<http_request> actually tries to connect, to (inexact
228     but fast) host => IP address caching or even socks protocol support.
229    
230 root 1.42 =item on_header => $callback->($headers)
231 root 1.41
232     When specified, this callback will be called with the header hash as soon
233     as headers have been successfully received from the remote server (not on
234     locally-generated errors).
235    
236     It has to return either true (in which case AnyEvent::HTTP will continue),
237     or false, in which case AnyEvent::HTTP will cancel the download (and call
238     the finish callback with an error code of C<598>).
239    
240     This callback is useful, among other things, to quickly reject unwanted
241     content, which, if it is supposed to be rare, can be faster than first
242     doing a C<HEAD> request.
243    
244 root 1.42 Example: cancel the request unless the content-type is "text/html".
245 root 1.41
246 root 1.42 on_header => sub {
247     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
248     },
249 root 1.41
250 root 1.42 =item on_body => $callback->($partial_body, $headers)
251 root 1.41
252 root 1.42 When specified, all body data will be passed to this callback instead of
253     to the completion callback. The completion callback will get the empty
254     string instead of the body data.
255 root 1.41
256 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
257     or false, in which case AnyEvent::HTTP will cancel the download (and call
258     the completion callback with an error code of C<598>).
259    
260     This callback is useful when the data is too large to be held in memory
261     (so the callback writes it to a file) or when only some information should
262     be extracted, or when the body should be processed incrementally.
263 root 1.41
264     It is usually preferred over doing your own body handling via
265 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
266     only used to create a connection, C<want_body_handle> is the better
267     alternative, as it allows you to install your own event handler, reducing
268     resource usage.
269 root 1.41
270     =item want_body_handle => $enable
271    
272     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
273     changes considerably: after parsing the headers, and instead of
274     downloading the body (if any), the completion callback will be
275     called. Instead of the C<$body> argument containing the body data, the
276     callback will receive the L<AnyEvent::Handle> object associated with the
277     connection. In error cases, C<undef> will be passed. When there is no body
278     (e.g. status C<304>), the empty string will be passed.
279    
280     The handle object might or might not be in TLS mode, might be connected to
281     a proxy, be a persistent connection etc., and configured in unspecified
282     ways. The user is responsible for this handle (it will not be used by this
283     module anymore).
284    
285     This is useful with some push-type services, where, after the initial
286     headers, an interactive protocol is used (typical example would be the
287     push-style twitter API which starts a JSON/XML stream).
288    
289     If you think you need this, first have a look at C<on_body>, to see if
290 root 1.45 that doesn't solve your problem in a better way.
291 root 1.41
292 root 1.1 =back
293    
294 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
295    
296     http_request GET => "http://www.nethype.de/", sub {
297     my ($body, $hdr) = @_;
298     print "$body\n";
299     };
300    
301     Example: make a HTTP HEAD request on https://www.google.com/, use a
302     timeout of 30 seconds.
303    
304     http_request
305     GET => "https://www.google.com",
306     timeout => 30,
307     sub {
308     my ($body, $hdr) = @_;
309     use Data::Dumper;
310     print Dumper $hdr;
311     }
312     ;
313 root 1.1
314 root 1.29 Example: make another simple HTTP GET request, but immediately try to
315     cancel it.
316    
317     my $request = http_request GET => "http://www.nethype.de/", sub {
318     my ($body, $hdr) = @_;
319     print "$body\n";
320     };
321    
322     undef $request;
323    
324 root 1.1 =cut
325    
326 root 1.12 sub _slot_schedule;
327 root 1.11 sub _slot_schedule($) {
328     my $host = shift;
329    
330     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
331     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
332 root 1.12 # somebody wants that slot
333 root 1.11 ++$CO_SLOT{$host}[0];
334 root 1.14 ++$ACTIVE;
335 root 1.11
336     $cb->(AnyEvent::Util::guard {
337 root 1.14 --$ACTIVE;
338 root 1.11 --$CO_SLOT{$host}[0];
339     _slot_schedule $host;
340     });
341     } else {
342     # nobody wants the slot, maybe we can forget about it
343     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
344     last;
345     }
346     }
347     }
348    
349     # wait for a free slot on host, call callback
350     sub _get_slot($$) {
351     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
352    
353     _slot_schedule $_[0];
354     }
355    
356 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
357 root 1.34
358 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
359     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
360 root 1.40
361 elmex 1.15 sub http_request($$@) {
362 root 1.1 my $cb = pop;
363     my ($method, $url, %arg) = @_;
364    
365     my %hdr;
366    
367 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
368     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
369    
370 root 1.3 $method = uc $method;
371    
372 root 1.8 if (my $hdr = $arg{headers}) {
373 root 1.1 while (my ($k, $v) = each %$hdr) {
374     $hdr{lc $k} = $v;
375     }
376     }
377    
378 root 1.55 # pseudo headers for all subsequent responses
379     my @pseudo = (URL => $url);
380     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
381    
382 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
383 root 1.8
384 root 1.55 return $cb->(undef, { Status => 599, Reason => "Too many redirections", @pseudo })
385 root 1.8 if $recurse < 0;
386    
387 root 1.2 my $proxy = $arg{proxy} || $PROXY;
388 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
389    
390 root 1.31 my ($uscheme, $uauthority, $upath, $query, $fragment) =
391 root 1.56 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?|;
392 root 1.2
393 root 1.31 $uscheme = lc $uscheme;
394 root 1.1
395 root 1.31 my $uport = $uscheme eq "http" ? 80
396     : $uscheme eq "https" ? 443
397 root 1.55 : return $cb->(undef, { Status => 599, Reason => "Only http and https URL schemes supported", @pseudo });
398 root 1.13
399 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
400 root 1.55 or return $cb->(undef, { Status => 599, Reason => "Unparsable URL", @pseudo });
401 root 1.10
402     my $uhost = $1;
403     $uport = $2 if defined $2;
404    
405 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
406     unless exists $hdr{host};
407 root 1.43
408 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
409 root 1.56 $upath .= $query if length $query;
410 root 1.10
411     $upath =~ s%^/?%/%;
412    
413     # cookie processing
414     if (my $jar = $arg{cookie_jar}) {
415 root 1.31 %$jar = () if $jar->{version} != 1;
416 root 1.10
417     my @cookie;
418    
419     while (my ($chost, $v) = each %$jar) {
420 root 1.30 if ($chost =~ /^\./) {
421     next unless $chost eq substr $uhost, -length $chost;
422     } elsif ($chost =~ /\./) {
423     next unless $chost eq $uhost;
424     } else {
425     next;
426     }
427 root 1.10
428     while (my ($cpath, $v) = each %$v) {
429     next unless $cpath eq substr $upath, 0, length $cpath;
430    
431     while (my ($k, $v) = each %$v) {
432 root 1.31 next if $uscheme ne "https" && exists $v->{secure};
433     my $value = $v->{value};
434     $value =~ s/([\\"])/\\$1/g;
435     push @cookie, "$k=\"$value\"";
436 root 1.10 }
437     }
438     }
439    
440     $hdr{cookie} = join "; ", @cookie
441     if @cookie;
442     }
443 root 1.1
444 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
445 root 1.2
446 root 1.10 if ($proxy) {
447 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
448 root 1.31
449 root 1.47 $rscheme = "http" unless defined $rscheme;
450    
451 root 1.31 # don't support https requests over https-proxy transport,
452 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
453 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
454 root 1.10 } else {
455 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
456 root 1.2 }
457    
458 root 1.47 # leave out fragment and query string, just a heuristic
459     $hdr{referer} ||= "$uscheme://$uauthority$upath" unless exists $hdr{referer};
460     $hdr{"user-agent"} ||= $USERAGENT unless exists $hdr{"user-agent"};
461 root 1.41
462 root 1.53 $hdr{"content-length"} = length $arg{body}
463     if length $arg{body} || $method ne "GET";
464 root 1.1
465 root 1.11 my %state = (connect_guard => 1);
466    
467     _get_slot $uhost, sub {
468     $state{slot_guard} = shift;
469 root 1.1
470 root 1.11 return unless $state{connect_guard};
471 root 1.1
472 root 1.59 my $tcp_connect = $arg{tcp_connect}
473     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
474    
475     $state{connect_guard} = $tcp_connect->(
476     $rhost,
477     $rport,
478     sub {
479     $state{fh} = shift
480     or do {
481     my $err = "$!";
482     %state = ();
483     return $cb->(undef, { Status => 599, Reason => $err, @pseudo });
484     };
485 root 1.44
486 root 1.59 pop; # free memory, save a tree
487 root 1.11
488 root 1.59 return unless delete $state{connect_guard};
489 root 1.11
490 root 1.59 # get handle
491     $state{handle} = new AnyEvent::Handle
492     fh => $state{fh},
493     peername => $rhost,
494     tls_ctx => $arg{tls_ctx},
495     # these need to be reconfigured on keepalive handles
496     timeout => $timeout,
497     on_error => sub {
498     %state = ();
499     $cb->(undef, { Status => 599, Reason => $_[2], @pseudo });
500     },
501     on_eof => sub {
502     %state = ();
503     $cb->(undef, { Status => 599, Reason => "Unexpected end-of-file", @pseudo });
504     },
505     ;
506 root 1.11
507 root 1.59 # limit the number of persistent connections
508     # keepalive not yet supported
509 root 1.56 # if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
510     # ++$KA_COUNT{$_[1]};
511     # $state{handle}{ka_count_guard} = AnyEvent::Util::guard {
512     # --$KA_COUNT{$_[1]}
513     # };
514     # $hdr{connection} = "keep-alive";
515     # } else {
516 root 1.59 delete $hdr{connection};
517 root 1.56 # }
518 root 1.1
519 root 1.59 $state{handle}->starttls ("connect") if $rscheme eq "https";
520    
521     # handle actual, non-tunneled, request
522     my $handle_actual_request = sub {
523     $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls};
524    
525     # send request
526     $state{handle}->push_write (
527     "$method $rpath HTTP/1.0\015\012"
528     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
529     . "\015\012"
530     . (delete $arg{body})
531     );
532    
533     # return if error occured during push_write()
534     return unless %state;
535    
536     %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use
537    
538     # status line and headers
539     $state{handle}->push_read (line => $qr_nlnl, sub {
540     for ("$_[1]") {
541     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
542    
543     /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )? \015?\012/igxc
544     or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid server response", @pseudo }));
545    
546     push @pseudo,
547     HTTPVersion => $1,
548     Status => $2,
549     Reason => $3,
550     ;
551    
552     # things seen, not parsed:
553     # p3pP="NON CUR OTPi OUR NOR UNI"
554    
555     $hdr{lc $1} .= ",$2"
556     while /\G
557     ([^:\000-\037]*):
558     [\011\040]*
559     ((?: [^\012]+ | \012[\011\040] )*)
560     \012
561     /gxc;
562 root 1.31
563 root 1.59 /\G$/
564     or return (%state = (), $cb->(undef, { Status => 599, Reason => "Garbled response headers", @pseudo }));
565 root 1.41 }
566    
567 root 1.59 # remove the "," prefix we added to all headers above
568     substr $_, 0, 1, ""
569     for values %hdr;
570    
571     # patch in all pseudo headers
572     %hdr = (%hdr, @pseudo);
573    
574     # redirect handling
575     # microsoft and other shitheads don't give a shit for following standards,
576     # try to support some common forms of broken Location headers.
577     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
578     $hdr{location} =~ s/^\.\/+//;
579    
580     my $url = "$rscheme://$uhost:$uport";
581    
582     unless ($hdr{location} =~ s/^\///) {
583     $url .= $upath;
584     $url =~ s/\/[^\/]*$//;
585     }
586    
587     $hdr{location} = "$url/$hdr{location}";
588     }
589 root 1.57
590 root 1.59 my $redirect;
591 root 1.41
592 root 1.59 if ($recurse) {
593     my $status = $hdr{Status};
594 root 1.55
595 root 1.59 # industry standard is to redirect POST as GET for
596     # 301, 302 and 303, in contrast to http/1.0 and 1.1.
597     # also, the UA should ask the user for 301 and 307 and POST,
598     # industry standard seems to be to simply follow.
599     # we go with the industry standard.
600     if ($status == 301 or $status == 302 or $status == 303) {
601     # HTTP/1.1 is unclear on how to mutate the method
602     $method = "GET" unless $method eq "HEAD";
603     $redirect = 1;
604     } elsif ($status == 307) {
605     $redirect = 1;
606     }
607 root 1.41 }
608    
609 root 1.59 my $finish = sub {
610     $state{handle}->destroy if $state{handle};
611     %state = ();
612    
613     # set-cookie processing
614     if ($arg{cookie_jar}) {
615     for ($_[1]{"set-cookie"}) {
616     # parse NAME=VALUE
617     my @kv;
618    
619     while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) {
620     my $name = $1;
621     my $value = $3;
622    
623     unless ($value) {
624     $value = $2;
625     $value =~ s/\\(.)/$1/gs;
626     }
627    
628     push @kv, $name => $value;
629 root 1.31
630 root 1.59 last unless /\G\s*;/gc;
631 root 1.57 }
632    
633 root 1.59 last unless @kv;
634 root 1.31
635 root 1.59 my $name = shift @kv;
636     my %kv = (value => shift @kv, @kv);
637 root 1.31
638 root 1.59 my $cdom;
639     my $cpath = (delete $kv{path}) || "/";
640 root 1.10
641 root 1.59 if (exists $kv{domain}) {
642     $cdom = delete $kv{domain};
643    
644     $cdom =~ s/^\.?/./; # make sure it starts with a "."
645 root 1.11
646 root 1.59 next if $cdom =~ /\.$/;
647    
648     # this is not rfc-like and not netscape-like. go figure.
649     my $ndots = $cdom =~ y/.//;
650     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
651     } else {
652     $cdom = $uhost;
653     }
654    
655     # store it
656     $arg{cookie_jar}{version} = 1;
657     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
658 root 1.11
659 root 1.59 redo if /\G\s*,/gc;
660 root 1.57 }
661 root 1.59 }
662 root 1.31
663 root 1.59 if ($redirect && exists $hdr{location}) {
664     # we ignore any errors, as it is very common to receive
665     # Content-Length != 0 but no actual body
666     # we also access %hdr, as $_[1] might be an erro
667     http_request (
668     $method => $hdr{location},
669     %arg,
670     recurse => $recurse - 1,
671     Redirect => \@_,
672     $cb);
673     } else {
674     $cb->($_[0], $_[1]);
675 root 1.11 }
676 root 1.59 };
677    
678     my $len = $hdr{"content-length"};
679 root 1.8
680 root 1.59 if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
681     $finish->(undef, { Status => 598, Reason => "Request cancelled by on_header", @pseudo });
682     } elsif (
683     $hdr{Status} =~ /^(?:1..|[23]04)$/
684     or $method eq "HEAD"
685     or (defined $len && !$len)
686     ) {
687     # no body
688     $finish->("", \%hdr);
689 root 1.57 } else {
690 root 1.59 # body handling, four different code paths
691     # for want_body_handle, on_body (2x), normal (2x)
692     # we might read too much here, but it does not matter yet (no pers. connections)
693     if (!$redirect && $arg{want_body_handle}) {
694     $_[0]->on_eof (undef);
695     $_[0]->on_error (undef);
696     $_[0]->on_read (undef);
697 root 1.24
698 root 1.59 $finish->(delete $state{handle}, \%hdr);
699 root 1.41
700 root 1.59 } elsif ($arg{on_body}) {
701     $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) });
702     if ($len) {
703     $_[0]->on_eof (undef);
704     $_[0]->on_read (sub {
705     $len -= length $_[0]{rbuf};
706    
707     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
708     or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo });
709    
710     $len > 0
711     or $finish->("", \%hdr);
712     });
713     } else {
714     $_[0]->on_eof (sub {
715     $finish->("", \%hdr);
716     });
717     $_[0]->on_read (sub {
718     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
719     or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo });
720     });
721     }
722     } else {
723 root 1.41 $_[0]->on_eof (undef);
724    
725 root 1.59 if ($len) {
726     $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) });
727     $_[0]->on_read (sub {
728     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), \%hdr)
729     if $len <= length $_[0]{rbuf};
730     });
731     } else {
732     $_[0]->on_error (sub {
733     ($! == Errno::EPIPE || !$!)
734     ? $finish->(delete $_[0]{rbuf}, \%hdr)
735     : $finish->(undef, { Status => 599, Reason => $_[2], @pseudo });
736     });
737     $_[0]->on_read (sub { });
738     }
739     }
740     }
741     });
742     };
743 root 1.57
744 root 1.59 # now handle proxy-CONNECT method
745     if ($proxy && $uscheme eq "https") {
746     # oh dear, we have to wrap it into a connect request
747    
748     # maybe re-use $uauthority with patched port?
749     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012");
750     $state{handle}->push_read (line => $qr_nlnl, sub {
751     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
752     or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid proxy connect response ($_[1])", @pseudo }));
753    
754     if ($2 == 200) {
755     $rpath = $upath;
756     &$handle_actual_request;
757 root 1.57 } else {
758 root 1.59 %state = ();
759     $cb->(undef, { Status => $2, Reason => $3, @pseudo });
760 root 1.11 }
761 root 1.59 });
762     } else {
763     &$handle_actual_request;
764     }
765 root 1.31
766 root 1.59 },
767     $arg{on_prepare} || sub { $timeout }
768     );
769 root 1.1 };
770    
771     defined wantarray && AnyEvent::Util::guard { %state = () }
772     }
773    
774 elmex 1.15 sub http_get($@) {
775 root 1.1 unshift @_, "GET";
776     &http_request
777     }
778    
779 elmex 1.15 sub http_head($@) {
780 root 1.4 unshift @_, "HEAD";
781     &http_request
782     }
783    
784 elmex 1.15 sub http_post($$@) {
785 root 1.22 my $url = shift;
786     unshift @_, "POST", $url, "body";
787 root 1.3 &http_request
788     }
789    
790 root 1.9 =back
791    
792 root 1.55 =head2 DNS CACHING
793    
794     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
795     the actual connection, which in turn uses AnyEvent::DNS to resolve
796     hostnames. The latter is a simple stub resolver and does no caching
797     on its own. If you want DNS caching, you currently have to provide
798     your own default resolver (by storing a suitable resolver object in
799     C<$AnyEvent::DNS::RESOLVER>).
800    
801 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
802 root 1.1
803     =over 4
804    
805 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
806    
807     Sets the default proxy server to use. The proxy-url must begin with a
808 root 1.52 string of the form C<http://host:port> (optionally C<https:...>), croaks
809     otherwise.
810    
811     To clear an already-set proxy, use C<undef>.
812 root 1.2
813 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
814    
815     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
816     Date (RFC 2616).
817    
818     =item $timestamp = AnyEvent::HTTP::parse_date $date
819    
820     Takes a HTTP Date (RFC 2616) and returns the corresponding POSIX
821     timestamp, or C<undef> if the date cannot be parsed.
822    
823 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
824 root 1.1
825 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
826 root 1.1
827     =item $AnyEvent::HTTP::USERAGENT
828    
829     The default value for the C<User-Agent> header (the default is
830 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
831 root 1.1
832 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
833 root 1.1
834 root 1.47 The maximum number of concurrent connections to the same host (identified
835 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
836     are queued until previous connections are closed.
837 root 1.1
838 root 1.43 The default value for this is C<4>, and it is highly advisable to not
839     increase it.
840 root 1.3
841 root 1.14 =item $AnyEvent::HTTP::ACTIVE
842    
843     The number of active connections. This is not the number of currently
844     running requests, but the number of currently open and non-idle TCP
845     connections. This number of can be useful for load-leveling.
846    
847 root 1.1 =back
848    
849     =cut
850    
851 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
852     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
853    
854     sub format_date($) {
855     my ($time) = @_;
856    
857     # RFC 822/1123 format
858     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
859    
860     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
861     $weekday[$wday], $mday, $month[$mon], $year + 1900,
862     $H, $M, $S;
863     }
864    
865     sub parse_date($) {
866     my ($date) = @_;
867    
868     my ($d, $m, $y, $H, $M, $S);
869    
870     if ($date =~ /^[A-Z][a-z][a-z], ([0-9][0-9]) ([A-Z][a-z][a-z]) ([0-9][0-9][0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
871     # RFC 822/1123, required by RFC 2616
872     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
873    
874     } elsif ($date =~ /^[A-Z][a-z]+, ([0-9][0-9])-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
875     # RFC 850
876     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
877    
878     } elsif ($date =~ /^[A-Z][a-z][a-z] ([A-Z][a-z][a-z]) ([0-9 ][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) ([0-9][0-9][0-9][0-9])$/) {
879     # ISO C's asctime
880     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
881     }
882     # other formats fail in the loop below
883    
884     for (0..11) {
885     if ($m eq $month[$_]) {
886     require Time::Local;
887     return Time::Local::timegm ($S, $M, $H, $d, $_, $y);
888     }
889     }
890    
891     undef
892     }
893    
894 root 1.2 sub set_proxy($) {
895 root 1.52 if (length $_[0]) {
896     $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix
897     or Carp::croak "$_[0]: invalid proxy URL";
898     $PROXY = [$2, $3 || 3128, $1]
899     } else {
900     undef $PROXY;
901     }
902 root 1.2 }
903    
904     # initialise proxy from environment
905 root 1.52 eval {
906     set_proxy $ENV{http_proxy};
907     };
908 root 1.2
909 root 1.60 =head2 SOCKS PROXIES
910    
911     Socks proxies are not directly supported by AnyEvent::HTTP. You can
912     compile your perl to support socks, or use an external program such as
913     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
914     transparently.
915    
916     Alternatively, for AnyEvent::HTTP only, you can use your own
917     C<tcp_connect> function that does the proxy handshake - here is an example
918     that works with socks4a proxies:
919    
920     use Errno;
921     use AnyEvent::Util;
922     use AnyEvent::Socket;
923     use AnyEvent::Handle;
924    
925     # host, port and username of/for your socks4a proxy
926     my $socks_host = "10.0.0.23";
927     my $socks_port = 9050;
928     my $socks_user = "";
929    
930     sub socks4a_connect {
931     my ($host, $port, $connect_cb, $prepare_cb) = @_;
932    
933     my $hdl = new AnyEvent::Handle
934     connect => [$socks_host, $socks_port],
935     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
936     on_error => sub { $connect_cb->() },
937     ;
938    
939     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
940    
941     $hdl->push_read (chunk => 8, sub {
942     my ($hdl, $chunk) = @_;
943     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
944    
945     if ($status == 0x5a) {
946     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
947     } else {
948     $! = Errno::ENXIO; $connect_cb->();
949     }
950     });
951    
952     $hdl
953     }
954    
955     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
956     possibly after switching off other proxy types:
957    
958     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
959    
960     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
961     my ($data, $headers) = @_;
962     ...
963     };
964    
965 root 1.1 =head1 SEE ALSO
966    
967     L<AnyEvent>.
968    
969     =head1 AUTHOR
970    
971 root 1.18 Marc Lehmann <schmorp@schmorp.de>
972     http://home.schmorp.de/
973 root 1.1
974 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
975     testcases and bugreports.
976    
977 root 1.1 =cut
978    
979     1
980