ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.63
Committed: Thu Dec 30 04:31:55 2010 UTC (13 years, 4 months ago) by root
Branch: MAIN
Changes since 1.62: +5 -7 lines
Log Message:
ok, stupid idea, keep it as-is

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44 root 1.41 use Errno ();
45 root 1.1
46 root 1.51 use AnyEvent 5.0 ();
47 root 1.1 use AnyEvent::Util ();
48     use AnyEvent::Handle ();
49    
50     use base Exporter::;
51    
52 root 1.57 our $VERSION = '1.46';
53 root 1.1
54 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
55 root 1.1
56 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
57 root 1.3 our $MAX_RECURSE = 10;
58 root 1.2 our $MAX_PERSISTENT = 8;
59     our $PERSISTENT_TIMEOUT = 2;
60     our $TIMEOUT = 300;
61 root 1.1
62     # changing these is evil
63 root 1.43 our $MAX_PERSISTENT_PER_HOST = 0;
64 root 1.11 our $MAX_PER_HOST = 4;
65 root 1.1
66 root 1.2 our $PROXY;
67 root 1.14 our $ACTIVE = 0;
68 root 1.2
69 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
70 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
71 root 1.1
72     =item http_get $url, key => value..., $cb->($data, $headers)
73    
74     Executes an HTTP-GET request. See the http_request function for details on
75 root 1.29 additional parameters and the return value.
76 root 1.1
77 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
78    
79 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
80     on additional parameters and the return value.
81 root 1.5
82     =item http_post $url, $body, key => value..., $cb->($data, $headers)
83 root 1.3
84 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
85 root 1.29 http_request function for details on additional parameters and the return
86     value.
87 root 1.3
88 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89    
90     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91     must be an absolute http or https URL.
92    
93 root 1.29 When called in void context, nothing is returned. In other contexts,
94     C<http_request> returns a "cancellation guard" - you have to keep the
95     object at least alive until the callback get called. If the object gets
96 root 1.58 destroyed before the callback is called, the request will be cancelled.
97 root 1.29
98 root 1.42 The callback will be called with the response body data as first argument
99     (or C<undef> if an error occured), and a hash-ref with response headers as
100     second argument.
101 root 1.2
102 root 1.7 All the headers in that hash are lowercased. In addition to the response
103 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
104     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
105     three parts of the HTTP Status-Line of the same name.
106    
107     The pseudo-header C<URL> contains the actual URL (which can differ from
108     the requested URL when following redirects - for example, you might get
109     an error that your URL scheme is not supported even though your URL is a
110     valid http URL because it redirected to an ftp URL, in which case you can
111     look at the URL pseudo header).
112    
113     The pseudo-header C<Redirect> only exists when the request was a result
114     of an internal redirect. In that case it is an array reference with
115     the C<($data, $headers)> from the redirect response. Note that this
116     response could in turn be the result of a redirect itself, and C<<
117     $headers->{Redirect}[1]{Redirect} >> will then contain the original
118     response, and so on.
119 root 1.20
120 root 1.32 If the server sends a header multiple times, then their contents will be
121     joined together with a comma (C<,>), as per the HTTP spec.
122 root 1.2
123     If an internal error occurs, such as not being able to resolve a hostname,
124 root 1.41 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<59x>
125     (usually C<599>) and the C<Reason> pseudo-header will contain an error
126     message.
127 root 1.2
128 root 1.6 A typical callback might look like this:
129    
130     sub {
131     my ($body, $hdr) = @_;
132    
133     if ($hdr->{Status} =~ /^2/) {
134     ... everything should be ok
135     } else {
136     print "error, $hdr->{Status} $hdr->{Reason}\n";
137     }
138     }
139    
140 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
141     include:
142    
143     =over 4
144    
145 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
146 root 1.1
147     Whether to recurse requests or not, e.g. on redirects, authentication
148 root 1.3 retries and so on, and how often to do so.
149 root 1.1
150     =item headers => hashref
151    
152 root 1.63 The request headers to use. Currently, C<http_request> may provide its
153     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
154     and will provide defaults for C<User-Agent:> and C<Referer:> (this can be
155     suppressed by using C<undef> for these headers in which case they won't be
156     sent at all).
157 root 1.1
158     =item timeout => $seconds
159    
160     The time-out to use for various stages - each connect attempt will reset
161 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
162     timeout.
163    
164     Default timeout is 5 minutes.
165 root 1.2
166     =item proxy => [$host, $port[, $scheme]] or undef
167    
168     Use the given http proxy for all requests. If not specified, then the
169     default proxy (as specified by C<$ENV{http_proxy}>) is used.
170    
171 root 1.47 C<$scheme> must be either missing, C<http> for HTTP or C<https> for
172 root 1.2 HTTPS.
173 root 1.1
174 root 1.3 =item body => $string
175    
176     The request body, usually empty. Will be-sent as-is (future versions of
177     this module might offer more options).
178    
179 root 1.10 =item cookie_jar => $hash_ref
180    
181     Passing this parameter enables (simplified) cookie-processing, loosely
182     based on the original netscape specification.
183    
184     The C<$hash_ref> must be an (initially empty) hash reference which will
185     get updated automatically. It is possible to save the cookie_jar to
186     persistent storage with something like JSON or Storable, but this is not
187 root 1.40 recommended, as expiry times are currently being ignored.
188 root 1.10
189     Note that this cookie implementation is not of very high quality, nor
190     meant to be complete. If you want complete cookie management you have to
191     do that on your own. C<cookie_jar> is meant as a quick fix to get some
192     cookie-using sites working. Cookies are a privacy disaster, do not use
193     them unless required to.
194    
195 root 1.40 =item tls_ctx => $scheme | $tls_ctx
196    
197     Specifies the AnyEvent::TLS context to be used for https connections. This
198     parameter follows the same rules as the C<tls_ctx> parameter to
199     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
200     C<high> can be specified, which give you a predefined low-security (no
201     verification, highest compatibility) and high-security (CA and common-name
202     verification) TLS context.
203    
204     The default for this option is C<low>, which could be interpreted as "give
205     me the page, no matter what".
206    
207 root 1.51 =item on_prepare => $callback->($fh)
208    
209     In rare cases you need to "tune" the socket before it is used to
210     connect (for exmaple, to bind it on a given IP address). This parameter
211     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
212     and behaves exactly the same way (e.g. it has to provide a
213     timeout). See the description for the C<$prepare_cb> argument of
214     C<AnyEvent::Socket::tcp_connect> for details.
215    
216 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
217    
218     In even rarer cases you want total control over how AnyEvent::HTTP
219     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
220     to do this, but you can provide your own C<tcp_connect> function -
221 root 1.60 obviously, it has to follow the same calling conventions, except that it
222     may always return a connection guard object.
223 root 1.59
224     There are probably lots of weird uses for this function, starting from
225     tracing the hosts C<http_request> actually tries to connect, to (inexact
226     but fast) host => IP address caching or even socks protocol support.
227    
228 root 1.42 =item on_header => $callback->($headers)
229 root 1.41
230     When specified, this callback will be called with the header hash as soon
231     as headers have been successfully received from the remote server (not on
232     locally-generated errors).
233    
234     It has to return either true (in which case AnyEvent::HTTP will continue),
235     or false, in which case AnyEvent::HTTP will cancel the download (and call
236     the finish callback with an error code of C<598>).
237    
238     This callback is useful, among other things, to quickly reject unwanted
239     content, which, if it is supposed to be rare, can be faster than first
240     doing a C<HEAD> request.
241    
242 root 1.42 Example: cancel the request unless the content-type is "text/html".
243 root 1.41
244 root 1.42 on_header => sub {
245     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
246     },
247 root 1.41
248 root 1.42 =item on_body => $callback->($partial_body, $headers)
249 root 1.41
250 root 1.42 When specified, all body data will be passed to this callback instead of
251     to the completion callback. The completion callback will get the empty
252     string instead of the body data.
253 root 1.41
254 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
255     or false, in which case AnyEvent::HTTP will cancel the download (and call
256     the completion callback with an error code of C<598>).
257    
258     This callback is useful when the data is too large to be held in memory
259     (so the callback writes it to a file) or when only some information should
260     be extracted, or when the body should be processed incrementally.
261 root 1.41
262     It is usually preferred over doing your own body handling via
263 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
264     only used to create a connection, C<want_body_handle> is the better
265     alternative, as it allows you to install your own event handler, reducing
266     resource usage.
267 root 1.41
268     =item want_body_handle => $enable
269    
270     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
271     changes considerably: after parsing the headers, and instead of
272     downloading the body (if any), the completion callback will be
273     called. Instead of the C<$body> argument containing the body data, the
274     callback will receive the L<AnyEvent::Handle> object associated with the
275     connection. In error cases, C<undef> will be passed. When there is no body
276     (e.g. status C<304>), the empty string will be passed.
277    
278     The handle object might or might not be in TLS mode, might be connected to
279     a proxy, be a persistent connection etc., and configured in unspecified
280     ways. The user is responsible for this handle (it will not be used by this
281     module anymore).
282    
283     This is useful with some push-type services, where, after the initial
284     headers, an interactive protocol is used (typical example would be the
285     push-style twitter API which starts a JSON/XML stream).
286    
287     If you think you need this, first have a look at C<on_body>, to see if
288 root 1.45 that doesn't solve your problem in a better way.
289 root 1.41
290 root 1.1 =back
291    
292 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
293    
294     http_request GET => "http://www.nethype.de/", sub {
295     my ($body, $hdr) = @_;
296     print "$body\n";
297     };
298    
299     Example: make a HTTP HEAD request on https://www.google.com/, use a
300     timeout of 30 seconds.
301    
302     http_request
303     GET => "https://www.google.com",
304     timeout => 30,
305     sub {
306     my ($body, $hdr) = @_;
307     use Data::Dumper;
308     print Dumper $hdr;
309     }
310     ;
311 root 1.1
312 root 1.29 Example: make another simple HTTP GET request, but immediately try to
313     cancel it.
314    
315     my $request = http_request GET => "http://www.nethype.de/", sub {
316     my ($body, $hdr) = @_;
317     print "$body\n";
318     };
319    
320     undef $request;
321    
322 root 1.1 =cut
323    
324 root 1.12 sub _slot_schedule;
325 root 1.11 sub _slot_schedule($) {
326     my $host = shift;
327    
328     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
329     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
330 root 1.12 # somebody wants that slot
331 root 1.11 ++$CO_SLOT{$host}[0];
332 root 1.14 ++$ACTIVE;
333 root 1.11
334     $cb->(AnyEvent::Util::guard {
335 root 1.14 --$ACTIVE;
336 root 1.11 --$CO_SLOT{$host}[0];
337     _slot_schedule $host;
338     });
339     } else {
340     # nobody wants the slot, maybe we can forget about it
341     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
342     last;
343     }
344     }
345     }
346    
347     # wait for a free slot on host, call callback
348     sub _get_slot($$) {
349     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
350    
351     _slot_schedule $_[0];
352     }
353    
354 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
355 root 1.34
356 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
357     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
358 root 1.40
359 elmex 1.15 sub http_request($$@) {
360 root 1.1 my $cb = pop;
361     my ($method, $url, %arg) = @_;
362    
363     my %hdr;
364    
365 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
366     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
367    
368 root 1.3 $method = uc $method;
369    
370 root 1.8 if (my $hdr = $arg{headers}) {
371 root 1.1 while (my ($k, $v) = each %$hdr) {
372     $hdr{lc $k} = $v;
373     }
374     }
375    
376 root 1.55 # pseudo headers for all subsequent responses
377     my @pseudo = (URL => $url);
378     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
379    
380 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
381 root 1.8
382 root 1.55 return $cb->(undef, { Status => 599, Reason => "Too many redirections", @pseudo })
383 root 1.8 if $recurse < 0;
384    
385 root 1.2 my $proxy = $arg{proxy} || $PROXY;
386 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
387    
388 root 1.31 my ($uscheme, $uauthority, $upath, $query, $fragment) =
389 root 1.56 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?|;
390 root 1.2
391 root 1.31 $uscheme = lc $uscheme;
392 root 1.1
393 root 1.31 my $uport = $uscheme eq "http" ? 80
394     : $uscheme eq "https" ? 443
395 root 1.55 : return $cb->(undef, { Status => 599, Reason => "Only http and https URL schemes supported", @pseudo });
396 root 1.13
397 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
398 root 1.55 or return $cb->(undef, { Status => 599, Reason => "Unparsable URL", @pseudo });
399 root 1.10
400     my $uhost = $1;
401     $uport = $2 if defined $2;
402    
403 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
404     unless exists $hdr{host};
405 root 1.43
406 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
407 root 1.56 $upath .= $query if length $query;
408 root 1.10
409     $upath =~ s%^/?%/%;
410    
411     # cookie processing
412     if (my $jar = $arg{cookie_jar}) {
413 root 1.31 %$jar = () if $jar->{version} != 1;
414 root 1.10
415     my @cookie;
416    
417     while (my ($chost, $v) = each %$jar) {
418 root 1.30 if ($chost =~ /^\./) {
419     next unless $chost eq substr $uhost, -length $chost;
420     } elsif ($chost =~ /\./) {
421     next unless $chost eq $uhost;
422     } else {
423     next;
424     }
425 root 1.10
426     while (my ($cpath, $v) = each %$v) {
427     next unless $cpath eq substr $upath, 0, length $cpath;
428    
429     while (my ($k, $v) = each %$v) {
430 root 1.31 next if $uscheme ne "https" && exists $v->{secure};
431     my $value = $v->{value};
432     $value =~ s/([\\"])/\\$1/g;
433     push @cookie, "$k=\"$value\"";
434 root 1.10 }
435     }
436     }
437    
438     $hdr{cookie} = join "; ", @cookie
439     if @cookie;
440     }
441 root 1.1
442 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
443 root 1.2
444 root 1.10 if ($proxy) {
445 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
446 root 1.31
447 root 1.47 $rscheme = "http" unless defined $rscheme;
448    
449 root 1.31 # don't support https requests over https-proxy transport,
450 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
451 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
452 root 1.10 } else {
453 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
454 root 1.2 }
455    
456 root 1.47 # leave out fragment and query string, just a heuristic
457     $hdr{referer} ||= "$uscheme://$uauthority$upath" unless exists $hdr{referer};
458     $hdr{"user-agent"} ||= $USERAGENT unless exists $hdr{"user-agent"};
459 root 1.41
460 root 1.53 $hdr{"content-length"} = length $arg{body}
461     if length $arg{body} || $method ne "GET";
462 root 1.1
463 root 1.11 my %state = (connect_guard => 1);
464    
465     _get_slot $uhost, sub {
466     $state{slot_guard} = shift;
467 root 1.1
468 root 1.11 return unless $state{connect_guard};
469 root 1.1
470 root 1.59 my $tcp_connect = $arg{tcp_connect}
471     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
472    
473     $state{connect_guard} = $tcp_connect->(
474     $rhost,
475     $rport,
476     sub {
477     $state{fh} = shift
478     or do {
479     my $err = "$!";
480     %state = ();
481     return $cb->(undef, { Status => 599, Reason => $err, @pseudo });
482     };
483 root 1.44
484 root 1.59 pop; # free memory, save a tree
485 root 1.11
486 root 1.59 return unless delete $state{connect_guard};
487 root 1.11
488 root 1.59 # get handle
489     $state{handle} = new AnyEvent::Handle
490     fh => $state{fh},
491     peername => $rhost,
492     tls_ctx => $arg{tls_ctx},
493     # these need to be reconfigured on keepalive handles
494     timeout => $timeout,
495     on_error => sub {
496     %state = ();
497     $cb->(undef, { Status => 599, Reason => $_[2], @pseudo });
498     },
499     on_eof => sub {
500     %state = ();
501     $cb->(undef, { Status => 599, Reason => "Unexpected end-of-file", @pseudo });
502     },
503     ;
504 root 1.11
505 root 1.59 # limit the number of persistent connections
506     # keepalive not yet supported
507 root 1.56 # if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
508     # ++$KA_COUNT{$_[1]};
509     # $state{handle}{ka_count_guard} = AnyEvent::Util::guard {
510     # --$KA_COUNT{$_[1]}
511     # };
512     # $hdr{connection} = "keep-alive";
513     # } else {
514 root 1.59 delete $hdr{connection};
515 root 1.56 # }
516 root 1.1
517 root 1.59 $state{handle}->starttls ("connect") if $rscheme eq "https";
518    
519     # handle actual, non-tunneled, request
520     my $handle_actual_request = sub {
521     $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls};
522    
523     # send request
524     $state{handle}->push_write (
525     "$method $rpath HTTP/1.0\015\012"
526     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
527     . "\015\012"
528     . (delete $arg{body})
529     );
530    
531     # return if error occured during push_write()
532     return unless %state;
533    
534     %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use
535    
536     # status line and headers
537     $state{handle}->push_read (line => $qr_nlnl, sub {
538     for ("$_[1]") {
539     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
540    
541     /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )? \015?\012/igxc
542     or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid server response", @pseudo }));
543    
544     push @pseudo,
545     HTTPVersion => $1,
546     Status => $2,
547     Reason => $3,
548     ;
549    
550     # things seen, not parsed:
551     # p3pP="NON CUR OTPi OUR NOR UNI"
552    
553     $hdr{lc $1} .= ",$2"
554     while /\G
555     ([^:\000-\037]*):
556     [\011\040]*
557     ((?: [^\012]+ | \012[\011\040] )*)
558     \012
559     /gxc;
560 root 1.31
561 root 1.59 /\G$/
562     or return (%state = (), $cb->(undef, { Status => 599, Reason => "Garbled response headers", @pseudo }));
563 root 1.41 }
564    
565 root 1.59 # remove the "," prefix we added to all headers above
566     substr $_, 0, 1, ""
567     for values %hdr;
568    
569     # patch in all pseudo headers
570     %hdr = (%hdr, @pseudo);
571    
572     # redirect handling
573     # microsoft and other shitheads don't give a shit for following standards,
574     # try to support some common forms of broken Location headers.
575     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
576     $hdr{location} =~ s/^\.\/+//;
577    
578     my $url = "$rscheme://$uhost:$uport";
579    
580     unless ($hdr{location} =~ s/^\///) {
581     $url .= $upath;
582     $url =~ s/\/[^\/]*$//;
583     }
584    
585     $hdr{location} = "$url/$hdr{location}";
586     }
587 root 1.57
588 root 1.59 my $redirect;
589 root 1.41
590 root 1.59 if ($recurse) {
591     my $status = $hdr{Status};
592 root 1.55
593 root 1.59 # industry standard is to redirect POST as GET for
594     # 301, 302 and 303, in contrast to http/1.0 and 1.1.
595     # also, the UA should ask the user for 301 and 307 and POST,
596     # industry standard seems to be to simply follow.
597     # we go with the industry standard.
598     if ($status == 301 or $status == 302 or $status == 303) {
599     # HTTP/1.1 is unclear on how to mutate the method
600     $method = "GET" unless $method eq "HEAD";
601     $redirect = 1;
602     } elsif ($status == 307) {
603     $redirect = 1;
604     }
605 root 1.41 }
606    
607 root 1.59 my $finish = sub {
608     $state{handle}->destroy if $state{handle};
609     %state = ();
610    
611     # set-cookie processing
612     if ($arg{cookie_jar}) {
613     for ($_[1]{"set-cookie"}) {
614     # parse NAME=VALUE
615     my @kv;
616    
617     while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) {
618     my $name = $1;
619     my $value = $3;
620    
621     unless ($value) {
622     $value = $2;
623     $value =~ s/\\(.)/$1/gs;
624     }
625    
626     push @kv, $name => $value;
627 root 1.31
628 root 1.59 last unless /\G\s*;/gc;
629 root 1.57 }
630    
631 root 1.59 last unless @kv;
632 root 1.31
633 root 1.59 my $name = shift @kv;
634     my %kv = (value => shift @kv, @kv);
635 root 1.31
636 root 1.59 my $cdom;
637     my $cpath = (delete $kv{path}) || "/";
638 root 1.10
639 root 1.59 if (exists $kv{domain}) {
640     $cdom = delete $kv{domain};
641    
642     $cdom =~ s/^\.?/./; # make sure it starts with a "."
643 root 1.11
644 root 1.59 next if $cdom =~ /\.$/;
645    
646     # this is not rfc-like and not netscape-like. go figure.
647     my $ndots = $cdom =~ y/.//;
648     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
649     } else {
650     $cdom = $uhost;
651     }
652    
653     # store it
654     $arg{cookie_jar}{version} = 1;
655     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
656 root 1.11
657 root 1.59 redo if /\G\s*,/gc;
658 root 1.57 }
659 root 1.59 }
660 root 1.31
661 root 1.59 if ($redirect && exists $hdr{location}) {
662     # we ignore any errors, as it is very common to receive
663     # Content-Length != 0 but no actual body
664     # we also access %hdr, as $_[1] might be an erro
665     http_request (
666     $method => $hdr{location},
667     %arg,
668     recurse => $recurse - 1,
669     Redirect => \@_,
670     $cb);
671     } else {
672     $cb->($_[0], $_[1]);
673 root 1.11 }
674 root 1.59 };
675    
676     my $len = $hdr{"content-length"};
677 root 1.8
678 root 1.59 if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
679     $finish->(undef, { Status => 598, Reason => "Request cancelled by on_header", @pseudo });
680     } elsif (
681     $hdr{Status} =~ /^(?:1..|[23]04)$/
682     or $method eq "HEAD"
683     or (defined $len && !$len)
684     ) {
685     # no body
686     $finish->("", \%hdr);
687 root 1.57 } else {
688 root 1.59 # body handling, four different code paths
689     # for want_body_handle, on_body (2x), normal (2x)
690     # we might read too much here, but it does not matter yet (no pers. connections)
691     if (!$redirect && $arg{want_body_handle}) {
692     $_[0]->on_eof (undef);
693     $_[0]->on_error (undef);
694     $_[0]->on_read (undef);
695 root 1.24
696 root 1.59 $finish->(delete $state{handle}, \%hdr);
697 root 1.41
698 root 1.59 } elsif ($arg{on_body}) {
699     $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) });
700     if ($len) {
701     $_[0]->on_eof (undef);
702     $_[0]->on_read (sub {
703     $len -= length $_[0]{rbuf};
704    
705     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
706     or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo });
707    
708     $len > 0
709     or $finish->("", \%hdr);
710     });
711     } else {
712     $_[0]->on_eof (sub {
713     $finish->("", \%hdr);
714     });
715     $_[0]->on_read (sub {
716     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
717     or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo });
718     });
719     }
720     } else {
721 root 1.41 $_[0]->on_eof (undef);
722    
723 root 1.59 if ($len) {
724     $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) });
725     $_[0]->on_read (sub {
726     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), \%hdr)
727     if $len <= length $_[0]{rbuf};
728     });
729     } else {
730     $_[0]->on_error (sub {
731     ($! == Errno::EPIPE || !$!)
732     ? $finish->(delete $_[0]{rbuf}, \%hdr)
733     : $finish->(undef, { Status => 599, Reason => $_[2], @pseudo });
734     });
735     $_[0]->on_read (sub { });
736     }
737     }
738     }
739     });
740     };
741 root 1.57
742 root 1.59 # now handle proxy-CONNECT method
743     if ($proxy && $uscheme eq "https") {
744     # oh dear, we have to wrap it into a connect request
745    
746     # maybe re-use $uauthority with patched port?
747     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012");
748     $state{handle}->push_read (line => $qr_nlnl, sub {
749     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
750     or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid proxy connect response ($_[1])", @pseudo }));
751    
752     if ($2 == 200) {
753     $rpath = $upath;
754     &$handle_actual_request;
755 root 1.57 } else {
756 root 1.59 %state = ();
757     $cb->(undef, { Status => $2, Reason => $3, @pseudo });
758 root 1.11 }
759 root 1.59 });
760     } else {
761     &$handle_actual_request;
762     }
763 root 1.31
764 root 1.59 },
765     $arg{on_prepare} || sub { $timeout }
766     );
767 root 1.1 };
768    
769     defined wantarray && AnyEvent::Util::guard { %state = () }
770     }
771    
772 elmex 1.15 sub http_get($@) {
773 root 1.1 unshift @_, "GET";
774     &http_request
775     }
776    
777 elmex 1.15 sub http_head($@) {
778 root 1.4 unshift @_, "HEAD";
779     &http_request
780     }
781    
782 elmex 1.15 sub http_post($$@) {
783 root 1.22 my $url = shift;
784     unshift @_, "POST", $url, "body";
785 root 1.3 &http_request
786     }
787    
788 root 1.9 =back
789    
790 root 1.55 =head2 DNS CACHING
791    
792     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
793     the actual connection, which in turn uses AnyEvent::DNS to resolve
794     hostnames. The latter is a simple stub resolver and does no caching
795     on its own. If you want DNS caching, you currently have to provide
796     your own default resolver (by storing a suitable resolver object in
797     C<$AnyEvent::DNS::RESOLVER>).
798    
799 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
800 root 1.1
801     =over 4
802    
803 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
804    
805     Sets the default proxy server to use. The proxy-url must begin with a
806 root 1.52 string of the form C<http://host:port> (optionally C<https:...>), croaks
807     otherwise.
808    
809     To clear an already-set proxy, use C<undef>.
810 root 1.2
811 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
812    
813     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
814     Date (RFC 2616).
815    
816     =item $timestamp = AnyEvent::HTTP::parse_date $date
817    
818     Takes a HTTP Date (RFC 2616) and returns the corresponding POSIX
819     timestamp, or C<undef> if the date cannot be parsed.
820    
821 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
822 root 1.1
823 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
824 root 1.1
825     =item $AnyEvent::HTTP::USERAGENT
826    
827     The default value for the C<User-Agent> header (the default is
828 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
829 root 1.1
830 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
831 root 1.1
832 root 1.47 The maximum number of concurrent connections to the same host (identified
833 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
834     are queued until previous connections are closed.
835 root 1.1
836 root 1.43 The default value for this is C<4>, and it is highly advisable to not
837     increase it.
838 root 1.3
839 root 1.14 =item $AnyEvent::HTTP::ACTIVE
840    
841     The number of active connections. This is not the number of currently
842     running requests, but the number of currently open and non-idle TCP
843     connections. This number of can be useful for load-leveling.
844    
845 root 1.1 =back
846    
847     =cut
848    
849 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
850     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
851    
852     sub format_date($) {
853     my ($time) = @_;
854    
855     # RFC 822/1123 format
856     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
857    
858     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
859     $weekday[$wday], $mday, $month[$mon], $year + 1900,
860     $H, $M, $S;
861     }
862    
863     sub parse_date($) {
864     my ($date) = @_;
865    
866     my ($d, $m, $y, $H, $M, $S);
867    
868     if ($date =~ /^[A-Z][a-z][a-z], ([0-9][0-9]) ([A-Z][a-z][a-z]) ([0-9][0-9][0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
869     # RFC 822/1123, required by RFC 2616
870     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
871    
872     } elsif ($date =~ /^[A-Z][a-z]+, ([0-9][0-9])-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
873     # RFC 850
874     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
875    
876     } elsif ($date =~ /^[A-Z][a-z][a-z] ([A-Z][a-z][a-z]) ([0-9 ][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) ([0-9][0-9][0-9][0-9])$/) {
877     # ISO C's asctime
878     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
879     }
880     # other formats fail in the loop below
881    
882     for (0..11) {
883     if ($m eq $month[$_]) {
884     require Time::Local;
885     return Time::Local::timegm ($S, $M, $H, $d, $_, $y);
886     }
887     }
888    
889     undef
890     }
891    
892 root 1.2 sub set_proxy($) {
893 root 1.52 if (length $_[0]) {
894     $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix
895     or Carp::croak "$_[0]: invalid proxy URL";
896     $PROXY = [$2, $3 || 3128, $1]
897     } else {
898     undef $PROXY;
899     }
900 root 1.2 }
901    
902     # initialise proxy from environment
903 root 1.52 eval {
904     set_proxy $ENV{http_proxy};
905     };
906 root 1.2
907 root 1.60 =head2 SOCKS PROXIES
908    
909     Socks proxies are not directly supported by AnyEvent::HTTP. You can
910     compile your perl to support socks, or use an external program such as
911     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
912     transparently.
913    
914     Alternatively, for AnyEvent::HTTP only, you can use your own
915     C<tcp_connect> function that does the proxy handshake - here is an example
916     that works with socks4a proxies:
917    
918     use Errno;
919     use AnyEvent::Util;
920     use AnyEvent::Socket;
921     use AnyEvent::Handle;
922    
923     # host, port and username of/for your socks4a proxy
924     my $socks_host = "10.0.0.23";
925     my $socks_port = 9050;
926     my $socks_user = "";
927    
928     sub socks4a_connect {
929     my ($host, $port, $connect_cb, $prepare_cb) = @_;
930    
931     my $hdl = new AnyEvent::Handle
932     connect => [$socks_host, $socks_port],
933     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
934     on_error => sub { $connect_cb->() },
935     ;
936    
937     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
938    
939     $hdl->push_read (chunk => 8, sub {
940     my ($hdl, $chunk) = @_;
941     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
942    
943     if ($status == 0x5a) {
944     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
945     } else {
946     $! = Errno::ENXIO; $connect_cb->();
947     }
948     });
949    
950     $hdl
951     }
952    
953     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
954     possibly after switching off other proxy types:
955    
956     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
957    
958     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
959     my ($data, $headers) = @_;
960     ...
961     };
962    
963 root 1.1 =head1 SEE ALSO
964    
965     L<AnyEvent>.
966    
967     =head1 AUTHOR
968    
969 root 1.18 Marc Lehmann <schmorp@schmorp.de>
970     http://home.schmorp.de/
971 root 1.1
972 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
973     testcases and bugreports.
974    
975 root 1.1 =cut
976    
977     1
978