ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.65
Committed: Fri Dec 31 03:47:32 2010 UTC (13 years, 4 months ago) by root
Branch: MAIN
CVS Tags: rel-1_5
Changes since 1.64: +1 -1 lines
Log Message:
1.5

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44 root 1.41 use Errno ();
45 root 1.1
46 root 1.51 use AnyEvent 5.0 ();
47 root 1.1 use AnyEvent::Util ();
48     use AnyEvent::Handle ();
49    
50     use base Exporter::;
51    
52 root 1.65 our $VERSION = '1.5';
53 root 1.1
54 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
55 root 1.1
56 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
57 root 1.3 our $MAX_RECURSE = 10;
58 root 1.2 our $MAX_PERSISTENT = 8;
59     our $PERSISTENT_TIMEOUT = 2;
60     our $TIMEOUT = 300;
61 root 1.1
62     # changing these is evil
63 root 1.43 our $MAX_PERSISTENT_PER_HOST = 0;
64 root 1.11 our $MAX_PER_HOST = 4;
65 root 1.1
66 root 1.2 our $PROXY;
67 root 1.14 our $ACTIVE = 0;
68 root 1.2
69 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
70 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
71 root 1.1
72     =item http_get $url, key => value..., $cb->($data, $headers)
73    
74     Executes an HTTP-GET request. See the http_request function for details on
75 root 1.29 additional parameters and the return value.
76 root 1.1
77 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
78    
79 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
80     on additional parameters and the return value.
81 root 1.5
82     =item http_post $url, $body, key => value..., $cb->($data, $headers)
83 root 1.3
84 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
85 root 1.29 http_request function for details on additional parameters and the return
86     value.
87 root 1.3
88 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89    
90     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91     must be an absolute http or https URL.
92    
93 root 1.29 When called in void context, nothing is returned. In other contexts,
94     C<http_request> returns a "cancellation guard" - you have to keep the
95     object at least alive until the callback get called. If the object gets
96 root 1.58 destroyed before the callback is called, the request will be cancelled.
97 root 1.29
98 root 1.42 The callback will be called with the response body data as first argument
99     (or C<undef> if an error occured), and a hash-ref with response headers as
100     second argument.
101 root 1.2
102 root 1.7 All the headers in that hash are lowercased. In addition to the response
103 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
104     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
105 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
106     during the body phase of a request, then the original C<Status> and
107     C<Reason> values from the header are available as C<OrigStatus> and
108     C<OrigReason>.
109 root 1.55
110     The pseudo-header C<URL> contains the actual URL (which can differ from
111     the requested URL when following redirects - for example, you might get
112     an error that your URL scheme is not supported even though your URL is a
113     valid http URL because it redirected to an ftp URL, in which case you can
114     look at the URL pseudo header).
115    
116     The pseudo-header C<Redirect> only exists when the request was a result
117     of an internal redirect. In that case it is an array reference with
118     the C<($data, $headers)> from the redirect response. Note that this
119     response could in turn be the result of a redirect itself, and C<<
120     $headers->{Redirect}[1]{Redirect} >> will then contain the original
121     response, and so on.
122 root 1.20
123 root 1.32 If the server sends a header multiple times, then their contents will be
124     joined together with a comma (C<,>), as per the HTTP spec.
125 root 1.2
126     If an internal error occurs, such as not being able to resolve a hostname,
127 root 1.41 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<59x>
128     (usually C<599>) and the C<Reason> pseudo-header will contain an error
129     message.
130 root 1.2
131 root 1.6 A typical callback might look like this:
132    
133     sub {
134     my ($body, $hdr) = @_;
135    
136     if ($hdr->{Status} =~ /^2/) {
137     ... everything should be ok
138     } else {
139     print "error, $hdr->{Status} $hdr->{Reason}\n";
140     }
141     }
142    
143 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
144     include:
145    
146     =over 4
147    
148 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
149 root 1.1
150     Whether to recurse requests or not, e.g. on redirects, authentication
151 root 1.3 retries and so on, and how often to do so.
152 root 1.1
153     =item headers => hashref
154    
155 root 1.63 The request headers to use. Currently, C<http_request> may provide its
156     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
157     and will provide defaults for C<User-Agent:> and C<Referer:> (this can be
158     suppressed by using C<undef> for these headers in which case they won't be
159     sent at all).
160 root 1.1
161     =item timeout => $seconds
162    
163     The time-out to use for various stages - each connect attempt will reset
164 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
165     timeout.
166    
167     Default timeout is 5 minutes.
168 root 1.2
169     =item proxy => [$host, $port[, $scheme]] or undef
170    
171     Use the given http proxy for all requests. If not specified, then the
172     default proxy (as specified by C<$ENV{http_proxy}>) is used.
173    
174 root 1.47 C<$scheme> must be either missing, C<http> for HTTP or C<https> for
175 root 1.2 HTTPS.
176 root 1.1
177 root 1.3 =item body => $string
178    
179     The request body, usually empty. Will be-sent as-is (future versions of
180     this module might offer more options).
181    
182 root 1.10 =item cookie_jar => $hash_ref
183    
184     Passing this parameter enables (simplified) cookie-processing, loosely
185     based on the original netscape specification.
186    
187     The C<$hash_ref> must be an (initially empty) hash reference which will
188     get updated automatically. It is possible to save the cookie_jar to
189     persistent storage with something like JSON or Storable, but this is not
190 root 1.40 recommended, as expiry times are currently being ignored.
191 root 1.10
192     Note that this cookie implementation is not of very high quality, nor
193     meant to be complete. If you want complete cookie management you have to
194     do that on your own. C<cookie_jar> is meant as a quick fix to get some
195     cookie-using sites working. Cookies are a privacy disaster, do not use
196     them unless required to.
197    
198 root 1.40 =item tls_ctx => $scheme | $tls_ctx
199    
200     Specifies the AnyEvent::TLS context to be used for https connections. This
201     parameter follows the same rules as the C<tls_ctx> parameter to
202     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
203     C<high> can be specified, which give you a predefined low-security (no
204     verification, highest compatibility) and high-security (CA and common-name
205     verification) TLS context.
206    
207     The default for this option is C<low>, which could be interpreted as "give
208     me the page, no matter what".
209    
210 root 1.51 =item on_prepare => $callback->($fh)
211    
212     In rare cases you need to "tune" the socket before it is used to
213     connect (for exmaple, to bind it on a given IP address). This parameter
214     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
215     and behaves exactly the same way (e.g. it has to provide a
216     timeout). See the description for the C<$prepare_cb> argument of
217     C<AnyEvent::Socket::tcp_connect> for details.
218    
219 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
220    
221     In even rarer cases you want total control over how AnyEvent::HTTP
222     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
223     to do this, but you can provide your own C<tcp_connect> function -
224 root 1.60 obviously, it has to follow the same calling conventions, except that it
225     may always return a connection guard object.
226 root 1.59
227     There are probably lots of weird uses for this function, starting from
228     tracing the hosts C<http_request> actually tries to connect, to (inexact
229     but fast) host => IP address caching or even socks protocol support.
230    
231 root 1.42 =item on_header => $callback->($headers)
232 root 1.41
233     When specified, this callback will be called with the header hash as soon
234     as headers have been successfully received from the remote server (not on
235     locally-generated errors).
236    
237     It has to return either true (in which case AnyEvent::HTTP will continue),
238     or false, in which case AnyEvent::HTTP will cancel the download (and call
239     the finish callback with an error code of C<598>).
240    
241     This callback is useful, among other things, to quickly reject unwanted
242     content, which, if it is supposed to be rare, can be faster than first
243     doing a C<HEAD> request.
244    
245 root 1.42 Example: cancel the request unless the content-type is "text/html".
246 root 1.41
247 root 1.42 on_header => sub {
248     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
249     },
250 root 1.41
251 root 1.42 =item on_body => $callback->($partial_body, $headers)
252 root 1.41
253 root 1.42 When specified, all body data will be passed to this callback instead of
254     to the completion callback. The completion callback will get the empty
255     string instead of the body data.
256 root 1.41
257 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
258     or false, in which case AnyEvent::HTTP will cancel the download (and call
259     the completion callback with an error code of C<598>).
260    
261     This callback is useful when the data is too large to be held in memory
262     (so the callback writes it to a file) or when only some information should
263     be extracted, or when the body should be processed incrementally.
264 root 1.41
265     It is usually preferred over doing your own body handling via
266 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
267     only used to create a connection, C<want_body_handle> is the better
268     alternative, as it allows you to install your own event handler, reducing
269     resource usage.
270 root 1.41
271     =item want_body_handle => $enable
272    
273     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
274     changes considerably: after parsing the headers, and instead of
275     downloading the body (if any), the completion callback will be
276     called. Instead of the C<$body> argument containing the body data, the
277     callback will receive the L<AnyEvent::Handle> object associated with the
278     connection. In error cases, C<undef> will be passed. When there is no body
279     (e.g. status C<304>), the empty string will be passed.
280    
281     The handle object might or might not be in TLS mode, might be connected to
282     a proxy, be a persistent connection etc., and configured in unspecified
283     ways. The user is responsible for this handle (it will not be used by this
284     module anymore).
285    
286     This is useful with some push-type services, where, after the initial
287     headers, an interactive protocol is used (typical example would be the
288     push-style twitter API which starts a JSON/XML stream).
289    
290     If you think you need this, first have a look at C<on_body>, to see if
291 root 1.45 that doesn't solve your problem in a better way.
292 root 1.41
293 root 1.1 =back
294    
295 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
296    
297     http_request GET => "http://www.nethype.de/", sub {
298     my ($body, $hdr) = @_;
299     print "$body\n";
300     };
301    
302     Example: make a HTTP HEAD request on https://www.google.com/, use a
303     timeout of 30 seconds.
304    
305     http_request
306     GET => "https://www.google.com",
307     timeout => 30,
308     sub {
309     my ($body, $hdr) = @_;
310     use Data::Dumper;
311     print Dumper $hdr;
312     }
313     ;
314 root 1.1
315 root 1.29 Example: make another simple HTTP GET request, but immediately try to
316     cancel it.
317    
318     my $request = http_request GET => "http://www.nethype.de/", sub {
319     my ($body, $hdr) = @_;
320     print "$body\n";
321     };
322    
323     undef $request;
324    
325 root 1.1 =cut
326    
327 root 1.12 sub _slot_schedule;
328 root 1.11 sub _slot_schedule($) {
329     my $host = shift;
330    
331     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
332     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
333 root 1.12 # somebody wants that slot
334 root 1.11 ++$CO_SLOT{$host}[0];
335 root 1.14 ++$ACTIVE;
336 root 1.11
337     $cb->(AnyEvent::Util::guard {
338 root 1.14 --$ACTIVE;
339 root 1.11 --$CO_SLOT{$host}[0];
340     _slot_schedule $host;
341     });
342     } else {
343     # nobody wants the slot, maybe we can forget about it
344     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
345     last;
346     }
347     }
348     }
349    
350     # wait for a free slot on host, call callback
351     sub _get_slot($$) {
352     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
353    
354     _slot_schedule $_[0];
355     }
356    
357 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
358 root 1.34
359 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
360     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
361 root 1.40
362 elmex 1.15 sub http_request($$@) {
363 root 1.1 my $cb = pop;
364     my ($method, $url, %arg) = @_;
365    
366     my %hdr;
367    
368 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
369     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
370    
371 root 1.3 $method = uc $method;
372    
373 root 1.8 if (my $hdr = $arg{headers}) {
374 root 1.1 while (my ($k, $v) = each %$hdr) {
375     $hdr{lc $k} = $v;
376     }
377     }
378    
379 root 1.55 # pseudo headers for all subsequent responses
380     my @pseudo = (URL => $url);
381     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
382    
383 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
384 root 1.8
385 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
386 root 1.8 if $recurse < 0;
387    
388 root 1.2 my $proxy = $arg{proxy} || $PROXY;
389 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
390    
391 root 1.31 my ($uscheme, $uauthority, $upath, $query, $fragment) =
392 root 1.56 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?|;
393 root 1.2
394 root 1.31 $uscheme = lc $uscheme;
395 root 1.1
396 root 1.31 my $uport = $uscheme eq "http" ? 80
397     : $uscheme eq "https" ? 443
398 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
399 root 1.13
400 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
401 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
402 root 1.10
403     my $uhost = $1;
404     $uport = $2 if defined $2;
405    
406 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
407     unless exists $hdr{host};
408 root 1.43
409 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
410 root 1.56 $upath .= $query if length $query;
411 root 1.10
412     $upath =~ s%^/?%/%;
413    
414     # cookie processing
415     if (my $jar = $arg{cookie_jar}) {
416 root 1.31 %$jar = () if $jar->{version} != 1;
417 root 1.10
418     my @cookie;
419    
420     while (my ($chost, $v) = each %$jar) {
421 root 1.30 if ($chost =~ /^\./) {
422     next unless $chost eq substr $uhost, -length $chost;
423     } elsif ($chost =~ /\./) {
424     next unless $chost eq $uhost;
425     } else {
426     next;
427     }
428 root 1.10
429     while (my ($cpath, $v) = each %$v) {
430     next unless $cpath eq substr $upath, 0, length $cpath;
431    
432     while (my ($k, $v) = each %$v) {
433 root 1.31 next if $uscheme ne "https" && exists $v->{secure};
434     my $value = $v->{value};
435     $value =~ s/([\\"])/\\$1/g;
436     push @cookie, "$k=\"$value\"";
437 root 1.10 }
438     }
439     }
440    
441     $hdr{cookie} = join "; ", @cookie
442     if @cookie;
443     }
444 root 1.1
445 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
446 root 1.2
447 root 1.10 if ($proxy) {
448 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
449 root 1.31
450 root 1.47 $rscheme = "http" unless defined $rscheme;
451    
452 root 1.31 # don't support https requests over https-proxy transport,
453 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
454 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
455 root 1.10 } else {
456 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
457 root 1.2 }
458    
459 root 1.47 # leave out fragment and query string, just a heuristic
460     $hdr{referer} ||= "$uscheme://$uauthority$upath" unless exists $hdr{referer};
461     $hdr{"user-agent"} ||= $USERAGENT unless exists $hdr{"user-agent"};
462 root 1.41
463 root 1.53 $hdr{"content-length"} = length $arg{body}
464     if length $arg{body} || $method ne "GET";
465 root 1.1
466 root 1.11 my %state = (connect_guard => 1);
467    
468     _get_slot $uhost, sub {
469     $state{slot_guard} = shift;
470 root 1.1
471 root 1.11 return unless $state{connect_guard};
472 root 1.1
473 root 1.64 my $connect_cb = sub {
474     $state{fh} = shift
475     or do {
476     my $err = "$!";
477     %state = ();
478     return $cb->(undef, { @pseudo, Status => 599, Reason => $err });
479     };
480 root 1.44
481 root 1.64 pop; # free memory, save a tree
482 root 1.11
483 root 1.64 return unless delete $state{connect_guard};
484 root 1.11
485 root 1.64 # get handle
486     $state{handle} = new AnyEvent::Handle
487     fh => $state{fh},
488     peername => $rhost,
489     tls_ctx => $arg{tls_ctx},
490     # these need to be reconfigured on keepalive handles
491     timeout => $timeout,
492     on_error => sub {
493     %state = ();
494     $cb->(undef, { @pseudo, Status => 599, Reason => $_[2] });
495     },
496     on_eof => sub {
497     %state = ();
498     $cb->(undef, { @pseudo, Status => 599, Reason => "Unexpected end-of-file" });
499     },
500     ;
501 root 1.11
502 root 1.64 # limit the number of persistent connections
503     # keepalive not yet supported
504 root 1.56 # if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
505     # ++$KA_COUNT{$_[1]};
506     # $state{handle}{ka_count_guard} = AnyEvent::Util::guard {
507     # --$KA_COUNT{$_[1]}
508     # };
509     # $hdr{connection} = "keep-alive";
510     # } else {
511 root 1.64 delete $hdr{connection};
512 root 1.56 # }
513 root 1.1
514 root 1.64 $state{handle}->starttls ("connect") if $rscheme eq "https";
515    
516     # handle actual, non-tunneled, request
517     my $handle_actual_request = sub {
518     $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls};
519    
520     # send request
521     $state{handle}->push_write (
522     "$method $rpath HTTP/1.0\015\012"
523     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
524     . "\015\012"
525     . (delete $arg{body})
526     );
527    
528     # return if error occured during push_write()
529     return unless %state;
530    
531     %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use
532    
533     # status line and headers
534     $state{handle}->push_read (line => $qr_nlnl, sub {
535     my $keepalive = pop;
536    
537     for ("$_[1]") {
538     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
539    
540     /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )? \015?\012/igxc
541     or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Invalid server response" }));
542    
543     push @pseudo,
544     HTTPVersion => $1,
545     Status => $2,
546     Reason => $3,
547     ;
548    
549     # things seen, not parsed:
550     # p3pP="NON CUR OTPi OUR NOR UNI"
551    
552     $hdr{lc $1} .= ",$2"
553     while /\G
554     ([^:\000-\037]*):
555     [\011\040]*
556     ((?: [^\012]+ | \012[\011\040] )*)
557     \012
558     /gxc;
559    
560     /\G$/
561     or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Garbled response headers" }));
562     }
563    
564     # remove the "," prefix we added to all headers above
565     substr $_, 0, 1, ""
566     for values %hdr;
567    
568     # patch in all pseudo headers
569     %hdr = (%hdr, @pseudo);
570    
571     # redirect handling
572     # microsoft and other shitheads don't give a shit for following standards,
573     # try to support some common forms of broken Location headers.
574     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
575     $hdr{location} =~ s/^\.\/+//;
576    
577     my $url = "$rscheme://$uhost:$uport";
578    
579     unless ($hdr{location} =~ s/^\///) {
580     $url .= $upath;
581     $url =~ s/\/[^\/]*$//;
582     }
583 root 1.59
584 root 1.64 $hdr{location} = "$url/$hdr{location}";
585     }
586 root 1.31
587 root 1.64 my $redirect;
588 root 1.41
589 root 1.64 if ($recurse) {
590     my $status = $hdr{Status};
591 root 1.59
592 root 1.64 # industry standard is to redirect POST as GET for
593     # 301, 302 and 303, in contrast to http/1.0 and 1.1.
594     # also, the UA should ask the user for 301 and 307 and POST,
595     # industry standard seems to be to simply follow.
596     # we go with the industry standard.
597     if ($status == 301 or $status == 302 or $status == 303) {
598     # HTTP/1.1 is unclear on how to mutate the method
599     $method = "GET" unless $method eq "HEAD";
600     $redirect = 1;
601     } elsif ($status == 307) {
602     $redirect = 1;
603 root 1.59 }
604 root 1.64 }
605 root 1.57
606 root 1.64 my $finish = sub { # ($data, $err_status, $err_reason[, $keepalive])
607     $state{handle}->destroy if $state{handle};
608     %state = ();
609 root 1.55
610 root 1.64 if (defined $_[1]) {
611     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
612     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
613 root 1.41 }
614    
615 root 1.64 # set-cookie processing
616     if ($arg{cookie_jar}) {
617     for ($hdr{"set-cookie"}) {
618     # parse NAME=VALUE
619     my @kv;
620    
621     while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) {
622     my $name = $1;
623     my $value = $3;
624    
625     unless ($value) {
626     $value = $2;
627     $value =~ s/\\(.)/$1/gs;
628     }
629 root 1.59
630 root 1.64 push @kv, $name => $value;
631 root 1.31
632 root 1.64 last unless /\G\s*;/gc;
633     }
634 root 1.57
635 root 1.64 last unless @kv;
636 root 1.31
637 root 1.64 my $name = shift @kv;
638     my %kv = (value => shift @kv, @kv);
639 root 1.31
640 root 1.64 my $cdom;
641     my $cpath = (delete $kv{path}) || "/";
642 root 1.10
643 root 1.64 if (exists $kv{domain}) {
644     $cdom = delete $kv{domain};
645    
646     $cdom =~ s/^\.?/./; # make sure it starts with a "."
647 root 1.11
648 root 1.64 next if $cdom =~ /\.$/;
649 root 1.59
650 root 1.64 # this is not rfc-like and not netscape-like. go figure.
651     my $ndots = $cdom =~ y/.//;
652     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
653     } else {
654     $cdom = $uhost;
655     }
656    
657     # store it
658     $arg{cookie_jar}{version} = 1;
659     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
660 root 1.11
661 root 1.64 redo if /\G\s*,/gc;
662 root 1.59 }
663 root 1.64 }
664 root 1.31
665 root 1.64 if ($redirect && exists $hdr{location}) {
666     # we ignore any errors, as it is very common to receive
667     # Content-Length != 0 but no actual body
668     # we also access %hdr, as $_[1] might be an erro
669     http_request (
670     $method => $hdr{location},
671     %arg,
672     recurse => $recurse - 1,
673     Redirect => [$_[0], \%hdr],
674     $cb);
675     } else {
676     $cb->($_[0], \%hdr);
677     }
678     };
679    
680     my $len = $hdr{"content-length"};
681    
682     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
683     $finish->(undef, 598 => "Request cancelled by on_header");
684     } elsif (
685     $hdr{Status} =~ /^(?:1..|204|205|304)$/
686     or $method eq "HEAD"
687     or (defined $len && !$len)
688     ) {
689     # no body
690     $finish->("", undef, undef, 1);
691     } else {
692     # body handling, four different code paths
693     # for want_body_handle, on_body (2x), normal (2x)
694     # we might read too much here, but it does not matter yet (no pipelining)
695     if (!$redirect && $arg{want_body_handle}) {
696     $_[0]->on_eof (undef);
697     $_[0]->on_error (undef);
698     $_[0]->on_read (undef);
699    
700     $finish->(delete $state{handle});
701    
702     } elsif ($arg{on_body}) {
703     $_[0]->on_error (sub { $finish->(undef, 599 => $_[2]) });
704     if ($len) {
705     $_[0]->on_read (sub {
706     $len -= length $_[0]{rbuf};
707    
708     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
709     or $finish->(undef, 598 => "Request cancelled by on_body");
710    
711     $len > 0
712     or $finish->("", undef, undef, 1);
713     });
714 root 1.59 } else {
715 root 1.64 $_[0]->on_eof (sub {
716     $finish->("");
717     });
718     $_[0]->on_read (sub {
719     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
720     or $finish->(undef, 598 => "Request cancelled by on_body");
721     });
722 root 1.11 }
723 root 1.64 } else {
724     $_[0]->on_eof (undef);
725 root 1.59
726 root 1.64 if ($len) {
727     $_[0]->on_error (sub { $finish->(undef, 599 => $_[2]) });
728     $_[0]->on_read (sub {
729     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
730     if $len <= length $_[0]{rbuf};
731     });
732 root 1.59 } else {
733 root 1.64 $_[0]->on_error (sub {
734     ($! == Errno::EPIPE || !$!)
735     ? $finish->(delete $_[0]{rbuf})
736     : $finish->(undef, 599 => $_[2]);
737     });
738     $_[0]->on_read (sub { });
739 root 1.59 }
740     }
741 root 1.64 }
742     });
743     };
744    
745     # now handle proxy-CONNECT method
746     if ($proxy && $uscheme eq "https") {
747     # oh dear, we have to wrap it into a connect request
748    
749     # maybe re-use $uauthority with patched port?
750     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012");
751     $state{handle}->push_read (line => $qr_nlnl, sub {
752     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
753     or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" }));
754    
755     if ($2 == 200) {
756     $rpath = $upath;
757     &$handle_actual_request;
758     } else {
759     %state = ();
760     $cb->(undef, { @pseudo, Status => $2, Reason => $3 });
761     }
762     });
763     } else {
764     &$handle_actual_request;
765     }
766     };
767    
768     my $tcp_connect = $arg{tcp_connect}
769     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
770 root 1.57
771 root 1.64 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
772 root 1.31
773 root 1.1 };
774    
775     defined wantarray && AnyEvent::Util::guard { %state = () }
776     }
777    
778 elmex 1.15 sub http_get($@) {
779 root 1.1 unshift @_, "GET";
780     &http_request
781     }
782    
783 elmex 1.15 sub http_head($@) {
784 root 1.4 unshift @_, "HEAD";
785     &http_request
786     }
787    
788 elmex 1.15 sub http_post($$@) {
789 root 1.22 my $url = shift;
790     unshift @_, "POST", $url, "body";
791 root 1.3 &http_request
792     }
793    
794 root 1.9 =back
795    
796 root 1.55 =head2 DNS CACHING
797    
798     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
799     the actual connection, which in turn uses AnyEvent::DNS to resolve
800     hostnames. The latter is a simple stub resolver and does no caching
801     on its own. If you want DNS caching, you currently have to provide
802     your own default resolver (by storing a suitable resolver object in
803     C<$AnyEvent::DNS::RESOLVER>).
804    
805 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
806 root 1.1
807     =over 4
808    
809 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
810    
811     Sets the default proxy server to use. The proxy-url must begin with a
812 root 1.52 string of the form C<http://host:port> (optionally C<https:...>), croaks
813     otherwise.
814    
815     To clear an already-set proxy, use C<undef>.
816 root 1.2
817 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
818    
819     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
820     Date (RFC 2616).
821    
822     =item $timestamp = AnyEvent::HTTP::parse_date $date
823    
824     Takes a HTTP Date (RFC 2616) and returns the corresponding POSIX
825     timestamp, or C<undef> if the date cannot be parsed.
826    
827 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
828 root 1.1
829 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
830 root 1.1
831     =item $AnyEvent::HTTP::USERAGENT
832    
833     The default value for the C<User-Agent> header (the default is
834 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
835 root 1.1
836 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
837 root 1.1
838 root 1.47 The maximum number of concurrent connections to the same host (identified
839 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
840     are queued until previous connections are closed.
841 root 1.1
842 root 1.43 The default value for this is C<4>, and it is highly advisable to not
843     increase it.
844 root 1.3
845 root 1.14 =item $AnyEvent::HTTP::ACTIVE
846    
847     The number of active connections. This is not the number of currently
848     running requests, but the number of currently open and non-idle TCP
849     connections. This number of can be useful for load-leveling.
850    
851 root 1.1 =back
852    
853     =cut
854    
855 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
856     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
857    
858     sub format_date($) {
859     my ($time) = @_;
860    
861     # RFC 822/1123 format
862     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
863    
864     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
865     $weekday[$wday], $mday, $month[$mon], $year + 1900,
866     $H, $M, $S;
867     }
868    
869     sub parse_date($) {
870     my ($date) = @_;
871    
872     my ($d, $m, $y, $H, $M, $S);
873    
874     if ($date =~ /^[A-Z][a-z][a-z], ([0-9][0-9]) ([A-Z][a-z][a-z]) ([0-9][0-9][0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
875     # RFC 822/1123, required by RFC 2616
876     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
877    
878     } elsif ($date =~ /^[A-Z][a-z]+, ([0-9][0-9])-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
879     # RFC 850
880     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
881    
882     } elsif ($date =~ /^[A-Z][a-z][a-z] ([A-Z][a-z][a-z]) ([0-9 ][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) ([0-9][0-9][0-9][0-9])$/) {
883     # ISO C's asctime
884     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
885     }
886     # other formats fail in the loop below
887    
888     for (0..11) {
889     if ($m eq $month[$_]) {
890     require Time::Local;
891     return Time::Local::timegm ($S, $M, $H, $d, $_, $y);
892     }
893     }
894    
895     undef
896     }
897    
898 root 1.2 sub set_proxy($) {
899 root 1.52 if (length $_[0]) {
900     $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix
901     or Carp::croak "$_[0]: invalid proxy URL";
902     $PROXY = [$2, $3 || 3128, $1]
903     } else {
904     undef $PROXY;
905     }
906 root 1.2 }
907    
908     # initialise proxy from environment
909 root 1.52 eval {
910     set_proxy $ENV{http_proxy};
911     };
912 root 1.2
913 root 1.60 =head2 SOCKS PROXIES
914    
915     Socks proxies are not directly supported by AnyEvent::HTTP. You can
916     compile your perl to support socks, or use an external program such as
917     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
918     transparently.
919    
920     Alternatively, for AnyEvent::HTTP only, you can use your own
921     C<tcp_connect> function that does the proxy handshake - here is an example
922     that works with socks4a proxies:
923    
924     use Errno;
925     use AnyEvent::Util;
926     use AnyEvent::Socket;
927     use AnyEvent::Handle;
928    
929     # host, port and username of/for your socks4a proxy
930     my $socks_host = "10.0.0.23";
931     my $socks_port = 9050;
932     my $socks_user = "";
933    
934     sub socks4a_connect {
935     my ($host, $port, $connect_cb, $prepare_cb) = @_;
936    
937     my $hdl = new AnyEvent::Handle
938     connect => [$socks_host, $socks_port],
939     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
940     on_error => sub { $connect_cb->() },
941     ;
942    
943     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
944    
945     $hdl->push_read (chunk => 8, sub {
946     my ($hdl, $chunk) = @_;
947     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
948    
949     if ($status == 0x5a) {
950     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
951     } else {
952     $! = Errno::ENXIO; $connect_cb->();
953     }
954     });
955    
956     $hdl
957     }
958    
959     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
960     possibly after switching off other proxy types:
961    
962     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
963    
964     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
965     my ($data, $headers) = @_;
966     ...
967     };
968    
969 root 1.1 =head1 SEE ALSO
970    
971     L<AnyEvent>.
972    
973     =head1 AUTHOR
974    
975 root 1.18 Marc Lehmann <schmorp@schmorp.de>
976     http://home.schmorp.de/
977 root 1.1
978 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
979     testcases and bugreports.
980    
981 root 1.1 =cut
982    
983     1
984