ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.59
Committed: Wed Dec 29 23:59:36 2010 UTC (13 years, 4 months ago) by root
Branch: MAIN
Changes since 1.58: +266 -248 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44 root 1.41 use Errno ();
45 root 1.1
46 root 1.51 use AnyEvent 5.0 ();
47 root 1.1 use AnyEvent::Util ();
48     use AnyEvent::Handle ();
49    
50     use base Exporter::;
51    
52 root 1.57 our $VERSION = '1.46';
53 root 1.1
54 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
55 root 1.1
56 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
57 root 1.3 our $MAX_RECURSE = 10;
58 root 1.2 our $MAX_PERSISTENT = 8;
59     our $PERSISTENT_TIMEOUT = 2;
60     our $TIMEOUT = 300;
61 root 1.1
62     # changing these is evil
63 root 1.43 our $MAX_PERSISTENT_PER_HOST = 0;
64 root 1.11 our $MAX_PER_HOST = 4;
65 root 1.1
66 root 1.2 our $PROXY;
67 root 1.14 our $ACTIVE = 0;
68 root 1.2
69 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
70 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
71 root 1.1
72     =item http_get $url, key => value..., $cb->($data, $headers)
73    
74     Executes an HTTP-GET request. See the http_request function for details on
75 root 1.29 additional parameters and the return value.
76 root 1.1
77 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
78    
79 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
80     on additional parameters and the return value.
81 root 1.5
82     =item http_post $url, $body, key => value..., $cb->($data, $headers)
83 root 1.3
84 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
85 root 1.29 http_request function for details on additional parameters and the return
86     value.
87 root 1.3
88 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89    
90     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91     must be an absolute http or https URL.
92    
93 root 1.29 When called in void context, nothing is returned. In other contexts,
94     C<http_request> returns a "cancellation guard" - you have to keep the
95     object at least alive until the callback get called. If the object gets
96 root 1.58 destroyed before the callback is called, the request will be cancelled.
97 root 1.29
98 root 1.42 The callback will be called with the response body data as first argument
99     (or C<undef> if an error occured), and a hash-ref with response headers as
100     second argument.
101 root 1.2
102 root 1.7 All the headers in that hash are lowercased. In addition to the response
103 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
104     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
105     three parts of the HTTP Status-Line of the same name.
106    
107     The pseudo-header C<URL> contains the actual URL (which can differ from
108     the requested URL when following redirects - for example, you might get
109     an error that your URL scheme is not supported even though your URL is a
110     valid http URL because it redirected to an ftp URL, in which case you can
111     look at the URL pseudo header).
112    
113     The pseudo-header C<Redirect> only exists when the request was a result
114     of an internal redirect. In that case it is an array reference with
115     the C<($data, $headers)> from the redirect response. Note that this
116     response could in turn be the result of a redirect itself, and C<<
117     $headers->{Redirect}[1]{Redirect} >> will then contain the original
118     response, and so on.
119 root 1.20
120 root 1.32 If the server sends a header multiple times, then their contents will be
121     joined together with a comma (C<,>), as per the HTTP spec.
122 root 1.2
123     If an internal error occurs, such as not being able to resolve a hostname,
124 root 1.41 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<59x>
125     (usually C<599>) and the C<Reason> pseudo-header will contain an error
126     message.
127 root 1.2
128 root 1.6 A typical callback might look like this:
129    
130     sub {
131     my ($body, $hdr) = @_;
132    
133     if ($hdr->{Status} =~ /^2/) {
134     ... everything should be ok
135     } else {
136     print "error, $hdr->{Status} $hdr->{Reason}\n";
137     }
138     }
139    
140 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
141     include:
142    
143     =over 4
144    
145 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
146 root 1.1
147     Whether to recurse requests or not, e.g. on redirects, authentication
148 root 1.3 retries and so on, and how often to do so.
149 root 1.1
150     =item headers => hashref
151    
152 root 1.12 The request headers to use. Currently, C<http_request> may provide its
153     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
154 root 1.47 and will provide defaults for C<User-Agent:> and C<Referer:> (this can be
155     suppressed by using C<undef> for these headers in which case they won't be
156 root 1.49 sent at all).
157 root 1.1
158     =item timeout => $seconds
159    
160     The time-out to use for various stages - each connect attempt will reset
161 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
162     timeout.
163    
164     Default timeout is 5 minutes.
165 root 1.2
166     =item proxy => [$host, $port[, $scheme]] or undef
167    
168     Use the given http proxy for all requests. If not specified, then the
169     default proxy (as specified by C<$ENV{http_proxy}>) is used.
170    
171 root 1.47 C<$scheme> must be either missing, C<http> for HTTP or C<https> for
172 root 1.2 HTTPS.
173 root 1.1
174 root 1.3 =item body => $string
175    
176     The request body, usually empty. Will be-sent as-is (future versions of
177     this module might offer more options).
178    
179 root 1.10 =item cookie_jar => $hash_ref
180    
181     Passing this parameter enables (simplified) cookie-processing, loosely
182     based on the original netscape specification.
183    
184     The C<$hash_ref> must be an (initially empty) hash reference which will
185     get updated automatically. It is possible to save the cookie_jar to
186     persistent storage with something like JSON or Storable, but this is not
187 root 1.40 recommended, as expiry times are currently being ignored.
188 root 1.10
189     Note that this cookie implementation is not of very high quality, nor
190     meant to be complete. If you want complete cookie management you have to
191     do that on your own. C<cookie_jar> is meant as a quick fix to get some
192     cookie-using sites working. Cookies are a privacy disaster, do not use
193     them unless required to.
194    
195 root 1.40 =item tls_ctx => $scheme | $tls_ctx
196    
197     Specifies the AnyEvent::TLS context to be used for https connections. This
198     parameter follows the same rules as the C<tls_ctx> parameter to
199     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
200     C<high> can be specified, which give you a predefined low-security (no
201     verification, highest compatibility) and high-security (CA and common-name
202     verification) TLS context.
203    
204     The default for this option is C<low>, which could be interpreted as "give
205     me the page, no matter what".
206    
207 root 1.51 =item on_prepare => $callback->($fh)
208    
209     In rare cases you need to "tune" the socket before it is used to
210     connect (for exmaple, to bind it on a given IP address). This parameter
211     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
212     and behaves exactly the same way (e.g. it has to provide a
213     timeout). See the description for the C<$prepare_cb> argument of
214     C<AnyEvent::Socket::tcp_connect> for details.
215    
216 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
217    
218     In even rarer cases you want total control over how AnyEvent::HTTP
219     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
220     to do this, but you can provide your own C<tcp_connect> function -
221     obviously, it has to follow the same calling conventions.
222    
223     There are probably lots of weird uses for this function, starting from
224     tracing the hosts C<http_request> actually tries to connect, to (inexact
225     but fast) host => IP address caching or even socks protocol support.
226    
227 root 1.42 =item on_header => $callback->($headers)
228 root 1.41
229     When specified, this callback will be called with the header hash as soon
230     as headers have been successfully received from the remote server (not on
231     locally-generated errors).
232    
233     It has to return either true (in which case AnyEvent::HTTP will continue),
234     or false, in which case AnyEvent::HTTP will cancel the download (and call
235     the finish callback with an error code of C<598>).
236    
237     This callback is useful, among other things, to quickly reject unwanted
238     content, which, if it is supposed to be rare, can be faster than first
239     doing a C<HEAD> request.
240    
241 root 1.42 Example: cancel the request unless the content-type is "text/html".
242 root 1.41
243 root 1.42 on_header => sub {
244     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
245     },
246 root 1.41
247 root 1.42 =item on_body => $callback->($partial_body, $headers)
248 root 1.41
249 root 1.42 When specified, all body data will be passed to this callback instead of
250     to the completion callback. The completion callback will get the empty
251     string instead of the body data.
252 root 1.41
253 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
254     or false, in which case AnyEvent::HTTP will cancel the download (and call
255     the completion callback with an error code of C<598>).
256    
257     This callback is useful when the data is too large to be held in memory
258     (so the callback writes it to a file) or when only some information should
259     be extracted, or when the body should be processed incrementally.
260 root 1.41
261     It is usually preferred over doing your own body handling via
262 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
263     only used to create a connection, C<want_body_handle> is the better
264     alternative, as it allows you to install your own event handler, reducing
265     resource usage.
266 root 1.41
267     =item want_body_handle => $enable
268    
269     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
270     changes considerably: after parsing the headers, and instead of
271     downloading the body (if any), the completion callback will be
272     called. Instead of the C<$body> argument containing the body data, the
273     callback will receive the L<AnyEvent::Handle> object associated with the
274     connection. In error cases, C<undef> will be passed. When there is no body
275     (e.g. status C<304>), the empty string will be passed.
276    
277     The handle object might or might not be in TLS mode, might be connected to
278     a proxy, be a persistent connection etc., and configured in unspecified
279     ways. The user is responsible for this handle (it will not be used by this
280     module anymore).
281    
282     This is useful with some push-type services, where, after the initial
283     headers, an interactive protocol is used (typical example would be the
284     push-style twitter API which starts a JSON/XML stream).
285    
286     If you think you need this, first have a look at C<on_body>, to see if
287 root 1.45 that doesn't solve your problem in a better way.
288 root 1.41
289 root 1.1 =back
290    
291 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
292    
293     http_request GET => "http://www.nethype.de/", sub {
294     my ($body, $hdr) = @_;
295     print "$body\n";
296     };
297    
298     Example: make a HTTP HEAD request on https://www.google.com/, use a
299     timeout of 30 seconds.
300    
301     http_request
302     GET => "https://www.google.com",
303     timeout => 30,
304     sub {
305     my ($body, $hdr) = @_;
306     use Data::Dumper;
307     print Dumper $hdr;
308     }
309     ;
310 root 1.1
311 root 1.29 Example: make another simple HTTP GET request, but immediately try to
312     cancel it.
313    
314     my $request = http_request GET => "http://www.nethype.de/", sub {
315     my ($body, $hdr) = @_;
316     print "$body\n";
317     };
318    
319     undef $request;
320    
321 root 1.1 =cut
322    
323 root 1.12 sub _slot_schedule;
324 root 1.11 sub _slot_schedule($) {
325     my $host = shift;
326    
327     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
328     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
329 root 1.12 # somebody wants that slot
330 root 1.11 ++$CO_SLOT{$host}[0];
331 root 1.14 ++$ACTIVE;
332 root 1.11
333     $cb->(AnyEvent::Util::guard {
334 root 1.14 --$ACTIVE;
335 root 1.11 --$CO_SLOT{$host}[0];
336     _slot_schedule $host;
337     });
338     } else {
339     # nobody wants the slot, maybe we can forget about it
340     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
341     last;
342     }
343     }
344     }
345    
346     # wait for a free slot on host, call callback
347     sub _get_slot($$) {
348     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
349    
350     _slot_schedule $_[0];
351     }
352    
353 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
354 root 1.34
355 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
356     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
357 root 1.40
358 elmex 1.15 sub http_request($$@) {
359 root 1.1 my $cb = pop;
360     my ($method, $url, %arg) = @_;
361    
362     my %hdr;
363    
364 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
365     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
366    
367 root 1.3 $method = uc $method;
368    
369 root 1.8 if (my $hdr = $arg{headers}) {
370 root 1.1 while (my ($k, $v) = each %$hdr) {
371     $hdr{lc $k} = $v;
372     }
373     }
374    
375 root 1.55 # pseudo headers for all subsequent responses
376     my @pseudo = (URL => $url);
377     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
378    
379 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
380 root 1.8
381 root 1.55 return $cb->(undef, { Status => 599, Reason => "Too many redirections", @pseudo })
382 root 1.8 if $recurse < 0;
383    
384 root 1.2 my $proxy = $arg{proxy} || $PROXY;
385 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
386    
387 root 1.31 my ($uscheme, $uauthority, $upath, $query, $fragment) =
388 root 1.56 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?|;
389 root 1.2
390 root 1.31 $uscheme = lc $uscheme;
391 root 1.1
392 root 1.31 my $uport = $uscheme eq "http" ? 80
393     : $uscheme eq "https" ? 443
394 root 1.55 : return $cb->(undef, { Status => 599, Reason => "Only http and https URL schemes supported", @pseudo });
395 root 1.13
396 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
397 root 1.55 or return $cb->(undef, { Status => 599, Reason => "Unparsable URL", @pseudo });
398 root 1.10
399     my $uhost = $1;
400     $uport = $2 if defined $2;
401    
402 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
403     unless exists $hdr{host};
404 root 1.43
405 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
406 root 1.56 $upath .= $query if length $query;
407 root 1.10
408     $upath =~ s%^/?%/%;
409    
410     # cookie processing
411     if (my $jar = $arg{cookie_jar}) {
412 root 1.31 %$jar = () if $jar->{version} != 1;
413 root 1.10
414     my @cookie;
415    
416     while (my ($chost, $v) = each %$jar) {
417 root 1.30 if ($chost =~ /^\./) {
418     next unless $chost eq substr $uhost, -length $chost;
419     } elsif ($chost =~ /\./) {
420     next unless $chost eq $uhost;
421     } else {
422     next;
423     }
424 root 1.10
425     while (my ($cpath, $v) = each %$v) {
426     next unless $cpath eq substr $upath, 0, length $cpath;
427    
428     while (my ($k, $v) = each %$v) {
429 root 1.31 next if $uscheme ne "https" && exists $v->{secure};
430     my $value = $v->{value};
431     $value =~ s/([\\"])/\\$1/g;
432     push @cookie, "$k=\"$value\"";
433 root 1.10 }
434     }
435     }
436    
437     $hdr{cookie} = join "; ", @cookie
438     if @cookie;
439     }
440 root 1.1
441 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
442 root 1.2
443 root 1.10 if ($proxy) {
444 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
445 root 1.31
446 root 1.47 $rscheme = "http" unless defined $rscheme;
447    
448 root 1.31 # don't support https requests over https-proxy transport,
449 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
450 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
451 root 1.10 } else {
452 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
453 root 1.2 }
454    
455 root 1.47 # leave out fragment and query string, just a heuristic
456     $hdr{referer} ||= "$uscheme://$uauthority$upath" unless exists $hdr{referer};
457     $hdr{"user-agent"} ||= $USERAGENT unless exists $hdr{"user-agent"};
458 root 1.41
459 root 1.53 $hdr{"content-length"} = length $arg{body}
460     if length $arg{body} || $method ne "GET";
461 root 1.1
462 root 1.11 my %state = (connect_guard => 1);
463    
464     _get_slot $uhost, sub {
465     $state{slot_guard} = shift;
466 root 1.1
467 root 1.11 return unless $state{connect_guard};
468 root 1.1
469 root 1.59 my $tcp_connect = $arg{tcp_connect}
470     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
471    
472     $state{connect_guard} = $tcp_connect->(
473     $rhost,
474     $rport,
475     sub {
476     $state{fh} = shift
477     or do {
478     my $err = "$!";
479     %state = ();
480     return $cb->(undef, { Status => 599, Reason => $err, @pseudo });
481     };
482 root 1.44
483 root 1.59 pop; # free memory, save a tree
484 root 1.11
485 root 1.59 return unless delete $state{connect_guard};
486 root 1.11
487 root 1.59 # get handle
488     $state{handle} = new AnyEvent::Handle
489     fh => $state{fh},
490     peername => $rhost,
491     tls_ctx => $arg{tls_ctx},
492     # these need to be reconfigured on keepalive handles
493     timeout => $timeout,
494     on_error => sub {
495     %state = ();
496     $cb->(undef, { Status => 599, Reason => $_[2], @pseudo });
497     },
498     on_eof => sub {
499     %state = ();
500     $cb->(undef, { Status => 599, Reason => "Unexpected end-of-file", @pseudo });
501     },
502     ;
503 root 1.11
504 root 1.59 # limit the number of persistent connections
505     # keepalive not yet supported
506 root 1.56 # if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
507     # ++$KA_COUNT{$_[1]};
508     # $state{handle}{ka_count_guard} = AnyEvent::Util::guard {
509     # --$KA_COUNT{$_[1]}
510     # };
511     # $hdr{connection} = "keep-alive";
512     # } else {
513 root 1.59 delete $hdr{connection};
514 root 1.56 # }
515 root 1.1
516 root 1.59 $state{handle}->starttls ("connect") if $rscheme eq "https";
517    
518     # handle actual, non-tunneled, request
519     my $handle_actual_request = sub {
520     $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls};
521    
522     # send request
523     $state{handle}->push_write (
524     "$method $rpath HTTP/1.0\015\012"
525     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
526     . "\015\012"
527     . (delete $arg{body})
528     );
529    
530     # return if error occured during push_write()
531     return unless %state;
532    
533     %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use
534    
535     # status line and headers
536     $state{handle}->push_read (line => $qr_nlnl, sub {
537     for ("$_[1]") {
538     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
539    
540     /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )? \015?\012/igxc
541     or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid server response", @pseudo }));
542    
543     push @pseudo,
544     HTTPVersion => $1,
545     Status => $2,
546     Reason => $3,
547     ;
548    
549     # things seen, not parsed:
550     # p3pP="NON CUR OTPi OUR NOR UNI"
551    
552     $hdr{lc $1} .= ",$2"
553     while /\G
554     ([^:\000-\037]*):
555     [\011\040]*
556     ((?: [^\012]+ | \012[\011\040] )*)
557     \012
558     /gxc;
559 root 1.31
560 root 1.59 /\G$/
561     or return (%state = (), $cb->(undef, { Status => 599, Reason => "Garbled response headers", @pseudo }));
562 root 1.41 }
563    
564 root 1.59 # remove the "," prefix we added to all headers above
565     substr $_, 0, 1, ""
566     for values %hdr;
567    
568     # patch in all pseudo headers
569     %hdr = (%hdr, @pseudo);
570    
571     # redirect handling
572     # microsoft and other shitheads don't give a shit for following standards,
573     # try to support some common forms of broken Location headers.
574     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
575     $hdr{location} =~ s/^\.\/+//;
576    
577     my $url = "$rscheme://$uhost:$uport";
578    
579     unless ($hdr{location} =~ s/^\///) {
580     $url .= $upath;
581     $url =~ s/\/[^\/]*$//;
582     }
583    
584     $hdr{location} = "$url/$hdr{location}";
585     }
586 root 1.57
587 root 1.59 my $redirect;
588 root 1.41
589 root 1.59 if ($recurse) {
590     my $status = $hdr{Status};
591 root 1.55
592 root 1.59 # industry standard is to redirect POST as GET for
593     # 301, 302 and 303, in contrast to http/1.0 and 1.1.
594     # also, the UA should ask the user for 301 and 307 and POST,
595     # industry standard seems to be to simply follow.
596     # we go with the industry standard.
597     if ($status == 301 or $status == 302 or $status == 303) {
598     # HTTP/1.1 is unclear on how to mutate the method
599     $method = "GET" unless $method eq "HEAD";
600     $redirect = 1;
601     } elsif ($status == 307) {
602     $redirect = 1;
603     }
604 root 1.41 }
605    
606 root 1.59 my $finish = sub {
607     $state{handle}->destroy if $state{handle};
608     %state = ();
609    
610     # set-cookie processing
611     if ($arg{cookie_jar}) {
612     for ($_[1]{"set-cookie"}) {
613     # parse NAME=VALUE
614     my @kv;
615    
616     while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) {
617     my $name = $1;
618     my $value = $3;
619    
620     unless ($value) {
621     $value = $2;
622     $value =~ s/\\(.)/$1/gs;
623     }
624    
625     push @kv, $name => $value;
626 root 1.31
627 root 1.59 last unless /\G\s*;/gc;
628 root 1.57 }
629    
630 root 1.59 last unless @kv;
631 root 1.31
632 root 1.59 my $name = shift @kv;
633     my %kv = (value => shift @kv, @kv);
634 root 1.31
635 root 1.59 my $cdom;
636     my $cpath = (delete $kv{path}) || "/";
637 root 1.10
638 root 1.59 if (exists $kv{domain}) {
639     $cdom = delete $kv{domain};
640    
641     $cdom =~ s/^\.?/./; # make sure it starts with a "."
642 root 1.11
643 root 1.59 next if $cdom =~ /\.$/;
644    
645     # this is not rfc-like and not netscape-like. go figure.
646     my $ndots = $cdom =~ y/.//;
647     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
648     } else {
649     $cdom = $uhost;
650     }
651    
652     # store it
653     $arg{cookie_jar}{version} = 1;
654     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
655 root 1.11
656 root 1.59 redo if /\G\s*,/gc;
657 root 1.57 }
658 root 1.59 }
659 root 1.31
660 root 1.59 if ($redirect && exists $hdr{location}) {
661     # we ignore any errors, as it is very common to receive
662     # Content-Length != 0 but no actual body
663     # we also access %hdr, as $_[1] might be an erro
664     http_request (
665     $method => $hdr{location},
666     %arg,
667     recurse => $recurse - 1,
668     Redirect => \@_,
669     $cb);
670     } else {
671     $cb->($_[0], $_[1]);
672 root 1.11 }
673 root 1.59 };
674    
675     my $len = $hdr{"content-length"};
676 root 1.8
677 root 1.59 if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
678     $finish->(undef, { Status => 598, Reason => "Request cancelled by on_header", @pseudo });
679     } elsif (
680     $hdr{Status} =~ /^(?:1..|[23]04)$/
681     or $method eq "HEAD"
682     or (defined $len && !$len)
683     ) {
684     # no body
685     $finish->("", \%hdr);
686 root 1.57 } else {
687 root 1.59 # body handling, four different code paths
688     # for want_body_handle, on_body (2x), normal (2x)
689     # we might read too much here, but it does not matter yet (no pers. connections)
690     if (!$redirect && $arg{want_body_handle}) {
691     $_[0]->on_eof (undef);
692     $_[0]->on_error (undef);
693     $_[0]->on_read (undef);
694 root 1.24
695 root 1.59 $finish->(delete $state{handle}, \%hdr);
696 root 1.41
697 root 1.59 } elsif ($arg{on_body}) {
698     $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) });
699     if ($len) {
700     $_[0]->on_eof (undef);
701     $_[0]->on_read (sub {
702     $len -= length $_[0]{rbuf};
703    
704     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
705     or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo });
706    
707     $len > 0
708     or $finish->("", \%hdr);
709     });
710     } else {
711     $_[0]->on_eof (sub {
712     $finish->("", \%hdr);
713     });
714     $_[0]->on_read (sub {
715     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
716     or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo });
717     });
718     }
719     } else {
720 root 1.41 $_[0]->on_eof (undef);
721    
722 root 1.59 if ($len) {
723     $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) });
724     $_[0]->on_read (sub {
725     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), \%hdr)
726     if $len <= length $_[0]{rbuf};
727     });
728     } else {
729     $_[0]->on_error (sub {
730     ($! == Errno::EPIPE || !$!)
731     ? $finish->(delete $_[0]{rbuf}, \%hdr)
732     : $finish->(undef, { Status => 599, Reason => $_[2], @pseudo });
733     });
734     $_[0]->on_read (sub { });
735     }
736     }
737     }
738     });
739     };
740 root 1.57
741 root 1.59 # now handle proxy-CONNECT method
742     if ($proxy && $uscheme eq "https") {
743     # oh dear, we have to wrap it into a connect request
744    
745     # maybe re-use $uauthority with patched port?
746     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012");
747     $state{handle}->push_read (line => $qr_nlnl, sub {
748     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
749     or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid proxy connect response ($_[1])", @pseudo }));
750    
751     if ($2 == 200) {
752     $rpath = $upath;
753     &$handle_actual_request;
754 root 1.57 } else {
755 root 1.59 %state = ();
756     $cb->(undef, { Status => $2, Reason => $3, @pseudo });
757 root 1.11 }
758 root 1.59 });
759     } else {
760     &$handle_actual_request;
761     }
762 root 1.31
763 root 1.59 },
764     $arg{on_prepare} || sub { $timeout }
765     );
766 root 1.1 };
767    
768     defined wantarray && AnyEvent::Util::guard { %state = () }
769     }
770    
771 elmex 1.15 sub http_get($@) {
772 root 1.1 unshift @_, "GET";
773     &http_request
774     }
775    
776 elmex 1.15 sub http_head($@) {
777 root 1.4 unshift @_, "HEAD";
778     &http_request
779     }
780    
781 elmex 1.15 sub http_post($$@) {
782 root 1.22 my $url = shift;
783     unshift @_, "POST", $url, "body";
784 root 1.3 &http_request
785     }
786    
787 root 1.9 =back
788    
789 root 1.55 =head2 DNS CACHING
790    
791     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
792     the actual connection, which in turn uses AnyEvent::DNS to resolve
793     hostnames. The latter is a simple stub resolver and does no caching
794     on its own. If you want DNS caching, you currently have to provide
795     your own default resolver (by storing a suitable resolver object in
796     C<$AnyEvent::DNS::RESOLVER>).
797    
798 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
799 root 1.1
800     =over 4
801    
802 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
803    
804     Sets the default proxy server to use. The proxy-url must begin with a
805 root 1.52 string of the form C<http://host:port> (optionally C<https:...>), croaks
806     otherwise.
807    
808     To clear an already-set proxy, use C<undef>.
809 root 1.2
810 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
811 root 1.1
812 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
813 root 1.1
814     =item $AnyEvent::HTTP::USERAGENT
815    
816     The default value for the C<User-Agent> header (the default is
817 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
818 root 1.1
819 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
820 root 1.1
821 root 1.47 The maximum number of concurrent connections to the same host (identified
822 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
823     are queued until previous connections are closed.
824 root 1.1
825 root 1.43 The default value for this is C<4>, and it is highly advisable to not
826     increase it.
827 root 1.3
828 root 1.14 =item $AnyEvent::HTTP::ACTIVE
829    
830     The number of active connections. This is not the number of currently
831     running requests, but the number of currently open and non-idle TCP
832     connections. This number of can be useful for load-leveling.
833    
834 root 1.1 =back
835    
836     =cut
837    
838 root 1.2 sub set_proxy($) {
839 root 1.52 if (length $_[0]) {
840     $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix
841     or Carp::croak "$_[0]: invalid proxy URL";
842     $PROXY = [$2, $3 || 3128, $1]
843     } else {
844     undef $PROXY;
845     }
846 root 1.2 }
847    
848     # initialise proxy from environment
849 root 1.52 eval {
850     set_proxy $ENV{http_proxy};
851     };
852 root 1.2
853 root 1.1 =head1 SEE ALSO
854    
855     L<AnyEvent>.
856    
857     =head1 AUTHOR
858    
859 root 1.18 Marc Lehmann <schmorp@schmorp.de>
860     http://home.schmorp.de/
861 root 1.1
862 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
863     testcases and bugreports.
864    
865 root 1.1 =cut
866    
867     1
868