ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.56
Committed: Mon Sep 6 05:30:54 2010 UTC (13 years, 8 months ago) by root
Branch: MAIN
Changes since 1.55: +22 -22 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44 root 1.41 use Errno ();
45 root 1.1
46 root 1.51 use AnyEvent 5.0 ();
47 root 1.1 use AnyEvent::Util ();
48     use AnyEvent::Socket ();
49     use AnyEvent::Handle ();
50    
51     use base Exporter::;
52    
53 root 1.55 our $VERSION = '1.45';
54 root 1.1
55 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
56 root 1.1
57 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
58 root 1.3 our $MAX_RECURSE = 10;
59 root 1.2 our $MAX_PERSISTENT = 8;
60     our $PERSISTENT_TIMEOUT = 2;
61     our $TIMEOUT = 300;
62 root 1.1
63     # changing these is evil
64 root 1.43 our $MAX_PERSISTENT_PER_HOST = 0;
65 root 1.11 our $MAX_PER_HOST = 4;
66 root 1.1
67 root 1.2 our $PROXY;
68 root 1.14 our $ACTIVE = 0;
69 root 1.2
70 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
71 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
72 root 1.1
73     =item http_get $url, key => value..., $cb->($data, $headers)
74    
75     Executes an HTTP-GET request. See the http_request function for details on
76 root 1.29 additional parameters and the return value.
77 root 1.1
78 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
79    
80 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
81     on additional parameters and the return value.
82 root 1.5
83     =item http_post $url, $body, key => value..., $cb->($data, $headers)
84 root 1.3
85 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
86 root 1.29 http_request function for details on additional parameters and the return
87     value.
88 root 1.3
89 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
90    
91     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
92     must be an absolute http or https URL.
93    
94 root 1.29 When called in void context, nothing is returned. In other contexts,
95     C<http_request> returns a "cancellation guard" - you have to keep the
96     object at least alive until the callback get called. If the object gets
97     destroyed before the callbakc is called, the request will be cancelled.
98    
99 root 1.42 The callback will be called with the response body data as first argument
100     (or C<undef> if an error occured), and a hash-ref with response headers as
101     second argument.
102 root 1.2
103 root 1.7 All the headers in that hash are lowercased. In addition to the response
104 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
105     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
106     three parts of the HTTP Status-Line of the same name.
107    
108     The pseudo-header C<URL> contains the actual URL (which can differ from
109     the requested URL when following redirects - for example, you might get
110     an error that your URL scheme is not supported even though your URL is a
111     valid http URL because it redirected to an ftp URL, in which case you can
112     look at the URL pseudo header).
113    
114     The pseudo-header C<Redirect> only exists when the request was a result
115     of an internal redirect. In that case it is an array reference with
116     the C<($data, $headers)> from the redirect response. Note that this
117     response could in turn be the result of a redirect itself, and C<<
118     $headers->{Redirect}[1]{Redirect} >> will then contain the original
119     response, and so on.
120 root 1.20
121 root 1.32 If the server sends a header multiple times, then their contents will be
122     joined together with a comma (C<,>), as per the HTTP spec.
123 root 1.2
124     If an internal error occurs, such as not being able to resolve a hostname,
125 root 1.41 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<59x>
126     (usually C<599>) and the C<Reason> pseudo-header will contain an error
127     message.
128 root 1.2
129 root 1.6 A typical callback might look like this:
130    
131     sub {
132     my ($body, $hdr) = @_;
133    
134     if ($hdr->{Status} =~ /^2/) {
135     ... everything should be ok
136     } else {
137     print "error, $hdr->{Status} $hdr->{Reason}\n";
138     }
139     }
140    
141 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
142     include:
143    
144     =over 4
145    
146 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
147 root 1.1
148     Whether to recurse requests or not, e.g. on redirects, authentication
149 root 1.3 retries and so on, and how often to do so.
150 root 1.1
151     =item headers => hashref
152    
153 root 1.12 The request headers to use. Currently, C<http_request> may provide its
154     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
155 root 1.47 and will provide defaults for C<User-Agent:> and C<Referer:> (this can be
156     suppressed by using C<undef> for these headers in which case they won't be
157 root 1.49 sent at all).
158 root 1.1
159     =item timeout => $seconds
160    
161     The time-out to use for various stages - each connect attempt will reset
162 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
163     timeout.
164    
165     Default timeout is 5 minutes.
166 root 1.2
167     =item proxy => [$host, $port[, $scheme]] or undef
168    
169     Use the given http proxy for all requests. If not specified, then the
170     default proxy (as specified by C<$ENV{http_proxy}>) is used.
171    
172 root 1.47 C<$scheme> must be either missing, C<http> for HTTP or C<https> for
173 root 1.2 HTTPS.
174 root 1.1
175 root 1.3 =item body => $string
176    
177     The request body, usually empty. Will be-sent as-is (future versions of
178     this module might offer more options).
179    
180 root 1.10 =item cookie_jar => $hash_ref
181    
182     Passing this parameter enables (simplified) cookie-processing, loosely
183     based on the original netscape specification.
184    
185     The C<$hash_ref> must be an (initially empty) hash reference which will
186     get updated automatically. It is possible to save the cookie_jar to
187     persistent storage with something like JSON or Storable, but this is not
188 root 1.40 recommended, as expiry times are currently being ignored.
189 root 1.10
190     Note that this cookie implementation is not of very high quality, nor
191     meant to be complete. If you want complete cookie management you have to
192     do that on your own. C<cookie_jar> is meant as a quick fix to get some
193     cookie-using sites working. Cookies are a privacy disaster, do not use
194     them unless required to.
195    
196 root 1.40 =item tls_ctx => $scheme | $tls_ctx
197    
198     Specifies the AnyEvent::TLS context to be used for https connections. This
199     parameter follows the same rules as the C<tls_ctx> parameter to
200     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
201     C<high> can be specified, which give you a predefined low-security (no
202     verification, highest compatibility) and high-security (CA and common-name
203     verification) TLS context.
204    
205     The default for this option is C<low>, which could be interpreted as "give
206     me the page, no matter what".
207    
208 root 1.51 =item on_prepare => $callback->($fh)
209    
210     In rare cases you need to "tune" the socket before it is used to
211     connect (for exmaple, to bind it on a given IP address). This parameter
212     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
213     and behaves exactly the same way (e.g. it has to provide a
214     timeout). See the description for the C<$prepare_cb> argument of
215     C<AnyEvent::Socket::tcp_connect> for details.
216    
217 root 1.42 =item on_header => $callback->($headers)
218 root 1.41
219     When specified, this callback will be called with the header hash as soon
220     as headers have been successfully received from the remote server (not on
221     locally-generated errors).
222    
223     It has to return either true (in which case AnyEvent::HTTP will continue),
224     or false, in which case AnyEvent::HTTP will cancel the download (and call
225     the finish callback with an error code of C<598>).
226    
227     This callback is useful, among other things, to quickly reject unwanted
228     content, which, if it is supposed to be rare, can be faster than first
229     doing a C<HEAD> request.
230    
231 root 1.42 Example: cancel the request unless the content-type is "text/html".
232 root 1.41
233 root 1.42 on_header => sub {
234     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
235     },
236 root 1.41
237 root 1.42 =item on_body => $callback->($partial_body, $headers)
238 root 1.41
239 root 1.42 When specified, all body data will be passed to this callback instead of
240     to the completion callback. The completion callback will get the empty
241     string instead of the body data.
242 root 1.41
243 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
244     or false, in which case AnyEvent::HTTP will cancel the download (and call
245     the completion callback with an error code of C<598>).
246    
247     This callback is useful when the data is too large to be held in memory
248     (so the callback writes it to a file) or when only some information should
249     be extracted, or when the body should be processed incrementally.
250 root 1.41
251     It is usually preferred over doing your own body handling via
252 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
253     only used to create a connection, C<want_body_handle> is the better
254     alternative, as it allows you to install your own event handler, reducing
255     resource usage.
256 root 1.41
257     =item want_body_handle => $enable
258    
259     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
260     changes considerably: after parsing the headers, and instead of
261     downloading the body (if any), the completion callback will be
262     called. Instead of the C<$body> argument containing the body data, the
263     callback will receive the L<AnyEvent::Handle> object associated with the
264     connection. In error cases, C<undef> will be passed. When there is no body
265     (e.g. status C<304>), the empty string will be passed.
266    
267     The handle object might or might not be in TLS mode, might be connected to
268     a proxy, be a persistent connection etc., and configured in unspecified
269     ways. The user is responsible for this handle (it will not be used by this
270     module anymore).
271    
272     This is useful with some push-type services, where, after the initial
273     headers, an interactive protocol is used (typical example would be the
274     push-style twitter API which starts a JSON/XML stream).
275    
276     If you think you need this, first have a look at C<on_body>, to see if
277 root 1.45 that doesn't solve your problem in a better way.
278 root 1.41
279 root 1.1 =back
280    
281 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
282    
283     http_request GET => "http://www.nethype.de/", sub {
284     my ($body, $hdr) = @_;
285     print "$body\n";
286     };
287    
288     Example: make a HTTP HEAD request on https://www.google.com/, use a
289     timeout of 30 seconds.
290    
291     http_request
292     GET => "https://www.google.com",
293     timeout => 30,
294     sub {
295     my ($body, $hdr) = @_;
296     use Data::Dumper;
297     print Dumper $hdr;
298     }
299     ;
300 root 1.1
301 root 1.29 Example: make another simple HTTP GET request, but immediately try to
302     cancel it.
303    
304     my $request = http_request GET => "http://www.nethype.de/", sub {
305     my ($body, $hdr) = @_;
306     print "$body\n";
307     };
308    
309     undef $request;
310    
311 root 1.1 =cut
312    
313 root 1.12 sub _slot_schedule;
314 root 1.11 sub _slot_schedule($) {
315     my $host = shift;
316    
317     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
318     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
319 root 1.12 # somebody wants that slot
320 root 1.11 ++$CO_SLOT{$host}[0];
321 root 1.14 ++$ACTIVE;
322 root 1.11
323     $cb->(AnyEvent::Util::guard {
324 root 1.14 --$ACTIVE;
325 root 1.11 --$CO_SLOT{$host}[0];
326     _slot_schedule $host;
327     });
328     } else {
329     # nobody wants the slot, maybe we can forget about it
330     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
331     last;
332     }
333     }
334     }
335    
336     # wait for a free slot on host, call callback
337     sub _get_slot($$) {
338     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
339    
340     _slot_schedule $_[0];
341     }
342    
343 root 1.46 our $qr_nl = qr{\015?\012};
344     our $qr_nlnl = qr{(?<![^\012])\015?\012};
345 root 1.34
346 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
347     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
348 root 1.40
349 elmex 1.15 sub http_request($$@) {
350 root 1.1 my $cb = pop;
351     my ($method, $url, %arg) = @_;
352    
353     my %hdr;
354    
355 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
356     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
357    
358 root 1.3 $method = uc $method;
359    
360 root 1.8 if (my $hdr = $arg{headers}) {
361 root 1.1 while (my ($k, $v) = each %$hdr) {
362     $hdr{lc $k} = $v;
363     }
364     }
365    
366 root 1.55 # pseudo headers for all subsequent responses
367     my @pseudo = (URL => $url);
368     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
369    
370 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
371 root 1.8
372 root 1.55 return $cb->(undef, { Status => 599, Reason => "Too many redirections", @pseudo })
373 root 1.8 if $recurse < 0;
374    
375 root 1.2 my $proxy = $arg{proxy} || $PROXY;
376 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
377    
378 root 1.31 my ($uscheme, $uauthority, $upath, $query, $fragment) =
379 root 1.56 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?|;
380 root 1.2
381 root 1.31 $uscheme = lc $uscheme;
382 root 1.1
383 root 1.31 my $uport = $uscheme eq "http" ? 80
384     : $uscheme eq "https" ? 443
385 root 1.55 : return $cb->(undef, { Status => 599, Reason => "Only http and https URL schemes supported", @pseudo });
386 root 1.13
387 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
388 root 1.55 or return $cb->(undef, { Status => 599, Reason => "Unparsable URL", @pseudo });
389 root 1.10
390     my $uhost = $1;
391     $uport = $2 if defined $2;
392    
393 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
394     unless exists $hdr{host};
395 root 1.43
396 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
397 root 1.56 $upath .= $query if length $query;
398 root 1.10
399     $upath =~ s%^/?%/%;
400    
401     # cookie processing
402     if (my $jar = $arg{cookie_jar}) {
403 root 1.31 %$jar = () if $jar->{version} != 1;
404 root 1.10
405     my @cookie;
406    
407     while (my ($chost, $v) = each %$jar) {
408 root 1.30 if ($chost =~ /^\./) {
409     next unless $chost eq substr $uhost, -length $chost;
410     } elsif ($chost =~ /\./) {
411     next unless $chost eq $uhost;
412     } else {
413     next;
414     }
415 root 1.10
416     while (my ($cpath, $v) = each %$v) {
417     next unless $cpath eq substr $upath, 0, length $cpath;
418    
419     while (my ($k, $v) = each %$v) {
420 root 1.31 next if $uscheme ne "https" && exists $v->{secure};
421     my $value = $v->{value};
422     $value =~ s/([\\"])/\\$1/g;
423     push @cookie, "$k=\"$value\"";
424 root 1.10 }
425     }
426     }
427    
428     $hdr{cookie} = join "; ", @cookie
429     if @cookie;
430     }
431 root 1.1
432 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
433 root 1.2
434 root 1.10 if ($proxy) {
435 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
436 root 1.31
437 root 1.47 $rscheme = "http" unless defined $rscheme;
438    
439 root 1.31 # don't support https requests over https-proxy transport,
440 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
441 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
442 root 1.10 } else {
443 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
444 root 1.2 }
445    
446 root 1.47 # leave out fragment and query string, just a heuristic
447     $hdr{referer} ||= "$uscheme://$uauthority$upath" unless exists $hdr{referer};
448     $hdr{"user-agent"} ||= $USERAGENT unless exists $hdr{"user-agent"};
449 root 1.41
450 root 1.53 $hdr{"content-length"} = length $arg{body}
451     if length $arg{body} || $method ne "GET";
452 root 1.1
453 root 1.11 my %state = (connect_guard => 1);
454    
455     _get_slot $uhost, sub {
456     $state{slot_guard} = shift;
457 root 1.1
458 root 1.11 return unless $state{connect_guard};
459 root 1.1
460 root 1.11 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
461     $state{fh} = shift
462 root 1.44 or do {
463     my $err = "$!";
464     %state = ();
465 root 1.55 return $cb->(undef, { Status => 599, Reason => $err, @pseudo });
466 root 1.44 };
467    
468 root 1.34 pop; # free memory, save a tree
469 root 1.11
470 root 1.34 return unless delete $state{connect_guard};
471 root 1.11
472     # get handle
473     $state{handle} = new AnyEvent::Handle
474 root 1.40 fh => $state{fh},
475 root 1.56 peername => $rhost,
476     tls_ctx => $arg{tls_ctx},
477     # these need to be reconfigured on keepalive handles
478 root 1.40 timeout => $timeout,
479 root 1.56 on_error => sub {
480     %state = ();
481     $cb->(undef, { Status => 599, Reason => $_[2], @pseudo });
482     },
483     on_eof => sub {
484     %state = ();
485     $cb->(undef, { Status => 599, Reason => "Unexpected end-of-file", @pseudo });
486     },
487     ;
488 root 1.11
489     # limit the number of persistent connections
490 root 1.34 # keepalive not yet supported
491 root 1.56 # if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
492     # ++$KA_COUNT{$_[1]};
493     # $state{handle}{ka_count_guard} = AnyEvent::Util::guard {
494     # --$KA_COUNT{$_[1]}
495     # };
496     # $hdr{connection} = "keep-alive";
497     # } else {
498 root 1.11 delete $hdr{connection};
499 root 1.56 # }
500 root 1.1
501 root 1.31 $state{handle}->starttls ("connect") if $rscheme eq "https";
502    
503     # handle actual, non-tunneled, request
504     my $handle_actual_request = sub {
505 root 1.34 $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls};
506 root 1.31
507     # send request
508     $state{handle}->push_write (
509     "$method $rpath HTTP/1.0\015\012"
510 root 1.47 . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
511 root 1.31 . "\015\012"
512     . (delete $arg{body})
513 root 1.11 );
514    
515 root 1.54 # return if error occured during push_write()
516     return unless %state;
517    
518 root 1.55 %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use
519 root 1.10
520 root 1.31 # status line
521 root 1.34 $state{handle}->push_read (line => $qr_nl, sub {
522 root 1.31 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
523 root 1.55 or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid server response ($_[1])", @pseudo }));
524 root 1.31
525 root 1.55 push @pseudo,
526     HTTPVersion => $1,
527     Status => $2,
528     Reason => $3,
529     ;
530 root 1.31
531     # headers, could be optimized a bit
532 root 1.34 $state{handle}->unshift_read (line => $qr_nlnl, sub {
533 root 1.46 for ("$_[1]") {
534 root 1.31 y/\015//d; # weed out any \015, as they show up in the weirdest of places.
535    
536 root 1.40 # things seen, not parsed:
537     # p3pP="NON CUR OTPi OUR NOR UNI"
538    
539 root 1.31 $hdr{lc $1} .= ",$2"
540     while /\G
541 root 1.43 ([^:\000-\037]*):
542 root 1.31 [\011\040]*
543     ((?: [^\012]+ | \012[\011\040] )*)
544     \012
545     /gxc;
546    
547     /\G$/
548 root 1.55 or return (%state = (), $cb->(undef, { Status => 599, Reason => "Garbled response headers", @pseudo }));
549 root 1.31 }
550    
551 root 1.55 # remove the "," prefix we added to all headers above
552 root 1.31 substr $_, 0, 1, ""
553     for values %hdr;
554    
555 root 1.55 # patch in all pseudo headers
556     %hdr = (%hdr, @pseudo);
557    
558 root 1.41 # redirect handling
559     # microsoft and other shitheads don't give a shit for following standards,
560     # try to support some common forms of broken Location headers.
561     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
562     $hdr{location} =~ s/^\.\/+//;
563    
564     my $url = "$rscheme://$uhost:$uport";
565    
566     unless ($hdr{location} =~ s/^\///) {
567     $url .= $upath;
568     $url =~ s/\/[^\/]*$//;
569     }
570    
571     $hdr{location} = "$url/$hdr{location}";
572     }
573    
574     my $redirect;
575    
576     if ($recurse) {
577 root 1.55 my $status = $hdr{Status};
578    
579     if (($status == 301 || $status == 302) && $method ne "POST") {
580 root 1.41 # apparently, mozilla et al. just change POST to GET here
581     # more research is needed before we do the same
582     $redirect = 1;
583 root 1.55 } elsif ($status == 303) {
584 root 1.41 # even http/1.1 is unclear on how to mutate the method
585     $method = "GET" unless $method eq "HEAD";
586     $redirect = 1;
587 root 1.55 } elsif ($status == 307 && $method =~ /^(?:GET|HEAD)$/) {
588 root 1.41 $redirect = 1;
589     }
590     }
591    
592 root 1.31 my $finish = sub {
593 root 1.41 $state{handle}->destroy if $state{handle};
594 root 1.31 %state = ();
595    
596     # set-cookie processing
597     if ($arg{cookie_jar}) {
598 root 1.41 for ($_[1]{"set-cookie"}) {
599 root 1.31 # parse NAME=VALUE
600     my @kv;
601    
602     while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) {
603     my $name = $1;
604     my $value = $3;
605    
606     unless ($value) {
607     $value = $2;
608     $value =~ s/\\(.)/$1/gs;
609     }
610    
611     push @kv, $name => $value;
612    
613     last unless /\G\s*;/gc;
614     }
615    
616     last unless @kv;
617 root 1.10
618 root 1.31 my $name = shift @kv;
619     my %kv = (value => shift @kv, @kv);
620 root 1.11
621 root 1.31 my $cdom;
622     my $cpath = (delete $kv{path}) || "/";
623 root 1.10
624 root 1.31 if (exists $kv{domain}) {
625     $cdom = delete $kv{domain};
626    
627     $cdom =~ s/^\.?/./; # make sure it starts with a "."
628 root 1.11
629 root 1.31 next if $cdom =~ /\.$/;
630    
631     # this is not rfc-like and not netscape-like. go figure.
632     my $ndots = $cdom =~ y/.//;
633     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
634     } else {
635     $cdom = $uhost;
636     }
637 root 1.30
638 root 1.31 # store it
639     $arg{cookie_jar}{version} = 1;
640     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
641    
642     redo if /\G\s*,/gc;
643 root 1.30 }
644 root 1.11 }
645 root 1.8
646 root 1.47 if ($redirect && exists $hdr{location}) {
647 root 1.41 # we ignore any errors, as it is very common to receive
648     # Content-Length != 0 but no actual body
649     # we also access %hdr, as $_[1] might be an erro
650 root 1.55 http_request (
651     $method => $hdr{location},
652     %arg,
653     recurse => $recurse - 1,
654     Redirect => \@_,
655     $cb);
656 root 1.31 } else {
657     $cb->($_[0], $_[1]);
658     }
659     };
660 root 1.24
661 root 1.41 my $len = $hdr{"content-length"};
662    
663     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
664 root 1.55 $finish->(undef, { Status => 598, Reason => "Request cancelled by on_header", @pseudo });
665 root 1.41 } elsif (
666 root 1.42 $hdr{Status} =~ /^(?:1..|[23]04)$/
667 root 1.41 or $method eq "HEAD"
668     or (defined $len && !$len)
669     ) {
670     # no body
671     $finish->("", \%hdr);
672 root 1.11 } else {
673 root 1.41 # body handling, four different code paths
674     # for want_body_handle, on_body (2x), normal (2x)
675     # we might read too much here, but it does not matter yet (no pers. connections)
676     if (!$redirect && $arg{want_body_handle}) {
677     $_[0]->on_eof (undef);
678     $_[0]->on_error (undef);
679     $_[0]->on_read (undef);
680    
681     $finish->(delete $state{handle}, \%hdr);
682 root 1.31
683 root 1.41 } elsif ($arg{on_body}) {
684 root 1.55 $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) });
685 root 1.41 if ($len) {
686     $_[0]->on_eof (undef);
687     $_[0]->on_read (sub {
688     $len -= length $_[0]{rbuf};
689    
690     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
691 root 1.55 or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo });
692 root 1.41
693     $len > 0
694     or $finish->("", \%hdr);
695     });
696     } else {
697     $_[0]->on_eof (sub {
698     $finish->("", \%hdr);
699     });
700     $_[0]->on_read (sub {
701     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
702 root 1.55 or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo });
703 root 1.41 });
704     }
705 root 1.31 } else {
706     $_[0]->on_eof (undef);
707 root 1.41
708     if ($len) {
709 root 1.55 $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) });
710 root 1.41 $_[0]->on_read (sub {
711     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), \%hdr)
712     if $len <= length $_[0]{rbuf};
713     });
714     } else {
715     $_[0]->on_error (sub {
716 root 1.43 $! == Errno::EPIPE || !$!
717 root 1.41 ? $finish->(delete $_[0]{rbuf}, \%hdr)
718 root 1.55 : $finish->(undef, { Status => 599, Reason => $_[2], @pseudo });
719 root 1.41 });
720     $_[0]->on_read (sub { });
721     }
722 root 1.31 }
723 root 1.11 }
724 root 1.31 });
725     });
726     };
727 root 1.3
728 root 1.31 # now handle proxy-CONNECT method
729     if ($proxy && $uscheme eq "https") {
730     # oh dear, we have to wrap it into a connect request
731    
732     # maybe re-use $uauthority with patched port?
733     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012");
734 root 1.34 $state{handle}->push_read (line => $qr_nlnl, sub {
735 root 1.31 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
736 root 1.55 or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid proxy connect response ($_[1])", @pseudo }));
737 root 1.31
738     if ($2 == 200) {
739     $rpath = $upath;
740     &$handle_actual_request;
741 root 1.3 } else {
742 root 1.31 %state = ();
743 root 1.55 $cb->(undef, { Status => $2, Reason => $3, @pseudo });
744 root 1.3 }
745 root 1.11 });
746 root 1.31 } else {
747     &$handle_actual_request;
748     }
749    
750 root 1.50 }, $arg{on_prepare} || sub { $timeout };
751 root 1.1 };
752    
753     defined wantarray && AnyEvent::Util::guard { %state = () }
754     }
755    
756 elmex 1.15 sub http_get($@) {
757 root 1.1 unshift @_, "GET";
758     &http_request
759     }
760    
761 elmex 1.15 sub http_head($@) {
762 root 1.4 unshift @_, "HEAD";
763     &http_request
764     }
765    
766 elmex 1.15 sub http_post($$@) {
767 root 1.22 my $url = shift;
768     unshift @_, "POST", $url, "body";
769 root 1.3 &http_request
770     }
771    
772 root 1.9 =back
773    
774 root 1.55 =head2 DNS CACHING
775    
776     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
777     the actual connection, which in turn uses AnyEvent::DNS to resolve
778     hostnames. The latter is a simple stub resolver and does no caching
779     on its own. If you want DNS caching, you currently have to provide
780     your own default resolver (by storing a suitable resolver object in
781     C<$AnyEvent::DNS::RESOLVER>).
782    
783 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
784 root 1.1
785     =over 4
786    
787 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
788    
789     Sets the default proxy server to use. The proxy-url must begin with a
790 root 1.52 string of the form C<http://host:port> (optionally C<https:...>), croaks
791     otherwise.
792    
793     To clear an already-set proxy, use C<undef>.
794 root 1.2
795 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
796 root 1.1
797 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
798 root 1.1
799     =item $AnyEvent::HTTP::USERAGENT
800    
801     The default value for the C<User-Agent> header (the default is
802 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
803 root 1.1
804 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
805 root 1.1
806 root 1.47 The maximum number of concurrent connections to the same host (identified
807 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
808     are queued until previous connections are closed.
809 root 1.1
810 root 1.43 The default value for this is C<4>, and it is highly advisable to not
811     increase it.
812 root 1.3
813 root 1.14 =item $AnyEvent::HTTP::ACTIVE
814    
815     The number of active connections. This is not the number of currently
816     running requests, but the number of currently open and non-idle TCP
817     connections. This number of can be useful for load-leveling.
818    
819 root 1.1 =back
820    
821     =cut
822    
823 root 1.2 sub set_proxy($) {
824 root 1.52 if (length $_[0]) {
825     $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix
826     or Carp::croak "$_[0]: invalid proxy URL";
827     $PROXY = [$2, $3 || 3128, $1]
828     } else {
829     undef $PROXY;
830     }
831 root 1.2 }
832    
833     # initialise proxy from environment
834 root 1.52 eval {
835     set_proxy $ENV{http_proxy};
836     };
837 root 1.2
838 root 1.1 =head1 SEE ALSO
839    
840     L<AnyEvent>.
841    
842     =head1 AUTHOR
843    
844 root 1.18 Marc Lehmann <schmorp@schmorp.de>
845     http://home.schmorp.de/
846 root 1.1
847 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
848     testcases and bugreports.
849    
850 root 1.1 =cut
851    
852     1
853