ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.79
Committed: Sat Jan 1 20:01:07 2011 UTC (13 years, 4 months ago) by root
Branch: MAIN
Changes since 1.78: +9 -7 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44 root 1.41 use Errno ();
45 root 1.1
46 root 1.51 use AnyEvent 5.0 ();
47 root 1.1 use AnyEvent::Util ();
48     use AnyEvent::Handle ();
49    
50     use base Exporter::;
51    
52 root 1.65 our $VERSION = '1.5';
53 root 1.1
54 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
55 root 1.1
56 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
57 root 1.3 our $MAX_RECURSE = 10;
58 root 1.2 our $MAX_PERSISTENT = 8;
59     our $PERSISTENT_TIMEOUT = 2;
60     our $TIMEOUT = 300;
61 root 1.1
62     # changing these is evil
63 root 1.43 our $MAX_PERSISTENT_PER_HOST = 0;
64 root 1.11 our $MAX_PER_HOST = 4;
65 root 1.1
66 root 1.2 our $PROXY;
67 root 1.14 our $ACTIVE = 0;
68 root 1.2
69 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
70 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
71 root 1.1
72     =item http_get $url, key => value..., $cb->($data, $headers)
73    
74     Executes an HTTP-GET request. See the http_request function for details on
75 root 1.29 additional parameters and the return value.
76 root 1.1
77 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
78    
79 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
80     on additional parameters and the return value.
81 root 1.5
82     =item http_post $url, $body, key => value..., $cb->($data, $headers)
83 root 1.3
84 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
85 root 1.29 http_request function for details on additional parameters and the return
86     value.
87 root 1.3
88 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89    
90     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91     must be an absolute http or https URL.
92    
93 root 1.29 When called in void context, nothing is returned. In other contexts,
94     C<http_request> returns a "cancellation guard" - you have to keep the
95     object at least alive until the callback get called. If the object gets
96 root 1.58 destroyed before the callback is called, the request will be cancelled.
97 root 1.29
98 root 1.42 The callback will be called with the response body data as first argument
99 root 1.68 (or C<undef> if an error occured), and a hash-ref with response headers
100     (and trailers) as second argument.
101 root 1.2
102 root 1.7 All the headers in that hash are lowercased. In addition to the response
103 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
104     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
105 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
106     during the body phase of a request, then the original C<Status> and
107     C<Reason> values from the header are available as C<OrigStatus> and
108     C<OrigReason>.
109 root 1.55
110     The pseudo-header C<URL> contains the actual URL (which can differ from
111     the requested URL when following redirects - for example, you might get
112     an error that your URL scheme is not supported even though your URL is a
113     valid http URL because it redirected to an ftp URL, in which case you can
114     look at the URL pseudo header).
115    
116     The pseudo-header C<Redirect> only exists when the request was a result
117     of an internal redirect. In that case it is an array reference with
118     the C<($data, $headers)> from the redirect response. Note that this
119     response could in turn be the result of a redirect itself, and C<<
120     $headers->{Redirect}[1]{Redirect} >> will then contain the original
121     response, and so on.
122 root 1.20
123 root 1.32 If the server sends a header multiple times, then their contents will be
124     joined together with a comma (C<,>), as per the HTTP spec.
125 root 1.2
126     If an internal error occurs, such as not being able to resolve a hostname,
127 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
128     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
129     message. Currently the following status codes are used:
130    
131     =over 4
132    
133     =item 595 - errors during connection etsbalishment, proxy handshake.
134    
135     =item 596 - errors during TLS negotiation, request sending and header processing.
136    
137 root 1.78 =item 597 - errors during body receiving or processing.
138 root 1.77
139 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
140 root 1.77
141     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
142    
143     =back
144 root 1.2
145 root 1.6 A typical callback might look like this:
146    
147     sub {
148     my ($body, $hdr) = @_;
149    
150     if ($hdr->{Status} =~ /^2/) {
151     ... everything should be ok
152     } else {
153     print "error, $hdr->{Status} $hdr->{Reason}\n";
154     }
155     }
156    
157 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
158     include:
159    
160     =over 4
161    
162 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
163 root 1.1
164     Whether to recurse requests or not, e.g. on redirects, authentication
165 root 1.3 retries and so on, and how often to do so.
166 root 1.1
167     =item headers => hashref
168    
169 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
170     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
171 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
172     (this can be suppressed by using C<undef> for these headers in which case
173     they won't be sent at all).
174 root 1.1
175     =item timeout => $seconds
176    
177     The time-out to use for various stages - each connect attempt will reset
178 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
179     timeout.
180    
181     Default timeout is 5 minutes.
182 root 1.2
183     =item proxy => [$host, $port[, $scheme]] or undef
184    
185     Use the given http proxy for all requests. If not specified, then the
186     default proxy (as specified by C<$ENV{http_proxy}>) is used.
187    
188 root 1.47 C<$scheme> must be either missing, C<http> for HTTP or C<https> for
189 root 1.2 HTTPS.
190 root 1.1
191 root 1.3 =item body => $string
192    
193 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
194 root 1.3 this module might offer more options).
195    
196 root 1.10 =item cookie_jar => $hash_ref
197    
198     Passing this parameter enables (simplified) cookie-processing, loosely
199     based on the original netscape specification.
200    
201     The C<$hash_ref> must be an (initially empty) hash reference which will
202 root 1.70 get updated automatically. It is possible to save the cookie jar to
203 root 1.10 persistent storage with something like JSON or Storable, but this is not
204 root 1.70 recommended, as session-only cookies might survive longer than expected.
205 root 1.10
206 root 1.70 Note that this cookie implementation is not meant to be complete. If
207     you want complete cookie management you have to do that on your
208     own. C<cookie_jar> is meant as a quick fix to get some cookie-using sites
209     working. Cookies are a privacy disaster, do not use them unless required
210     to.
211 root 1.10
212 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
213 root 1.70 headers will be set and handled by this module, otherwise they will be
214 root 1.69 left untouched.
215    
216 root 1.40 =item tls_ctx => $scheme | $tls_ctx
217    
218     Specifies the AnyEvent::TLS context to be used for https connections. This
219     parameter follows the same rules as the C<tls_ctx> parameter to
220     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
221     C<high> can be specified, which give you a predefined low-security (no
222     verification, highest compatibility) and high-security (CA and common-name
223     verification) TLS context.
224    
225     The default for this option is C<low>, which could be interpreted as "give
226     me the page, no matter what".
227    
228 root 1.51 =item on_prepare => $callback->($fh)
229    
230     In rare cases you need to "tune" the socket before it is used to
231     connect (for exmaple, to bind it on a given IP address). This parameter
232     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
233     and behaves exactly the same way (e.g. it has to provide a
234     timeout). See the description for the C<$prepare_cb> argument of
235     C<AnyEvent::Socket::tcp_connect> for details.
236    
237 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
238    
239     In even rarer cases you want total control over how AnyEvent::HTTP
240     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
241     to do this, but you can provide your own C<tcp_connect> function -
242 root 1.60 obviously, it has to follow the same calling conventions, except that it
243     may always return a connection guard object.
244 root 1.59
245     There are probably lots of weird uses for this function, starting from
246     tracing the hosts C<http_request> actually tries to connect, to (inexact
247     but fast) host => IP address caching or even socks protocol support.
248    
249 root 1.42 =item on_header => $callback->($headers)
250 root 1.41
251     When specified, this callback will be called with the header hash as soon
252     as headers have been successfully received from the remote server (not on
253     locally-generated errors).
254    
255     It has to return either true (in which case AnyEvent::HTTP will continue),
256     or false, in which case AnyEvent::HTTP will cancel the download (and call
257     the finish callback with an error code of C<598>).
258    
259     This callback is useful, among other things, to quickly reject unwanted
260     content, which, if it is supposed to be rare, can be faster than first
261     doing a C<HEAD> request.
262    
263 root 1.68 The downside is that cancelling the request makes it impossible to re-use
264     the connection. Also, the C<on_header> callback will not receive any
265     trailer (headers sent after the response body).
266    
267 root 1.42 Example: cancel the request unless the content-type is "text/html".
268 root 1.41
269 root 1.42 on_header => sub {
270     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
271     },
272 root 1.41
273 root 1.42 =item on_body => $callback->($partial_body, $headers)
274 root 1.41
275 root 1.42 When specified, all body data will be passed to this callback instead of
276     to the completion callback. The completion callback will get the empty
277     string instead of the body data.
278 root 1.41
279 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
280     or false, in which case AnyEvent::HTTP will cancel the download (and call
281     the completion callback with an error code of C<598>).
282    
283 root 1.68 The downside to cancelling the request is that it makes it impossible to
284     re-use the connection.
285    
286 root 1.42 This callback is useful when the data is too large to be held in memory
287     (so the callback writes it to a file) or when only some information should
288     be extracted, or when the body should be processed incrementally.
289 root 1.41
290     It is usually preferred over doing your own body handling via
291 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
292     only used to create a connection, C<want_body_handle> is the better
293     alternative, as it allows you to install your own event handler, reducing
294     resource usage.
295 root 1.41
296     =item want_body_handle => $enable
297    
298     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
299     changes considerably: after parsing the headers, and instead of
300     downloading the body (if any), the completion callback will be
301     called. Instead of the C<$body> argument containing the body data, the
302     callback will receive the L<AnyEvent::Handle> object associated with the
303     connection. In error cases, C<undef> will be passed. When there is no body
304     (e.g. status C<304>), the empty string will be passed.
305    
306     The handle object might or might not be in TLS mode, might be connected to
307     a proxy, be a persistent connection etc., and configured in unspecified
308     ways. The user is responsible for this handle (it will not be used by this
309     module anymore).
310    
311     This is useful with some push-type services, where, after the initial
312     headers, an interactive protocol is used (typical example would be the
313     push-style twitter API which starts a JSON/XML stream).
314    
315     If you think you need this, first have a look at C<on_body>, to see if
316 root 1.45 that doesn't solve your problem in a better way.
317 root 1.41
318 root 1.1 =back
319    
320 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
321     the response body.
322 root 1.9
323     http_request GET => "http://www.nethype.de/", sub {
324     my ($body, $hdr) = @_;
325     print "$body\n";
326     };
327    
328 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
329 root 1.9 timeout of 30 seconds.
330    
331     http_request
332     GET => "https://www.google.com",
333     timeout => 30,
334     sub {
335     my ($body, $hdr) = @_;
336     use Data::Dumper;
337     print Dumper $hdr;
338     }
339     ;
340 root 1.1
341 root 1.68 Example: do another simple HTTP GET request, but immediately try to
342 root 1.29 cancel it.
343    
344     my $request = http_request GET => "http://www.nethype.de/", sub {
345     my ($body, $hdr) = @_;
346     print "$body\n";
347     };
348    
349     undef $request;
350    
351 root 1.1 =cut
352    
353 root 1.12 sub _slot_schedule;
354 root 1.11 sub _slot_schedule($) {
355     my $host = shift;
356    
357     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
358     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
359 root 1.12 # somebody wants that slot
360 root 1.11 ++$CO_SLOT{$host}[0];
361 root 1.14 ++$ACTIVE;
362 root 1.11
363     $cb->(AnyEvent::Util::guard {
364 root 1.14 --$ACTIVE;
365 root 1.11 --$CO_SLOT{$host}[0];
366     _slot_schedule $host;
367     });
368     } else {
369     # nobody wants the slot, maybe we can forget about it
370     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
371     last;
372     }
373     }
374     }
375    
376     # wait for a free slot on host, call callback
377     sub _get_slot($$) {
378     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
379    
380     _slot_schedule $_[0];
381     }
382    
383 root 1.72 # extract cookies from jar
384 root 1.71 sub cookie_jar_extract($$$$) {
385     my ($jar, $uscheme, $uhost, $upath) = @_;
386    
387     %$jar = () if $jar->{version} != 1;
388    
389     my @cookies;
390    
391     while (my ($chost, $paths) = each %$jar) {
392     next unless ref $paths;
393    
394     if ($chost =~ /^\./) {
395     next unless $chost eq substr $uhost, -length $chost;
396     } elsif ($chost =~ /\./) {
397     next unless $chost eq $uhost;
398     } else {
399     next;
400     }
401    
402     while (my ($cpath, $cookies) = each %$paths) {
403     next unless $cpath eq substr $upath, 0, length $cpath;
404    
405     while (my ($cookie, $kv) = each %$cookies) {
406     next if $uscheme ne "https" && exists $kv->{secure};
407    
408     if (exists $kv->{expires}) {
409     if (AE::now > parse_date ($kv->{expires})) {
410     delete $cookies->{$cookie};
411     next;
412     }
413     }
414    
415     my $value = $kv->{value};
416    
417     if ($value =~ /[=;,[:space:]]/) {
418     $value =~ s/([\\"])/\\$1/g;
419     $value = "\"$value\"";
420     }
421    
422     push @cookies, "$cookie=$value";
423     }
424     }
425     }
426    
427     \@cookies
428     }
429    
430 root 1.72 # parse set_cookie header into jar
431 root 1.73 sub cookie_jar_set_cookie($$$) {
432     my ($jar, $set_cookie, $uhost) = @_;
433 root 1.72
434     for ($set_cookie) {
435     # parse NAME=VALUE
436     my @kv;
437    
438 root 1.79 # expires is not http-compliant in the original cookie-spec,
439     # we support the official date format and some extensions
440 root 1.72 while (
441     m{
442     \G\s*
443     (?:
444 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
445 root 1.72 | ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )
446     )
447     }gcxsi
448     ) {
449     my $name = $2;
450     my $value = $4;
451    
452     unless (defined $name) {
453     # expires
454     $name = "expires";
455     $value = $1;
456     } elsif (!defined $value) {
457     # quoted
458     $value = $3;
459     $value =~ s/\\(.)/$1/gs;
460     }
461    
462     push @kv, lc $name, $value;
463    
464     last unless /\G\s*;/gc;
465     }
466    
467     last unless @kv;
468    
469     my $name = shift @kv;
470     my %kv = (value => shift @kv, @kv);
471    
472     $kv{expires} ||= format_date (AE::now + $kv{"max-age"})
473     if exists $kv{"max-age"};
474    
475     my $cdom;
476     my $cpath = (delete $kv{path}) || "/";
477    
478     if (exists $kv{domain}) {
479     $cdom = delete $kv{domain};
480    
481     $cdom =~ s/^\.?/./; # make sure it starts with a "."
482    
483     next if $cdom =~ /\.$/;
484    
485     # this is not rfc-like and not netscape-like. go figure.
486     my $ndots = $cdom =~ y/.//;
487     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
488     } else {
489     $cdom = $uhost;
490     }
491    
492     # store it
493 root 1.73 $jar->{version} = 1;
494     $jar->{$cdom}{$cpath}{$name} = \%kv;
495 root 1.72
496     redo if /\G\s*,/gc;
497     }
498     }
499    
500 root 1.66 # continue to parse $_ for headers and place them into the arg
501     sub parse_hdr() {
502     my %hdr;
503    
504     # things seen, not parsed:
505     # p3pP="NON CUR OTPi OUR NOR UNI"
506    
507     $hdr{lc $1} .= ",$2"
508     while /\G
509     ([^:\000-\037]*):
510     [\011\040]*
511     ((?: [^\012]+ | \012[\011\040] )*)
512     \012
513     /gxc;
514    
515     /\G$/
516     or return;
517    
518     # remove the "," prefix we added to all headers above
519     substr $_, 0, 1, ""
520     for values %hdr;
521    
522     \%hdr
523     }
524    
525 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
526 root 1.34
527 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
528     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
529 root 1.40
530 elmex 1.15 sub http_request($$@) {
531 root 1.1 my $cb = pop;
532     my ($method, $url, %arg) = @_;
533    
534     my %hdr;
535    
536 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
537     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
538    
539 root 1.3 $method = uc $method;
540    
541 root 1.8 if (my $hdr = $arg{headers}) {
542 root 1.1 while (my ($k, $v) = each %$hdr) {
543     $hdr{lc $k} = $v;
544     }
545     }
546    
547 root 1.55 # pseudo headers for all subsequent responses
548     my @pseudo = (URL => $url);
549     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
550    
551 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
552 root 1.8
553 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
554 root 1.8 if $recurse < 0;
555    
556 root 1.2 my $proxy = $arg{proxy} || $PROXY;
557 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
558    
559 root 1.31 my ($uscheme, $uauthority, $upath, $query, $fragment) =
560 root 1.56 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?|;
561 root 1.2
562 root 1.31 $uscheme = lc $uscheme;
563 root 1.1
564 root 1.31 my $uport = $uscheme eq "http" ? 80
565     : $uscheme eq "https" ? 443
566 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
567 root 1.13
568 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
569 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
570 root 1.10
571     my $uhost = $1;
572     $uport = $2 if defined $2;
573    
574 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
575     unless exists $hdr{host};
576 root 1.43
577 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
578 root 1.56 $upath .= $query if length $query;
579 root 1.10
580     $upath =~ s%^/?%/%;
581    
582     # cookie processing
583     if (my $jar = $arg{cookie_jar}) {
584 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
585 root 1.70
586 root 1.71 $hdr{cookie} = join "; ", @$cookies
587     if @$cookies;
588 root 1.10 }
589 root 1.1
590 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
591 root 1.2
592 root 1.10 if ($proxy) {
593 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
594 root 1.31
595 root 1.47 $rscheme = "http" unless defined $rscheme;
596    
597 root 1.31 # don't support https requests over https-proxy transport,
598 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
599 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
600 root 1.10 } else {
601 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
602 root 1.2 }
603    
604 root 1.47 # leave out fragment and query string, just a heuristic
605 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
606     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
607 root 1.41
608 root 1.53 $hdr{"content-length"} = length $arg{body}
609     if length $arg{body} || $method ne "GET";
610 root 1.1
611 root 1.68 $hdr{connection} = "close TE"; #1.1
612     $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
613 root 1.66
614 root 1.11 my %state = (connect_guard => 1);
615    
616     _get_slot $uhost, sub {
617     $state{slot_guard} = shift;
618 root 1.1
619 root 1.11 return unless $state{connect_guard};
620 root 1.1
621 root 1.77 my $ae_error = 595; # connecting
622    
623 root 1.64 my $connect_cb = sub {
624     $state{fh} = shift
625     or do {
626     my $err = "$!";
627     %state = ();
628 root 1.77 return $cb->(undef, { @pseudo, Status => $ae_error, Reason => $err });
629 root 1.64 };
630 root 1.44
631 root 1.64 return unless delete $state{connect_guard};
632 root 1.11
633 root 1.64 # get handle
634     $state{handle} = new AnyEvent::Handle
635     fh => $state{fh},
636     peername => $rhost,
637     tls_ctx => $arg{tls_ctx},
638     # these need to be reconfigured on keepalive handles
639     timeout => $timeout,
640     on_error => sub {
641     %state = ();
642 root 1.77 $cb->(undef, { @pseudo, Status => $ae_error, Reason => $_[2] });
643 root 1.64 },
644     on_eof => sub {
645     %state = ();
646 root 1.77 $cb->(undef, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" });
647 root 1.64 },
648     ;
649 root 1.11
650 root 1.64 # limit the number of persistent connections
651     # keepalive not yet supported
652 root 1.56 # if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
653     # ++$KA_COUNT{$_[1]};
654     # $state{handle}{ka_count_guard} = AnyEvent::Util::guard {
655     # --$KA_COUNT{$_[1]}
656     # };
657     # $hdr{connection} = "keep-alive";
658     # }
659 root 1.1
660 root 1.64 $state{handle}->starttls ("connect") if $rscheme eq "https";
661    
662     # handle actual, non-tunneled, request
663     my $handle_actual_request = sub {
664 root 1.77 $ae_error = 596; # request phase
665    
666 root 1.64 $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls};
667    
668     # send request
669     $state{handle}->push_write (
670 root 1.66 "$method $rpath HTTP/1.1\015\012"
671 root 1.64 . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
672     . "\015\012"
673     . (delete $arg{body})
674     );
675    
676     # return if error occured during push_write()
677     return unless %state;
678    
679     %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use
680    
681     # status line and headers
682 root 1.68 $state{read_response} = sub {
683 root 1.64 for ("$_[1]") {
684     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
685    
686 root 1.76 /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
687 root 1.64 or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Invalid server response" }));
688    
689 root 1.68 # 100 Continue handling
690     # should not happen as we don't send expect: 100-continue,
691     # but we handle it just in case.
692     # since we send the request body regardless, if we get an error
693     # we are out of-sync, which we currently do NOT handle correctly.
694     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
695     if $2 eq 100;
696    
697 root 1.64 push @pseudo,
698     HTTPVersion => $1,
699     Status => $2,
700     Reason => $3,
701     ;
702    
703 root 1.66 my $hdr = parse_hdr
704     or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Garbled response headers" }));
705 root 1.64
706 root 1.66 %hdr = (%$hdr, @pseudo);
707 root 1.64 }
708    
709     # redirect handling
710     # microsoft and other shitheads don't give a shit for following standards,
711     # try to support some common forms of broken Location headers.
712     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
713     $hdr{location} =~ s/^\.\/+//;
714    
715     my $url = "$rscheme://$uhost:$uport";
716    
717     unless ($hdr{location} =~ s/^\///) {
718     $url .= $upath;
719     $url =~ s/\/[^\/]*$//;
720     }
721 root 1.59
722 root 1.64 $hdr{location} = "$url/$hdr{location}";
723     }
724 root 1.31
725 root 1.64 my $redirect;
726 root 1.41
727 root 1.64 if ($recurse) {
728     my $status = $hdr{Status};
729 root 1.59
730 root 1.64 # industry standard is to redirect POST as GET for
731 root 1.76 # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
732 root 1.64 # also, the UA should ask the user for 301 and 307 and POST,
733     # industry standard seems to be to simply follow.
734     # we go with the industry standard.
735     if ($status == 301 or $status == 302 or $status == 303) {
736     # HTTP/1.1 is unclear on how to mutate the method
737     $method = "GET" unless $method eq "HEAD";
738     $redirect = 1;
739     } elsif ($status == 307) {
740     $redirect = 1;
741 root 1.59 }
742 root 1.64 }
743 root 1.57
744 root 1.64 my $finish = sub { # ($data, $err_status, $err_reason[, $keepalive])
745 root 1.74 my $may_keep_alive = $_[3];
746 root 1.68
747 root 1.64 $state{handle}->destroy if $state{handle};
748     %state = ();
749 root 1.55
750 root 1.64 if (defined $_[1]) {
751     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
752     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
753 root 1.41 }
754    
755 root 1.64 # set-cookie processing
756     if ($arg{cookie_jar}) {
757 root 1.73 cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost;
758     }
759 root 1.31
760 root 1.64 if ($redirect && exists $hdr{location}) {
761     # we ignore any errors, as it is very common to receive
762     # Content-Length != 0 but no actual body
763     # we also access %hdr, as $_[1] might be an erro
764     http_request (
765     $method => $hdr{location},
766     %arg,
767     recurse => $recurse - 1,
768     Redirect => [$_[0], \%hdr],
769     $cb);
770     } else {
771     $cb->($_[0], \%hdr);
772     }
773     };
774    
775 root 1.77 $ae_error = 597; # body phase
776    
777 root 1.64 my $len = $hdr{"content-length"};
778    
779     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
780     $finish->(undef, 598 => "Request cancelled by on_header");
781     } elsif (
782     $hdr{Status} =~ /^(?:1..|204|205|304)$/
783     or $method eq "HEAD"
784     or (defined $len && !$len)
785     ) {
786     # no body
787     $finish->("", undef, undef, 1);
788     } else {
789 root 1.66 # body handling, many different code paths
790     # - no body expected
791     # - want_body_handle
792     # - te chunked
793     # - 2x length known (with or without on_body)
794     # - 2x length not known (with or without on_body)
795 root 1.64 if (!$redirect && $arg{want_body_handle}) {
796     $_[0]->on_eof (undef);
797     $_[0]->on_error (undef);
798     $_[0]->on_read (undef);
799    
800     $finish->(delete $state{handle});
801    
802 root 1.68 } elsif ($hdr{"transfer-encoding"} =~ /\bchunked\b/i) {
803     my $cl = 0;
804 root 1.66 my $body = undef;
805     my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
806    
807     my $read_chunk; $read_chunk = sub {
808     $_[1] =~ /^([0-9a-fA-F]+)/
809 root 1.77 or $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
810 root 1.66
811     my $len = hex $1;
812    
813     if ($len) {
814 root 1.68 $cl += $len;
815    
816     $_[0]->push_read (chunk => $len, sub {
817 root 1.66 $on_body->($_[1], \%hdr)
818     or return $finish->(undef, 598 => "Request cancelled by on_body");
819    
820     $_[0]->push_read (line => sub {
821     length $_[1]
822 root 1.77 and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
823 root 1.66 $_[0]->push_read (line => $read_chunk);
824     });
825     });
826     } else {
827 root 1.68 $hdr{"content-length"} ||= $cl;
828    
829 root 1.66 $_[0]->push_read (line => $qr_nlnl, sub {
830     if (length $_[1]) {
831     for ("$_[1]") {
832     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
833    
834     my $hdr = parse_hdr
835 root 1.77 or return $finish->(undef, $ae_error => "Garbled response trailers");
836 root 1.66
837     %hdr = (%hdr, %$hdr);
838     }
839     }
840    
841     $finish->($body, undef, undef, 1);
842     });
843     }
844     };
845    
846     $_[0]->push_read (line => $read_chunk);
847    
848 root 1.64 } elsif ($arg{on_body}) {
849     if ($len) {
850     $_[0]->on_read (sub {
851     $len -= length $_[0]{rbuf};
852    
853     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
854 root 1.66 or return $finish->(undef, 598 => "Request cancelled by on_body");
855 root 1.64
856     $len > 0
857     or $finish->("", undef, undef, 1);
858     });
859 root 1.59 } else {
860 root 1.64 $_[0]->on_eof (sub {
861     $finish->("");
862     });
863     $_[0]->on_read (sub {
864     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
865     or $finish->(undef, 598 => "Request cancelled by on_body");
866     });
867 root 1.11 }
868 root 1.64 } else {
869     $_[0]->on_eof (undef);
870 root 1.59
871 root 1.64 if ($len) {
872     $_[0]->on_read (sub {
873     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
874     if $len <= length $_[0]{rbuf};
875     });
876 root 1.59 } else {
877 root 1.64 $_[0]->on_error (sub {
878     ($! == Errno::EPIPE || !$!)
879     ? $finish->(delete $_[0]{rbuf})
880 root 1.77 : $finish->(undef, $ae_error => $_[2]);
881 root 1.64 });
882     $_[0]->on_read (sub { });
883 root 1.59 }
884     }
885 root 1.64 }
886 root 1.68 };
887    
888     $state{handle}->push_read (line => $qr_nlnl, $state{read_response});
889 root 1.64 };
890    
891     # now handle proxy-CONNECT method
892     if ($proxy && $uscheme eq "https") {
893     # oh dear, we have to wrap it into a connect request
894    
895     # maybe re-use $uauthority with patched port?
896     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012");
897     $state{handle}->push_read (line => $qr_nlnl, sub {
898     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
899     or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" }));
900    
901     if ($2 == 200) {
902     $rpath = $upath;
903     &$handle_actual_request;
904     } else {
905     %state = ();
906     $cb->(undef, { @pseudo, Status => $2, Reason => $3 });
907     }
908     });
909     } else {
910     &$handle_actual_request;
911     }
912     };
913    
914     my $tcp_connect = $arg{tcp_connect}
915     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
916 root 1.57
917 root 1.64 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
918 root 1.31
919 root 1.1 };
920    
921     defined wantarray && AnyEvent::Util::guard { %state = () }
922     }
923    
924 elmex 1.15 sub http_get($@) {
925 root 1.1 unshift @_, "GET";
926     &http_request
927     }
928    
929 elmex 1.15 sub http_head($@) {
930 root 1.4 unshift @_, "HEAD";
931     &http_request
932     }
933    
934 elmex 1.15 sub http_post($$@) {
935 root 1.22 my $url = shift;
936     unshift @_, "POST", $url, "body";
937 root 1.3 &http_request
938     }
939    
940 root 1.9 =back
941    
942 root 1.55 =head2 DNS CACHING
943    
944     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
945     the actual connection, which in turn uses AnyEvent::DNS to resolve
946     hostnames. The latter is a simple stub resolver and does no caching
947     on its own. If you want DNS caching, you currently have to provide
948     your own default resolver (by storing a suitable resolver object in
949     C<$AnyEvent::DNS::RESOLVER>).
950    
951 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
952 root 1.1
953     =over 4
954    
955 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
956    
957     Sets the default proxy server to use. The proxy-url must begin with a
958 root 1.52 string of the form C<http://host:port> (optionally C<https:...>), croaks
959     otherwise.
960    
961     To clear an already-set proxy, use C<undef>.
962 root 1.2
963 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
964    
965     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
966     Date (RFC 2616).
967    
968     =item $timestamp = AnyEvent::HTTP::parse_date $date
969    
970 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
971     bunch of minor variations of those, and returns the corresponding POSIX
972     timestamp, or C<undef> if the date cannot be parsed.
973 root 1.61
974 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
975 root 1.1
976 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
977 root 1.1
978     =item $AnyEvent::HTTP::USERAGENT
979    
980     The default value for the C<User-Agent> header (the default is
981 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
982 root 1.1
983 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
984 root 1.1
985 root 1.47 The maximum number of concurrent connections to the same host (identified
986 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
987     are queued until previous connections are closed.
988 root 1.1
989 root 1.43 The default value for this is C<4>, and it is highly advisable to not
990     increase it.
991 root 1.3
992 root 1.14 =item $AnyEvent::HTTP::ACTIVE
993    
994     The number of active connections. This is not the number of currently
995     running requests, but the number of currently open and non-idle TCP
996     connections. This number of can be useful for load-leveling.
997    
998 root 1.1 =back
999    
1000     =cut
1001    
1002 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1003     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1004    
1005     sub format_date($) {
1006     my ($time) = @_;
1007    
1008     # RFC 822/1123 format
1009     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1010    
1011     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1012     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1013     $H, $M, $S;
1014     }
1015    
1016     sub parse_date($) {
1017     my ($date) = @_;
1018    
1019     my ($d, $m, $y, $H, $M, $S);
1020    
1021 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1022 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1023     # cookie dates (with "-")
1024    
1025 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1026    
1027 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1028 root 1.61 # RFC 850
1029     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1030    
1031 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1032 root 1.61 # ISO C's asctime
1033     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1034     }
1035     # other formats fail in the loop below
1036    
1037     for (0..11) {
1038     if ($m eq $month[$_]) {
1039     require Time::Local;
1040     return Time::Local::timegm ($S, $M, $H, $d, $_, $y);
1041     }
1042     }
1043    
1044     undef
1045     }
1046    
1047 root 1.2 sub set_proxy($) {
1048 root 1.52 if (length $_[0]) {
1049     $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix
1050     or Carp::croak "$_[0]: invalid proxy URL";
1051     $PROXY = [$2, $3 || 3128, $1]
1052     } else {
1053     undef $PROXY;
1054     }
1055 root 1.2 }
1056    
1057     # initialise proxy from environment
1058 root 1.52 eval {
1059     set_proxy $ENV{http_proxy};
1060     };
1061 root 1.2
1062 root 1.60 =head2 SOCKS PROXIES
1063    
1064     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1065     compile your perl to support socks, or use an external program such as
1066     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1067     transparently.
1068    
1069     Alternatively, for AnyEvent::HTTP only, you can use your own
1070     C<tcp_connect> function that does the proxy handshake - here is an example
1071     that works with socks4a proxies:
1072    
1073     use Errno;
1074     use AnyEvent::Util;
1075     use AnyEvent::Socket;
1076     use AnyEvent::Handle;
1077    
1078     # host, port and username of/for your socks4a proxy
1079     my $socks_host = "10.0.0.23";
1080     my $socks_port = 9050;
1081     my $socks_user = "";
1082    
1083     sub socks4a_connect {
1084     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1085    
1086     my $hdl = new AnyEvent::Handle
1087     connect => [$socks_host, $socks_port],
1088     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1089     on_error => sub { $connect_cb->() },
1090     ;
1091    
1092     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1093    
1094     $hdl->push_read (chunk => 8, sub {
1095     my ($hdl, $chunk) = @_;
1096     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1097    
1098     if ($status == 0x5a) {
1099     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1100     } else {
1101     $! = Errno::ENXIO; $connect_cb->();
1102     }
1103     });
1104    
1105     $hdl
1106     }
1107    
1108     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1109     possibly after switching off other proxy types:
1110    
1111     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1112    
1113     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1114     my ($data, $headers) = @_;
1115     ...
1116     };
1117    
1118 root 1.1 =head1 SEE ALSO
1119    
1120     L<AnyEvent>.
1121    
1122     =head1 AUTHOR
1123    
1124 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1125     http://home.schmorp.de/
1126 root 1.1
1127 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1128     testcases and bugreports.
1129    
1130 root 1.1 =cut
1131    
1132     1
1133