ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.78
Committed: Sat Jan 1 19:32:41 2011 UTC (13 years, 4 months ago) by root
Branch: MAIN
Changes since 1.77: +2 -2 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44 root 1.41 use Errno ();
45 root 1.1
46 root 1.51 use AnyEvent 5.0 ();
47 root 1.1 use AnyEvent::Util ();
48     use AnyEvent::Handle ();
49    
50     use base Exporter::;
51    
52 root 1.65 our $VERSION = '1.5';
53 root 1.1
54 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
55 root 1.1
56 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
57 root 1.3 our $MAX_RECURSE = 10;
58 root 1.2 our $MAX_PERSISTENT = 8;
59     our $PERSISTENT_TIMEOUT = 2;
60     our $TIMEOUT = 300;
61 root 1.1
62     # changing these is evil
63 root 1.43 our $MAX_PERSISTENT_PER_HOST = 0;
64 root 1.11 our $MAX_PER_HOST = 4;
65 root 1.1
66 root 1.2 our $PROXY;
67 root 1.14 our $ACTIVE = 0;
68 root 1.2
69 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
70 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
71 root 1.1
72     =item http_get $url, key => value..., $cb->($data, $headers)
73    
74     Executes an HTTP-GET request. See the http_request function for details on
75 root 1.29 additional parameters and the return value.
76 root 1.1
77 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
78    
79 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
80     on additional parameters and the return value.
81 root 1.5
82     =item http_post $url, $body, key => value..., $cb->($data, $headers)
83 root 1.3
84 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
85 root 1.29 http_request function for details on additional parameters and the return
86     value.
87 root 1.3
88 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89    
90     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91     must be an absolute http or https URL.
92    
93 root 1.29 When called in void context, nothing is returned. In other contexts,
94     C<http_request> returns a "cancellation guard" - you have to keep the
95     object at least alive until the callback get called. If the object gets
96 root 1.58 destroyed before the callback is called, the request will be cancelled.
97 root 1.29
98 root 1.42 The callback will be called with the response body data as first argument
99 root 1.68 (or C<undef> if an error occured), and a hash-ref with response headers
100     (and trailers) as second argument.
101 root 1.2
102 root 1.7 All the headers in that hash are lowercased. In addition to the response
103 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
104     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
105 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
106     during the body phase of a request, then the original C<Status> and
107     C<Reason> values from the header are available as C<OrigStatus> and
108     C<OrigReason>.
109 root 1.55
110     The pseudo-header C<URL> contains the actual URL (which can differ from
111     the requested URL when following redirects - for example, you might get
112     an error that your URL scheme is not supported even though your URL is a
113     valid http URL because it redirected to an ftp URL, in which case you can
114     look at the URL pseudo header).
115    
116     The pseudo-header C<Redirect> only exists when the request was a result
117     of an internal redirect. In that case it is an array reference with
118     the C<($data, $headers)> from the redirect response. Note that this
119     response could in turn be the result of a redirect itself, and C<<
120     $headers->{Redirect}[1]{Redirect} >> will then contain the original
121     response, and so on.
122 root 1.20
123 root 1.32 If the server sends a header multiple times, then their contents will be
124     joined together with a comma (C<,>), as per the HTTP spec.
125 root 1.2
126     If an internal error occurs, such as not being able to resolve a hostname,
127 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
128     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
129     message. Currently the following status codes are used:
130    
131     =over 4
132    
133     =item 595 - errors during connection etsbalishment, proxy handshake.
134    
135     =item 596 - errors during TLS negotiation, request sending and header processing.
136    
137 root 1.78 =item 597 - errors during body receiving or processing.
138 root 1.77
139 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
140 root 1.77
141     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
142    
143     =back
144 root 1.2
145 root 1.6 A typical callback might look like this:
146    
147     sub {
148     my ($body, $hdr) = @_;
149    
150     if ($hdr->{Status} =~ /^2/) {
151     ... everything should be ok
152     } else {
153     print "error, $hdr->{Status} $hdr->{Reason}\n";
154     }
155     }
156    
157 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
158     include:
159    
160     =over 4
161    
162 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
163 root 1.1
164     Whether to recurse requests or not, e.g. on redirects, authentication
165 root 1.3 retries and so on, and how often to do so.
166 root 1.1
167     =item headers => hashref
168    
169 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
170     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
171 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
172     (this can be suppressed by using C<undef> for these headers in which case
173     they won't be sent at all).
174 root 1.1
175     =item timeout => $seconds
176    
177     The time-out to use for various stages - each connect attempt will reset
178 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
179     timeout.
180    
181     Default timeout is 5 minutes.
182 root 1.2
183     =item proxy => [$host, $port[, $scheme]] or undef
184    
185     Use the given http proxy for all requests. If not specified, then the
186     default proxy (as specified by C<$ENV{http_proxy}>) is used.
187    
188 root 1.47 C<$scheme> must be either missing, C<http> for HTTP or C<https> for
189 root 1.2 HTTPS.
190 root 1.1
191 root 1.3 =item body => $string
192    
193 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
194 root 1.3 this module might offer more options).
195    
196 root 1.10 =item cookie_jar => $hash_ref
197    
198     Passing this parameter enables (simplified) cookie-processing, loosely
199     based on the original netscape specification.
200    
201     The C<$hash_ref> must be an (initially empty) hash reference which will
202 root 1.70 get updated automatically. It is possible to save the cookie jar to
203 root 1.10 persistent storage with something like JSON or Storable, but this is not
204 root 1.70 recommended, as session-only cookies might survive longer than expected.
205 root 1.10
206 root 1.70 Note that this cookie implementation is not meant to be complete. If
207     you want complete cookie management you have to do that on your
208     own. C<cookie_jar> is meant as a quick fix to get some cookie-using sites
209     working. Cookies are a privacy disaster, do not use them unless required
210     to.
211 root 1.10
212 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
213 root 1.70 headers will be set and handled by this module, otherwise they will be
214 root 1.69 left untouched.
215    
216 root 1.40 =item tls_ctx => $scheme | $tls_ctx
217    
218     Specifies the AnyEvent::TLS context to be used for https connections. This
219     parameter follows the same rules as the C<tls_ctx> parameter to
220     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
221     C<high> can be specified, which give you a predefined low-security (no
222     verification, highest compatibility) and high-security (CA and common-name
223     verification) TLS context.
224    
225     The default for this option is C<low>, which could be interpreted as "give
226     me the page, no matter what".
227    
228 root 1.51 =item on_prepare => $callback->($fh)
229    
230     In rare cases you need to "tune" the socket before it is used to
231     connect (for exmaple, to bind it on a given IP address). This parameter
232     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
233     and behaves exactly the same way (e.g. it has to provide a
234     timeout). See the description for the C<$prepare_cb> argument of
235     C<AnyEvent::Socket::tcp_connect> for details.
236    
237 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
238    
239     In even rarer cases you want total control over how AnyEvent::HTTP
240     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
241     to do this, but you can provide your own C<tcp_connect> function -
242 root 1.60 obviously, it has to follow the same calling conventions, except that it
243     may always return a connection guard object.
244 root 1.59
245     There are probably lots of weird uses for this function, starting from
246     tracing the hosts C<http_request> actually tries to connect, to (inexact
247     but fast) host => IP address caching or even socks protocol support.
248    
249 root 1.42 =item on_header => $callback->($headers)
250 root 1.41
251     When specified, this callback will be called with the header hash as soon
252     as headers have been successfully received from the remote server (not on
253     locally-generated errors).
254    
255     It has to return either true (in which case AnyEvent::HTTP will continue),
256     or false, in which case AnyEvent::HTTP will cancel the download (and call
257     the finish callback with an error code of C<598>).
258    
259     This callback is useful, among other things, to quickly reject unwanted
260     content, which, if it is supposed to be rare, can be faster than first
261     doing a C<HEAD> request.
262    
263 root 1.68 The downside is that cancelling the request makes it impossible to re-use
264     the connection. Also, the C<on_header> callback will not receive any
265     trailer (headers sent after the response body).
266    
267 root 1.42 Example: cancel the request unless the content-type is "text/html".
268 root 1.41
269 root 1.42 on_header => sub {
270     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
271     },
272 root 1.41
273 root 1.42 =item on_body => $callback->($partial_body, $headers)
274 root 1.41
275 root 1.42 When specified, all body data will be passed to this callback instead of
276     to the completion callback. The completion callback will get the empty
277     string instead of the body data.
278 root 1.41
279 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
280     or false, in which case AnyEvent::HTTP will cancel the download (and call
281     the completion callback with an error code of C<598>).
282    
283 root 1.68 The downside to cancelling the request is that it makes it impossible to
284     re-use the connection.
285    
286 root 1.42 This callback is useful when the data is too large to be held in memory
287     (so the callback writes it to a file) or when only some information should
288     be extracted, or when the body should be processed incrementally.
289 root 1.41
290     It is usually preferred over doing your own body handling via
291 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
292     only used to create a connection, C<want_body_handle> is the better
293     alternative, as it allows you to install your own event handler, reducing
294     resource usage.
295 root 1.41
296     =item want_body_handle => $enable
297    
298     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
299     changes considerably: after parsing the headers, and instead of
300     downloading the body (if any), the completion callback will be
301     called. Instead of the C<$body> argument containing the body data, the
302     callback will receive the L<AnyEvent::Handle> object associated with the
303     connection. In error cases, C<undef> will be passed. When there is no body
304     (e.g. status C<304>), the empty string will be passed.
305    
306     The handle object might or might not be in TLS mode, might be connected to
307     a proxy, be a persistent connection etc., and configured in unspecified
308     ways. The user is responsible for this handle (it will not be used by this
309     module anymore).
310    
311     This is useful with some push-type services, where, after the initial
312     headers, an interactive protocol is used (typical example would be the
313     push-style twitter API which starts a JSON/XML stream).
314    
315     If you think you need this, first have a look at C<on_body>, to see if
316 root 1.45 that doesn't solve your problem in a better way.
317 root 1.41
318 root 1.1 =back
319    
320 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
321     the response body.
322 root 1.9
323     http_request GET => "http://www.nethype.de/", sub {
324     my ($body, $hdr) = @_;
325     print "$body\n";
326     };
327    
328 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
329 root 1.9 timeout of 30 seconds.
330    
331     http_request
332     GET => "https://www.google.com",
333     timeout => 30,
334     sub {
335     my ($body, $hdr) = @_;
336     use Data::Dumper;
337     print Dumper $hdr;
338     }
339     ;
340 root 1.1
341 root 1.68 Example: do another simple HTTP GET request, but immediately try to
342 root 1.29 cancel it.
343    
344     my $request = http_request GET => "http://www.nethype.de/", sub {
345     my ($body, $hdr) = @_;
346     print "$body\n";
347     };
348    
349     undef $request;
350    
351 root 1.1 =cut
352    
353 root 1.12 sub _slot_schedule;
354 root 1.11 sub _slot_schedule($) {
355     my $host = shift;
356    
357     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
358     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
359 root 1.12 # somebody wants that slot
360 root 1.11 ++$CO_SLOT{$host}[0];
361 root 1.14 ++$ACTIVE;
362 root 1.11
363     $cb->(AnyEvent::Util::guard {
364 root 1.14 --$ACTIVE;
365 root 1.11 --$CO_SLOT{$host}[0];
366     _slot_schedule $host;
367     });
368     } else {
369     # nobody wants the slot, maybe we can forget about it
370     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
371     last;
372     }
373     }
374     }
375    
376     # wait for a free slot on host, call callback
377     sub _get_slot($$) {
378     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
379    
380     _slot_schedule $_[0];
381     }
382    
383 root 1.72 # extract cookies from jar
384 root 1.71 sub cookie_jar_extract($$$$) {
385     my ($jar, $uscheme, $uhost, $upath) = @_;
386    
387     %$jar = () if $jar->{version} != 1;
388    
389     my @cookies;
390    
391     while (my ($chost, $paths) = each %$jar) {
392     next unless ref $paths;
393    
394     if ($chost =~ /^\./) {
395     next unless $chost eq substr $uhost, -length $chost;
396     } elsif ($chost =~ /\./) {
397     next unless $chost eq $uhost;
398     } else {
399     next;
400     }
401    
402     while (my ($cpath, $cookies) = each %$paths) {
403     next unless $cpath eq substr $upath, 0, length $cpath;
404    
405     while (my ($cookie, $kv) = each %$cookies) {
406     next if $uscheme ne "https" && exists $kv->{secure};
407    
408     if (exists $kv->{expires}) {
409     if (AE::now > parse_date ($kv->{expires})) {
410     delete $cookies->{$cookie};
411     next;
412     }
413     }
414    
415     my $value = $kv->{value};
416    
417     if ($value =~ /[=;,[:space:]]/) {
418     $value =~ s/([\\"])/\\$1/g;
419     $value = "\"$value\"";
420     }
421    
422     push @cookies, "$cookie=$value";
423     }
424     }
425     }
426    
427     \@cookies
428     }
429    
430 root 1.72 # parse set_cookie header into jar
431 root 1.73 sub cookie_jar_set_cookie($$$) {
432     my ($jar, $set_cookie, $uhost) = @_;
433 root 1.72
434     for ($set_cookie) {
435     # parse NAME=VALUE
436     my @kv;
437    
438     while (
439     m{
440     \G\s*
441     (?:
442     expires \s*=\s* ([A-Z][a-z][a-z],\ [^,;]+)
443     | ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )
444     )
445     }gcxsi
446     ) {
447     my $name = $2;
448     my $value = $4;
449    
450     unless (defined $name) {
451     # expires
452     $name = "expires";
453     $value = $1;
454     } elsif (!defined $value) {
455     # quoted
456     $value = $3;
457     $value =~ s/\\(.)/$1/gs;
458     }
459    
460     push @kv, lc $name, $value;
461    
462     last unless /\G\s*;/gc;
463     }
464    
465     last unless @kv;
466    
467     my $name = shift @kv;
468     my %kv = (value => shift @kv, @kv);
469    
470     $kv{expires} ||= format_date (AE::now + $kv{"max-age"})
471     if exists $kv{"max-age"};
472    
473     my $cdom;
474     my $cpath = (delete $kv{path}) || "/";
475    
476     if (exists $kv{domain}) {
477     $cdom = delete $kv{domain};
478    
479     $cdom =~ s/^\.?/./; # make sure it starts with a "."
480    
481     next if $cdom =~ /\.$/;
482    
483     # this is not rfc-like and not netscape-like. go figure.
484     my $ndots = $cdom =~ y/.//;
485     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
486     } else {
487     $cdom = $uhost;
488     }
489    
490     # store it
491 root 1.73 $jar->{version} = 1;
492     $jar->{$cdom}{$cpath}{$name} = \%kv;
493 root 1.72
494     redo if /\G\s*,/gc;
495     }
496     }
497    
498 root 1.66 # continue to parse $_ for headers and place them into the arg
499     sub parse_hdr() {
500     my %hdr;
501    
502     # things seen, not parsed:
503     # p3pP="NON CUR OTPi OUR NOR UNI"
504    
505     $hdr{lc $1} .= ",$2"
506     while /\G
507     ([^:\000-\037]*):
508     [\011\040]*
509     ((?: [^\012]+ | \012[\011\040] )*)
510     \012
511     /gxc;
512    
513     /\G$/
514     or return;
515    
516     # remove the "," prefix we added to all headers above
517     substr $_, 0, 1, ""
518     for values %hdr;
519    
520     \%hdr
521     }
522    
523 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
524 root 1.34
525 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
526     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
527 root 1.40
528 elmex 1.15 sub http_request($$@) {
529 root 1.1 my $cb = pop;
530     my ($method, $url, %arg) = @_;
531    
532     my %hdr;
533    
534 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
535     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
536    
537 root 1.3 $method = uc $method;
538    
539 root 1.8 if (my $hdr = $arg{headers}) {
540 root 1.1 while (my ($k, $v) = each %$hdr) {
541     $hdr{lc $k} = $v;
542     }
543     }
544    
545 root 1.55 # pseudo headers for all subsequent responses
546     my @pseudo = (URL => $url);
547     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
548    
549 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
550 root 1.8
551 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
552 root 1.8 if $recurse < 0;
553    
554 root 1.2 my $proxy = $arg{proxy} || $PROXY;
555 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
556    
557 root 1.31 my ($uscheme, $uauthority, $upath, $query, $fragment) =
558 root 1.56 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?|;
559 root 1.2
560 root 1.31 $uscheme = lc $uscheme;
561 root 1.1
562 root 1.31 my $uport = $uscheme eq "http" ? 80
563     : $uscheme eq "https" ? 443
564 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
565 root 1.13
566 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
567 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
568 root 1.10
569     my $uhost = $1;
570     $uport = $2 if defined $2;
571    
572 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
573     unless exists $hdr{host};
574 root 1.43
575 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
576 root 1.56 $upath .= $query if length $query;
577 root 1.10
578     $upath =~ s%^/?%/%;
579    
580     # cookie processing
581     if (my $jar = $arg{cookie_jar}) {
582 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
583 root 1.70
584 root 1.71 $hdr{cookie} = join "; ", @$cookies
585     if @$cookies;
586 root 1.10 }
587 root 1.1
588 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
589 root 1.2
590 root 1.10 if ($proxy) {
591 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
592 root 1.31
593 root 1.47 $rscheme = "http" unless defined $rscheme;
594    
595 root 1.31 # don't support https requests over https-proxy transport,
596 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
597 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
598 root 1.10 } else {
599 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
600 root 1.2 }
601    
602 root 1.47 # leave out fragment and query string, just a heuristic
603 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
604     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
605 root 1.41
606 root 1.53 $hdr{"content-length"} = length $arg{body}
607     if length $arg{body} || $method ne "GET";
608 root 1.1
609 root 1.68 $hdr{connection} = "close TE"; #1.1
610     $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
611 root 1.66
612 root 1.11 my %state = (connect_guard => 1);
613    
614     _get_slot $uhost, sub {
615     $state{slot_guard} = shift;
616 root 1.1
617 root 1.11 return unless $state{connect_guard};
618 root 1.1
619 root 1.77 my $ae_error = 595; # connecting
620    
621 root 1.64 my $connect_cb = sub {
622     $state{fh} = shift
623     or do {
624     my $err = "$!";
625     %state = ();
626 root 1.77 return $cb->(undef, { @pseudo, Status => $ae_error, Reason => $err });
627 root 1.64 };
628 root 1.44
629 root 1.64 return unless delete $state{connect_guard};
630 root 1.11
631 root 1.64 # get handle
632     $state{handle} = new AnyEvent::Handle
633     fh => $state{fh},
634     peername => $rhost,
635     tls_ctx => $arg{tls_ctx},
636     # these need to be reconfigured on keepalive handles
637     timeout => $timeout,
638     on_error => sub {
639     %state = ();
640 root 1.77 $cb->(undef, { @pseudo, Status => $ae_error, Reason => $_[2] });
641 root 1.64 },
642     on_eof => sub {
643     %state = ();
644 root 1.77 $cb->(undef, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" });
645 root 1.64 },
646     ;
647 root 1.11
648 root 1.64 # limit the number of persistent connections
649     # keepalive not yet supported
650 root 1.56 # if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
651     # ++$KA_COUNT{$_[1]};
652     # $state{handle}{ka_count_guard} = AnyEvent::Util::guard {
653     # --$KA_COUNT{$_[1]}
654     # };
655     # $hdr{connection} = "keep-alive";
656     # }
657 root 1.1
658 root 1.64 $state{handle}->starttls ("connect") if $rscheme eq "https";
659    
660     # handle actual, non-tunneled, request
661     my $handle_actual_request = sub {
662 root 1.77 $ae_error = 596; # request phase
663    
664 root 1.64 $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls};
665    
666     # send request
667     $state{handle}->push_write (
668 root 1.66 "$method $rpath HTTP/1.1\015\012"
669 root 1.64 . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
670     . "\015\012"
671     . (delete $arg{body})
672     );
673    
674     # return if error occured during push_write()
675     return unless %state;
676    
677     %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use
678    
679     # status line and headers
680 root 1.68 $state{read_response} = sub {
681 root 1.64 for ("$_[1]") {
682     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
683    
684 root 1.76 /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
685 root 1.64 or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Invalid server response" }));
686    
687 root 1.68 # 100 Continue handling
688     # should not happen as we don't send expect: 100-continue,
689     # but we handle it just in case.
690     # since we send the request body regardless, if we get an error
691     # we are out of-sync, which we currently do NOT handle correctly.
692     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
693     if $2 eq 100;
694    
695 root 1.64 push @pseudo,
696     HTTPVersion => $1,
697     Status => $2,
698     Reason => $3,
699     ;
700    
701 root 1.66 my $hdr = parse_hdr
702     or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Garbled response headers" }));
703 root 1.64
704 root 1.66 %hdr = (%$hdr, @pseudo);
705 root 1.64 }
706    
707     # redirect handling
708     # microsoft and other shitheads don't give a shit for following standards,
709     # try to support some common forms of broken Location headers.
710     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
711     $hdr{location} =~ s/^\.\/+//;
712    
713     my $url = "$rscheme://$uhost:$uport";
714    
715     unless ($hdr{location} =~ s/^\///) {
716     $url .= $upath;
717     $url =~ s/\/[^\/]*$//;
718     }
719 root 1.59
720 root 1.64 $hdr{location} = "$url/$hdr{location}";
721     }
722 root 1.31
723 root 1.64 my $redirect;
724 root 1.41
725 root 1.64 if ($recurse) {
726     my $status = $hdr{Status};
727 root 1.59
728 root 1.64 # industry standard is to redirect POST as GET for
729 root 1.76 # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
730 root 1.64 # also, the UA should ask the user for 301 and 307 and POST,
731     # industry standard seems to be to simply follow.
732     # we go with the industry standard.
733     if ($status == 301 or $status == 302 or $status == 303) {
734     # HTTP/1.1 is unclear on how to mutate the method
735     $method = "GET" unless $method eq "HEAD";
736     $redirect = 1;
737     } elsif ($status == 307) {
738     $redirect = 1;
739 root 1.59 }
740 root 1.64 }
741 root 1.57
742 root 1.64 my $finish = sub { # ($data, $err_status, $err_reason[, $keepalive])
743 root 1.74 my $may_keep_alive = $_[3];
744 root 1.68
745 root 1.64 $state{handle}->destroy if $state{handle};
746     %state = ();
747 root 1.55
748 root 1.64 if (defined $_[1]) {
749     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
750     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
751 root 1.41 }
752    
753 root 1.64 # set-cookie processing
754     if ($arg{cookie_jar}) {
755 root 1.73 cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost;
756     }
757 root 1.31
758 root 1.64 if ($redirect && exists $hdr{location}) {
759     # we ignore any errors, as it is very common to receive
760     # Content-Length != 0 but no actual body
761     # we also access %hdr, as $_[1] might be an erro
762     http_request (
763     $method => $hdr{location},
764     %arg,
765     recurse => $recurse - 1,
766     Redirect => [$_[0], \%hdr],
767     $cb);
768     } else {
769     $cb->($_[0], \%hdr);
770     }
771     };
772    
773 root 1.77 $ae_error = 597; # body phase
774    
775 root 1.64 my $len = $hdr{"content-length"};
776    
777     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
778     $finish->(undef, 598 => "Request cancelled by on_header");
779     } elsif (
780     $hdr{Status} =~ /^(?:1..|204|205|304)$/
781     or $method eq "HEAD"
782     or (defined $len && !$len)
783     ) {
784     # no body
785     $finish->("", undef, undef, 1);
786     } else {
787 root 1.66 # body handling, many different code paths
788     # - no body expected
789     # - want_body_handle
790     # - te chunked
791     # - 2x length known (with or without on_body)
792     # - 2x length not known (with or without on_body)
793 root 1.64 if (!$redirect && $arg{want_body_handle}) {
794     $_[0]->on_eof (undef);
795     $_[0]->on_error (undef);
796     $_[0]->on_read (undef);
797    
798     $finish->(delete $state{handle});
799    
800 root 1.68 } elsif ($hdr{"transfer-encoding"} =~ /\bchunked\b/i) {
801     my $cl = 0;
802 root 1.66 my $body = undef;
803     my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
804    
805     my $read_chunk; $read_chunk = sub {
806     $_[1] =~ /^([0-9a-fA-F]+)/
807 root 1.77 or $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
808 root 1.66
809     my $len = hex $1;
810    
811     if ($len) {
812 root 1.68 $cl += $len;
813    
814     $_[0]->push_read (chunk => $len, sub {
815 root 1.66 $on_body->($_[1], \%hdr)
816     or return $finish->(undef, 598 => "Request cancelled by on_body");
817    
818     $_[0]->push_read (line => sub {
819     length $_[1]
820 root 1.77 and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
821 root 1.66 $_[0]->push_read (line => $read_chunk);
822     });
823     });
824     } else {
825 root 1.68 $hdr{"content-length"} ||= $cl;
826    
827 root 1.66 $_[0]->push_read (line => $qr_nlnl, sub {
828     if (length $_[1]) {
829     for ("$_[1]") {
830     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
831    
832     my $hdr = parse_hdr
833 root 1.77 or return $finish->(undef, $ae_error => "Garbled response trailers");
834 root 1.66
835     %hdr = (%hdr, %$hdr);
836     }
837     }
838    
839     $finish->($body, undef, undef, 1);
840     });
841     }
842     };
843    
844     $_[0]->push_read (line => $read_chunk);
845    
846 root 1.64 } elsif ($arg{on_body}) {
847     if ($len) {
848     $_[0]->on_read (sub {
849     $len -= length $_[0]{rbuf};
850    
851     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
852 root 1.66 or return $finish->(undef, 598 => "Request cancelled by on_body");
853 root 1.64
854     $len > 0
855     or $finish->("", undef, undef, 1);
856     });
857 root 1.59 } else {
858 root 1.64 $_[0]->on_eof (sub {
859     $finish->("");
860     });
861     $_[0]->on_read (sub {
862     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
863     or $finish->(undef, 598 => "Request cancelled by on_body");
864     });
865 root 1.11 }
866 root 1.64 } else {
867     $_[0]->on_eof (undef);
868 root 1.59
869 root 1.64 if ($len) {
870     $_[0]->on_read (sub {
871     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
872     if $len <= length $_[0]{rbuf};
873     });
874 root 1.59 } else {
875 root 1.64 $_[0]->on_error (sub {
876     ($! == Errno::EPIPE || !$!)
877     ? $finish->(delete $_[0]{rbuf})
878 root 1.77 : $finish->(undef, $ae_error => $_[2]);
879 root 1.64 });
880     $_[0]->on_read (sub { });
881 root 1.59 }
882     }
883 root 1.64 }
884 root 1.68 };
885    
886     $state{handle}->push_read (line => $qr_nlnl, $state{read_response});
887 root 1.64 };
888    
889     # now handle proxy-CONNECT method
890     if ($proxy && $uscheme eq "https") {
891     # oh dear, we have to wrap it into a connect request
892    
893     # maybe re-use $uauthority with patched port?
894     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012");
895     $state{handle}->push_read (line => $qr_nlnl, sub {
896     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
897     or return (%state = (), $cb->(undef, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" }));
898    
899     if ($2 == 200) {
900     $rpath = $upath;
901     &$handle_actual_request;
902     } else {
903     %state = ();
904     $cb->(undef, { @pseudo, Status => $2, Reason => $3 });
905     }
906     });
907     } else {
908     &$handle_actual_request;
909     }
910     };
911    
912     my $tcp_connect = $arg{tcp_connect}
913     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
914 root 1.57
915 root 1.64 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
916 root 1.31
917 root 1.1 };
918    
919     defined wantarray && AnyEvent::Util::guard { %state = () }
920     }
921    
922 elmex 1.15 sub http_get($@) {
923 root 1.1 unshift @_, "GET";
924     &http_request
925     }
926    
927 elmex 1.15 sub http_head($@) {
928 root 1.4 unshift @_, "HEAD";
929     &http_request
930     }
931    
932 elmex 1.15 sub http_post($$@) {
933 root 1.22 my $url = shift;
934     unshift @_, "POST", $url, "body";
935 root 1.3 &http_request
936     }
937    
938 root 1.9 =back
939    
940 root 1.55 =head2 DNS CACHING
941    
942     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
943     the actual connection, which in turn uses AnyEvent::DNS to resolve
944     hostnames. The latter is a simple stub resolver and does no caching
945     on its own. If you want DNS caching, you currently have to provide
946     your own default resolver (by storing a suitable resolver object in
947     C<$AnyEvent::DNS::RESOLVER>).
948    
949 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
950 root 1.1
951     =over 4
952    
953 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
954    
955     Sets the default proxy server to use. The proxy-url must begin with a
956 root 1.52 string of the form C<http://host:port> (optionally C<https:...>), croaks
957     otherwise.
958    
959     To clear an already-set proxy, use C<undef>.
960 root 1.2
961 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
962    
963     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
964     Date (RFC 2616).
965    
966     =item $timestamp = AnyEvent::HTTP::parse_date $date
967    
968 root 1.70 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) and
969     returns the corresponding POSIX timestamp, or C<undef> if the date cannot
970     be parsed.
971 root 1.61
972 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
973 root 1.1
974 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
975 root 1.1
976     =item $AnyEvent::HTTP::USERAGENT
977    
978     The default value for the C<User-Agent> header (the default is
979 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
980 root 1.1
981 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
982 root 1.1
983 root 1.47 The maximum number of concurrent connections to the same host (identified
984 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
985     are queued until previous connections are closed.
986 root 1.1
987 root 1.43 The default value for this is C<4>, and it is highly advisable to not
988     increase it.
989 root 1.3
990 root 1.14 =item $AnyEvent::HTTP::ACTIVE
991    
992     The number of active connections. This is not the number of currently
993     running requests, but the number of currently open and non-idle TCP
994     connections. This number of can be useful for load-leveling.
995    
996 root 1.1 =back
997    
998     =cut
999    
1000 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1001     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1002    
1003     sub format_date($) {
1004     my ($time) = @_;
1005    
1006     # RFC 822/1123 format
1007     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1008    
1009     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1010     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1011     $H, $M, $S;
1012     }
1013    
1014     sub parse_date($) {
1015     my ($date) = @_;
1016    
1017     my ($d, $m, $y, $H, $M, $S);
1018    
1019 root 1.70 if ($date =~ /^[A-Z][a-z][a-z], ([0-9][0-9])[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
1020     # RFC 822/1123, required by RFC 2616 (with " ")
1021     # cookie dates (with "-")
1022    
1023 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1024    
1025     } elsif ($date =~ /^[A-Z][a-z]+, ([0-9][0-9])-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) {
1026     # RFC 850
1027     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1028    
1029     } elsif ($date =~ /^[A-Z][a-z][a-z] ([A-Z][a-z][a-z]) ([0-9 ][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) ([0-9][0-9][0-9][0-9])$/) {
1030     # ISO C's asctime
1031     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1032     }
1033     # other formats fail in the loop below
1034    
1035     for (0..11) {
1036     if ($m eq $month[$_]) {
1037     require Time::Local;
1038     return Time::Local::timegm ($S, $M, $H, $d, $_, $y);
1039     }
1040     }
1041    
1042     undef
1043     }
1044    
1045 root 1.2 sub set_proxy($) {
1046 root 1.52 if (length $_[0]) {
1047     $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix
1048     or Carp::croak "$_[0]: invalid proxy URL";
1049     $PROXY = [$2, $3 || 3128, $1]
1050     } else {
1051     undef $PROXY;
1052     }
1053 root 1.2 }
1054    
1055     # initialise proxy from environment
1056 root 1.52 eval {
1057     set_proxy $ENV{http_proxy};
1058     };
1059 root 1.2
1060 root 1.60 =head2 SOCKS PROXIES
1061    
1062     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1063     compile your perl to support socks, or use an external program such as
1064     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1065     transparently.
1066    
1067     Alternatively, for AnyEvent::HTTP only, you can use your own
1068     C<tcp_connect> function that does the proxy handshake - here is an example
1069     that works with socks4a proxies:
1070    
1071     use Errno;
1072     use AnyEvent::Util;
1073     use AnyEvent::Socket;
1074     use AnyEvent::Handle;
1075    
1076     # host, port and username of/for your socks4a proxy
1077     my $socks_host = "10.0.0.23";
1078     my $socks_port = 9050;
1079     my $socks_user = "";
1080    
1081     sub socks4a_connect {
1082     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1083    
1084     my $hdl = new AnyEvent::Handle
1085     connect => [$socks_host, $socks_port],
1086     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1087     on_error => sub { $connect_cb->() },
1088     ;
1089    
1090     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1091    
1092     $hdl->push_read (chunk => 8, sub {
1093     my ($hdl, $chunk) = @_;
1094     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1095    
1096     if ($status == 0x5a) {
1097     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1098     } else {
1099     $! = Errno::ENXIO; $connect_cb->();
1100     }
1101     });
1102    
1103     $hdl
1104     }
1105    
1106     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1107     possibly after switching off other proxy types:
1108    
1109     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1110    
1111     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1112     my ($data, $headers) = @_;
1113     ...
1114     };
1115    
1116 root 1.1 =head1 SEE ALSO
1117    
1118     L<AnyEvent>.
1119    
1120     =head1 AUTHOR
1121    
1122 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1123     http://home.schmorp.de/
1124 root 1.1
1125 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1126     testcases and bugreports.
1127    
1128 root 1.1 =cut
1129    
1130     1
1131