ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.98
Committed: Mon Jan 24 20:03:24 2011 UTC (13 years, 3 months ago) by root
Branch: MAIN
Changes since 1.97: +1 -1 lines
Log Message:
Emmanuel Rodriguez

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
21 root 1.11 automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41 root 1.85 use common::sense;
42 root 1.1
43 root 1.41 use Errno ();
44 root 1.1
45 root 1.51 use AnyEvent 5.0 ();
46 root 1.1 use AnyEvent::Util ();
47     use AnyEvent::Handle ();
48    
49     use base Exporter::;
50    
51 root 1.97 our $VERSION = '2.03';
52 root 1.1
53 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
54 root 1.1
55 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
56 root 1.3 our $MAX_RECURSE = 10;
57 root 1.92 our $PERSISTENT_TIMEOUT = 3;
58 root 1.2 our $TIMEOUT = 300;
59 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
60 root 1.1
61 root 1.2 our $PROXY;
62 root 1.14 our $ACTIVE = 0;
63 root 1.2
64 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
65 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
66 root 1.1
67     =item http_get $url, key => value..., $cb->($data, $headers)
68    
69     Executes an HTTP-GET request. See the http_request function for details on
70 root 1.29 additional parameters and the return value.
71 root 1.1
72 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
73    
74 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
75     on additional parameters and the return value.
76 root 1.5
77     =item http_post $url, $body, key => value..., $cb->($data, $headers)
78 root 1.3
79 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
80 root 1.29 http_request function for details on additional parameters and the return
81     value.
82 root 1.3
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.29 When called in void context, nothing is returned. In other contexts,
89     C<http_request> returns a "cancellation guard" - you have to keep the
90     object at least alive until the callback get called. If the object gets
91 root 1.58 destroyed before the callback is called, the request will be cancelled.
92 root 1.29
93 root 1.42 The callback will be called with the response body data as first argument
94 root 1.68 (or C<undef> if an error occured), and a hash-ref with response headers
95     (and trailers) as second argument.
96 root 1.2
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
99     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
100 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
101     during the body phase of a request, then the original C<Status> and
102     C<Reason> values from the header are available as C<OrigStatus> and
103     C<OrigReason>.
104 root 1.55
105     The pseudo-header C<URL> contains the actual URL (which can differ from
106     the requested URL when following redirects - for example, you might get
107     an error that your URL scheme is not supported even though your URL is a
108     valid http URL because it redirected to an ftp URL, in which case you can
109     look at the URL pseudo header).
110    
111     The pseudo-header C<Redirect> only exists when the request was a result
112     of an internal redirect. In that case it is an array reference with
113     the C<($data, $headers)> from the redirect response. Note that this
114     response could in turn be the result of a redirect itself, and C<<
115     $headers->{Redirect}[1]{Redirect} >> will then contain the original
116     response, and so on.
117 root 1.20
118 root 1.32 If the server sends a header multiple times, then their contents will be
119     joined together with a comma (C<,>), as per the HTTP spec.
120 root 1.2
121     If an internal error occurs, such as not being able to resolve a hostname,
122 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
123     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
124     message. Currently the following status codes are used:
125    
126     =over 4
127    
128     =item 595 - errors during connection etsbalishment, proxy handshake.
129    
130     =item 596 - errors during TLS negotiation, request sending and header processing.
131    
132 root 1.78 =item 597 - errors during body receiving or processing.
133 root 1.77
134 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
135 root 1.77
136     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
137    
138     =back
139 root 1.2
140 root 1.6 A typical callback might look like this:
141    
142     sub {
143     my ($body, $hdr) = @_;
144    
145     if ($hdr->{Status} =~ /^2/) {
146     ... everything should be ok
147     } else {
148     print "error, $hdr->{Status} $hdr->{Reason}\n";
149     }
150     }
151    
152 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
153     include:
154    
155     =over 4
156    
157 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
158 root 1.1
159     Whether to recurse requests or not, e.g. on redirects, authentication
160 root 1.3 retries and so on, and how often to do so.
161 root 1.1
162     =item headers => hashref
163    
164 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
165     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
166 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
167     (this can be suppressed by using C<undef> for these headers in which case
168     they won't be sent at all).
169 root 1.1
170 root 1.90 You really should provide your own C<User-Agent:> header value that is
171     appropriate for your program - I wouldn't be surprised if the default
172     AnyEvent string gets blocked by webservers sooner or later.
173    
174 root 1.1 =item timeout => $seconds
175    
176     The time-out to use for various stages - each connect attempt will reset
177 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
178     timeout.
179    
180     Default timeout is 5 minutes.
181 root 1.2
182     =item proxy => [$host, $port[, $scheme]] or undef
183    
184     Use the given http proxy for all requests. If not specified, then the
185     default proxy (as specified by C<$ENV{http_proxy}>) is used.
186    
187 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
188 root 1.1
189 root 1.3 =item body => $string
190    
191 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
192 root 1.3 this module might offer more options).
193    
194 root 1.10 =item cookie_jar => $hash_ref
195    
196     Passing this parameter enables (simplified) cookie-processing, loosely
197     based on the original netscape specification.
198    
199 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
200     will get updated automatically. It is possible to save the cookie jar
201     to persistent storage with something like JSON or Storable - see the
202     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
203     expired or session-only cookies, and also for documentation on the format
204     of the cookie jar.
205 root 1.10
206 root 1.70 Note that this cookie implementation is not meant to be complete. If
207     you want complete cookie management you have to do that on your
208 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
209 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
210     to.
211 root 1.10
212 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
213 root 1.70 headers will be set and handled by this module, otherwise they will be
214 root 1.69 left untouched.
215    
216 root 1.40 =item tls_ctx => $scheme | $tls_ctx
217    
218     Specifies the AnyEvent::TLS context to be used for https connections. This
219     parameter follows the same rules as the C<tls_ctx> parameter to
220     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
221     C<high> can be specified, which give you a predefined low-security (no
222     verification, highest compatibility) and high-security (CA and common-name
223     verification) TLS context.
224    
225     The default for this option is C<low>, which could be interpreted as "give
226     me the page, no matter what".
227    
228 root 1.92 See also the C<sessionid> parameter.
229    
230     =item session => $string
231    
232     The module might reuse connections to the same host internally. Sometimes
233     (e.g. when using TLS), you do not want to reuse connections from other
234     sessions. This can be achieved by setting this parameter to some unique
235     ID (such as the address of an object storing your state data, or the TLS
236     context) - only connections using the same unique ID will be reused.
237    
238 root 1.51 =item on_prepare => $callback->($fh)
239    
240     In rare cases you need to "tune" the socket before it is used to
241     connect (for exmaple, to bind it on a given IP address). This parameter
242     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
243     and behaves exactly the same way (e.g. it has to provide a
244     timeout). See the description for the C<$prepare_cb> argument of
245     C<AnyEvent::Socket::tcp_connect> for details.
246    
247 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
248    
249     In even rarer cases you want total control over how AnyEvent::HTTP
250     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
251     to do this, but you can provide your own C<tcp_connect> function -
252 root 1.60 obviously, it has to follow the same calling conventions, except that it
253     may always return a connection guard object.
254 root 1.59
255     There are probably lots of weird uses for this function, starting from
256     tracing the hosts C<http_request> actually tries to connect, to (inexact
257     but fast) host => IP address caching or even socks protocol support.
258    
259 root 1.42 =item on_header => $callback->($headers)
260 root 1.41
261     When specified, this callback will be called with the header hash as soon
262     as headers have been successfully received from the remote server (not on
263     locally-generated errors).
264    
265     It has to return either true (in which case AnyEvent::HTTP will continue),
266     or false, in which case AnyEvent::HTTP will cancel the download (and call
267     the finish callback with an error code of C<598>).
268    
269     This callback is useful, among other things, to quickly reject unwanted
270     content, which, if it is supposed to be rare, can be faster than first
271     doing a C<HEAD> request.
272    
273 root 1.68 The downside is that cancelling the request makes it impossible to re-use
274     the connection. Also, the C<on_header> callback will not receive any
275     trailer (headers sent after the response body).
276    
277 root 1.42 Example: cancel the request unless the content-type is "text/html".
278 root 1.41
279 root 1.42 on_header => sub {
280     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
281     },
282 root 1.41
283 root 1.42 =item on_body => $callback->($partial_body, $headers)
284 root 1.41
285 root 1.42 When specified, all body data will be passed to this callback instead of
286     to the completion callback. The completion callback will get the empty
287     string instead of the body data.
288 root 1.41
289 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
290     or false, in which case AnyEvent::HTTP will cancel the download (and call
291     the completion callback with an error code of C<598>).
292    
293 root 1.68 The downside to cancelling the request is that it makes it impossible to
294     re-use the connection.
295    
296 root 1.42 This callback is useful when the data is too large to be held in memory
297     (so the callback writes it to a file) or when only some information should
298     be extracted, or when the body should be processed incrementally.
299 root 1.41
300     It is usually preferred over doing your own body handling via
301 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
302     only used to create a connection, C<want_body_handle> is the better
303     alternative, as it allows you to install your own event handler, reducing
304     resource usage.
305 root 1.41
306     =item want_body_handle => $enable
307    
308     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
309     changes considerably: after parsing the headers, and instead of
310     downloading the body (if any), the completion callback will be
311     called. Instead of the C<$body> argument containing the body data, the
312     callback will receive the L<AnyEvent::Handle> object associated with the
313     connection. In error cases, C<undef> will be passed. When there is no body
314     (e.g. status C<304>), the empty string will be passed.
315    
316 root 1.92 The handle object might or might not be in TLS mode, might be connected
317     to a proxy, be a persistent connection, use chunked transfer encoding
318     etc., and configured in unspecified ways. The user is responsible for this
319     handle (it will not be used by this module anymore).
320 root 1.41
321     This is useful with some push-type services, where, after the initial
322     headers, an interactive protocol is used (typical example would be the
323     push-style twitter API which starts a JSON/XML stream).
324    
325     If you think you need this, first have a look at C<on_body>, to see if
326 root 1.45 that doesn't solve your problem in a better way.
327 root 1.41
328 root 1.92 =item persistent => $boolean
329    
330     Try to create/reuse a persistent connection. When this flag is set
331     (default: true for idempotent requests, false for all others), then
332     C<http_request> tries to re-use an existing (previously-created)
333     persistent connection to the host and, failing that, tries to create a new
334     one.
335    
336     Requests failing in certain ways will be automatically retried once, which
337     is dangerous for non-idempotent requests, which is why it defaults to off
338     for them. The reason for this is because the bozos who designed HTTP/1.1
339     made it impossible to distinguish between a fatal error and a normal
340     connection timeout, so you never know whether there was a problem with
341     your request or not.
342    
343     When reusing an existent connection, many parameters (such as TLS context)
344     will be ignored. See the C<session> parameter for a workaround.
345    
346     =item keepalive => $boolean
347    
348     Only used when C<persistent> is also true. This parameter decides whether
349     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
350     (as opposed to only a HTTP/1.1 persistent connection).
351    
352     The default is true, except when using a proxy, in which case it defaults
353     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
354    
355     =item handle_params => { key => value ... }
356    
357     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
358     constructor that is called - not all requests will create a handle, and
359     sometimes more than one is created, so this parameter is only good for
360     setting hints.
361    
362     Example: set the maximum read size to 4096, to potentially conserve memory
363     at the cost of speed.
364    
365     handle_params => {
366     max_read_size => 4096,
367     },
368    
369 root 1.1 =back
370    
371 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
372     the response body.
373 root 1.9
374     http_request GET => "http://www.nethype.de/", sub {
375     my ($body, $hdr) = @_;
376     print "$body\n";
377     };
378    
379 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
380 root 1.9 timeout of 30 seconds.
381    
382     http_request
383     GET => "https://www.google.com",
384 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
385 root 1.9 timeout => 30,
386     sub {
387     my ($body, $hdr) = @_;
388     use Data::Dumper;
389     print Dumper $hdr;
390     }
391     ;
392 root 1.1
393 root 1.68 Example: do another simple HTTP GET request, but immediately try to
394 root 1.29 cancel it.
395    
396     my $request = http_request GET => "http://www.nethype.de/", sub {
397     my ($body, $hdr) = @_;
398     print "$body\n";
399     };
400    
401     undef $request;
402    
403 root 1.1 =cut
404    
405 root 1.92 #############################################################################
406     # wait queue/slots
407    
408 root 1.12 sub _slot_schedule;
409 root 1.11 sub _slot_schedule($) {
410     my $host = shift;
411    
412     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
413     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
414 root 1.12 # somebody wants that slot
415 root 1.11 ++$CO_SLOT{$host}[0];
416 root 1.14 ++$ACTIVE;
417 root 1.11
418     $cb->(AnyEvent::Util::guard {
419 root 1.14 --$ACTIVE;
420 root 1.11 --$CO_SLOT{$host}[0];
421     _slot_schedule $host;
422     });
423     } else {
424     # nobody wants the slot, maybe we can forget about it
425     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
426     last;
427     }
428     }
429     }
430    
431     # wait for a free slot on host, call callback
432     sub _get_slot($$) {
433     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
434    
435     _slot_schedule $_[0];
436     }
437    
438 root 1.80 #############################################################################
439 root 1.92 # cookie handling
440 root 1.80
441     # expire cookies
442     sub cookie_jar_expire($;$) {
443     my ($jar, $session_end) = @_;
444    
445     %$jar = () if $jar->{version} != 1;
446    
447     my $anow = AE::now;
448    
449     while (my ($chost, $paths) = each %$jar) {
450     next unless ref $paths;
451    
452     while (my ($cpath, $cookies) = each %$paths) {
453     while (my ($cookie, $kv) = each %$cookies) {
454     if (exists $kv->{_expires}) {
455     delete $cookies->{$cookie}
456     if $anow > $kv->{_expires};
457     } elsif ($session_end) {
458     delete $cookies->{$cookie};
459     }
460     }
461    
462     delete $paths->{$cpath}
463     unless %$cookies;
464     }
465    
466     delete $jar->{$chost}
467     unless %$paths;
468     }
469     }
470    
471 root 1.72 # extract cookies from jar
472 root 1.71 sub cookie_jar_extract($$$$) {
473 root 1.92 my ($jar, $scheme, $host, $path) = @_;
474 root 1.71
475     %$jar = () if $jar->{version} != 1;
476    
477     my @cookies;
478    
479     while (my ($chost, $paths) = each %$jar) {
480     next unless ref $paths;
481    
482     if ($chost =~ /^\./) {
483 root 1.92 next unless $chost eq substr $host, -length $chost;
484 root 1.71 } elsif ($chost =~ /\./) {
485 root 1.92 next unless $chost eq $host;
486 root 1.71 } else {
487     next;
488     }
489    
490     while (my ($cpath, $cookies) = each %$paths) {
491 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
492 root 1.71
493     while (my ($cookie, $kv) = each %$cookies) {
494 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
495 root 1.71
496 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
497     delete $cookies->{$cookie};
498     next;
499 root 1.71 }
500    
501     my $value = $kv->{value};
502    
503     if ($value =~ /[=;,[:space:]]/) {
504     $value =~ s/([\\"])/\\$1/g;
505     $value = "\"$value\"";
506     }
507    
508     push @cookies, "$cookie=$value";
509     }
510     }
511     }
512    
513     \@cookies
514     }
515    
516 root 1.72 # parse set_cookie header into jar
517 root 1.80 sub cookie_jar_set_cookie($$$$) {
518 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
519 root 1.80
520     my $anow = int AE::now;
521     my $snow; # server-now
522 root 1.72
523     for ($set_cookie) {
524     # parse NAME=VALUE
525     my @kv;
526    
527 root 1.79 # expires is not http-compliant in the original cookie-spec,
528     # we support the official date format and some extensions
529 root 1.72 while (
530     m{
531     \G\s*
532     (?:
533 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
534 root 1.82 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) ) )?
535 root 1.72 )
536     }gcxsi
537     ) {
538     my $name = $2;
539     my $value = $4;
540    
541 root 1.82 if (defined $1) {
542 root 1.72 # expires
543     $name = "expires";
544     $value = $1;
545 root 1.82 } elsif (defined $3) {
546 root 1.72 # quoted
547     $value = $3;
548     $value =~ s/\\(.)/$1/gs;
549     }
550    
551 root 1.95 push @kv, @kv ? lc $name : $name, $value;
552 root 1.72
553     last unless /\G\s*;/gc;
554     }
555    
556     last unless @kv;
557    
558     my $name = shift @kv;
559     my %kv = (value => shift @kv, @kv);
560    
561 root 1.80 if (exists $kv{"max-age"}) {
562     $kv{_expires} = $anow + delete $kv{"max-age"};
563     } elsif (exists $kv{expires}) {
564     $snow ||= parse_date ($date) || $anow;
565     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
566     } else {
567     delete $kv{_expires};
568     }
569 root 1.72
570     my $cdom;
571     my $cpath = (delete $kv{path}) || "/";
572    
573     if (exists $kv{domain}) {
574     $cdom = delete $kv{domain};
575    
576     $cdom =~ s/^\.?/./; # make sure it starts with a "."
577    
578     next if $cdom =~ /\.$/;
579    
580     # this is not rfc-like and not netscape-like. go figure.
581     my $ndots = $cdom =~ y/.//;
582     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
583     } else {
584 root 1.92 $cdom = $host;
585 root 1.72 }
586    
587     # store it
588 root 1.73 $jar->{version} = 1;
589 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
590 root 1.72
591     redo if /\G\s*,/gc;
592     }
593     }
594    
595 root 1.92 #############################################################################
596     # keepalive/persistent connection cache
597    
598     # fetch a connection from the keepalive cache
599     sub ka_fetch($) {
600     my $ka_key = shift;
601    
602     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
603     delete $KA_CACHE{$ka_key}
604     unless @{ $KA_CACHE{$ka_key} };
605    
606     $hdl
607     }
608    
609     sub ka_store($$) {
610     my ($ka_key, $hdl) = @_;
611    
612     my $kaa = $KA_CACHE{$ka_key} ||= [];
613    
614     my $destroy = sub {
615     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
616    
617     $hdl->destroy;
618    
619     @ka
620     ? $KA_CACHE{$ka_key} = \@ka
621     : delete $KA_CACHE{$ka_key};
622     };
623    
624     # on error etc., destroy
625     $hdl->on_error ($destroy);
626     $hdl->on_eof ($destroy);
627     $hdl->on_read ($destroy);
628     $hdl->timeout ($PERSISTENT_TIMEOUT);
629    
630     push @$kaa, $hdl;
631     shift @$kaa while @$kaa > $MAX_PER_HOST;
632     }
633    
634     #############################################################################
635     # utilities
636    
637 root 1.66 # continue to parse $_ for headers and place them into the arg
638 root 1.92 sub _parse_hdr() {
639 root 1.66 my %hdr;
640    
641     # things seen, not parsed:
642     # p3pP="NON CUR OTPi OUR NOR UNI"
643    
644     $hdr{lc $1} .= ",$2"
645     while /\G
646     ([^:\000-\037]*):
647     [\011\040]*
648     ((?: [^\012]+ | \012[\011\040] )*)
649     \012
650     /gxc;
651    
652     /\G$/
653     or return;
654    
655     # remove the "," prefix we added to all headers above
656     substr $_, 0, 1, ""
657     for values %hdr;
658    
659     \%hdr
660     }
661    
662 root 1.92 #############################################################################
663     # http_get
664    
665 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
666 root 1.34
667 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
668     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
669 root 1.40
670 root 1.92 # maybe it should just become a normal object :/
671    
672     sub _destroy_state(\%) {
673     my ($state) = @_;
674    
675     $state->{handle}->destroy if $state->{handle};
676     %$state = ();
677     }
678    
679     sub _error(\%$$) {
680     my ($state, $cb, $hdr) = @_;
681    
682     &_destroy_state ($state);
683    
684     $cb->(undef, $hdr);
685     ()
686     }
687    
688 elmex 1.15 sub http_request($$@) {
689 root 1.1 my $cb = pop;
690     my ($method, $url, %arg) = @_;
691    
692     my %hdr;
693    
694 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
695     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
696    
697 root 1.3 $method = uc $method;
698    
699 root 1.8 if (my $hdr = $arg{headers}) {
700 root 1.1 while (my ($k, $v) = each %$hdr) {
701     $hdr{lc $k} = $v;
702     }
703     }
704    
705 root 1.55 # pseudo headers for all subsequent responses
706     my @pseudo = (URL => $url);
707     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
708    
709 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
710 root 1.8
711 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
712 root 1.8 if $recurse < 0;
713    
714 root 1.2 my $proxy = $arg{proxy} || $PROXY;
715 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
716    
717 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
718 root 1.56 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?|;
719 root 1.2
720 root 1.31 $uscheme = lc $uscheme;
721 root 1.1
722 root 1.31 my $uport = $uscheme eq "http" ? 80
723     : $uscheme eq "https" ? 443
724 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
725 root 1.13
726 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
727 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
728 root 1.10
729 root 1.86 my $uhost = lc $1;
730 root 1.10 $uport = $2 if defined $2;
731    
732 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
733     unless exists $hdr{host};
734 root 1.43
735 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
736 root 1.56 $upath .= $query if length $query;
737 root 1.10
738     $upath =~ s%^/?%/%;
739    
740     # cookie processing
741     if (my $jar = $arg{cookie_jar}) {
742 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
743 root 1.70
744 root 1.71 $hdr{cookie} = join "; ", @$cookies
745     if @$cookies;
746 root 1.10 }
747 root 1.1
748 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
749 root 1.2
750 root 1.10 if ($proxy) {
751 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
752 root 1.31
753 root 1.47 $rscheme = "http" unless defined $rscheme;
754    
755 root 1.31 # don't support https requests over https-proxy transport,
756 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
757 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
758 root 1.86
759     $rhost = lc $rhost;
760     $rscheme = lc $rscheme;
761 root 1.10 } else {
762 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
763 root 1.2 }
764    
765 root 1.47 # leave out fragment and query string, just a heuristic
766 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
767     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
768 root 1.41
769 root 1.53 $hdr{"content-length"} = length $arg{body}
770     if length $arg{body} || $method ne "GET";
771 root 1.1
772 root 1.89 my $idempotent = $method =~ /^(?:GET|HEAD|PUT|DELETE|OPTIONS|TRACE)$/;
773    
774     # default value for keepalive is true iff the request is for an idempotent method
775 root 1.92 my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : $idempotent;
776     my $keepalive10 = exists $arg{keepalive10} ? $arg{keepalive10} : !$proxy;
777     my $keptalive; # true if this is actually a recycled connection
778    
779     # the key to use in the keepalive cache
780     my $ka_key = "$uhost\x00$arg{sessionid}";
781 root 1.89
782 root 1.92 $hdr{connection} = ($keepalive ? $keepalive10 ? "keep-alive " : "" : "close ") . "Te"; #1.1
783 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
784 root 1.66
785 root 1.11 my %state = (connect_guard => 1);
786    
787 root 1.88 my $ae_error = 595; # connecting
788 root 1.1
789 root 1.88 # handle actual, non-tunneled, request
790     my $handle_actual_request = sub {
791     $ae_error = 596; # request phase
792    
793 root 1.92 my $hdl = $state{handle};
794    
795     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
796 root 1.88
797     # send request
798 root 1.92 $hdl->push_write (
799 root 1.88 "$method $rpath HTTP/1.1\015\012"
800     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
801     . "\015\012"
802     . (delete $arg{body})
803     );
804    
805     # return if error occured during push_write()
806     return unless %state;
807    
808 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
809     %hdr = ();
810 root 1.88
811     # status line and headers
812     $state{read_response} = sub {
813 root 1.92 return unless %state;
814    
815 root 1.88 for ("$_[1]") {
816     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
817    
818     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
819 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
820 root 1.88
821     # 100 Continue handling
822     # should not happen as we don't send expect: 100-continue,
823     # but we handle it just in case.
824     # since we send the request body regardless, if we get an error
825     # we are out of-sync, which we currently do NOT handle correctly.
826     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
827     if $2 eq 100;
828    
829     push @pseudo,
830     HTTPVersion => $1,
831     Status => $2,
832     Reason => $3,
833     ;
834 root 1.1
835 root 1.92 my $hdr = _parse_hdr
836     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
837 root 1.77
838 root 1.88 %hdr = (%$hdr, @pseudo);
839     }
840 root 1.82
841 root 1.88 # redirect handling
842     # microsoft and other shitheads don't give a shit for following standards,
843     # try to support some common forms of broken Location headers.
844     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
845     $hdr{location} =~ s/^\.\/+//;
846    
847     my $url = "$rscheme://$uhost:$uport";
848    
849     unless ($hdr{location} =~ s/^\///) {
850     $url .= $upath;
851     $url =~ s/\/[^\/]*$//;
852 root 1.82 }
853    
854 root 1.88 $hdr{location} = "$url/$hdr{location}";
855     }
856 root 1.82
857 root 1.88 my $redirect;
858 root 1.82
859 root 1.88 if ($recurse) {
860     my $status = $hdr{Status};
861 root 1.82
862 root 1.88 # industry standard is to redirect POST as GET for
863     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
864     # also, the UA should ask the user for 301 and 307 and POST,
865     # industry standard seems to be to simply follow.
866     # we go with the industry standard.
867     if ($status == 301 or $status == 302 or $status == 303) {
868     # HTTP/1.1 is unclear on how to mutate the method
869     $method = "GET" unless $method eq "HEAD";
870     $redirect = 1;
871     } elsif ($status == 307) {
872     $redirect = 1;
873 root 1.82 }
874 root 1.88 }
875 root 1.82
876 root 1.88 my $finish = sub { # ($data, $err_status, $err_reason[, $keepalive])
877 root 1.92 if ($state{handle}) {
878     # handle keepalive
879     if (
880     $keepalive
881     && $_[3]
882     && ($hdr{HTTPVersion} < 1.1
883     ? $hdr{connection} =~ /\bkeep-?alive\b/i
884     : $hdr{connection} !~ /\bclose\b/i)
885     ) {
886     ka_store $ka_key, delete $state{handle};
887     } else {
888     # no keepalive, destroy the handle
889     $state{handle}->destroy;
890     }
891     }
892 root 1.82
893 root 1.88 %state = ();
894 root 1.82
895 root 1.88 if (defined $_[1]) {
896     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
897     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
898     }
899 root 1.82
900 root 1.88 # set-cookie processing
901     if ($arg{cookie_jar}) {
902     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
903     }
904 root 1.82
905 root 1.88 if ($redirect && exists $hdr{location}) {
906     # we ignore any errors, as it is very common to receive
907     # Content-Length != 0 but no actual body
908     # we also access %hdr, as $_[1] might be an erro
909     http_request (
910     $method => $hdr{location},
911     %arg,
912     recurse => $recurse - 1,
913     Redirect => [$_[0], \%hdr],
914 root 1.92 $cb
915     );
916 root 1.88 } else {
917     $cb->($_[0], \%hdr);
918     }
919     };
920 root 1.82
921 root 1.88 $ae_error = 597; # body phase
922 root 1.82
923 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
924    
925     my $len = $chunked ? undef : $hdr{"content-length"};
926 root 1.82
927 root 1.88 # body handling, many different code paths
928     # - no body expected
929     # - want_body_handle
930     # - te chunked
931     # - 2x length known (with or without on_body)
932     # - 2x length not known (with or without on_body)
933     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
934     $finish->(undef, 598 => "Request cancelled by on_header");
935     } elsif (
936     $hdr{Status} =~ /^(?:1..|204|205|304)$/
937     or $method eq "HEAD"
938     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
939     ) {
940     # no body
941     $finish->("", undef, undef, 1);
942    
943     } elsif (!$redirect && $arg{want_body_handle}) {
944     $_[0]->on_eof (undef);
945     $_[0]->on_error (undef);
946     $_[0]->on_read (undef);
947    
948     $finish->(delete $state{handle});
949    
950 root 1.91 } elsif ($chunked) {
951 root 1.88 my $cl = 0;
952 root 1.92 my $body = "";
953 root 1.88 my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
954    
955     $state{read_chunk} = sub {
956     $_[1] =~ /^([0-9a-fA-F]+)/
957     or $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
958 root 1.82
959 root 1.88 my $len = hex $1;
960 root 1.82
961 root 1.88 if ($len) {
962     $cl += $len;
963 root 1.82
964 root 1.88 $_[0]->push_read (chunk => $len, sub {
965     $on_body->($_[1], \%hdr)
966     or return $finish->(undef, 598 => "Request cancelled by on_body");
967 root 1.82
968 root 1.88 $_[0]->push_read (line => sub {
969     length $_[1]
970     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
971     $_[0]->push_read (line => $state{read_chunk});
972 root 1.82 });
973 root 1.88 });
974     } else {
975     $hdr{"content-length"} ||= $cl;
976 root 1.84
977 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
978     if (length $_[1]) {
979     for ("$_[1]") {
980     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
981 root 1.84
982 root 1.92 my $hdr = _parse_hdr
983 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
984 root 1.84
985 root 1.88 %hdr = (%hdr, %$hdr);
986     }
987     }
988 root 1.84
989 root 1.88 $finish->($body, undef, undef, 1);
990 root 1.84 });
991     }
992 root 1.88 };
993    
994     $_[0]->push_read (line => $state{read_chunk});
995    
996     } elsif ($arg{on_body}) {
997     if (defined $len) {
998     $_[0]->on_read (sub {
999     $len -= length $_[0]{rbuf};
1000    
1001     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1002     or return $finish->(undef, 598 => "Request cancelled by on_body");
1003    
1004     $len > 0
1005     or $finish->("", undef, undef, 1);
1006     });
1007 root 1.84 } else {
1008 root 1.88 $_[0]->on_eof (sub {
1009     $finish->("");
1010     });
1011     $_[0]->on_read (sub {
1012     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1013     or $finish->(undef, 598 => "Request cancelled by on_body");
1014     });
1015     }
1016     } else {
1017     $_[0]->on_eof (undef);
1018 root 1.82
1019 root 1.88 if (defined $len) {
1020     $_[0]->on_read (sub {
1021     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1022     if $len <= length $_[0]{rbuf};
1023     });
1024     } else {
1025     $_[0]->on_error (sub {
1026     ($! == Errno::EPIPE || !$!)
1027     ? $finish->(delete $_[0]{rbuf})
1028     : $finish->(undef, $ae_error => $_[2]);
1029     });
1030     $_[0]->on_read (sub { });
1031 root 1.82 }
1032 root 1.88 }
1033     };
1034 root 1.82
1035 root 1.92 # if keepalive is enabled, then the server closing the connection
1036     # before a response can happen legally - we retry on idempotent methods.
1037     if ($keptalive && $idempotent) {
1038     my $old_eof = $hdl->{on_eof};
1039     $hdl->{on_eof} = sub {
1040     _destroy_state %state;
1041    
1042     http_request (
1043     $method => $url,
1044     %arg,
1045     keepalive => 0,
1046     $cb
1047     );
1048     };
1049     $hdl->on_read (sub {
1050     return unless %state;
1051    
1052     # as soon as we receive something, a connection close
1053     # once more becomes a hard error
1054     $hdl->{on_eof} = $old_eof;
1055     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1056     });
1057     } else {
1058     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1059     }
1060     };
1061    
1062     my $prepare_handle = sub {
1063     my ($hdl) = $state{handle};
1064    
1065     $hdl->timeout ($timeout);
1066     $hdl->on_error (sub {
1067     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1068     });
1069     $hdl->on_eof (sub {
1070     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1071     });
1072 root 1.88 };
1073 root 1.82
1074 root 1.92 # connected to proxy (or origin server)
1075 root 1.88 my $connect_cb = sub {
1076 root 1.92 my $fh = shift
1077     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1078 root 1.44
1079 root 1.88 return unless delete $state{connect_guard};
1080 root 1.11
1081 root 1.88 # get handle
1082     $state{handle} = new AnyEvent::Handle
1083 root 1.92 %{ $arg{handle_params} },
1084     fh => $fh,
1085     peername => $uhost,
1086 root 1.88 tls_ctx => $arg{tls_ctx},
1087     ;
1088 root 1.11
1089 root 1.92 $prepare_handle->();
1090 root 1.1
1091 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1092 root 1.88
1093     # now handle proxy-CONNECT method
1094     if ($proxy && $uscheme eq "https") {
1095     # oh dear, we have to wrap it into a connect request
1096    
1097     # maybe re-use $uauthority with patched port?
1098     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012\015\012");
1099     $state{handle}->push_read (line => $qr_nlnl, sub {
1100     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1101 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1102 root 1.88
1103     if ($2 == 200) {
1104     $rpath = $upath;
1105     $handle_actual_request->();
1106     } else {
1107 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1108 root 1.88 }
1109     });
1110     } else {
1111     $handle_actual_request->();
1112     }
1113     };
1114    
1115     _get_slot $uhost, sub {
1116     $state{slot_guard} = shift;
1117 root 1.64
1118 root 1.88 return unless $state{connect_guard};
1119 root 1.64
1120 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1121     # on a keepalive request (in theory, this should be a separate config option).
1122     if ($keepalive && $KA_CACHE{$ka_key}) {
1123     $keptalive = 1;
1124     $state{handle} = ka_fetch $ka_key;
1125     $prepare_handle->();
1126     $handle_actual_request->();
1127    
1128     } else {
1129     my $tcp_connect = $arg{tcp_connect}
1130     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1131 root 1.57
1132 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1133     }
1134 root 1.1 };
1135    
1136 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1137 root 1.1 }
1138    
1139 elmex 1.15 sub http_get($@) {
1140 root 1.1 unshift @_, "GET";
1141     &http_request
1142     }
1143    
1144 elmex 1.15 sub http_head($@) {
1145 root 1.4 unshift @_, "HEAD";
1146     &http_request
1147     }
1148    
1149 elmex 1.15 sub http_post($$@) {
1150 root 1.22 my $url = shift;
1151     unshift @_, "POST", $url, "body";
1152 root 1.3 &http_request
1153     }
1154    
1155 root 1.9 =back
1156    
1157 root 1.55 =head2 DNS CACHING
1158    
1159     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1160     the actual connection, which in turn uses AnyEvent::DNS to resolve
1161     hostnames. The latter is a simple stub resolver and does no caching
1162     on its own. If you want DNS caching, you currently have to provide
1163     your own default resolver (by storing a suitable resolver object in
1164 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1165 root 1.55
1166 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1167 root 1.1
1168     =over 4
1169    
1170 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1171    
1172     Sets the default proxy server to use. The proxy-url must begin with a
1173 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1174 root 1.52
1175     To clear an already-set proxy, use C<undef>.
1176 root 1.2
1177 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1178    
1179     Remove all cookies from the cookie jar that have been expired. If
1180     C<$session_end> is given and true, then additionally remove all session
1181     cookies.
1182    
1183     You should call this function (with a true C<$session_end>) before you
1184     save cookies to disk, and you should call this function after loading them
1185     again. If you have a long-running program you can additonally call this
1186     function from time to time.
1187    
1188     A cookie jar is initially an empty hash-reference that is managed by this
1189     module. It's format is subject to change, but currently it is like this:
1190    
1191     The key C<version> has to contain C<1>, otherwise the hash gets
1192     emptied. All other keys are hostnames or IP addresses pointing to
1193     hash-references. The key for these inner hash references is the
1194     server path for which this cookie is meant, and the values are again
1195     hash-references. The keys of those hash-references is the cookie name, and
1196     the value, you guessed it, is another hash-reference, this time with the
1197     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1198     which have been replaced by a C<_expires> key that contains the cookie
1199     expiry timestamp.
1200    
1201     Here is an example of a cookie jar with a single cookie, so you have a
1202     chance of understanding the above paragraph:
1203    
1204     {
1205     version => 1,
1206     "10.0.0.1" => {
1207     "/" => {
1208     "mythweb_id" => {
1209     _expires => 1293917923,
1210     value => "ooRung9dThee3ooyXooM1Ohm",
1211     },
1212     },
1213     },
1214     }
1215    
1216 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1217    
1218     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1219     Date (RFC 2616).
1220    
1221     =item $timestamp = AnyEvent::HTTP::parse_date $date
1222    
1223 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1224     bunch of minor variations of those, and returns the corresponding POSIX
1225     timestamp, or C<undef> if the date cannot be parsed.
1226 root 1.61
1227 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1228 root 1.1
1229 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1230 root 1.1
1231 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1232    
1233     The default timeout for conenction operations (default: C<300>).
1234    
1235 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1236    
1237     The default value for the C<User-Agent> header (the default is
1238 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1239 root 1.1
1240 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1241 root 1.1
1242 root 1.47 The maximum number of concurrent connections to the same host (identified
1243 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
1244 root 1.92 are queued until previous connections are closed. Both persistent and
1245     non-persistent connections are counted in this limit.
1246 root 1.1
1247 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1248 root 1.92 increase it much.
1249    
1250     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1251     connections, older browsers used 2, newers (such as firefox 3) typically
1252     use 6, and Opera uses 8 because like, they have the fastest browser and
1253     give a shit for everybody else on the planet.
1254    
1255     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1256    
1257     The time after which idle persistent conenctions get closed by
1258     AnyEvent::HTTP (default: C<3>).
1259 root 1.3
1260 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1261    
1262     The number of active connections. This is not the number of currently
1263     running requests, but the number of currently open and non-idle TCP
1264 root 1.92 connections. This number can be useful for load-leveling.
1265 root 1.14
1266 root 1.1 =back
1267    
1268     =cut
1269    
1270 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1271     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1272    
1273     sub format_date($) {
1274     my ($time) = @_;
1275    
1276     # RFC 822/1123 format
1277     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1278    
1279     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1280     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1281     $H, $M, $S;
1282     }
1283    
1284     sub parse_date($) {
1285     my ($date) = @_;
1286    
1287     my ($d, $m, $y, $H, $M, $S);
1288    
1289 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1290 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1291     # cookie dates (with "-")
1292    
1293 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1294    
1295 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1296 root 1.61 # RFC 850
1297     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1298    
1299 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1300 root 1.61 # ISO C's asctime
1301     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1302     }
1303     # other formats fail in the loop below
1304    
1305     for (0..11) {
1306     if ($m eq $month[$_]) {
1307     require Time::Local;
1308     return Time::Local::timegm ($S, $M, $H, $d, $_, $y);
1309     }
1310     }
1311    
1312     undef
1313     }
1314    
1315 root 1.2 sub set_proxy($) {
1316 root 1.52 if (length $_[0]) {
1317 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1318 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1319     $PROXY = [$2, $3 || 3128, $1]
1320     } else {
1321     undef $PROXY;
1322     }
1323 root 1.2 }
1324    
1325     # initialise proxy from environment
1326 root 1.52 eval {
1327     set_proxy $ENV{http_proxy};
1328     };
1329 root 1.2
1330 root 1.93 =head2 SHOWCASE
1331    
1332     This section contaisn some more elaborate "real-world" examples or code
1333     snippets.
1334    
1335     =head2 HTTP/1.1 FILE DOWNLOAD
1336    
1337 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1338 root 1.98 goes wrong and you want to resume.
1339 root 1.93
1340     Here is a function that initiates and resumes a download. It uses the
1341     last modified time to check for file content changes, and works with many
1342     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1343     on older servers.
1344    
1345     It calls the completion callback with either C<undef>, which means a
1346     nonretryable error occured, C<0> when the download was partial and should
1347     be retried, and C<1> if it was successful.
1348    
1349     use AnyEvent::HTTP;
1350    
1351     sub download($$$) {
1352     my ($url, $file, $cb) = @_;
1353    
1354     open my $fh, "+<", $file
1355     or die "$file: $!";
1356    
1357     my %hdr;
1358     my $ofs = 0;
1359    
1360     warn stat $fh;
1361     warn -s _;
1362     if (stat $fh and -s _) {
1363     $ofs = -s _;
1364     warn "-s is ", $ofs;#d#
1365     $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1366     $hdr{"range"} = "bytes=$ofs-";
1367     }
1368    
1369     http_get $url,
1370     headers => \%hdr,
1371     on_header => sub {
1372     my ($hdr) = @_;
1373    
1374     if ($hdr->{Status} == 200 && $ofs) {
1375     # resume failed
1376     truncate $fh, $ofs = 0;
1377     }
1378    
1379     sysseek $fh, $ofs, 0;
1380    
1381     1
1382     },
1383     on_body => sub {
1384     my ($data, $hdr) = @_;
1385    
1386     if ($hdr->{Status} =~ /^2/) {
1387     length $data == syswrite $fh, $data
1388     or return; # abort on write errors
1389     }
1390    
1391     1
1392     },
1393     sub {
1394     my (undef, $hdr) = @_;
1395    
1396     my $status = $hdr->{Status};
1397    
1398     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1399     utime $fh, $time, $time;
1400     }
1401    
1402     if ($status == 200 || $status == 206 || $status == 416) {
1403     # download ok || resume ok || file already fully downloaded
1404     $cb->(1, $hdr);
1405    
1406     } elsif ($status == 412) {
1407     # file has changed while resuming, delete and retry
1408     unlink $file;
1409     $cb->(0, $hdr);
1410    
1411     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1412     # retry later
1413     $cb->(0, $hdr);
1414    
1415     } else {
1416     $cb->(undef, $hdr);
1417     }
1418     }
1419     ;
1420     }
1421    
1422     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1423     if ($_[0]) {
1424     print "OK!\n";
1425     } elsif (defined $_[0]) {
1426     print "please retry later\n";
1427     } else {
1428     print "ERROR\n";
1429     }
1430     };
1431    
1432     =head3 SOCKS PROXIES
1433 root 1.60
1434     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1435     compile your perl to support socks, or use an external program such as
1436     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1437     transparently.
1438    
1439     Alternatively, for AnyEvent::HTTP only, you can use your own
1440     C<tcp_connect> function that does the proxy handshake - here is an example
1441     that works with socks4a proxies:
1442    
1443     use Errno;
1444     use AnyEvent::Util;
1445     use AnyEvent::Socket;
1446     use AnyEvent::Handle;
1447    
1448     # host, port and username of/for your socks4a proxy
1449     my $socks_host = "10.0.0.23";
1450     my $socks_port = 9050;
1451     my $socks_user = "";
1452    
1453     sub socks4a_connect {
1454     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1455    
1456     my $hdl = new AnyEvent::Handle
1457     connect => [$socks_host, $socks_port],
1458     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1459     on_error => sub { $connect_cb->() },
1460     ;
1461    
1462     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1463    
1464     $hdl->push_read (chunk => 8, sub {
1465     my ($hdl, $chunk) = @_;
1466     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1467    
1468     if ($status == 0x5a) {
1469     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1470     } else {
1471     $! = Errno::ENXIO; $connect_cb->();
1472     }
1473     });
1474    
1475     $hdl
1476     }
1477    
1478     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1479     possibly after switching off other proxy types:
1480    
1481     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1482    
1483     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1484     my ($data, $headers) = @_;
1485     ...
1486     };
1487    
1488 root 1.1 =head1 SEE ALSO
1489    
1490     L<AnyEvent>.
1491    
1492     =head1 AUTHOR
1493    
1494 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1495     http://home.schmorp.de/
1496 root 1.1
1497 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1498     testcases and bugreports.
1499    
1500 root 1.1 =cut
1501    
1502     1
1503