ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.109
Committed: Wed Jul 27 16:11:55 2011 UTC (12 years, 9 months ago) by root
Branch: MAIN
CVS Tags: rel-2_13
Changes since 1.108: +2 -2 lines
Log Message:
2.13

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
21 root 1.11 automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41 root 1.85 use common::sense;
42 root 1.1
43 root 1.41 use Errno ();
44 root 1.1
45 root 1.51 use AnyEvent 5.0 ();
46 root 1.1 use AnyEvent::Util ();
47     use AnyEvent::Handle ();
48    
49     use base Exporter::;
50    
51 root 1.109 our $VERSION = '2.13';
52 root 1.1
53 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
54 root 1.1
55 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
56 root 1.3 our $MAX_RECURSE = 10;
57 root 1.92 our $PERSISTENT_TIMEOUT = 3;
58 root 1.2 our $TIMEOUT = 300;
59 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
60 root 1.1
61 root 1.2 our $PROXY;
62 root 1.14 our $ACTIVE = 0;
63 root 1.2
64 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
65 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
66 root 1.1
67     =item http_get $url, key => value..., $cb->($data, $headers)
68    
69     Executes an HTTP-GET request. See the http_request function for details on
70 root 1.29 additional parameters and the return value.
71 root 1.1
72 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
73    
74 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
75     on additional parameters and the return value.
76 root 1.5
77     =item http_post $url, $body, key => value..., $cb->($data, $headers)
78 root 1.3
79 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
80 root 1.29 http_request function for details on additional parameters and the return
81     value.
82 root 1.3
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.29 When called in void context, nothing is returned. In other contexts,
89     C<http_request> returns a "cancellation guard" - you have to keep the
90     object at least alive until the callback get called. If the object gets
91 root 1.58 destroyed before the callback is called, the request will be cancelled.
92 root 1.29
93 root 1.42 The callback will be called with the response body data as first argument
94 root 1.68 (or C<undef> if an error occured), and a hash-ref with response headers
95     (and trailers) as second argument.
96 root 1.2
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
99     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
100 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
101     during the body phase of a request, then the original C<Status> and
102     C<Reason> values from the header are available as C<OrigStatus> and
103     C<OrigReason>.
104 root 1.55
105     The pseudo-header C<URL> contains the actual URL (which can differ from
106     the requested URL when following redirects - for example, you might get
107     an error that your URL scheme is not supported even though your URL is a
108     valid http URL because it redirected to an ftp URL, in which case you can
109     look at the URL pseudo header).
110    
111     The pseudo-header C<Redirect> only exists when the request was a result
112     of an internal redirect. In that case it is an array reference with
113     the C<($data, $headers)> from the redirect response. Note that this
114     response could in turn be the result of a redirect itself, and C<<
115     $headers->{Redirect}[1]{Redirect} >> will then contain the original
116     response, and so on.
117 root 1.20
118 root 1.32 If the server sends a header multiple times, then their contents will be
119     joined together with a comma (C<,>), as per the HTTP spec.
120 root 1.2
121     If an internal error occurs, such as not being able to resolve a hostname,
122 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
123     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
124     message. Currently the following status codes are used:
125    
126     =over 4
127    
128     =item 595 - errors during connection etsbalishment, proxy handshake.
129    
130     =item 596 - errors during TLS negotiation, request sending and header processing.
131    
132 root 1.78 =item 597 - errors during body receiving or processing.
133 root 1.77
134 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
135 root 1.77
136     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
137    
138     =back
139 root 1.2
140 root 1.6 A typical callback might look like this:
141    
142     sub {
143     my ($body, $hdr) = @_;
144    
145     if ($hdr->{Status} =~ /^2/) {
146     ... everything should be ok
147     } else {
148     print "error, $hdr->{Status} $hdr->{Reason}\n";
149     }
150     }
151    
152 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
153     include:
154    
155     =over 4
156    
157 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
158 root 1.1
159     Whether to recurse requests or not, e.g. on redirects, authentication
160 root 1.3 retries and so on, and how often to do so.
161 root 1.1
162     =item headers => hashref
163    
164 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
165     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
166 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
167     (this can be suppressed by using C<undef> for these headers in which case
168     they won't be sent at all).
169 root 1.1
170 root 1.90 You really should provide your own C<User-Agent:> header value that is
171     appropriate for your program - I wouldn't be surprised if the default
172     AnyEvent string gets blocked by webservers sooner or later.
173    
174 root 1.104 Also, make sure that your headers names and values do not contain any
175     embedded newlines.
176    
177 root 1.1 =item timeout => $seconds
178    
179     The time-out to use for various stages - each connect attempt will reset
180 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
181     timeout.
182    
183     Default timeout is 5 minutes.
184 root 1.2
185     =item proxy => [$host, $port[, $scheme]] or undef
186    
187 root 1.102 Use the given http proxy for all requests, or no proxy if C<undef> is
188     used.
189 root 1.2
190 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
191 root 1.1
192 root 1.102 If not specified, then the default proxy is used (see
193     C<AnyEvent::HTTP::set_proxy>).
194    
195 root 1.3 =item body => $string
196    
197 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
198 root 1.3 this module might offer more options).
199    
200 root 1.10 =item cookie_jar => $hash_ref
201    
202     Passing this parameter enables (simplified) cookie-processing, loosely
203     based on the original netscape specification.
204    
205 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
206     will get updated automatically. It is possible to save the cookie jar
207     to persistent storage with something like JSON or Storable - see the
208     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
209     expired or session-only cookies, and also for documentation on the format
210     of the cookie jar.
211 root 1.10
212 root 1.70 Note that this cookie implementation is not meant to be complete. If
213     you want complete cookie management you have to do that on your
214 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
215 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
216     to.
217 root 1.10
218 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
219 root 1.70 headers will be set and handled by this module, otherwise they will be
220 root 1.69 left untouched.
221    
222 root 1.40 =item tls_ctx => $scheme | $tls_ctx
223    
224     Specifies the AnyEvent::TLS context to be used for https connections. This
225     parameter follows the same rules as the C<tls_ctx> parameter to
226     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
227     C<high> can be specified, which give you a predefined low-security (no
228     verification, highest compatibility) and high-security (CA and common-name
229     verification) TLS context.
230    
231     The default for this option is C<low>, which could be interpreted as "give
232     me the page, no matter what".
233    
234 root 1.92 See also the C<sessionid> parameter.
235    
236     =item session => $string
237    
238     The module might reuse connections to the same host internally. Sometimes
239     (e.g. when using TLS), you do not want to reuse connections from other
240     sessions. This can be achieved by setting this parameter to some unique
241     ID (such as the address of an object storing your state data, or the TLS
242     context) - only connections using the same unique ID will be reused.
243    
244 root 1.51 =item on_prepare => $callback->($fh)
245    
246     In rare cases you need to "tune" the socket before it is used to
247     connect (for exmaple, to bind it on a given IP address). This parameter
248     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
249     and behaves exactly the same way (e.g. it has to provide a
250     timeout). See the description for the C<$prepare_cb> argument of
251     C<AnyEvent::Socket::tcp_connect> for details.
252    
253 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
254    
255     In even rarer cases you want total control over how AnyEvent::HTTP
256     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
257     to do this, but you can provide your own C<tcp_connect> function -
258 root 1.60 obviously, it has to follow the same calling conventions, except that it
259     may always return a connection guard object.
260 root 1.59
261     There are probably lots of weird uses for this function, starting from
262     tracing the hosts C<http_request> actually tries to connect, to (inexact
263     but fast) host => IP address caching or even socks protocol support.
264    
265 root 1.42 =item on_header => $callback->($headers)
266 root 1.41
267     When specified, this callback will be called with the header hash as soon
268     as headers have been successfully received from the remote server (not on
269     locally-generated errors).
270    
271     It has to return either true (in which case AnyEvent::HTTP will continue),
272     or false, in which case AnyEvent::HTTP will cancel the download (and call
273     the finish callback with an error code of C<598>).
274    
275     This callback is useful, among other things, to quickly reject unwanted
276     content, which, if it is supposed to be rare, can be faster than first
277     doing a C<HEAD> request.
278    
279 root 1.68 The downside is that cancelling the request makes it impossible to re-use
280     the connection. Also, the C<on_header> callback will not receive any
281     trailer (headers sent after the response body).
282    
283 root 1.42 Example: cancel the request unless the content-type is "text/html".
284 root 1.41
285 root 1.42 on_header => sub {
286     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
287     },
288 root 1.41
289 root 1.42 =item on_body => $callback->($partial_body, $headers)
290 root 1.41
291 root 1.42 When specified, all body data will be passed to this callback instead of
292     to the completion callback. The completion callback will get the empty
293     string instead of the body data.
294 root 1.41
295 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
296     or false, in which case AnyEvent::HTTP will cancel the download (and call
297     the completion callback with an error code of C<598>).
298    
299 root 1.68 The downside to cancelling the request is that it makes it impossible to
300     re-use the connection.
301    
302 root 1.42 This callback is useful when the data is too large to be held in memory
303     (so the callback writes it to a file) or when only some information should
304     be extracted, or when the body should be processed incrementally.
305 root 1.41
306     It is usually preferred over doing your own body handling via
307 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
308     only used to create a connection, C<want_body_handle> is the better
309     alternative, as it allows you to install your own event handler, reducing
310     resource usage.
311 root 1.41
312     =item want_body_handle => $enable
313    
314     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
315     changes considerably: after parsing the headers, and instead of
316     downloading the body (if any), the completion callback will be
317     called. Instead of the C<$body> argument containing the body data, the
318     callback will receive the L<AnyEvent::Handle> object associated with the
319     connection. In error cases, C<undef> will be passed. When there is no body
320     (e.g. status C<304>), the empty string will be passed.
321    
322 root 1.92 The handle object might or might not be in TLS mode, might be connected
323     to a proxy, be a persistent connection, use chunked transfer encoding
324     etc., and configured in unspecified ways. The user is responsible for this
325     handle (it will not be used by this module anymore).
326 root 1.41
327     This is useful with some push-type services, where, after the initial
328     headers, an interactive protocol is used (typical example would be the
329     push-style twitter API which starts a JSON/XML stream).
330    
331     If you think you need this, first have a look at C<on_body>, to see if
332 root 1.45 that doesn't solve your problem in a better way.
333 root 1.41
334 root 1.92 =item persistent => $boolean
335    
336     Try to create/reuse a persistent connection. When this flag is set
337     (default: true for idempotent requests, false for all others), then
338     C<http_request> tries to re-use an existing (previously-created)
339     persistent connection to the host and, failing that, tries to create a new
340     one.
341    
342     Requests failing in certain ways will be automatically retried once, which
343     is dangerous for non-idempotent requests, which is why it defaults to off
344     for them. The reason for this is because the bozos who designed HTTP/1.1
345     made it impossible to distinguish between a fatal error and a normal
346     connection timeout, so you never know whether there was a problem with
347     your request or not.
348    
349     When reusing an existent connection, many parameters (such as TLS context)
350     will be ignored. See the C<session> parameter for a workaround.
351    
352     =item keepalive => $boolean
353    
354     Only used when C<persistent> is also true. This parameter decides whether
355     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
356     (as opposed to only a HTTP/1.1 persistent connection).
357    
358     The default is true, except when using a proxy, in which case it defaults
359     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
360    
361     =item handle_params => { key => value ... }
362    
363     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
364     constructor that is called - not all requests will create a handle, and
365     sometimes more than one is created, so this parameter is only good for
366     setting hints.
367    
368     Example: set the maximum read size to 4096, to potentially conserve memory
369     at the cost of speed.
370    
371     handle_params => {
372     max_read_size => 4096,
373     },
374    
375 root 1.1 =back
376    
377 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
378     the response body.
379 root 1.9
380     http_request GET => "http://www.nethype.de/", sub {
381     my ($body, $hdr) = @_;
382     print "$body\n";
383     };
384    
385 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
386 root 1.9 timeout of 30 seconds.
387    
388     http_request
389 root 1.109 HEAD => "https://www.google.com",
390 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
391 root 1.9 timeout => 30,
392     sub {
393     my ($body, $hdr) = @_;
394     use Data::Dumper;
395     print Dumper $hdr;
396     }
397     ;
398 root 1.1
399 root 1.68 Example: do another simple HTTP GET request, but immediately try to
400 root 1.29 cancel it.
401    
402     my $request = http_request GET => "http://www.nethype.de/", sub {
403     my ($body, $hdr) = @_;
404     print "$body\n";
405     };
406    
407     undef $request;
408    
409 root 1.1 =cut
410    
411 root 1.92 #############################################################################
412     # wait queue/slots
413    
414 root 1.12 sub _slot_schedule;
415 root 1.11 sub _slot_schedule($) {
416     my $host = shift;
417    
418     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
419     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
420 root 1.12 # somebody wants that slot
421 root 1.11 ++$CO_SLOT{$host}[0];
422 root 1.14 ++$ACTIVE;
423 root 1.11
424     $cb->(AnyEvent::Util::guard {
425 root 1.14 --$ACTIVE;
426 root 1.11 --$CO_SLOT{$host}[0];
427     _slot_schedule $host;
428     });
429     } else {
430     # nobody wants the slot, maybe we can forget about it
431     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
432     last;
433     }
434     }
435     }
436    
437     # wait for a free slot on host, call callback
438     sub _get_slot($$) {
439     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
440    
441     _slot_schedule $_[0];
442     }
443    
444 root 1.80 #############################################################################
445 root 1.92 # cookie handling
446 root 1.80
447     # expire cookies
448     sub cookie_jar_expire($;$) {
449     my ($jar, $session_end) = @_;
450    
451     %$jar = () if $jar->{version} != 1;
452    
453     my $anow = AE::now;
454    
455     while (my ($chost, $paths) = each %$jar) {
456     next unless ref $paths;
457    
458     while (my ($cpath, $cookies) = each %$paths) {
459     while (my ($cookie, $kv) = each %$cookies) {
460     if (exists $kv->{_expires}) {
461     delete $cookies->{$cookie}
462     if $anow > $kv->{_expires};
463     } elsif ($session_end) {
464     delete $cookies->{$cookie};
465     }
466     }
467    
468     delete $paths->{$cpath}
469     unless %$cookies;
470     }
471    
472     delete $jar->{$chost}
473     unless %$paths;
474     }
475     }
476    
477 root 1.72 # extract cookies from jar
478 root 1.71 sub cookie_jar_extract($$$$) {
479 root 1.92 my ($jar, $scheme, $host, $path) = @_;
480 root 1.71
481     %$jar = () if $jar->{version} != 1;
482    
483     my @cookies;
484    
485     while (my ($chost, $paths) = each %$jar) {
486     next unless ref $paths;
487    
488     if ($chost =~ /^\./) {
489 root 1.92 next unless $chost eq substr $host, -length $chost;
490 root 1.71 } elsif ($chost =~ /\./) {
491 root 1.92 next unless $chost eq $host;
492 root 1.71 } else {
493     next;
494     }
495    
496     while (my ($cpath, $cookies) = each %$paths) {
497 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
498 root 1.71
499     while (my ($cookie, $kv) = each %$cookies) {
500 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
501 root 1.71
502 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
503     delete $cookies->{$cookie};
504     next;
505 root 1.71 }
506    
507     my $value = $kv->{value};
508    
509     if ($value =~ /[=;,[:space:]]/) {
510     $value =~ s/([\\"])/\\$1/g;
511     $value = "\"$value\"";
512     }
513    
514     push @cookies, "$cookie=$value";
515     }
516     }
517     }
518    
519     \@cookies
520     }
521    
522 root 1.72 # parse set_cookie header into jar
523 root 1.80 sub cookie_jar_set_cookie($$$$) {
524 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
525 root 1.80
526     my $anow = int AE::now;
527     my $snow; # server-now
528 root 1.72
529     for ($set_cookie) {
530     # parse NAME=VALUE
531     my @kv;
532    
533 root 1.79 # expires is not http-compliant in the original cookie-spec,
534     # we support the official date format and some extensions
535 root 1.72 while (
536     m{
537     \G\s*
538     (?:
539 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
540 root 1.101 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^;,[:space:]]*) ) )?
541 root 1.72 )
542     }gcxsi
543     ) {
544     my $name = $2;
545     my $value = $4;
546    
547 root 1.82 if (defined $1) {
548 root 1.72 # expires
549     $name = "expires";
550     $value = $1;
551 root 1.82 } elsif (defined $3) {
552 root 1.72 # quoted
553     $value = $3;
554     $value =~ s/\\(.)/$1/gs;
555     }
556    
557 root 1.95 push @kv, @kv ? lc $name : $name, $value;
558 root 1.72
559     last unless /\G\s*;/gc;
560     }
561    
562     last unless @kv;
563    
564     my $name = shift @kv;
565     my %kv = (value => shift @kv, @kv);
566    
567 root 1.80 if (exists $kv{"max-age"}) {
568     $kv{_expires} = $anow + delete $kv{"max-age"};
569     } elsif (exists $kv{expires}) {
570     $snow ||= parse_date ($date) || $anow;
571     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
572     } else {
573     delete $kv{_expires};
574     }
575 root 1.72
576     my $cdom;
577     my $cpath = (delete $kv{path}) || "/";
578    
579     if (exists $kv{domain}) {
580     $cdom = delete $kv{domain};
581    
582     $cdom =~ s/^\.?/./; # make sure it starts with a "."
583    
584     next if $cdom =~ /\.$/;
585    
586     # this is not rfc-like and not netscape-like. go figure.
587     my $ndots = $cdom =~ y/.//;
588     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
589     } else {
590 root 1.92 $cdom = $host;
591 root 1.72 }
592    
593     # store it
594 root 1.73 $jar->{version} = 1;
595 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
596 root 1.72
597     redo if /\G\s*,/gc;
598     }
599     }
600    
601 root 1.92 #############################################################################
602     # keepalive/persistent connection cache
603    
604     # fetch a connection from the keepalive cache
605     sub ka_fetch($) {
606     my $ka_key = shift;
607    
608     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
609     delete $KA_CACHE{$ka_key}
610     unless @{ $KA_CACHE{$ka_key} };
611    
612     $hdl
613     }
614    
615     sub ka_store($$) {
616     my ($ka_key, $hdl) = @_;
617    
618     my $kaa = $KA_CACHE{$ka_key} ||= [];
619    
620     my $destroy = sub {
621     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
622    
623     $hdl->destroy;
624    
625     @ka
626     ? $KA_CACHE{$ka_key} = \@ka
627     : delete $KA_CACHE{$ka_key};
628     };
629    
630     # on error etc., destroy
631     $hdl->on_error ($destroy);
632     $hdl->on_eof ($destroy);
633     $hdl->on_read ($destroy);
634     $hdl->timeout ($PERSISTENT_TIMEOUT);
635    
636     push @$kaa, $hdl;
637     shift @$kaa while @$kaa > $MAX_PER_HOST;
638     }
639    
640     #############################################################################
641     # utilities
642    
643 root 1.66 # continue to parse $_ for headers and place them into the arg
644 root 1.92 sub _parse_hdr() {
645 root 1.66 my %hdr;
646    
647     # things seen, not parsed:
648     # p3pP="NON CUR OTPi OUR NOR UNI"
649    
650     $hdr{lc $1} .= ",$2"
651     while /\G
652     ([^:\000-\037]*):
653     [\011\040]*
654     ((?: [^\012]+ | \012[\011\040] )*)
655     \012
656     /gxc;
657    
658     /\G$/
659     or return;
660    
661     # remove the "," prefix we added to all headers above
662     substr $_, 0, 1, ""
663     for values %hdr;
664    
665     \%hdr
666     }
667    
668 root 1.92 #############################################################################
669     # http_get
670    
671 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
672 root 1.34
673 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
674     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
675 root 1.40
676 root 1.92 # maybe it should just become a normal object :/
677    
678     sub _destroy_state(\%) {
679     my ($state) = @_;
680    
681     $state->{handle}->destroy if $state->{handle};
682     %$state = ();
683     }
684    
685     sub _error(\%$$) {
686     my ($state, $cb, $hdr) = @_;
687    
688     &_destroy_state ($state);
689    
690     $cb->(undef, $hdr);
691     ()
692     }
693    
694 elmex 1.15 sub http_request($$@) {
695 root 1.1 my $cb = pop;
696     my ($method, $url, %arg) = @_;
697    
698     my %hdr;
699    
700 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
701     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
702    
703 root 1.3 $method = uc $method;
704    
705 root 1.8 if (my $hdr = $arg{headers}) {
706 root 1.1 while (my ($k, $v) = each %$hdr) {
707     $hdr{lc $k} = $v;
708     }
709     }
710    
711 root 1.55 # pseudo headers for all subsequent responses
712     my @pseudo = (URL => $url);
713     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
714    
715 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
716 root 1.8
717 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
718 root 1.8 if $recurse < 0;
719    
720 root 1.102 my $proxy = exists $arg{proxy} ? $arg{proxy} : $PROXY;
721 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
722    
723 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
724 root 1.100 $url =~ m|^([^:]+):(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?$|;
725 root 1.2
726 root 1.31 $uscheme = lc $uscheme;
727 root 1.1
728 root 1.31 my $uport = $uscheme eq "http" ? 80
729     : $uscheme eq "https" ? 443
730 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
731 root 1.13
732 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
733 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
734 root 1.10
735 root 1.86 my $uhost = lc $1;
736 root 1.10 $uport = $2 if defined $2;
737    
738 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
739     unless exists $hdr{host};
740 root 1.43
741 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
742 root 1.56 $upath .= $query if length $query;
743 root 1.10
744     $upath =~ s%^/?%/%;
745    
746     # cookie processing
747     if (my $jar = $arg{cookie_jar}) {
748 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
749 root 1.70
750 root 1.71 $hdr{cookie} = join "; ", @$cookies
751     if @$cookies;
752 root 1.10 }
753 root 1.1
754 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
755 root 1.2
756 root 1.10 if ($proxy) {
757 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
758 root 1.31
759 root 1.47 $rscheme = "http" unless defined $rscheme;
760    
761 root 1.31 # don't support https requests over https-proxy transport,
762 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
763 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
764 root 1.86
765     $rhost = lc $rhost;
766     $rscheme = lc $rscheme;
767 root 1.10 } else {
768 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
769 root 1.2 }
770    
771 root 1.47 # leave out fragment and query string, just a heuristic
772 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
773     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
774 root 1.41
775 root 1.53 $hdr{"content-length"} = length $arg{body}
776     if length $arg{body} || $method ne "GET";
777 root 1.1
778 root 1.89 my $idempotent = $method =~ /^(?:GET|HEAD|PUT|DELETE|OPTIONS|TRACE)$/;
779    
780     # default value for keepalive is true iff the request is for an idempotent method
781 root 1.103 my $persistent = exists $arg{persistent} ? !!$arg{persistent} : $idempotent;
782     my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : !$proxy;
783     my $was_persistent; # true if this is actually a recycled connection
784 root 1.92
785     # the key to use in the keepalive cache
786 root 1.105 my $ka_key = "$uscheme\x00$uhost\x00$uport\x00$arg{sessionid}";
787 root 1.89
788 root 1.103 $hdr{connection} = ($persistent ? $keepalive ? "keep-alive " : "" : "close ") . "Te"; #1.1
789 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
790 root 1.66
791 root 1.11 my %state = (connect_guard => 1);
792    
793 root 1.88 my $ae_error = 595; # connecting
794 root 1.1
795 root 1.88 # handle actual, non-tunneled, request
796     my $handle_actual_request = sub {
797     $ae_error = 596; # request phase
798    
799 root 1.92 my $hdl = $state{handle};
800    
801     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
802 root 1.88
803     # send request
804 root 1.92 $hdl->push_write (
805 root 1.88 "$method $rpath HTTP/1.1\015\012"
806     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
807     . "\015\012"
808     . (delete $arg{body})
809     );
810    
811     # return if error occured during push_write()
812     return unless %state;
813    
814 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
815     %hdr = ();
816 root 1.88
817     # status line and headers
818     $state{read_response} = sub {
819 root 1.92 return unless %state;
820    
821 root 1.88 for ("$_[1]") {
822     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
823    
824     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
825 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
826 root 1.88
827     # 100 Continue handling
828     # should not happen as we don't send expect: 100-continue,
829     # but we handle it just in case.
830     # since we send the request body regardless, if we get an error
831     # we are out of-sync, which we currently do NOT handle correctly.
832     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
833     if $2 eq 100;
834    
835     push @pseudo,
836     HTTPVersion => $1,
837     Status => $2,
838     Reason => $3,
839     ;
840 root 1.1
841 root 1.92 my $hdr = _parse_hdr
842     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
843 root 1.77
844 root 1.88 %hdr = (%$hdr, @pseudo);
845     }
846 root 1.82
847 root 1.88 # redirect handling
848     # microsoft and other shitheads don't give a shit for following standards,
849     # try to support some common forms of broken Location headers.
850     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
851     $hdr{location} =~ s/^\.\/+//;
852    
853     my $url = "$rscheme://$uhost:$uport";
854    
855     unless ($hdr{location} =~ s/^\///) {
856     $url .= $upath;
857     $url =~ s/\/[^\/]*$//;
858 root 1.82 }
859    
860 root 1.88 $hdr{location} = "$url/$hdr{location}";
861     }
862 root 1.82
863 root 1.88 my $redirect;
864 root 1.82
865 root 1.88 if ($recurse) {
866     my $status = $hdr{Status};
867 root 1.82
868 root 1.88 # industry standard is to redirect POST as GET for
869     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
870     # also, the UA should ask the user for 301 and 307 and POST,
871     # industry standard seems to be to simply follow.
872     # we go with the industry standard.
873     if ($status == 301 or $status == 302 or $status == 303) {
874     # HTTP/1.1 is unclear on how to mutate the method
875     $method = "GET" unless $method eq "HEAD";
876     $redirect = 1;
877     } elsif ($status == 307) {
878     $redirect = 1;
879 root 1.82 }
880 root 1.88 }
881 root 1.82
882 root 1.103 my $finish = sub { # ($data, $err_status, $err_reason[, $persistent])
883 root 1.92 if ($state{handle}) {
884     # handle keepalive
885     if (
886 root 1.103 $persistent
887 root 1.92 && $_[3]
888     && ($hdr{HTTPVersion} < 1.1
889     ? $hdr{connection} =~ /\bkeep-?alive\b/i
890     : $hdr{connection} !~ /\bclose\b/i)
891     ) {
892     ka_store $ka_key, delete $state{handle};
893     } else {
894     # no keepalive, destroy the handle
895     $state{handle}->destroy;
896     }
897     }
898 root 1.82
899 root 1.88 %state = ();
900 root 1.82
901 root 1.88 if (defined $_[1]) {
902     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
903     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
904     }
905 root 1.82
906 root 1.88 # set-cookie processing
907     if ($arg{cookie_jar}) {
908     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
909     }
910 root 1.82
911 root 1.88 if ($redirect && exists $hdr{location}) {
912     # we ignore any errors, as it is very common to receive
913     # Content-Length != 0 but no actual body
914     # we also access %hdr, as $_[1] might be an erro
915 root 1.103 $state{recurse} =
916     http_request (
917     $method => $hdr{location},
918     %arg,
919     recurse => $recurse - 1,
920     Redirect => [$_[0], \%hdr],
921     sub {
922     %state = ();
923     &$cb
924     },
925     );
926 root 1.88 } else {
927     $cb->($_[0], \%hdr);
928     }
929     };
930 root 1.82
931 root 1.88 $ae_error = 597; # body phase
932 root 1.82
933 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
934    
935     my $len = $chunked ? undef : $hdr{"content-length"};
936 root 1.82
937 root 1.88 # body handling, many different code paths
938     # - no body expected
939     # - want_body_handle
940     # - te chunked
941     # - 2x length known (with or without on_body)
942     # - 2x length not known (with or without on_body)
943     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
944     $finish->(undef, 598 => "Request cancelled by on_header");
945     } elsif (
946     $hdr{Status} =~ /^(?:1..|204|205|304)$/
947     or $method eq "HEAD"
948     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
949     ) {
950     # no body
951     $finish->("", undef, undef, 1);
952    
953     } elsif (!$redirect && $arg{want_body_handle}) {
954     $_[0]->on_eof (undef);
955     $_[0]->on_error (undef);
956     $_[0]->on_read (undef);
957    
958     $finish->(delete $state{handle});
959    
960 root 1.91 } elsif ($chunked) {
961 root 1.88 my $cl = 0;
962 root 1.92 my $body = "";
963 root 1.88 my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
964    
965     $state{read_chunk} = sub {
966     $_[1] =~ /^([0-9a-fA-F]+)/
967 root 1.108 or return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
968 root 1.82
969 root 1.88 my $len = hex $1;
970 root 1.82
971 root 1.88 if ($len) {
972     $cl += $len;
973 root 1.82
974 root 1.88 $_[0]->push_read (chunk => $len, sub {
975     $on_body->($_[1], \%hdr)
976     or return $finish->(undef, 598 => "Request cancelled by on_body");
977 root 1.82
978 root 1.88 $_[0]->push_read (line => sub {
979     length $_[1]
980     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
981     $_[0]->push_read (line => $state{read_chunk});
982 root 1.82 });
983 root 1.88 });
984     } else {
985     $hdr{"content-length"} ||= $cl;
986 root 1.84
987 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
988     if (length $_[1]) {
989     for ("$_[1]") {
990     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
991 root 1.84
992 root 1.92 my $hdr = _parse_hdr
993 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
994 root 1.84
995 root 1.88 %hdr = (%hdr, %$hdr);
996     }
997     }
998 root 1.84
999 root 1.88 $finish->($body, undef, undef, 1);
1000 root 1.84 });
1001     }
1002 root 1.88 };
1003    
1004     $_[0]->push_read (line => $state{read_chunk});
1005    
1006     } elsif ($arg{on_body}) {
1007     if (defined $len) {
1008     $_[0]->on_read (sub {
1009     $len -= length $_[0]{rbuf};
1010    
1011     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1012     or return $finish->(undef, 598 => "Request cancelled by on_body");
1013    
1014     $len > 0
1015     or $finish->("", undef, undef, 1);
1016     });
1017 root 1.84 } else {
1018 root 1.88 $_[0]->on_eof (sub {
1019     $finish->("");
1020     });
1021     $_[0]->on_read (sub {
1022     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1023     or $finish->(undef, 598 => "Request cancelled by on_body");
1024     });
1025     }
1026     } else {
1027     $_[0]->on_eof (undef);
1028 root 1.82
1029 root 1.88 if (defined $len) {
1030     $_[0]->on_read (sub {
1031     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1032     if $len <= length $_[0]{rbuf};
1033     });
1034     } else {
1035     $_[0]->on_error (sub {
1036     ($! == Errno::EPIPE || !$!)
1037     ? $finish->(delete $_[0]{rbuf})
1038     : $finish->(undef, $ae_error => $_[2]);
1039     });
1040     $_[0]->on_read (sub { });
1041 root 1.82 }
1042 root 1.88 }
1043     };
1044 root 1.82
1045 root 1.92 # if keepalive is enabled, then the server closing the connection
1046     # before a response can happen legally - we retry on idempotent methods.
1047 root 1.103 if ($was_persistent && $idempotent) {
1048 root 1.92 my $old_eof = $hdl->{on_eof};
1049     $hdl->{on_eof} = sub {
1050     _destroy_state %state;
1051    
1052 root 1.103 %state = ();
1053     $state{recurse} =
1054     http_request (
1055     $method => $url,
1056     %arg,
1057     keepalive => 0,
1058     sub {
1059     %state = ();
1060     &$cb
1061     }
1062     );
1063 root 1.92 };
1064     $hdl->on_read (sub {
1065     return unless %state;
1066    
1067     # as soon as we receive something, a connection close
1068     # once more becomes a hard error
1069     $hdl->{on_eof} = $old_eof;
1070     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1071     });
1072     } else {
1073     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1074     }
1075     };
1076    
1077     my $prepare_handle = sub {
1078     my ($hdl) = $state{handle};
1079    
1080     $hdl->on_error (sub {
1081     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1082     });
1083     $hdl->on_eof (sub {
1084     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1085     });
1086 root 1.103 $hdl->timeout_reset;
1087     $hdl->timeout ($timeout);
1088 root 1.88 };
1089 root 1.82
1090 root 1.92 # connected to proxy (or origin server)
1091 root 1.88 my $connect_cb = sub {
1092 root 1.92 my $fh = shift
1093     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1094 root 1.44
1095 root 1.88 return unless delete $state{connect_guard};
1096 root 1.11
1097 root 1.88 # get handle
1098     $state{handle} = new AnyEvent::Handle
1099 root 1.92 %{ $arg{handle_params} },
1100     fh => $fh,
1101     peername => $uhost,
1102 root 1.88 tls_ctx => $arg{tls_ctx},
1103     ;
1104 root 1.11
1105 root 1.92 $prepare_handle->();
1106 root 1.1
1107 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1108 root 1.88
1109     # now handle proxy-CONNECT method
1110     if ($proxy && $uscheme eq "https") {
1111     # oh dear, we have to wrap it into a connect request
1112    
1113     # maybe re-use $uauthority with patched port?
1114     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012\015\012");
1115     $state{handle}->push_read (line => $qr_nlnl, sub {
1116     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1117 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1118 root 1.88
1119     if ($2 == 200) {
1120     $rpath = $upath;
1121     $handle_actual_request->();
1122     } else {
1123 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1124 root 1.88 }
1125     });
1126     } else {
1127     $handle_actual_request->();
1128     }
1129     };
1130    
1131     _get_slot $uhost, sub {
1132     $state{slot_guard} = shift;
1133 root 1.64
1134 root 1.88 return unless $state{connect_guard};
1135 root 1.64
1136 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1137     # on a keepalive request (in theory, this should be a separate config option).
1138 root 1.103 if ($persistent && $KA_CACHE{$ka_key}) {
1139     $was_persistent = 1;
1140    
1141 root 1.92 $state{handle} = ka_fetch $ka_key;
1142 root 1.103 $state{handle}->destroyed
1143 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (1), please report.";#d#
1144 root 1.92 $prepare_handle->();
1145 root 1.103 $state{handle}->destroyed
1146 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (2), please report.";#d#
1147 root 1.92 $handle_actual_request->();
1148    
1149     } else {
1150     my $tcp_connect = $arg{tcp_connect}
1151     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1152 root 1.57
1153 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1154     }
1155 root 1.1 };
1156    
1157 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1158 root 1.1 }
1159    
1160 elmex 1.15 sub http_get($@) {
1161 root 1.1 unshift @_, "GET";
1162     &http_request
1163     }
1164    
1165 elmex 1.15 sub http_head($@) {
1166 root 1.4 unshift @_, "HEAD";
1167     &http_request
1168     }
1169    
1170 elmex 1.15 sub http_post($$@) {
1171 root 1.22 my $url = shift;
1172     unshift @_, "POST", $url, "body";
1173 root 1.3 &http_request
1174     }
1175    
1176 root 1.9 =back
1177    
1178 root 1.55 =head2 DNS CACHING
1179    
1180     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1181     the actual connection, which in turn uses AnyEvent::DNS to resolve
1182     hostnames. The latter is a simple stub resolver and does no caching
1183     on its own. If you want DNS caching, you currently have to provide
1184     your own default resolver (by storing a suitable resolver object in
1185 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1186 root 1.55
1187 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1188 root 1.1
1189     =over 4
1190    
1191 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1192    
1193     Sets the default proxy server to use. The proxy-url must begin with a
1194 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1195 root 1.52
1196     To clear an already-set proxy, use C<undef>.
1197 root 1.2
1198 root 1.102 When AnyEvent::HTTP is laoded for the first time it will query the
1199     default proxy from the operating system, currently by looking at
1200     C<$ENV{http_proxy>}.
1201    
1202 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1203    
1204     Remove all cookies from the cookie jar that have been expired. If
1205     C<$session_end> is given and true, then additionally remove all session
1206     cookies.
1207    
1208     You should call this function (with a true C<$session_end>) before you
1209     save cookies to disk, and you should call this function after loading them
1210     again. If you have a long-running program you can additonally call this
1211     function from time to time.
1212    
1213     A cookie jar is initially an empty hash-reference that is managed by this
1214     module. It's format is subject to change, but currently it is like this:
1215    
1216     The key C<version> has to contain C<1>, otherwise the hash gets
1217     emptied. All other keys are hostnames or IP addresses pointing to
1218     hash-references. The key for these inner hash references is the
1219     server path for which this cookie is meant, and the values are again
1220     hash-references. The keys of those hash-references is the cookie name, and
1221     the value, you guessed it, is another hash-reference, this time with the
1222     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1223     which have been replaced by a C<_expires> key that contains the cookie
1224     expiry timestamp.
1225    
1226     Here is an example of a cookie jar with a single cookie, so you have a
1227     chance of understanding the above paragraph:
1228    
1229     {
1230     version => 1,
1231     "10.0.0.1" => {
1232     "/" => {
1233     "mythweb_id" => {
1234     _expires => 1293917923,
1235     value => "ooRung9dThee3ooyXooM1Ohm",
1236     },
1237     },
1238     },
1239     }
1240    
1241 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1242    
1243     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1244     Date (RFC 2616).
1245    
1246     =item $timestamp = AnyEvent::HTTP::parse_date $date
1247    
1248 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1249     bunch of minor variations of those, and returns the corresponding POSIX
1250     timestamp, or C<undef> if the date cannot be parsed.
1251 root 1.61
1252 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1253 root 1.1
1254 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1255 root 1.1
1256 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1257    
1258     The default timeout for conenction operations (default: C<300>).
1259    
1260 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1261    
1262     The default value for the C<User-Agent> header (the default is
1263 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1264 root 1.1
1265 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1266 root 1.1
1267 root 1.47 The maximum number of concurrent connections to the same host (identified
1268 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
1269 root 1.92 are queued until previous connections are closed. Both persistent and
1270     non-persistent connections are counted in this limit.
1271 root 1.1
1272 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1273 root 1.92 increase it much.
1274    
1275     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1276     connections, older browsers used 2, newers (such as firefox 3) typically
1277     use 6, and Opera uses 8 because like, they have the fastest browser and
1278     give a shit for everybody else on the planet.
1279    
1280     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1281    
1282     The time after which idle persistent conenctions get closed by
1283     AnyEvent::HTTP (default: C<3>).
1284 root 1.3
1285 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1286    
1287     The number of active connections. This is not the number of currently
1288     running requests, but the number of currently open and non-idle TCP
1289 root 1.92 connections. This number can be useful for load-leveling.
1290 root 1.14
1291 root 1.1 =back
1292    
1293     =cut
1294    
1295 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1296     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1297    
1298     sub format_date($) {
1299     my ($time) = @_;
1300    
1301     # RFC 822/1123 format
1302     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1303    
1304     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1305     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1306     $H, $M, $S;
1307     }
1308    
1309     sub parse_date($) {
1310     my ($date) = @_;
1311    
1312     my ($d, $m, $y, $H, $M, $S);
1313    
1314 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1315 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1316     # cookie dates (with "-")
1317    
1318 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1319    
1320 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1321 root 1.61 # RFC 850
1322     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1323    
1324 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1325 root 1.61 # ISO C's asctime
1326     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1327     }
1328     # other formats fail in the loop below
1329    
1330     for (0..11) {
1331     if ($m eq $month[$_]) {
1332     require Time::Local;
1333     return Time::Local::timegm ($S, $M, $H, $d, $_, $y);
1334     }
1335     }
1336    
1337     undef
1338     }
1339    
1340 root 1.2 sub set_proxy($) {
1341 root 1.52 if (length $_[0]) {
1342 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1343 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1344     $PROXY = [$2, $3 || 3128, $1]
1345     } else {
1346     undef $PROXY;
1347     }
1348 root 1.2 }
1349    
1350     # initialise proxy from environment
1351 root 1.52 eval {
1352     set_proxy $ENV{http_proxy};
1353     };
1354 root 1.2
1355 root 1.93 =head2 SHOWCASE
1356    
1357     This section contaisn some more elaborate "real-world" examples or code
1358     snippets.
1359    
1360     =head2 HTTP/1.1 FILE DOWNLOAD
1361    
1362 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1363 root 1.98 goes wrong and you want to resume.
1364 root 1.93
1365     Here is a function that initiates and resumes a download. It uses the
1366     last modified time to check for file content changes, and works with many
1367     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1368     on older servers.
1369    
1370     It calls the completion callback with either C<undef>, which means a
1371     nonretryable error occured, C<0> when the download was partial and should
1372     be retried, and C<1> if it was successful.
1373    
1374     use AnyEvent::HTTP;
1375    
1376     sub download($$$) {
1377     my ($url, $file, $cb) = @_;
1378    
1379     open my $fh, "+<", $file
1380     or die "$file: $!";
1381    
1382     my %hdr;
1383     my $ofs = 0;
1384    
1385     warn stat $fh;
1386     warn -s _;
1387     if (stat $fh and -s _) {
1388     $ofs = -s _;
1389 root 1.107 warn "-s is ", $ofs;
1390 root 1.93 $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1391     $hdr{"range"} = "bytes=$ofs-";
1392     }
1393    
1394     http_get $url,
1395     headers => \%hdr,
1396     on_header => sub {
1397     my ($hdr) = @_;
1398    
1399     if ($hdr->{Status} == 200 && $ofs) {
1400     # resume failed
1401     truncate $fh, $ofs = 0;
1402     }
1403    
1404     sysseek $fh, $ofs, 0;
1405    
1406     1
1407     },
1408     on_body => sub {
1409     my ($data, $hdr) = @_;
1410    
1411     if ($hdr->{Status} =~ /^2/) {
1412     length $data == syswrite $fh, $data
1413     or return; # abort on write errors
1414     }
1415    
1416     1
1417     },
1418     sub {
1419     my (undef, $hdr) = @_;
1420    
1421     my $status = $hdr->{Status};
1422    
1423     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1424     utime $fh, $time, $time;
1425     }
1426    
1427     if ($status == 200 || $status == 206 || $status == 416) {
1428     # download ok || resume ok || file already fully downloaded
1429     $cb->(1, $hdr);
1430    
1431     } elsif ($status == 412) {
1432     # file has changed while resuming, delete and retry
1433     unlink $file;
1434     $cb->(0, $hdr);
1435    
1436     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1437     # retry later
1438     $cb->(0, $hdr);
1439    
1440     } else {
1441     $cb->(undef, $hdr);
1442     }
1443     }
1444     ;
1445     }
1446    
1447     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1448     if ($_[0]) {
1449     print "OK!\n";
1450     } elsif (defined $_[0]) {
1451     print "please retry later\n";
1452     } else {
1453     print "ERROR\n";
1454     }
1455     };
1456    
1457     =head3 SOCKS PROXIES
1458 root 1.60
1459     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1460     compile your perl to support socks, or use an external program such as
1461     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1462     transparently.
1463    
1464     Alternatively, for AnyEvent::HTTP only, you can use your own
1465     C<tcp_connect> function that does the proxy handshake - here is an example
1466     that works with socks4a proxies:
1467    
1468     use Errno;
1469     use AnyEvent::Util;
1470     use AnyEvent::Socket;
1471     use AnyEvent::Handle;
1472    
1473     # host, port and username of/for your socks4a proxy
1474     my $socks_host = "10.0.0.23";
1475     my $socks_port = 9050;
1476     my $socks_user = "";
1477    
1478     sub socks4a_connect {
1479     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1480    
1481     my $hdl = new AnyEvent::Handle
1482     connect => [$socks_host, $socks_port],
1483     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1484     on_error => sub { $connect_cb->() },
1485     ;
1486    
1487     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1488    
1489     $hdl->push_read (chunk => 8, sub {
1490     my ($hdl, $chunk) = @_;
1491     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1492    
1493     if ($status == 0x5a) {
1494     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1495     } else {
1496     $! = Errno::ENXIO; $connect_cb->();
1497     }
1498     });
1499    
1500     $hdl
1501     }
1502    
1503     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1504     possibly after switching off other proxy types:
1505    
1506     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1507    
1508     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1509     my ($data, $headers) = @_;
1510     ...
1511     };
1512    
1513 root 1.1 =head1 SEE ALSO
1514    
1515     L<AnyEvent>.
1516    
1517     =head1 AUTHOR
1518    
1519 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1520     http://home.schmorp.de/
1521 root 1.1
1522 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1523     testcases and bugreports.
1524    
1525 root 1.1 =cut
1526    
1527     1
1528