ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.115
Committed: Fri Jan 18 22:55:39 2013 UTC (11 years, 4 months ago) by root
Branch: MAIN
Changes since 1.114: +3 -2 lines
Log Message:
typos

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
21 root 1.11 automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41 root 1.85 use common::sense;
42 root 1.1
43 root 1.41 use Errno ();
44 root 1.1
45 root 1.51 use AnyEvent 5.0 ();
46 root 1.1 use AnyEvent::Util ();
47     use AnyEvent::Handle ();
48    
49     use base Exporter::;
50    
51 root 1.112 our $VERSION = '2.15';
52 root 1.1
53 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
54 root 1.1
55 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
56 root 1.3 our $MAX_RECURSE = 10;
57 root 1.92 our $PERSISTENT_TIMEOUT = 3;
58 root 1.2 our $TIMEOUT = 300;
59 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
60 root 1.1
61 root 1.2 our $PROXY;
62 root 1.14 our $ACTIVE = 0;
63 root 1.2
64 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
65 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
66 root 1.1
67     =item http_get $url, key => value..., $cb->($data, $headers)
68    
69     Executes an HTTP-GET request. See the http_request function for details on
70 root 1.29 additional parameters and the return value.
71 root 1.1
72 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
73    
74 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
75     on additional parameters and the return value.
76 root 1.5
77     =item http_post $url, $body, key => value..., $cb->($data, $headers)
78 root 1.3
79 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
80 root 1.29 http_request function for details on additional parameters and the return
81     value.
82 root 1.3
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.29 When called in void context, nothing is returned. In other contexts,
89     C<http_request> returns a "cancellation guard" - you have to keep the
90     object at least alive until the callback get called. If the object gets
91 root 1.58 destroyed before the callback is called, the request will be cancelled.
92 root 1.29
93 root 1.42 The callback will be called with the response body data as first argument
94 root 1.68 (or C<undef> if an error occured), and a hash-ref with response headers
95     (and trailers) as second argument.
96 root 1.2
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
99     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
100 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
101     during the body phase of a request, then the original C<Status> and
102     C<Reason> values from the header are available as C<OrigStatus> and
103     C<OrigReason>.
104 root 1.55
105     The pseudo-header C<URL> contains the actual URL (which can differ from
106     the requested URL when following redirects - for example, you might get
107     an error that your URL scheme is not supported even though your URL is a
108     valid http URL because it redirected to an ftp URL, in which case you can
109     look at the URL pseudo header).
110    
111     The pseudo-header C<Redirect> only exists when the request was a result
112     of an internal redirect. In that case it is an array reference with
113     the C<($data, $headers)> from the redirect response. Note that this
114     response could in turn be the result of a redirect itself, and C<<
115     $headers->{Redirect}[1]{Redirect} >> will then contain the original
116     response, and so on.
117 root 1.20
118 root 1.32 If the server sends a header multiple times, then their contents will be
119     joined together with a comma (C<,>), as per the HTTP spec.
120 root 1.2
121     If an internal error occurs, such as not being able to resolve a hostname,
122 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
123     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
124     message. Currently the following status codes are used:
125    
126     =over 4
127    
128 root 1.114 =item 595 - errors during connection establishment, proxy handshake.
129 root 1.77
130     =item 596 - errors during TLS negotiation, request sending and header processing.
131    
132 root 1.78 =item 597 - errors during body receiving or processing.
133 root 1.77
134 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
135 root 1.77
136     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
137    
138     =back
139 root 1.2
140 root 1.6 A typical callback might look like this:
141    
142     sub {
143     my ($body, $hdr) = @_;
144    
145     if ($hdr->{Status} =~ /^2/) {
146     ... everything should be ok
147     } else {
148     print "error, $hdr->{Status} $hdr->{Reason}\n";
149     }
150     }
151    
152 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
153     include:
154    
155     =over 4
156    
157 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
158 root 1.1
159 root 1.112 Whether to recurse requests or not, e.g. on redirects, authentication and
160     other retries and so on, and how often to do so.
161 root 1.1
162     =item headers => hashref
163    
164 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
165     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
166 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
167     (this can be suppressed by using C<undef> for these headers in which case
168     they won't be sent at all).
169 root 1.1
170 root 1.90 You really should provide your own C<User-Agent:> header value that is
171     appropriate for your program - I wouldn't be surprised if the default
172     AnyEvent string gets blocked by webservers sooner or later.
173    
174 root 1.104 Also, make sure that your headers names and values do not contain any
175     embedded newlines.
176    
177 root 1.1 =item timeout => $seconds
178    
179     The time-out to use for various stages - each connect attempt will reset
180 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
181     timeout.
182    
183     Default timeout is 5 minutes.
184 root 1.2
185     =item proxy => [$host, $port[, $scheme]] or undef
186    
187 root 1.102 Use the given http proxy for all requests, or no proxy if C<undef> is
188     used.
189 root 1.2
190 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
191 root 1.1
192 root 1.102 If not specified, then the default proxy is used (see
193     C<AnyEvent::HTTP::set_proxy>).
194    
195 root 1.3 =item body => $string
196    
197 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
198 root 1.3 this module might offer more options).
199    
200 root 1.10 =item cookie_jar => $hash_ref
201    
202     Passing this parameter enables (simplified) cookie-processing, loosely
203     based on the original netscape specification.
204    
205 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
206     will get updated automatically. It is possible to save the cookie jar
207     to persistent storage with something like JSON or Storable - see the
208     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
209     expired or session-only cookies, and also for documentation on the format
210     of the cookie jar.
211 root 1.10
212 root 1.70 Note that this cookie implementation is not meant to be complete. If
213     you want complete cookie management you have to do that on your
214 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
215 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
216     to.
217 root 1.10
218 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
219 root 1.70 headers will be set and handled by this module, otherwise they will be
220 root 1.69 left untouched.
221    
222 root 1.40 =item tls_ctx => $scheme | $tls_ctx
223    
224     Specifies the AnyEvent::TLS context to be used for https connections. This
225     parameter follows the same rules as the C<tls_ctx> parameter to
226     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
227     C<high> can be specified, which give you a predefined low-security (no
228     verification, highest compatibility) and high-security (CA and common-name
229     verification) TLS context.
230    
231     The default for this option is C<low>, which could be interpreted as "give
232     me the page, no matter what".
233    
234 root 1.92 See also the C<sessionid> parameter.
235    
236     =item session => $string
237    
238     The module might reuse connections to the same host internally. Sometimes
239     (e.g. when using TLS), you do not want to reuse connections from other
240     sessions. This can be achieved by setting this parameter to some unique
241     ID (such as the address of an object storing your state data, or the TLS
242     context) - only connections using the same unique ID will be reused.
243    
244 root 1.51 =item on_prepare => $callback->($fh)
245    
246     In rare cases you need to "tune" the socket before it is used to
247     connect (for exmaple, to bind it on a given IP address). This parameter
248     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
249     and behaves exactly the same way (e.g. it has to provide a
250     timeout). See the description for the C<$prepare_cb> argument of
251     C<AnyEvent::Socket::tcp_connect> for details.
252    
253 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
254    
255     In even rarer cases you want total control over how AnyEvent::HTTP
256     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
257     to do this, but you can provide your own C<tcp_connect> function -
258 root 1.60 obviously, it has to follow the same calling conventions, except that it
259     may always return a connection guard object.
260 root 1.59
261     There are probably lots of weird uses for this function, starting from
262     tracing the hosts C<http_request> actually tries to connect, to (inexact
263     but fast) host => IP address caching or even socks protocol support.
264    
265 root 1.42 =item on_header => $callback->($headers)
266 root 1.41
267     When specified, this callback will be called with the header hash as soon
268     as headers have been successfully received from the remote server (not on
269     locally-generated errors).
270    
271     It has to return either true (in which case AnyEvent::HTTP will continue),
272     or false, in which case AnyEvent::HTTP will cancel the download (and call
273     the finish callback with an error code of C<598>).
274    
275     This callback is useful, among other things, to quickly reject unwanted
276     content, which, if it is supposed to be rare, can be faster than first
277     doing a C<HEAD> request.
278    
279 root 1.68 The downside is that cancelling the request makes it impossible to re-use
280     the connection. Also, the C<on_header> callback will not receive any
281     trailer (headers sent after the response body).
282    
283 root 1.42 Example: cancel the request unless the content-type is "text/html".
284 root 1.41
285 root 1.42 on_header => sub {
286     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
287     },
288 root 1.41
289 root 1.42 =item on_body => $callback->($partial_body, $headers)
290 root 1.41
291 root 1.42 When specified, all body data will be passed to this callback instead of
292     to the completion callback. The completion callback will get the empty
293     string instead of the body data.
294 root 1.41
295 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
296     or false, in which case AnyEvent::HTTP will cancel the download (and call
297     the completion callback with an error code of C<598>).
298    
299 root 1.68 The downside to cancelling the request is that it makes it impossible to
300     re-use the connection.
301    
302 root 1.42 This callback is useful when the data is too large to be held in memory
303     (so the callback writes it to a file) or when only some information should
304     be extracted, or when the body should be processed incrementally.
305 root 1.41
306     It is usually preferred over doing your own body handling via
307 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
308     only used to create a connection, C<want_body_handle> is the better
309     alternative, as it allows you to install your own event handler, reducing
310     resource usage.
311 root 1.41
312     =item want_body_handle => $enable
313    
314     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
315     changes considerably: after parsing the headers, and instead of
316     downloading the body (if any), the completion callback will be
317     called. Instead of the C<$body> argument containing the body data, the
318     callback will receive the L<AnyEvent::Handle> object associated with the
319     connection. In error cases, C<undef> will be passed. When there is no body
320     (e.g. status C<304>), the empty string will be passed.
321    
322 root 1.92 The handle object might or might not be in TLS mode, might be connected
323     to a proxy, be a persistent connection, use chunked transfer encoding
324     etc., and configured in unspecified ways. The user is responsible for this
325     handle (it will not be used by this module anymore).
326 root 1.41
327     This is useful with some push-type services, where, after the initial
328     headers, an interactive protocol is used (typical example would be the
329     push-style twitter API which starts a JSON/XML stream).
330    
331     If you think you need this, first have a look at C<on_body>, to see if
332 root 1.45 that doesn't solve your problem in a better way.
333 root 1.41
334 root 1.92 =item persistent => $boolean
335    
336     Try to create/reuse a persistent connection. When this flag is set
337     (default: true for idempotent requests, false for all others), then
338     C<http_request> tries to re-use an existing (previously-created)
339     persistent connection to the host and, failing that, tries to create a new
340     one.
341    
342     Requests failing in certain ways will be automatically retried once, which
343     is dangerous for non-idempotent requests, which is why it defaults to off
344     for them. The reason for this is because the bozos who designed HTTP/1.1
345     made it impossible to distinguish between a fatal error and a normal
346     connection timeout, so you never know whether there was a problem with
347     your request or not.
348    
349     When reusing an existent connection, many parameters (such as TLS context)
350     will be ignored. See the C<session> parameter for a workaround.
351    
352     =item keepalive => $boolean
353    
354     Only used when C<persistent> is also true. This parameter decides whether
355     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
356     (as opposed to only a HTTP/1.1 persistent connection).
357    
358     The default is true, except when using a proxy, in which case it defaults
359     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
360    
361     =item handle_params => { key => value ... }
362    
363     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
364     constructor that is called - not all requests will create a handle, and
365     sometimes more than one is created, so this parameter is only good for
366     setting hints.
367    
368     Example: set the maximum read size to 4096, to potentially conserve memory
369     at the cost of speed.
370    
371     handle_params => {
372     max_read_size => 4096,
373     },
374    
375 root 1.1 =back
376    
377 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
378     the response body.
379 root 1.9
380     http_request GET => "http://www.nethype.de/", sub {
381     my ($body, $hdr) = @_;
382     print "$body\n";
383     };
384    
385 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
386 root 1.9 timeout of 30 seconds.
387    
388     http_request
389 root 1.109 HEAD => "https://www.google.com",
390 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
391 root 1.9 timeout => 30,
392     sub {
393     my ($body, $hdr) = @_;
394     use Data::Dumper;
395     print Dumper $hdr;
396     }
397     ;
398 root 1.1
399 root 1.68 Example: do another simple HTTP GET request, but immediately try to
400 root 1.29 cancel it.
401    
402     my $request = http_request GET => "http://www.nethype.de/", sub {
403     my ($body, $hdr) = @_;
404     print "$body\n";
405     };
406    
407     undef $request;
408    
409 root 1.1 =cut
410    
411 root 1.92 #############################################################################
412     # wait queue/slots
413    
414 root 1.12 sub _slot_schedule;
415 root 1.11 sub _slot_schedule($) {
416     my $host = shift;
417    
418     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
419     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
420 root 1.12 # somebody wants that slot
421 root 1.11 ++$CO_SLOT{$host}[0];
422 root 1.14 ++$ACTIVE;
423 root 1.11
424     $cb->(AnyEvent::Util::guard {
425 root 1.14 --$ACTIVE;
426 root 1.11 --$CO_SLOT{$host}[0];
427     _slot_schedule $host;
428     });
429     } else {
430     # nobody wants the slot, maybe we can forget about it
431     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
432     last;
433     }
434     }
435     }
436    
437     # wait for a free slot on host, call callback
438     sub _get_slot($$) {
439     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
440    
441     _slot_schedule $_[0];
442     }
443    
444 root 1.80 #############################################################################
445 root 1.92 # cookie handling
446 root 1.80
447     # expire cookies
448     sub cookie_jar_expire($;$) {
449     my ($jar, $session_end) = @_;
450    
451     %$jar = () if $jar->{version} != 1;
452    
453     my $anow = AE::now;
454    
455     while (my ($chost, $paths) = each %$jar) {
456     next unless ref $paths;
457    
458     while (my ($cpath, $cookies) = each %$paths) {
459     while (my ($cookie, $kv) = each %$cookies) {
460     if (exists $kv->{_expires}) {
461     delete $cookies->{$cookie}
462     if $anow > $kv->{_expires};
463     } elsif ($session_end) {
464     delete $cookies->{$cookie};
465     }
466     }
467    
468     delete $paths->{$cpath}
469     unless %$cookies;
470     }
471    
472     delete $jar->{$chost}
473     unless %$paths;
474     }
475     }
476    
477 root 1.72 # extract cookies from jar
478 root 1.71 sub cookie_jar_extract($$$$) {
479 root 1.92 my ($jar, $scheme, $host, $path) = @_;
480 root 1.71
481     %$jar = () if $jar->{version} != 1;
482    
483     my @cookies;
484    
485     while (my ($chost, $paths) = each %$jar) {
486     next unless ref $paths;
487    
488     if ($chost =~ /^\./) {
489 root 1.92 next unless $chost eq substr $host, -length $chost;
490 root 1.71 } elsif ($chost =~ /\./) {
491 root 1.92 next unless $chost eq $host;
492 root 1.71 } else {
493     next;
494     }
495    
496     while (my ($cpath, $cookies) = each %$paths) {
497 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
498 root 1.71
499     while (my ($cookie, $kv) = each %$cookies) {
500 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
501 root 1.71
502 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
503     delete $cookies->{$cookie};
504     next;
505 root 1.71 }
506    
507     my $value = $kv->{value};
508    
509     if ($value =~ /[=;,[:space:]]/) {
510     $value =~ s/([\\"])/\\$1/g;
511     $value = "\"$value\"";
512     }
513    
514     push @cookies, "$cookie=$value";
515     }
516     }
517     }
518    
519     \@cookies
520     }
521    
522 root 1.72 # parse set_cookie header into jar
523 root 1.80 sub cookie_jar_set_cookie($$$$) {
524 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
525 root 1.80
526     my $anow = int AE::now;
527     my $snow; # server-now
528 root 1.72
529     for ($set_cookie) {
530     # parse NAME=VALUE
531     my @kv;
532    
533 root 1.79 # expires is not http-compliant in the original cookie-spec,
534     # we support the official date format and some extensions
535 root 1.72 while (
536     m{
537     \G\s*
538     (?:
539 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
540 root 1.101 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^;,[:space:]]*) ) )?
541 root 1.72 )
542     }gcxsi
543     ) {
544     my $name = $2;
545     my $value = $4;
546    
547 root 1.82 if (defined $1) {
548 root 1.72 # expires
549     $name = "expires";
550     $value = $1;
551 root 1.82 } elsif (defined $3) {
552 root 1.72 # quoted
553     $value = $3;
554     $value =~ s/\\(.)/$1/gs;
555     }
556    
557 root 1.95 push @kv, @kv ? lc $name : $name, $value;
558 root 1.72
559     last unless /\G\s*;/gc;
560     }
561    
562     last unless @kv;
563    
564     my $name = shift @kv;
565     my %kv = (value => shift @kv, @kv);
566    
567 root 1.80 if (exists $kv{"max-age"}) {
568     $kv{_expires} = $anow + delete $kv{"max-age"};
569     } elsif (exists $kv{expires}) {
570     $snow ||= parse_date ($date) || $anow;
571     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
572     } else {
573     delete $kv{_expires};
574     }
575 root 1.72
576     my $cdom;
577     my $cpath = (delete $kv{path}) || "/";
578    
579     if (exists $kv{domain}) {
580     $cdom = delete $kv{domain};
581    
582     $cdom =~ s/^\.?/./; # make sure it starts with a "."
583    
584     next if $cdom =~ /\.$/;
585    
586     # this is not rfc-like and not netscape-like. go figure.
587     my $ndots = $cdom =~ y/.//;
588     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
589     } else {
590 root 1.92 $cdom = $host;
591 root 1.72 }
592    
593     # store it
594 root 1.73 $jar->{version} = 1;
595 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
596 root 1.72
597     redo if /\G\s*,/gc;
598     }
599     }
600    
601 root 1.92 #############################################################################
602     # keepalive/persistent connection cache
603    
604     # fetch a connection from the keepalive cache
605     sub ka_fetch($) {
606     my $ka_key = shift;
607    
608     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
609     delete $KA_CACHE{$ka_key}
610     unless @{ $KA_CACHE{$ka_key} };
611    
612     $hdl
613     }
614    
615     sub ka_store($$) {
616     my ($ka_key, $hdl) = @_;
617    
618     my $kaa = $KA_CACHE{$ka_key} ||= [];
619    
620     my $destroy = sub {
621     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
622    
623     $hdl->destroy;
624    
625     @ka
626     ? $KA_CACHE{$ka_key} = \@ka
627     : delete $KA_CACHE{$ka_key};
628     };
629    
630     # on error etc., destroy
631     $hdl->on_error ($destroy);
632     $hdl->on_eof ($destroy);
633     $hdl->on_read ($destroy);
634     $hdl->timeout ($PERSISTENT_TIMEOUT);
635    
636     push @$kaa, $hdl;
637     shift @$kaa while @$kaa > $MAX_PER_HOST;
638     }
639    
640     #############################################################################
641     # utilities
642    
643 root 1.66 # continue to parse $_ for headers and place them into the arg
644 root 1.92 sub _parse_hdr() {
645 root 1.66 my %hdr;
646    
647     # things seen, not parsed:
648     # p3pP="NON CUR OTPi OUR NOR UNI"
649    
650     $hdr{lc $1} .= ",$2"
651     while /\G
652     ([^:\000-\037]*):
653     [\011\040]*
654     ((?: [^\012]+ | \012[\011\040] )*)
655     \012
656     /gxc;
657    
658     /\G$/
659     or return;
660    
661     # remove the "," prefix we added to all headers above
662     substr $_, 0, 1, ""
663     for values %hdr;
664    
665     \%hdr
666     }
667    
668 root 1.92 #############################################################################
669     # http_get
670    
671 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
672 root 1.34
673 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
674     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
675 root 1.40
676 root 1.92 # maybe it should just become a normal object :/
677    
678     sub _destroy_state(\%) {
679     my ($state) = @_;
680    
681     $state->{handle}->destroy if $state->{handle};
682     %$state = ();
683     }
684    
685     sub _error(\%$$) {
686     my ($state, $cb, $hdr) = @_;
687    
688     &_destroy_state ($state);
689    
690     $cb->(undef, $hdr);
691     ()
692     }
693    
694 elmex 1.15 sub http_request($$@) {
695 root 1.1 my $cb = pop;
696     my ($method, $url, %arg) = @_;
697    
698     my %hdr;
699    
700 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
701     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
702    
703 root 1.3 $method = uc $method;
704    
705 root 1.8 if (my $hdr = $arg{headers}) {
706 root 1.1 while (my ($k, $v) = each %$hdr) {
707     $hdr{lc $k} = $v;
708     }
709     }
710    
711 root 1.55 # pseudo headers for all subsequent responses
712     my @pseudo = (URL => $url);
713     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
714    
715 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
716 root 1.8
717 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
718 root 1.8 if $recurse < 0;
719    
720 root 1.102 my $proxy = exists $arg{proxy} ? $arg{proxy} : $PROXY;
721 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
722    
723 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
724 root 1.100 $url =~ m|^([^:]+):(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?$|;
725 root 1.2
726 root 1.31 $uscheme = lc $uscheme;
727 root 1.1
728 root 1.31 my $uport = $uscheme eq "http" ? 80
729     : $uscheme eq "https" ? 443
730 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
731 root 1.13
732 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
733 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
734 root 1.10
735 root 1.86 my $uhost = lc $1;
736 root 1.10 $uport = $2 if defined $2;
737    
738 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
739     unless exists $hdr{host};
740 root 1.43
741 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
742 root 1.56 $upath .= $query if length $query;
743 root 1.10
744     $upath =~ s%^/?%/%;
745    
746     # cookie processing
747     if (my $jar = $arg{cookie_jar}) {
748 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
749 root 1.70
750 root 1.71 $hdr{cookie} = join "; ", @$cookies
751     if @$cookies;
752 root 1.10 }
753 root 1.1
754 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
755 root 1.2
756 root 1.10 if ($proxy) {
757 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
758 root 1.31
759 root 1.47 $rscheme = "http" unless defined $rscheme;
760    
761 root 1.31 # don't support https requests over https-proxy transport,
762 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
763 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
764 root 1.86
765     $rhost = lc $rhost;
766     $rscheme = lc $rscheme;
767 root 1.10 } else {
768 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
769 root 1.2 }
770    
771 root 1.47 # leave out fragment and query string, just a heuristic
772 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
773     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
774 root 1.41
775 root 1.53 $hdr{"content-length"} = length $arg{body}
776     if length $arg{body} || $method ne "GET";
777 root 1.1
778 root 1.89 my $idempotent = $method =~ /^(?:GET|HEAD|PUT|DELETE|OPTIONS|TRACE)$/;
779    
780     # default value for keepalive is true iff the request is for an idempotent method
781 root 1.103 my $persistent = exists $arg{persistent} ? !!$arg{persistent} : $idempotent;
782     my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : !$proxy;
783     my $was_persistent; # true if this is actually a recycled connection
784 root 1.92
785     # the key to use in the keepalive cache
786 root 1.105 my $ka_key = "$uscheme\x00$uhost\x00$uport\x00$arg{sessionid}";
787 root 1.89
788 root 1.103 $hdr{connection} = ($persistent ? $keepalive ? "keep-alive " : "" : "close ") . "Te"; #1.1
789 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
790 root 1.66
791 root 1.11 my %state = (connect_guard => 1);
792    
793 root 1.88 my $ae_error = 595; # connecting
794 root 1.1
795 root 1.88 # handle actual, non-tunneled, request
796     my $handle_actual_request = sub {
797     $ae_error = 596; # request phase
798    
799 root 1.92 my $hdl = $state{handle};
800    
801     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
802 root 1.88
803     # send request
804 root 1.92 $hdl->push_write (
805 root 1.88 "$method $rpath HTTP/1.1\015\012"
806     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
807     . "\015\012"
808     . (delete $arg{body})
809     );
810    
811     # return if error occured during push_write()
812     return unless %state;
813    
814 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
815     %hdr = ();
816 root 1.88
817     # status line and headers
818     $state{read_response} = sub {
819 root 1.92 return unless %state;
820    
821 root 1.88 for ("$_[1]") {
822     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
823    
824     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
825 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
826 root 1.88
827     # 100 Continue handling
828     # should not happen as we don't send expect: 100-continue,
829     # but we handle it just in case.
830     # since we send the request body regardless, if we get an error
831     # we are out of-sync, which we currently do NOT handle correctly.
832     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
833     if $2 eq 100;
834    
835     push @pseudo,
836     HTTPVersion => $1,
837     Status => $2,
838     Reason => $3,
839     ;
840 root 1.1
841 root 1.92 my $hdr = _parse_hdr
842     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
843 root 1.77
844 root 1.88 %hdr = (%$hdr, @pseudo);
845     }
846 root 1.82
847 root 1.88 # redirect handling
848     # microsoft and other shitheads don't give a shit for following standards,
849     # try to support some common forms of broken Location headers.
850     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
851     $hdr{location} =~ s/^\.\/+//;
852    
853     my $url = "$rscheme://$uhost:$uport";
854    
855     unless ($hdr{location} =~ s/^\///) {
856     $url .= $upath;
857     $url =~ s/\/[^\/]*$//;
858 root 1.82 }
859    
860 root 1.88 $hdr{location} = "$url/$hdr{location}";
861     }
862 root 1.82
863 root 1.88 my $redirect;
864 root 1.82
865 root 1.88 if ($recurse) {
866     my $status = $hdr{Status};
867 root 1.82
868 root 1.88 # industry standard is to redirect POST as GET for
869     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
870     # also, the UA should ask the user for 301 and 307 and POST,
871     # industry standard seems to be to simply follow.
872     # we go with the industry standard.
873     if ($status == 301 or $status == 302 or $status == 303) {
874     # HTTP/1.1 is unclear on how to mutate the method
875     $method = "GET" unless $method eq "HEAD";
876     $redirect = 1;
877     } elsif ($status == 307) {
878     $redirect = 1;
879 root 1.82 }
880 root 1.88 }
881 root 1.82
882 root 1.103 my $finish = sub { # ($data, $err_status, $err_reason[, $persistent])
883 root 1.92 if ($state{handle}) {
884     # handle keepalive
885     if (
886 root 1.103 $persistent
887 root 1.92 && $_[3]
888     && ($hdr{HTTPVersion} < 1.1
889     ? $hdr{connection} =~ /\bkeep-?alive\b/i
890     : $hdr{connection} !~ /\bclose\b/i)
891     ) {
892     ka_store $ka_key, delete $state{handle};
893     } else {
894     # no keepalive, destroy the handle
895     $state{handle}->destroy;
896     }
897     }
898 root 1.82
899 root 1.88 %state = ();
900 root 1.82
901 root 1.88 if (defined $_[1]) {
902     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
903     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
904     }
905 root 1.82
906 root 1.88 # set-cookie processing
907     if ($arg{cookie_jar}) {
908     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
909     }
910 root 1.82
911 root 1.88 if ($redirect && exists $hdr{location}) {
912     # we ignore any errors, as it is very common to receive
913     # Content-Length != 0 but no actual body
914     # we also access %hdr, as $_[1] might be an erro
915 root 1.103 $state{recurse} =
916     http_request (
917     $method => $hdr{location},
918     %arg,
919     recurse => $recurse - 1,
920     Redirect => [$_[0], \%hdr],
921     sub {
922     %state = ();
923     &$cb
924     },
925     );
926 root 1.88 } else {
927     $cb->($_[0], \%hdr);
928     }
929     };
930 root 1.82
931 root 1.88 $ae_error = 597; # body phase
932 root 1.82
933 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
934    
935     my $len = $chunked ? undef : $hdr{"content-length"};
936 root 1.82
937 root 1.88 # body handling, many different code paths
938     # - no body expected
939     # - want_body_handle
940     # - te chunked
941     # - 2x length known (with or without on_body)
942     # - 2x length not known (with or without on_body)
943     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
944     $finish->(undef, 598 => "Request cancelled by on_header");
945     } elsif (
946     $hdr{Status} =~ /^(?:1..|204|205|304)$/
947     or $method eq "HEAD"
948     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
949     ) {
950     # no body
951     $finish->("", undef, undef, 1);
952    
953     } elsif (!$redirect && $arg{want_body_handle}) {
954     $_[0]->on_eof (undef);
955     $_[0]->on_error (undef);
956     $_[0]->on_read (undef);
957    
958     $finish->(delete $state{handle});
959    
960 root 1.91 } elsif ($chunked) {
961 root 1.88 my $cl = 0;
962 root 1.92 my $body = "";
963 root 1.88 my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
964    
965     $state{read_chunk} = sub {
966     $_[1] =~ /^([0-9a-fA-F]+)/
967 root 1.108 or return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
968 root 1.82
969 root 1.88 my $len = hex $1;
970 root 1.82
971 root 1.88 if ($len) {
972     $cl += $len;
973 root 1.82
974 root 1.88 $_[0]->push_read (chunk => $len, sub {
975     $on_body->($_[1], \%hdr)
976     or return $finish->(undef, 598 => "Request cancelled by on_body");
977 root 1.82
978 root 1.88 $_[0]->push_read (line => sub {
979     length $_[1]
980     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
981     $_[0]->push_read (line => $state{read_chunk});
982 root 1.82 });
983 root 1.88 });
984     } else {
985     $hdr{"content-length"} ||= $cl;
986 root 1.84
987 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
988     if (length $_[1]) {
989     for ("$_[1]") {
990     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
991 root 1.84
992 root 1.92 my $hdr = _parse_hdr
993 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
994 root 1.84
995 root 1.88 %hdr = (%hdr, %$hdr);
996     }
997     }
998 root 1.84
999 root 1.88 $finish->($body, undef, undef, 1);
1000 root 1.84 });
1001     }
1002 root 1.88 };
1003    
1004     $_[0]->push_read (line => $state{read_chunk});
1005    
1006     } elsif ($arg{on_body}) {
1007     if (defined $len) {
1008     $_[0]->on_read (sub {
1009     $len -= length $_[0]{rbuf};
1010    
1011     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1012     or return $finish->(undef, 598 => "Request cancelled by on_body");
1013    
1014     $len > 0
1015     or $finish->("", undef, undef, 1);
1016     });
1017 root 1.84 } else {
1018 root 1.88 $_[0]->on_eof (sub {
1019     $finish->("");
1020     });
1021     $_[0]->on_read (sub {
1022     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1023     or $finish->(undef, 598 => "Request cancelled by on_body");
1024     });
1025     }
1026     } else {
1027     $_[0]->on_eof (undef);
1028 root 1.82
1029 root 1.88 if (defined $len) {
1030     $_[0]->on_read (sub {
1031     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1032     if $len <= length $_[0]{rbuf};
1033     });
1034     } else {
1035     $_[0]->on_error (sub {
1036     ($! == Errno::EPIPE || !$!)
1037     ? $finish->(delete $_[0]{rbuf})
1038     : $finish->(undef, $ae_error => $_[2]);
1039     });
1040     $_[0]->on_read (sub { });
1041 root 1.82 }
1042 root 1.88 }
1043     };
1044 root 1.82
1045 root 1.92 # if keepalive is enabled, then the server closing the connection
1046     # before a response can happen legally - we retry on idempotent methods.
1047 root 1.103 if ($was_persistent && $idempotent) {
1048 root 1.92 my $old_eof = $hdl->{on_eof};
1049     $hdl->{on_eof} = sub {
1050     _destroy_state %state;
1051    
1052 root 1.103 %state = ();
1053     $state{recurse} =
1054     http_request (
1055 root 1.112 $method => $url,
1056 root 1.103 %arg,
1057 root 1.112 recurse => $recurse - 1,
1058 root 1.103 keepalive => 0,
1059     sub {
1060     %state = ();
1061     &$cb
1062     }
1063     );
1064 root 1.92 };
1065     $hdl->on_read (sub {
1066     return unless %state;
1067    
1068     # as soon as we receive something, a connection close
1069     # once more becomes a hard error
1070     $hdl->{on_eof} = $old_eof;
1071     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1072     });
1073     } else {
1074     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1075     }
1076     };
1077    
1078     my $prepare_handle = sub {
1079     my ($hdl) = $state{handle};
1080    
1081     $hdl->on_error (sub {
1082     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1083     });
1084     $hdl->on_eof (sub {
1085     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1086     });
1087 root 1.103 $hdl->timeout_reset;
1088     $hdl->timeout ($timeout);
1089 root 1.88 };
1090 root 1.82
1091 root 1.92 # connected to proxy (or origin server)
1092 root 1.88 my $connect_cb = sub {
1093 root 1.92 my $fh = shift
1094     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1095 root 1.44
1096 root 1.88 return unless delete $state{connect_guard};
1097 root 1.11
1098 root 1.88 # get handle
1099     $state{handle} = new AnyEvent::Handle
1100 root 1.92 %{ $arg{handle_params} },
1101     fh => $fh,
1102     peername => $uhost,
1103 root 1.88 tls_ctx => $arg{tls_ctx},
1104     ;
1105 root 1.11
1106 root 1.92 $prepare_handle->();
1107 root 1.1
1108 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1109 root 1.88
1110     # now handle proxy-CONNECT method
1111     if ($proxy && $uscheme eq "https") {
1112     # oh dear, we have to wrap it into a connect request
1113    
1114     # maybe re-use $uauthority with patched port?
1115     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012\015\012");
1116     $state{handle}->push_read (line => $qr_nlnl, sub {
1117     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1118 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1119 root 1.88
1120     if ($2 == 200) {
1121     $rpath = $upath;
1122     $handle_actual_request->();
1123     } else {
1124 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1125 root 1.88 }
1126     });
1127     } else {
1128     $handle_actual_request->();
1129     }
1130     };
1131    
1132     _get_slot $uhost, sub {
1133     $state{slot_guard} = shift;
1134 root 1.64
1135 root 1.88 return unless $state{connect_guard};
1136 root 1.64
1137 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1138     # on a keepalive request (in theory, this should be a separate config option).
1139 root 1.103 if ($persistent && $KA_CACHE{$ka_key}) {
1140     $was_persistent = 1;
1141    
1142 root 1.92 $state{handle} = ka_fetch $ka_key;
1143 root 1.103 $state{handle}->destroyed
1144 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (1), please report.";#d#
1145 root 1.92 $prepare_handle->();
1146 root 1.103 $state{handle}->destroyed
1147 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (2), please report.";#d#
1148 root 1.92 $handle_actual_request->();
1149    
1150     } else {
1151     my $tcp_connect = $arg{tcp_connect}
1152     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1153 root 1.57
1154 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1155     }
1156 root 1.1 };
1157    
1158 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1159 root 1.1 }
1160    
1161 elmex 1.15 sub http_get($@) {
1162 root 1.1 unshift @_, "GET";
1163     &http_request
1164     }
1165    
1166 elmex 1.15 sub http_head($@) {
1167 root 1.4 unshift @_, "HEAD";
1168     &http_request
1169     }
1170    
1171 elmex 1.15 sub http_post($$@) {
1172 root 1.22 my $url = shift;
1173     unshift @_, "POST", $url, "body";
1174 root 1.3 &http_request
1175     }
1176    
1177 root 1.9 =back
1178    
1179 root 1.55 =head2 DNS CACHING
1180    
1181     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1182     the actual connection, which in turn uses AnyEvent::DNS to resolve
1183     hostnames. The latter is a simple stub resolver and does no caching
1184     on its own. If you want DNS caching, you currently have to provide
1185     your own default resolver (by storing a suitable resolver object in
1186 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1187 root 1.55
1188 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1189 root 1.1
1190     =over 4
1191    
1192 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1193    
1194     Sets the default proxy server to use. The proxy-url must begin with a
1195 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1196 root 1.52
1197     To clear an already-set proxy, use C<undef>.
1198 root 1.2
1199 root 1.113 When AnyEvent::HTTP is loaded for the first time it will query the
1200 root 1.102 default proxy from the operating system, currently by looking at
1201     C<$ENV{http_proxy>}.
1202    
1203 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1204    
1205     Remove all cookies from the cookie jar that have been expired. If
1206     C<$session_end> is given and true, then additionally remove all session
1207     cookies.
1208    
1209     You should call this function (with a true C<$session_end>) before you
1210     save cookies to disk, and you should call this function after loading them
1211     again. If you have a long-running program you can additonally call this
1212     function from time to time.
1213    
1214     A cookie jar is initially an empty hash-reference that is managed by this
1215     module. It's format is subject to change, but currently it is like this:
1216    
1217     The key C<version> has to contain C<1>, otherwise the hash gets
1218     emptied. All other keys are hostnames or IP addresses pointing to
1219     hash-references. The key for these inner hash references is the
1220     server path for which this cookie is meant, and the values are again
1221 root 1.115 hash-references. Each key of those hash-references is a cookie name, and
1222 root 1.80 the value, you guessed it, is another hash-reference, this time with the
1223     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1224     which have been replaced by a C<_expires> key that contains the cookie
1225 root 1.115 expiry timestamp. Session cookies are indicated by not having an
1226     C<_expires> key.
1227 root 1.80
1228     Here is an example of a cookie jar with a single cookie, so you have a
1229     chance of understanding the above paragraph:
1230    
1231     {
1232     version => 1,
1233     "10.0.0.1" => {
1234     "/" => {
1235     "mythweb_id" => {
1236     _expires => 1293917923,
1237     value => "ooRung9dThee3ooyXooM1Ohm",
1238     },
1239     },
1240     },
1241     }
1242    
1243 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1244    
1245     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1246     Date (RFC 2616).
1247    
1248     =item $timestamp = AnyEvent::HTTP::parse_date $date
1249    
1250 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1251     bunch of minor variations of those, and returns the corresponding POSIX
1252     timestamp, or C<undef> if the date cannot be parsed.
1253 root 1.61
1254 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1255 root 1.1
1256 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1257 root 1.1
1258 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1259    
1260 root 1.110 The default timeout for connection operations (default: C<300>).
1261 root 1.92
1262 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1263    
1264     The default value for the C<User-Agent> header (the default is
1265 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1266 root 1.1
1267 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1268 root 1.1
1269 root 1.47 The maximum number of concurrent connections to the same host (identified
1270 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
1271 root 1.92 are queued until previous connections are closed. Both persistent and
1272     non-persistent connections are counted in this limit.
1273 root 1.1
1274 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1275 root 1.92 increase it much.
1276    
1277     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1278     connections, older browsers used 2, newers (such as firefox 3) typically
1279     use 6, and Opera uses 8 because like, they have the fastest browser and
1280     give a shit for everybody else on the planet.
1281    
1282     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1283    
1284     The time after which idle persistent conenctions get closed by
1285     AnyEvent::HTTP (default: C<3>).
1286 root 1.3
1287 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1288    
1289     The number of active connections. This is not the number of currently
1290     running requests, but the number of currently open and non-idle TCP
1291 root 1.92 connections. This number can be useful for load-leveling.
1292 root 1.14
1293 root 1.1 =back
1294    
1295     =cut
1296    
1297 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1298     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1299    
1300     sub format_date($) {
1301     my ($time) = @_;
1302    
1303     # RFC 822/1123 format
1304     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1305    
1306     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1307     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1308     $H, $M, $S;
1309     }
1310    
1311     sub parse_date($) {
1312     my ($date) = @_;
1313    
1314     my ($d, $m, $y, $H, $M, $S);
1315    
1316 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1317 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1318     # cookie dates (with "-")
1319    
1320 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1321    
1322 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1323 root 1.61 # RFC 850
1324     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1325    
1326 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1327 root 1.61 # ISO C's asctime
1328     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1329     }
1330     # other formats fail in the loop below
1331    
1332     for (0..11) {
1333     if ($m eq $month[$_]) {
1334     require Time::Local;
1335 root 1.111 return eval { Time::Local::timegm ($S, $M, $H, $d, $_, $y) };
1336 root 1.61 }
1337     }
1338    
1339     undef
1340     }
1341    
1342 root 1.2 sub set_proxy($) {
1343 root 1.52 if (length $_[0]) {
1344 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1345 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1346     $PROXY = [$2, $3 || 3128, $1]
1347     } else {
1348     undef $PROXY;
1349     }
1350 root 1.2 }
1351    
1352     # initialise proxy from environment
1353 root 1.52 eval {
1354     set_proxy $ENV{http_proxy};
1355     };
1356 root 1.2
1357 root 1.93 =head2 SHOWCASE
1358    
1359     This section contaisn some more elaborate "real-world" examples or code
1360     snippets.
1361    
1362     =head2 HTTP/1.1 FILE DOWNLOAD
1363    
1364 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1365 root 1.98 goes wrong and you want to resume.
1366 root 1.93
1367     Here is a function that initiates and resumes a download. It uses the
1368     last modified time to check for file content changes, and works with many
1369     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1370     on older servers.
1371    
1372     It calls the completion callback with either C<undef>, which means a
1373     nonretryable error occured, C<0> when the download was partial and should
1374     be retried, and C<1> if it was successful.
1375    
1376     use AnyEvent::HTTP;
1377    
1378     sub download($$$) {
1379     my ($url, $file, $cb) = @_;
1380    
1381     open my $fh, "+<", $file
1382     or die "$file: $!";
1383    
1384     my %hdr;
1385     my $ofs = 0;
1386    
1387     warn stat $fh;
1388     warn -s _;
1389     if (stat $fh and -s _) {
1390     $ofs = -s _;
1391 root 1.107 warn "-s is ", $ofs;
1392 root 1.93 $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1393     $hdr{"range"} = "bytes=$ofs-";
1394     }
1395    
1396     http_get $url,
1397     headers => \%hdr,
1398     on_header => sub {
1399     my ($hdr) = @_;
1400    
1401     if ($hdr->{Status} == 200 && $ofs) {
1402     # resume failed
1403     truncate $fh, $ofs = 0;
1404     }
1405    
1406     sysseek $fh, $ofs, 0;
1407    
1408     1
1409     },
1410     on_body => sub {
1411     my ($data, $hdr) = @_;
1412    
1413     if ($hdr->{Status} =~ /^2/) {
1414     length $data == syswrite $fh, $data
1415     or return; # abort on write errors
1416     }
1417    
1418     1
1419     },
1420     sub {
1421     my (undef, $hdr) = @_;
1422    
1423     my $status = $hdr->{Status};
1424    
1425     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1426     utime $fh, $time, $time;
1427     }
1428    
1429     if ($status == 200 || $status == 206 || $status == 416) {
1430     # download ok || resume ok || file already fully downloaded
1431     $cb->(1, $hdr);
1432    
1433     } elsif ($status == 412) {
1434     # file has changed while resuming, delete and retry
1435     unlink $file;
1436     $cb->(0, $hdr);
1437    
1438     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1439     # retry later
1440     $cb->(0, $hdr);
1441    
1442     } else {
1443     $cb->(undef, $hdr);
1444     }
1445     }
1446     ;
1447     }
1448    
1449     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1450     if ($_[0]) {
1451     print "OK!\n";
1452     } elsif (defined $_[0]) {
1453     print "please retry later\n";
1454     } else {
1455     print "ERROR\n";
1456     }
1457     };
1458    
1459     =head3 SOCKS PROXIES
1460 root 1.60
1461     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1462     compile your perl to support socks, or use an external program such as
1463     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1464     transparently.
1465    
1466     Alternatively, for AnyEvent::HTTP only, you can use your own
1467     C<tcp_connect> function that does the proxy handshake - here is an example
1468     that works with socks4a proxies:
1469    
1470     use Errno;
1471     use AnyEvent::Util;
1472     use AnyEvent::Socket;
1473     use AnyEvent::Handle;
1474    
1475     # host, port and username of/for your socks4a proxy
1476     my $socks_host = "10.0.0.23";
1477     my $socks_port = 9050;
1478     my $socks_user = "";
1479    
1480     sub socks4a_connect {
1481     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1482    
1483     my $hdl = new AnyEvent::Handle
1484     connect => [$socks_host, $socks_port],
1485     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1486     on_error => sub { $connect_cb->() },
1487     ;
1488    
1489     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1490    
1491     $hdl->push_read (chunk => 8, sub {
1492     my ($hdl, $chunk) = @_;
1493     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1494    
1495     if ($status == 0x5a) {
1496     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1497     } else {
1498     $! = Errno::ENXIO; $connect_cb->();
1499     }
1500     });
1501    
1502     $hdl
1503     }
1504    
1505     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1506     possibly after switching off other proxy types:
1507    
1508     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1509    
1510     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1511     my ($data, $headers) = @_;
1512     ...
1513     };
1514    
1515 root 1.1 =head1 SEE ALSO
1516    
1517     L<AnyEvent>.
1518    
1519     =head1 AUTHOR
1520    
1521 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1522     http://home.schmorp.de/
1523 root 1.1
1524 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1525     testcases and bugreports.
1526    
1527 root 1.1 =cut
1528    
1529     1
1530