ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.112
Committed: Wed Nov 14 22:22:24 2012 UTC (11 years, 6 months ago) by root
Branch: MAIN
CVS Tags: rel-2_15
Changes since 1.111: +5 -4 lines
Log Message:
2.15

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
21 root 1.11 automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41 root 1.85 use common::sense;
42 root 1.1
43 root 1.41 use Errno ();
44 root 1.1
45 root 1.51 use AnyEvent 5.0 ();
46 root 1.1 use AnyEvent::Util ();
47     use AnyEvent::Handle ();
48    
49     use base Exporter::;
50    
51 root 1.112 our $VERSION = '2.15';
52 root 1.1
53 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
54 root 1.1
55 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
56 root 1.3 our $MAX_RECURSE = 10;
57 root 1.92 our $PERSISTENT_TIMEOUT = 3;
58 root 1.2 our $TIMEOUT = 300;
59 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
60 root 1.1
61 root 1.2 our $PROXY;
62 root 1.14 our $ACTIVE = 0;
63 root 1.2
64 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
65 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
66 root 1.1
67     =item http_get $url, key => value..., $cb->($data, $headers)
68    
69     Executes an HTTP-GET request. See the http_request function for details on
70 root 1.29 additional parameters and the return value.
71 root 1.1
72 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
73    
74 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
75     on additional parameters and the return value.
76 root 1.5
77     =item http_post $url, $body, key => value..., $cb->($data, $headers)
78 root 1.3
79 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
80 root 1.29 http_request function for details on additional parameters and the return
81     value.
82 root 1.3
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.29 When called in void context, nothing is returned. In other contexts,
89     C<http_request> returns a "cancellation guard" - you have to keep the
90     object at least alive until the callback get called. If the object gets
91 root 1.58 destroyed before the callback is called, the request will be cancelled.
92 root 1.29
93 root 1.42 The callback will be called with the response body data as first argument
94 root 1.68 (or C<undef> if an error occured), and a hash-ref with response headers
95     (and trailers) as second argument.
96 root 1.2
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
99     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
100 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
101     during the body phase of a request, then the original C<Status> and
102     C<Reason> values from the header are available as C<OrigStatus> and
103     C<OrigReason>.
104 root 1.55
105     The pseudo-header C<URL> contains the actual URL (which can differ from
106     the requested URL when following redirects - for example, you might get
107     an error that your URL scheme is not supported even though your URL is a
108     valid http URL because it redirected to an ftp URL, in which case you can
109     look at the URL pseudo header).
110    
111     The pseudo-header C<Redirect> only exists when the request was a result
112     of an internal redirect. In that case it is an array reference with
113     the C<($data, $headers)> from the redirect response. Note that this
114     response could in turn be the result of a redirect itself, and C<<
115     $headers->{Redirect}[1]{Redirect} >> will then contain the original
116     response, and so on.
117 root 1.20
118 root 1.32 If the server sends a header multiple times, then their contents will be
119     joined together with a comma (C<,>), as per the HTTP spec.
120 root 1.2
121     If an internal error occurs, such as not being able to resolve a hostname,
122 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
123     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
124     message. Currently the following status codes are used:
125    
126     =over 4
127    
128     =item 595 - errors during connection etsbalishment, proxy handshake.
129    
130     =item 596 - errors during TLS negotiation, request sending and header processing.
131    
132 root 1.78 =item 597 - errors during body receiving or processing.
133 root 1.77
134 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
135 root 1.77
136     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
137    
138     =back
139 root 1.2
140 root 1.6 A typical callback might look like this:
141    
142     sub {
143     my ($body, $hdr) = @_;
144    
145     if ($hdr->{Status} =~ /^2/) {
146     ... everything should be ok
147     } else {
148     print "error, $hdr->{Status} $hdr->{Reason}\n";
149     }
150     }
151    
152 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
153     include:
154    
155     =over 4
156    
157 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
158 root 1.1
159 root 1.112 Whether to recurse requests or not, e.g. on redirects, authentication and
160     other retries and so on, and how often to do so.
161 root 1.1
162     =item headers => hashref
163    
164 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
165     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
166 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
167     (this can be suppressed by using C<undef> for these headers in which case
168     they won't be sent at all).
169 root 1.1
170 root 1.90 You really should provide your own C<User-Agent:> header value that is
171     appropriate for your program - I wouldn't be surprised if the default
172     AnyEvent string gets blocked by webservers sooner or later.
173    
174 root 1.104 Also, make sure that your headers names and values do not contain any
175     embedded newlines.
176    
177 root 1.1 =item timeout => $seconds
178    
179     The time-out to use for various stages - each connect attempt will reset
180 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
181     timeout.
182    
183     Default timeout is 5 minutes.
184 root 1.2
185     =item proxy => [$host, $port[, $scheme]] or undef
186    
187 root 1.102 Use the given http proxy for all requests, or no proxy if C<undef> is
188     used.
189 root 1.2
190 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
191 root 1.1
192 root 1.102 If not specified, then the default proxy is used (see
193     C<AnyEvent::HTTP::set_proxy>).
194    
195 root 1.3 =item body => $string
196    
197 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
198 root 1.3 this module might offer more options).
199    
200 root 1.10 =item cookie_jar => $hash_ref
201    
202     Passing this parameter enables (simplified) cookie-processing, loosely
203     based on the original netscape specification.
204    
205 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
206     will get updated automatically. It is possible to save the cookie jar
207     to persistent storage with something like JSON or Storable - see the
208     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
209     expired or session-only cookies, and also for documentation on the format
210     of the cookie jar.
211 root 1.10
212 root 1.70 Note that this cookie implementation is not meant to be complete. If
213     you want complete cookie management you have to do that on your
214 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
215 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
216     to.
217 root 1.10
218 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
219 root 1.70 headers will be set and handled by this module, otherwise they will be
220 root 1.69 left untouched.
221    
222 root 1.40 =item tls_ctx => $scheme | $tls_ctx
223    
224     Specifies the AnyEvent::TLS context to be used for https connections. This
225     parameter follows the same rules as the C<tls_ctx> parameter to
226     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
227     C<high> can be specified, which give you a predefined low-security (no
228     verification, highest compatibility) and high-security (CA and common-name
229     verification) TLS context.
230    
231     The default for this option is C<low>, which could be interpreted as "give
232     me the page, no matter what".
233    
234 root 1.92 See also the C<sessionid> parameter.
235    
236     =item session => $string
237    
238     The module might reuse connections to the same host internally. Sometimes
239     (e.g. when using TLS), you do not want to reuse connections from other
240     sessions. This can be achieved by setting this parameter to some unique
241     ID (such as the address of an object storing your state data, or the TLS
242     context) - only connections using the same unique ID will be reused.
243    
244 root 1.51 =item on_prepare => $callback->($fh)
245    
246     In rare cases you need to "tune" the socket before it is used to
247     connect (for exmaple, to bind it on a given IP address). This parameter
248     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
249     and behaves exactly the same way (e.g. it has to provide a
250     timeout). See the description for the C<$prepare_cb> argument of
251     C<AnyEvent::Socket::tcp_connect> for details.
252    
253 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
254    
255     In even rarer cases you want total control over how AnyEvent::HTTP
256     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
257     to do this, but you can provide your own C<tcp_connect> function -
258 root 1.60 obviously, it has to follow the same calling conventions, except that it
259     may always return a connection guard object.
260 root 1.59
261     There are probably lots of weird uses for this function, starting from
262     tracing the hosts C<http_request> actually tries to connect, to (inexact
263     but fast) host => IP address caching or even socks protocol support.
264    
265 root 1.42 =item on_header => $callback->($headers)
266 root 1.41
267     When specified, this callback will be called with the header hash as soon
268     as headers have been successfully received from the remote server (not on
269     locally-generated errors).
270    
271     It has to return either true (in which case AnyEvent::HTTP will continue),
272     or false, in which case AnyEvent::HTTP will cancel the download (and call
273     the finish callback with an error code of C<598>).
274    
275     This callback is useful, among other things, to quickly reject unwanted
276     content, which, if it is supposed to be rare, can be faster than first
277     doing a C<HEAD> request.
278    
279 root 1.68 The downside is that cancelling the request makes it impossible to re-use
280     the connection. Also, the C<on_header> callback will not receive any
281     trailer (headers sent after the response body).
282    
283 root 1.42 Example: cancel the request unless the content-type is "text/html".
284 root 1.41
285 root 1.42 on_header => sub {
286     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
287     },
288 root 1.41
289 root 1.42 =item on_body => $callback->($partial_body, $headers)
290 root 1.41
291 root 1.42 When specified, all body data will be passed to this callback instead of
292     to the completion callback. The completion callback will get the empty
293     string instead of the body data.
294 root 1.41
295 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
296     or false, in which case AnyEvent::HTTP will cancel the download (and call
297     the completion callback with an error code of C<598>).
298    
299 root 1.68 The downside to cancelling the request is that it makes it impossible to
300     re-use the connection.
301    
302 root 1.42 This callback is useful when the data is too large to be held in memory
303     (so the callback writes it to a file) or when only some information should
304     be extracted, or when the body should be processed incrementally.
305 root 1.41
306     It is usually preferred over doing your own body handling via
307 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
308     only used to create a connection, C<want_body_handle> is the better
309     alternative, as it allows you to install your own event handler, reducing
310     resource usage.
311 root 1.41
312     =item want_body_handle => $enable
313    
314     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
315     changes considerably: after parsing the headers, and instead of
316     downloading the body (if any), the completion callback will be
317     called. Instead of the C<$body> argument containing the body data, the
318     callback will receive the L<AnyEvent::Handle> object associated with the
319     connection. In error cases, C<undef> will be passed. When there is no body
320     (e.g. status C<304>), the empty string will be passed.
321    
322 root 1.92 The handle object might or might not be in TLS mode, might be connected
323     to a proxy, be a persistent connection, use chunked transfer encoding
324     etc., and configured in unspecified ways. The user is responsible for this
325     handle (it will not be used by this module anymore).
326 root 1.41
327     This is useful with some push-type services, where, after the initial
328     headers, an interactive protocol is used (typical example would be the
329     push-style twitter API which starts a JSON/XML stream).
330    
331     If you think you need this, first have a look at C<on_body>, to see if
332 root 1.45 that doesn't solve your problem in a better way.
333 root 1.41
334 root 1.92 =item persistent => $boolean
335    
336     Try to create/reuse a persistent connection. When this flag is set
337     (default: true for idempotent requests, false for all others), then
338     C<http_request> tries to re-use an existing (previously-created)
339     persistent connection to the host and, failing that, tries to create a new
340     one.
341    
342     Requests failing in certain ways will be automatically retried once, which
343     is dangerous for non-idempotent requests, which is why it defaults to off
344     for them. The reason for this is because the bozos who designed HTTP/1.1
345     made it impossible to distinguish between a fatal error and a normal
346     connection timeout, so you never know whether there was a problem with
347     your request or not.
348    
349     When reusing an existent connection, many parameters (such as TLS context)
350     will be ignored. See the C<session> parameter for a workaround.
351    
352     =item keepalive => $boolean
353    
354     Only used when C<persistent> is also true. This parameter decides whether
355     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
356     (as opposed to only a HTTP/1.1 persistent connection).
357    
358     The default is true, except when using a proxy, in which case it defaults
359     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
360    
361     =item handle_params => { key => value ... }
362    
363     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
364     constructor that is called - not all requests will create a handle, and
365     sometimes more than one is created, so this parameter is only good for
366     setting hints.
367    
368     Example: set the maximum read size to 4096, to potentially conserve memory
369     at the cost of speed.
370    
371     handle_params => {
372     max_read_size => 4096,
373     },
374    
375 root 1.1 =back
376    
377 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
378     the response body.
379 root 1.9
380     http_request GET => "http://www.nethype.de/", sub {
381     my ($body, $hdr) = @_;
382     print "$body\n";
383     };
384    
385 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
386 root 1.9 timeout of 30 seconds.
387    
388     http_request
389 root 1.109 HEAD => "https://www.google.com",
390 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
391 root 1.9 timeout => 30,
392     sub {
393     my ($body, $hdr) = @_;
394     use Data::Dumper;
395     print Dumper $hdr;
396     }
397     ;
398 root 1.1
399 root 1.68 Example: do another simple HTTP GET request, but immediately try to
400 root 1.29 cancel it.
401    
402     my $request = http_request GET => "http://www.nethype.de/", sub {
403     my ($body, $hdr) = @_;
404     print "$body\n";
405     };
406    
407     undef $request;
408    
409 root 1.1 =cut
410    
411 root 1.92 #############################################################################
412     # wait queue/slots
413    
414 root 1.12 sub _slot_schedule;
415 root 1.11 sub _slot_schedule($) {
416     my $host = shift;
417    
418     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
419     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
420 root 1.12 # somebody wants that slot
421 root 1.11 ++$CO_SLOT{$host}[0];
422 root 1.14 ++$ACTIVE;
423 root 1.11
424     $cb->(AnyEvent::Util::guard {
425 root 1.14 --$ACTIVE;
426 root 1.11 --$CO_SLOT{$host}[0];
427     _slot_schedule $host;
428     });
429     } else {
430     # nobody wants the slot, maybe we can forget about it
431     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
432     last;
433     }
434     }
435     }
436    
437     # wait for a free slot on host, call callback
438     sub _get_slot($$) {
439     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
440    
441     _slot_schedule $_[0];
442     }
443    
444 root 1.80 #############################################################################
445 root 1.92 # cookie handling
446 root 1.80
447     # expire cookies
448     sub cookie_jar_expire($;$) {
449     my ($jar, $session_end) = @_;
450    
451     %$jar = () if $jar->{version} != 1;
452    
453     my $anow = AE::now;
454    
455     while (my ($chost, $paths) = each %$jar) {
456     next unless ref $paths;
457    
458     while (my ($cpath, $cookies) = each %$paths) {
459     while (my ($cookie, $kv) = each %$cookies) {
460     if (exists $kv->{_expires}) {
461     delete $cookies->{$cookie}
462     if $anow > $kv->{_expires};
463     } elsif ($session_end) {
464     delete $cookies->{$cookie};
465     }
466     }
467    
468     delete $paths->{$cpath}
469     unless %$cookies;
470     }
471    
472     delete $jar->{$chost}
473     unless %$paths;
474     }
475     }
476    
477 root 1.72 # extract cookies from jar
478 root 1.71 sub cookie_jar_extract($$$$) {
479 root 1.92 my ($jar, $scheme, $host, $path) = @_;
480 root 1.71
481     %$jar = () if $jar->{version} != 1;
482    
483     my @cookies;
484    
485     while (my ($chost, $paths) = each %$jar) {
486     next unless ref $paths;
487    
488     if ($chost =~ /^\./) {
489 root 1.92 next unless $chost eq substr $host, -length $chost;
490 root 1.71 } elsif ($chost =~ /\./) {
491 root 1.92 next unless $chost eq $host;
492 root 1.71 } else {
493     next;
494     }
495    
496     while (my ($cpath, $cookies) = each %$paths) {
497 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
498 root 1.71
499     while (my ($cookie, $kv) = each %$cookies) {
500 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
501 root 1.71
502 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
503     delete $cookies->{$cookie};
504     next;
505 root 1.71 }
506    
507     my $value = $kv->{value};
508    
509     if ($value =~ /[=;,[:space:]]/) {
510     $value =~ s/([\\"])/\\$1/g;
511     $value = "\"$value\"";
512     }
513    
514     push @cookies, "$cookie=$value";
515     }
516     }
517     }
518    
519     \@cookies
520     }
521    
522 root 1.72 # parse set_cookie header into jar
523 root 1.80 sub cookie_jar_set_cookie($$$$) {
524 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
525 root 1.80
526     my $anow = int AE::now;
527     my $snow; # server-now
528 root 1.72
529     for ($set_cookie) {
530     # parse NAME=VALUE
531     my @kv;
532    
533 root 1.79 # expires is not http-compliant in the original cookie-spec,
534     # we support the official date format and some extensions
535 root 1.72 while (
536     m{
537     \G\s*
538     (?:
539 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
540 root 1.101 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^;,[:space:]]*) ) )?
541 root 1.72 )
542     }gcxsi
543     ) {
544     my $name = $2;
545     my $value = $4;
546    
547 root 1.82 if (defined $1) {
548 root 1.72 # expires
549     $name = "expires";
550     $value = $1;
551 root 1.82 } elsif (defined $3) {
552 root 1.72 # quoted
553     $value = $3;
554     $value =~ s/\\(.)/$1/gs;
555     }
556    
557 root 1.95 push @kv, @kv ? lc $name : $name, $value;
558 root 1.72
559     last unless /\G\s*;/gc;
560     }
561    
562     last unless @kv;
563    
564     my $name = shift @kv;
565     my %kv = (value => shift @kv, @kv);
566    
567 root 1.80 if (exists $kv{"max-age"}) {
568     $kv{_expires} = $anow + delete $kv{"max-age"};
569     } elsif (exists $kv{expires}) {
570     $snow ||= parse_date ($date) || $anow;
571     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
572     } else {
573     delete $kv{_expires};
574     }
575 root 1.72
576     my $cdom;
577     my $cpath = (delete $kv{path}) || "/";
578    
579     if (exists $kv{domain}) {
580     $cdom = delete $kv{domain};
581    
582     $cdom =~ s/^\.?/./; # make sure it starts with a "."
583    
584     next if $cdom =~ /\.$/;
585    
586     # this is not rfc-like and not netscape-like. go figure.
587     my $ndots = $cdom =~ y/.//;
588     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
589     } else {
590 root 1.92 $cdom = $host;
591 root 1.72 }
592    
593     # store it
594 root 1.73 $jar->{version} = 1;
595 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
596 root 1.72
597     redo if /\G\s*,/gc;
598     }
599     }
600    
601 root 1.92 #############################################################################
602     # keepalive/persistent connection cache
603    
604     # fetch a connection from the keepalive cache
605     sub ka_fetch($) {
606     my $ka_key = shift;
607    
608     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
609     delete $KA_CACHE{$ka_key}
610     unless @{ $KA_CACHE{$ka_key} };
611    
612     $hdl
613     }
614    
615     sub ka_store($$) {
616     my ($ka_key, $hdl) = @_;
617    
618     my $kaa = $KA_CACHE{$ka_key} ||= [];
619    
620     my $destroy = sub {
621     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
622    
623     $hdl->destroy;
624    
625     @ka
626     ? $KA_CACHE{$ka_key} = \@ka
627     : delete $KA_CACHE{$ka_key};
628     };
629    
630     # on error etc., destroy
631     $hdl->on_error ($destroy);
632     $hdl->on_eof ($destroy);
633     $hdl->on_read ($destroy);
634     $hdl->timeout ($PERSISTENT_TIMEOUT);
635    
636     push @$kaa, $hdl;
637     shift @$kaa while @$kaa > $MAX_PER_HOST;
638     }
639    
640     #############################################################################
641     # utilities
642    
643 root 1.66 # continue to parse $_ for headers and place them into the arg
644 root 1.92 sub _parse_hdr() {
645 root 1.66 my %hdr;
646    
647     # things seen, not parsed:
648     # p3pP="NON CUR OTPi OUR NOR UNI"
649    
650     $hdr{lc $1} .= ",$2"
651     while /\G
652     ([^:\000-\037]*):
653     [\011\040]*
654     ((?: [^\012]+ | \012[\011\040] )*)
655     \012
656     /gxc;
657    
658     /\G$/
659     or return;
660    
661     # remove the "," prefix we added to all headers above
662     substr $_, 0, 1, ""
663     for values %hdr;
664    
665     \%hdr
666     }
667    
668 root 1.92 #############################################################################
669     # http_get
670    
671 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
672 root 1.34
673 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
674     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
675 root 1.40
676 root 1.92 # maybe it should just become a normal object :/
677    
678     sub _destroy_state(\%) {
679     my ($state) = @_;
680    
681     $state->{handle}->destroy if $state->{handle};
682     %$state = ();
683     }
684    
685     sub _error(\%$$) {
686     my ($state, $cb, $hdr) = @_;
687    
688     &_destroy_state ($state);
689    
690     $cb->(undef, $hdr);
691     ()
692     }
693    
694 elmex 1.15 sub http_request($$@) {
695 root 1.1 my $cb = pop;
696     my ($method, $url, %arg) = @_;
697    
698     my %hdr;
699    
700 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
701     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
702    
703 root 1.3 $method = uc $method;
704    
705 root 1.8 if (my $hdr = $arg{headers}) {
706 root 1.1 while (my ($k, $v) = each %$hdr) {
707     $hdr{lc $k} = $v;
708     }
709     }
710    
711 root 1.55 # pseudo headers for all subsequent responses
712     my @pseudo = (URL => $url);
713     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
714    
715 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
716 root 1.8
717 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
718 root 1.8 if $recurse < 0;
719    
720 root 1.102 my $proxy = exists $arg{proxy} ? $arg{proxy} : $PROXY;
721 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
722    
723 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
724 root 1.100 $url =~ m|^([^:]+):(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?$|;
725 root 1.2
726 root 1.31 $uscheme = lc $uscheme;
727 root 1.1
728 root 1.31 my $uport = $uscheme eq "http" ? 80
729     : $uscheme eq "https" ? 443
730 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
731 root 1.13
732 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
733 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
734 root 1.10
735 root 1.86 my $uhost = lc $1;
736 root 1.10 $uport = $2 if defined $2;
737    
738 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
739     unless exists $hdr{host};
740 root 1.43
741 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
742 root 1.56 $upath .= $query if length $query;
743 root 1.10
744     $upath =~ s%^/?%/%;
745    
746     # cookie processing
747     if (my $jar = $arg{cookie_jar}) {
748 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
749 root 1.70
750 root 1.71 $hdr{cookie} = join "; ", @$cookies
751     if @$cookies;
752 root 1.10 }
753 root 1.1
754 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
755 root 1.2
756 root 1.10 if ($proxy) {
757 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
758 root 1.31
759 root 1.47 $rscheme = "http" unless defined $rscheme;
760    
761 root 1.31 # don't support https requests over https-proxy transport,
762 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
763 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
764 root 1.86
765     $rhost = lc $rhost;
766     $rscheme = lc $rscheme;
767 root 1.10 } else {
768 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
769 root 1.2 }
770    
771 root 1.47 # leave out fragment and query string, just a heuristic
772 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
773     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
774 root 1.41
775 root 1.53 $hdr{"content-length"} = length $arg{body}
776     if length $arg{body} || $method ne "GET";
777 root 1.1
778 root 1.89 my $idempotent = $method =~ /^(?:GET|HEAD|PUT|DELETE|OPTIONS|TRACE)$/;
779    
780     # default value for keepalive is true iff the request is for an idempotent method
781 root 1.103 my $persistent = exists $arg{persistent} ? !!$arg{persistent} : $idempotent;
782     my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : !$proxy;
783     my $was_persistent; # true if this is actually a recycled connection
784 root 1.92
785     # the key to use in the keepalive cache
786 root 1.105 my $ka_key = "$uscheme\x00$uhost\x00$uport\x00$arg{sessionid}";
787 root 1.89
788 root 1.103 $hdr{connection} = ($persistent ? $keepalive ? "keep-alive " : "" : "close ") . "Te"; #1.1
789 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
790 root 1.66
791 root 1.11 my %state = (connect_guard => 1);
792    
793 root 1.88 my $ae_error = 595; # connecting
794 root 1.1
795 root 1.88 # handle actual, non-tunneled, request
796     my $handle_actual_request = sub {
797     $ae_error = 596; # request phase
798    
799 root 1.92 my $hdl = $state{handle};
800    
801     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
802 root 1.88
803     # send request
804 root 1.92 $hdl->push_write (
805 root 1.88 "$method $rpath HTTP/1.1\015\012"
806     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
807     . "\015\012"
808     . (delete $arg{body})
809     );
810    
811     # return if error occured during push_write()
812     return unless %state;
813    
814 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
815     %hdr = ();
816 root 1.88
817     # status line and headers
818     $state{read_response} = sub {
819 root 1.92 return unless %state;
820    
821 root 1.88 for ("$_[1]") {
822     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
823    
824     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
825 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
826 root 1.88
827     # 100 Continue handling
828     # should not happen as we don't send expect: 100-continue,
829     # but we handle it just in case.
830     # since we send the request body regardless, if we get an error
831     # we are out of-sync, which we currently do NOT handle correctly.
832     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
833     if $2 eq 100;
834    
835     push @pseudo,
836     HTTPVersion => $1,
837     Status => $2,
838     Reason => $3,
839     ;
840 root 1.1
841 root 1.92 my $hdr = _parse_hdr
842     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
843 root 1.77
844 root 1.88 %hdr = (%$hdr, @pseudo);
845     }
846 root 1.82
847 root 1.88 # redirect handling
848     # microsoft and other shitheads don't give a shit for following standards,
849     # try to support some common forms of broken Location headers.
850     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
851     $hdr{location} =~ s/^\.\/+//;
852    
853     my $url = "$rscheme://$uhost:$uport";
854    
855     unless ($hdr{location} =~ s/^\///) {
856     $url .= $upath;
857     $url =~ s/\/[^\/]*$//;
858 root 1.82 }
859    
860 root 1.88 $hdr{location} = "$url/$hdr{location}";
861     }
862 root 1.82
863 root 1.88 my $redirect;
864 root 1.82
865 root 1.88 if ($recurse) {
866     my $status = $hdr{Status};
867 root 1.82
868 root 1.88 # industry standard is to redirect POST as GET for
869     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
870     # also, the UA should ask the user for 301 and 307 and POST,
871     # industry standard seems to be to simply follow.
872     # we go with the industry standard.
873     if ($status == 301 or $status == 302 or $status == 303) {
874     # HTTP/1.1 is unclear on how to mutate the method
875     $method = "GET" unless $method eq "HEAD";
876     $redirect = 1;
877     } elsif ($status == 307) {
878     $redirect = 1;
879 root 1.82 }
880 root 1.88 }
881 root 1.82
882 root 1.103 my $finish = sub { # ($data, $err_status, $err_reason[, $persistent])
883 root 1.92 if ($state{handle}) {
884     # handle keepalive
885     if (
886 root 1.103 $persistent
887 root 1.92 && $_[3]
888     && ($hdr{HTTPVersion} < 1.1
889     ? $hdr{connection} =~ /\bkeep-?alive\b/i
890     : $hdr{connection} !~ /\bclose\b/i)
891     ) {
892     ka_store $ka_key, delete $state{handle};
893     } else {
894     # no keepalive, destroy the handle
895     $state{handle}->destroy;
896     }
897     }
898 root 1.82
899 root 1.88 %state = ();
900 root 1.82
901 root 1.88 if (defined $_[1]) {
902     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
903     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
904     }
905 root 1.82
906 root 1.88 # set-cookie processing
907     if ($arg{cookie_jar}) {
908     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
909     }
910 root 1.82
911 root 1.88 if ($redirect && exists $hdr{location}) {
912     # we ignore any errors, as it is very common to receive
913     # Content-Length != 0 but no actual body
914     # we also access %hdr, as $_[1] might be an erro
915 root 1.103 $state{recurse} =
916     http_request (
917     $method => $hdr{location},
918     %arg,
919     recurse => $recurse - 1,
920     Redirect => [$_[0], \%hdr],
921     sub {
922     %state = ();
923     &$cb
924     },
925     );
926 root 1.88 } else {
927     $cb->($_[0], \%hdr);
928     }
929     };
930 root 1.82
931 root 1.88 $ae_error = 597; # body phase
932 root 1.82
933 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
934    
935     my $len = $chunked ? undef : $hdr{"content-length"};
936 root 1.82
937 root 1.88 # body handling, many different code paths
938     # - no body expected
939     # - want_body_handle
940     # - te chunked
941     # - 2x length known (with or without on_body)
942     # - 2x length not known (with or without on_body)
943     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
944     $finish->(undef, 598 => "Request cancelled by on_header");
945     } elsif (
946     $hdr{Status} =~ /^(?:1..|204|205|304)$/
947     or $method eq "HEAD"
948     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
949     ) {
950     # no body
951     $finish->("", undef, undef, 1);
952    
953     } elsif (!$redirect && $arg{want_body_handle}) {
954     $_[0]->on_eof (undef);
955     $_[0]->on_error (undef);
956     $_[0]->on_read (undef);
957    
958     $finish->(delete $state{handle});
959    
960 root 1.91 } elsif ($chunked) {
961 root 1.88 my $cl = 0;
962 root 1.92 my $body = "";
963 root 1.88 my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
964    
965     $state{read_chunk} = sub {
966     $_[1] =~ /^([0-9a-fA-F]+)/
967 root 1.108 or return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
968 root 1.82
969 root 1.88 my $len = hex $1;
970 root 1.82
971 root 1.88 if ($len) {
972     $cl += $len;
973 root 1.82
974 root 1.88 $_[0]->push_read (chunk => $len, sub {
975     $on_body->($_[1], \%hdr)
976     or return $finish->(undef, 598 => "Request cancelled by on_body");
977 root 1.82
978 root 1.88 $_[0]->push_read (line => sub {
979     length $_[1]
980     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
981     $_[0]->push_read (line => $state{read_chunk});
982 root 1.82 });
983 root 1.88 });
984     } else {
985     $hdr{"content-length"} ||= $cl;
986 root 1.84
987 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
988     if (length $_[1]) {
989     for ("$_[1]") {
990     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
991 root 1.84
992 root 1.92 my $hdr = _parse_hdr
993 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
994 root 1.84
995 root 1.88 %hdr = (%hdr, %$hdr);
996     }
997     }
998 root 1.84
999 root 1.88 $finish->($body, undef, undef, 1);
1000 root 1.84 });
1001     }
1002 root 1.88 };
1003    
1004     $_[0]->push_read (line => $state{read_chunk});
1005    
1006     } elsif ($arg{on_body}) {
1007     if (defined $len) {
1008     $_[0]->on_read (sub {
1009     $len -= length $_[0]{rbuf};
1010    
1011     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1012     or return $finish->(undef, 598 => "Request cancelled by on_body");
1013    
1014     $len > 0
1015     or $finish->("", undef, undef, 1);
1016     });
1017 root 1.84 } else {
1018 root 1.88 $_[0]->on_eof (sub {
1019     $finish->("");
1020     });
1021     $_[0]->on_read (sub {
1022     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1023     or $finish->(undef, 598 => "Request cancelled by on_body");
1024     });
1025     }
1026     } else {
1027     $_[0]->on_eof (undef);
1028 root 1.82
1029 root 1.88 if (defined $len) {
1030     $_[0]->on_read (sub {
1031     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1032     if $len <= length $_[0]{rbuf};
1033     });
1034     } else {
1035     $_[0]->on_error (sub {
1036     ($! == Errno::EPIPE || !$!)
1037     ? $finish->(delete $_[0]{rbuf})
1038     : $finish->(undef, $ae_error => $_[2]);
1039     });
1040     $_[0]->on_read (sub { });
1041 root 1.82 }
1042 root 1.88 }
1043     };
1044 root 1.82
1045 root 1.92 # if keepalive is enabled, then the server closing the connection
1046     # before a response can happen legally - we retry on idempotent methods.
1047 root 1.103 if ($was_persistent && $idempotent) {
1048 root 1.92 my $old_eof = $hdl->{on_eof};
1049     $hdl->{on_eof} = sub {
1050     _destroy_state %state;
1051    
1052 root 1.103 %state = ();
1053     $state{recurse} =
1054     http_request (
1055 root 1.112 $method => $url,
1056 root 1.103 %arg,
1057 root 1.112 recurse => $recurse - 1,
1058 root 1.103 keepalive => 0,
1059     sub {
1060     %state = ();
1061     &$cb
1062     }
1063     );
1064 root 1.92 };
1065     $hdl->on_read (sub {
1066     return unless %state;
1067    
1068     # as soon as we receive something, a connection close
1069     # once more becomes a hard error
1070     $hdl->{on_eof} = $old_eof;
1071     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1072     });
1073     } else {
1074     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1075     }
1076     };
1077    
1078     my $prepare_handle = sub {
1079     my ($hdl) = $state{handle};
1080    
1081     $hdl->on_error (sub {
1082     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1083     });
1084     $hdl->on_eof (sub {
1085     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1086     });
1087 root 1.103 $hdl->timeout_reset;
1088     $hdl->timeout ($timeout);
1089 root 1.88 };
1090 root 1.82
1091 root 1.92 # connected to proxy (or origin server)
1092 root 1.88 my $connect_cb = sub {
1093 root 1.92 my $fh = shift
1094     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1095 root 1.44
1096 root 1.88 return unless delete $state{connect_guard};
1097 root 1.11
1098 root 1.88 # get handle
1099     $state{handle} = new AnyEvent::Handle
1100 root 1.92 %{ $arg{handle_params} },
1101     fh => $fh,
1102     peername => $uhost,
1103 root 1.88 tls_ctx => $arg{tls_ctx},
1104     ;
1105 root 1.11
1106 root 1.92 $prepare_handle->();
1107 root 1.1
1108 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1109 root 1.88
1110     # now handle proxy-CONNECT method
1111     if ($proxy && $uscheme eq "https") {
1112     # oh dear, we have to wrap it into a connect request
1113    
1114     # maybe re-use $uauthority with patched port?
1115     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012\015\012");
1116     $state{handle}->push_read (line => $qr_nlnl, sub {
1117     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1118 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1119 root 1.88
1120     if ($2 == 200) {
1121     $rpath = $upath;
1122     $handle_actual_request->();
1123     } else {
1124 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1125 root 1.88 }
1126     });
1127     } else {
1128     $handle_actual_request->();
1129     }
1130     };
1131    
1132     _get_slot $uhost, sub {
1133     $state{slot_guard} = shift;
1134 root 1.64
1135 root 1.88 return unless $state{connect_guard};
1136 root 1.64
1137 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1138     # on a keepalive request (in theory, this should be a separate config option).
1139 root 1.103 if ($persistent && $KA_CACHE{$ka_key}) {
1140     $was_persistent = 1;
1141    
1142 root 1.92 $state{handle} = ka_fetch $ka_key;
1143 root 1.103 $state{handle}->destroyed
1144 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (1), please report.";#d#
1145 root 1.92 $prepare_handle->();
1146 root 1.103 $state{handle}->destroyed
1147 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (2), please report.";#d#
1148 root 1.92 $handle_actual_request->();
1149    
1150     } else {
1151     my $tcp_connect = $arg{tcp_connect}
1152     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1153 root 1.57
1154 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1155     }
1156 root 1.1 };
1157    
1158 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1159 root 1.1 }
1160    
1161 elmex 1.15 sub http_get($@) {
1162 root 1.1 unshift @_, "GET";
1163     &http_request
1164     }
1165    
1166 elmex 1.15 sub http_head($@) {
1167 root 1.4 unshift @_, "HEAD";
1168     &http_request
1169     }
1170    
1171 elmex 1.15 sub http_post($$@) {
1172 root 1.22 my $url = shift;
1173     unshift @_, "POST", $url, "body";
1174 root 1.3 &http_request
1175     }
1176    
1177 root 1.9 =back
1178    
1179 root 1.55 =head2 DNS CACHING
1180    
1181     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1182     the actual connection, which in turn uses AnyEvent::DNS to resolve
1183     hostnames. The latter is a simple stub resolver and does no caching
1184     on its own. If you want DNS caching, you currently have to provide
1185     your own default resolver (by storing a suitable resolver object in
1186 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1187 root 1.55
1188 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1189 root 1.1
1190     =over 4
1191    
1192 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1193    
1194     Sets the default proxy server to use. The proxy-url must begin with a
1195 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1196 root 1.52
1197     To clear an already-set proxy, use C<undef>.
1198 root 1.2
1199 root 1.102 When AnyEvent::HTTP is laoded for the first time it will query the
1200     default proxy from the operating system, currently by looking at
1201     C<$ENV{http_proxy>}.
1202    
1203 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1204    
1205     Remove all cookies from the cookie jar that have been expired. If
1206     C<$session_end> is given and true, then additionally remove all session
1207     cookies.
1208    
1209     You should call this function (with a true C<$session_end>) before you
1210     save cookies to disk, and you should call this function after loading them
1211     again. If you have a long-running program you can additonally call this
1212     function from time to time.
1213    
1214     A cookie jar is initially an empty hash-reference that is managed by this
1215     module. It's format is subject to change, but currently it is like this:
1216    
1217     The key C<version> has to contain C<1>, otherwise the hash gets
1218     emptied. All other keys are hostnames or IP addresses pointing to
1219     hash-references. The key for these inner hash references is the
1220     server path for which this cookie is meant, and the values are again
1221     hash-references. The keys of those hash-references is the cookie name, and
1222     the value, you guessed it, is another hash-reference, this time with the
1223     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1224     which have been replaced by a C<_expires> key that contains the cookie
1225     expiry timestamp.
1226    
1227     Here is an example of a cookie jar with a single cookie, so you have a
1228     chance of understanding the above paragraph:
1229    
1230     {
1231     version => 1,
1232     "10.0.0.1" => {
1233     "/" => {
1234     "mythweb_id" => {
1235     _expires => 1293917923,
1236     value => "ooRung9dThee3ooyXooM1Ohm",
1237     },
1238     },
1239     },
1240     }
1241    
1242 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1243    
1244     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1245     Date (RFC 2616).
1246    
1247     =item $timestamp = AnyEvent::HTTP::parse_date $date
1248    
1249 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1250     bunch of minor variations of those, and returns the corresponding POSIX
1251     timestamp, or C<undef> if the date cannot be parsed.
1252 root 1.61
1253 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1254 root 1.1
1255 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1256 root 1.1
1257 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1258    
1259 root 1.110 The default timeout for connection operations (default: C<300>).
1260 root 1.92
1261 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1262    
1263     The default value for the C<User-Agent> header (the default is
1264 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1265 root 1.1
1266 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1267 root 1.1
1268 root 1.47 The maximum number of concurrent connections to the same host (identified
1269 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
1270 root 1.92 are queued until previous connections are closed. Both persistent and
1271     non-persistent connections are counted in this limit.
1272 root 1.1
1273 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1274 root 1.92 increase it much.
1275    
1276     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1277     connections, older browsers used 2, newers (such as firefox 3) typically
1278     use 6, and Opera uses 8 because like, they have the fastest browser and
1279     give a shit for everybody else on the planet.
1280    
1281     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1282    
1283     The time after which idle persistent conenctions get closed by
1284     AnyEvent::HTTP (default: C<3>).
1285 root 1.3
1286 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1287    
1288     The number of active connections. This is not the number of currently
1289     running requests, but the number of currently open and non-idle TCP
1290 root 1.92 connections. This number can be useful for load-leveling.
1291 root 1.14
1292 root 1.1 =back
1293    
1294     =cut
1295    
1296 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1297     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1298    
1299     sub format_date($) {
1300     my ($time) = @_;
1301    
1302     # RFC 822/1123 format
1303     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1304    
1305     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1306     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1307     $H, $M, $S;
1308     }
1309    
1310     sub parse_date($) {
1311     my ($date) = @_;
1312    
1313     my ($d, $m, $y, $H, $M, $S);
1314    
1315 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1316 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1317     # cookie dates (with "-")
1318    
1319 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1320    
1321 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1322 root 1.61 # RFC 850
1323     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1324    
1325 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1326 root 1.61 # ISO C's asctime
1327     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1328     }
1329     # other formats fail in the loop below
1330    
1331     for (0..11) {
1332     if ($m eq $month[$_]) {
1333     require Time::Local;
1334 root 1.111 return eval { Time::Local::timegm ($S, $M, $H, $d, $_, $y) };
1335 root 1.61 }
1336     }
1337    
1338     undef
1339     }
1340    
1341 root 1.2 sub set_proxy($) {
1342 root 1.52 if (length $_[0]) {
1343 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1344 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1345     $PROXY = [$2, $3 || 3128, $1]
1346     } else {
1347     undef $PROXY;
1348     }
1349 root 1.2 }
1350    
1351     # initialise proxy from environment
1352 root 1.52 eval {
1353     set_proxy $ENV{http_proxy};
1354     };
1355 root 1.2
1356 root 1.93 =head2 SHOWCASE
1357    
1358     This section contaisn some more elaborate "real-world" examples or code
1359     snippets.
1360    
1361     =head2 HTTP/1.1 FILE DOWNLOAD
1362    
1363 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1364 root 1.98 goes wrong and you want to resume.
1365 root 1.93
1366     Here is a function that initiates and resumes a download. It uses the
1367     last modified time to check for file content changes, and works with many
1368     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1369     on older servers.
1370    
1371     It calls the completion callback with either C<undef>, which means a
1372     nonretryable error occured, C<0> when the download was partial and should
1373     be retried, and C<1> if it was successful.
1374    
1375     use AnyEvent::HTTP;
1376    
1377     sub download($$$) {
1378     my ($url, $file, $cb) = @_;
1379    
1380     open my $fh, "+<", $file
1381     or die "$file: $!";
1382    
1383     my %hdr;
1384     my $ofs = 0;
1385    
1386     warn stat $fh;
1387     warn -s _;
1388     if (stat $fh and -s _) {
1389     $ofs = -s _;
1390 root 1.107 warn "-s is ", $ofs;
1391 root 1.93 $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1392     $hdr{"range"} = "bytes=$ofs-";
1393     }
1394    
1395     http_get $url,
1396     headers => \%hdr,
1397     on_header => sub {
1398     my ($hdr) = @_;
1399    
1400     if ($hdr->{Status} == 200 && $ofs) {
1401     # resume failed
1402     truncate $fh, $ofs = 0;
1403     }
1404    
1405     sysseek $fh, $ofs, 0;
1406    
1407     1
1408     },
1409     on_body => sub {
1410     my ($data, $hdr) = @_;
1411    
1412     if ($hdr->{Status} =~ /^2/) {
1413     length $data == syswrite $fh, $data
1414     or return; # abort on write errors
1415     }
1416    
1417     1
1418     },
1419     sub {
1420     my (undef, $hdr) = @_;
1421    
1422     my $status = $hdr->{Status};
1423    
1424     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1425     utime $fh, $time, $time;
1426     }
1427    
1428     if ($status == 200 || $status == 206 || $status == 416) {
1429     # download ok || resume ok || file already fully downloaded
1430     $cb->(1, $hdr);
1431    
1432     } elsif ($status == 412) {
1433     # file has changed while resuming, delete and retry
1434     unlink $file;
1435     $cb->(0, $hdr);
1436    
1437     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1438     # retry later
1439     $cb->(0, $hdr);
1440    
1441     } else {
1442     $cb->(undef, $hdr);
1443     }
1444     }
1445     ;
1446     }
1447    
1448     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1449     if ($_[0]) {
1450     print "OK!\n";
1451     } elsif (defined $_[0]) {
1452     print "please retry later\n";
1453     } else {
1454     print "ERROR\n";
1455     }
1456     };
1457    
1458     =head3 SOCKS PROXIES
1459 root 1.60
1460     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1461     compile your perl to support socks, or use an external program such as
1462     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1463     transparently.
1464    
1465     Alternatively, for AnyEvent::HTTP only, you can use your own
1466     C<tcp_connect> function that does the proxy handshake - here is an example
1467     that works with socks4a proxies:
1468    
1469     use Errno;
1470     use AnyEvent::Util;
1471     use AnyEvent::Socket;
1472     use AnyEvent::Handle;
1473    
1474     # host, port and username of/for your socks4a proxy
1475     my $socks_host = "10.0.0.23";
1476     my $socks_port = 9050;
1477     my $socks_user = "";
1478    
1479     sub socks4a_connect {
1480     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1481    
1482     my $hdl = new AnyEvent::Handle
1483     connect => [$socks_host, $socks_port],
1484     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1485     on_error => sub { $connect_cb->() },
1486     ;
1487    
1488     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1489    
1490     $hdl->push_read (chunk => 8, sub {
1491     my ($hdl, $chunk) = @_;
1492     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1493    
1494     if ($status == 0x5a) {
1495     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1496     } else {
1497     $! = Errno::ENXIO; $connect_cb->();
1498     }
1499     });
1500    
1501     $hdl
1502     }
1503    
1504     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1505     possibly after switching off other proxy types:
1506    
1507     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1508    
1509     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1510     my ($data, $headers) = @_;
1511     ...
1512     };
1513    
1514 root 1.1 =head1 SEE ALSO
1515    
1516     L<AnyEvent>.
1517    
1518     =head1 AUTHOR
1519    
1520 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1521     http://home.schmorp.de/
1522 root 1.1
1523 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1524     testcases and bugreports.
1525    
1526 root 1.1 =cut
1527    
1528     1
1529