ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.117
Committed: Mon Sep 9 21:41:43 2013 UTC (10 years, 8 months ago) by root
Branch: MAIN
Changes since 1.116: +9 -9 lines
Log Message:
David Steinbrunner

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
21 root 1.11 automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41 root 1.85 use common::sense;
42 root 1.1
43 root 1.41 use Errno ();
44 root 1.1
45 root 1.51 use AnyEvent 5.0 ();
46 root 1.1 use AnyEvent::Util ();
47     use AnyEvent::Handle ();
48    
49     use base Exporter::;
50    
51 root 1.112 our $VERSION = '2.15';
52 root 1.1
53 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
54 root 1.1
55 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
56 root 1.3 our $MAX_RECURSE = 10;
57 root 1.92 our $PERSISTENT_TIMEOUT = 3;
58 root 1.2 our $TIMEOUT = 300;
59 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
60 root 1.1
61 root 1.2 our $PROXY;
62 root 1.14 our $ACTIVE = 0;
63 root 1.2
64 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
65 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
66 root 1.1
67     =item http_get $url, key => value..., $cb->($data, $headers)
68    
69     Executes an HTTP-GET request. See the http_request function for details on
70 root 1.29 additional parameters and the return value.
71 root 1.1
72 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
73    
74 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
75     on additional parameters and the return value.
76 root 1.5
77     =item http_post $url, $body, key => value..., $cb->($data, $headers)
78 root 1.3
79 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
80 root 1.29 http_request function for details on additional parameters and the return
81     value.
82 root 1.3
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.29 When called in void context, nothing is returned. In other contexts,
89     C<http_request> returns a "cancellation guard" - you have to keep the
90     object at least alive until the callback get called. If the object gets
91 root 1.58 destroyed before the callback is called, the request will be cancelled.
92 root 1.29
93 root 1.42 The callback will be called with the response body data as first argument
94 root 1.117 (or C<undef> if an error occurred), and a hash-ref with response headers
95 root 1.68 (and trailers) as second argument.
96 root 1.2
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
99     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
100 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
101     during the body phase of a request, then the original C<Status> and
102     C<Reason> values from the header are available as C<OrigStatus> and
103     C<OrigReason>.
104 root 1.55
105     The pseudo-header C<URL> contains the actual URL (which can differ from
106     the requested URL when following redirects - for example, you might get
107     an error that your URL scheme is not supported even though your URL is a
108     valid http URL because it redirected to an ftp URL, in which case you can
109     look at the URL pseudo header).
110    
111     The pseudo-header C<Redirect> only exists when the request was a result
112     of an internal redirect. In that case it is an array reference with
113     the C<($data, $headers)> from the redirect response. Note that this
114     response could in turn be the result of a redirect itself, and C<<
115     $headers->{Redirect}[1]{Redirect} >> will then contain the original
116     response, and so on.
117 root 1.20
118 root 1.32 If the server sends a header multiple times, then their contents will be
119     joined together with a comma (C<,>), as per the HTTP spec.
120 root 1.2
121     If an internal error occurs, such as not being able to resolve a hostname,
122 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
123     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
124     message. Currently the following status codes are used:
125    
126     =over 4
127    
128 root 1.114 =item 595 - errors during connection establishment, proxy handshake.
129 root 1.77
130     =item 596 - errors during TLS negotiation, request sending and header processing.
131    
132 root 1.78 =item 597 - errors during body receiving or processing.
133 root 1.77
134 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
135 root 1.77
136     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
137    
138     =back
139 root 1.2
140 root 1.6 A typical callback might look like this:
141    
142     sub {
143     my ($body, $hdr) = @_;
144    
145     if ($hdr->{Status} =~ /^2/) {
146     ... everything should be ok
147     } else {
148     print "error, $hdr->{Status} $hdr->{Reason}\n";
149     }
150     }
151    
152 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
153     include:
154    
155     =over 4
156    
157 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
158 root 1.1
159 root 1.112 Whether to recurse requests or not, e.g. on redirects, authentication and
160     other retries and so on, and how often to do so.
161 root 1.1
162     =item headers => hashref
163    
164 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
165     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
166 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
167     (this can be suppressed by using C<undef> for these headers in which case
168     they won't be sent at all).
169 root 1.1
170 root 1.90 You really should provide your own C<User-Agent:> header value that is
171     appropriate for your program - I wouldn't be surprised if the default
172     AnyEvent string gets blocked by webservers sooner or later.
173    
174 root 1.104 Also, make sure that your headers names and values do not contain any
175     embedded newlines.
176    
177 root 1.1 =item timeout => $seconds
178    
179     The time-out to use for various stages - each connect attempt will reset
180 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
181     timeout.
182    
183     Default timeout is 5 minutes.
184 root 1.2
185     =item proxy => [$host, $port[, $scheme]] or undef
186    
187 root 1.102 Use the given http proxy for all requests, or no proxy if C<undef> is
188     used.
189 root 1.2
190 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
191 root 1.1
192 root 1.102 If not specified, then the default proxy is used (see
193     C<AnyEvent::HTTP::set_proxy>).
194    
195 root 1.3 =item body => $string
196    
197 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
198 root 1.3 this module might offer more options).
199    
200 root 1.10 =item cookie_jar => $hash_ref
201    
202     Passing this parameter enables (simplified) cookie-processing, loosely
203     based on the original netscape specification.
204    
205 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
206     will get updated automatically. It is possible to save the cookie jar
207     to persistent storage with something like JSON or Storable - see the
208     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
209     expired or session-only cookies, and also for documentation on the format
210     of the cookie jar.
211 root 1.10
212 root 1.70 Note that this cookie implementation is not meant to be complete. If
213     you want complete cookie management you have to do that on your
214 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
215 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
216     to.
217 root 1.10
218 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
219 root 1.70 headers will be set and handled by this module, otherwise they will be
220 root 1.69 left untouched.
221    
222 root 1.40 =item tls_ctx => $scheme | $tls_ctx
223    
224     Specifies the AnyEvent::TLS context to be used for https connections. This
225     parameter follows the same rules as the C<tls_ctx> parameter to
226     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
227     C<high> can be specified, which give you a predefined low-security (no
228     verification, highest compatibility) and high-security (CA and common-name
229     verification) TLS context.
230    
231     The default for this option is C<low>, which could be interpreted as "give
232     me the page, no matter what".
233    
234 root 1.92 See also the C<sessionid> parameter.
235    
236     =item session => $string
237    
238     The module might reuse connections to the same host internally. Sometimes
239     (e.g. when using TLS), you do not want to reuse connections from other
240     sessions. This can be achieved by setting this parameter to some unique
241     ID (such as the address of an object storing your state data, or the TLS
242     context) - only connections using the same unique ID will be reused.
243    
244 root 1.51 =item on_prepare => $callback->($fh)
245    
246     In rare cases you need to "tune" the socket before it is used to
247 root 1.117 connect (for example, to bind it on a given IP address). This parameter
248 root 1.51 overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
249     and behaves exactly the same way (e.g. it has to provide a
250     timeout). See the description for the C<$prepare_cb> argument of
251     C<AnyEvent::Socket::tcp_connect> for details.
252    
253 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
254    
255     In even rarer cases you want total control over how AnyEvent::HTTP
256     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
257     to do this, but you can provide your own C<tcp_connect> function -
258 root 1.60 obviously, it has to follow the same calling conventions, except that it
259     may always return a connection guard object.
260 root 1.59
261     There are probably lots of weird uses for this function, starting from
262     tracing the hosts C<http_request> actually tries to connect, to (inexact
263     but fast) host => IP address caching or even socks protocol support.
264    
265 root 1.42 =item on_header => $callback->($headers)
266 root 1.41
267     When specified, this callback will be called with the header hash as soon
268     as headers have been successfully received from the remote server (not on
269     locally-generated errors).
270    
271     It has to return either true (in which case AnyEvent::HTTP will continue),
272     or false, in which case AnyEvent::HTTP will cancel the download (and call
273     the finish callback with an error code of C<598>).
274    
275     This callback is useful, among other things, to quickly reject unwanted
276     content, which, if it is supposed to be rare, can be faster than first
277     doing a C<HEAD> request.
278    
279 root 1.68 The downside is that cancelling the request makes it impossible to re-use
280     the connection. Also, the C<on_header> callback will not receive any
281     trailer (headers sent after the response body).
282    
283 root 1.42 Example: cancel the request unless the content-type is "text/html".
284 root 1.41
285 root 1.42 on_header => sub {
286     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
287     },
288 root 1.41
289 root 1.42 =item on_body => $callback->($partial_body, $headers)
290 root 1.41
291 root 1.42 When specified, all body data will be passed to this callback instead of
292     to the completion callback. The completion callback will get the empty
293     string instead of the body data.
294 root 1.41
295 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
296     or false, in which case AnyEvent::HTTP will cancel the download (and call
297     the completion callback with an error code of C<598>).
298    
299 root 1.68 The downside to cancelling the request is that it makes it impossible to
300     re-use the connection.
301    
302 root 1.42 This callback is useful when the data is too large to be held in memory
303     (so the callback writes it to a file) or when only some information should
304     be extracted, or when the body should be processed incrementally.
305 root 1.41
306     It is usually preferred over doing your own body handling via
307 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
308     only used to create a connection, C<want_body_handle> is the better
309     alternative, as it allows you to install your own event handler, reducing
310     resource usage.
311 root 1.41
312     =item want_body_handle => $enable
313    
314     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
315     changes considerably: after parsing the headers, and instead of
316     downloading the body (if any), the completion callback will be
317     called. Instead of the C<$body> argument containing the body data, the
318     callback will receive the L<AnyEvent::Handle> object associated with the
319     connection. In error cases, C<undef> will be passed. When there is no body
320     (e.g. status C<304>), the empty string will be passed.
321    
322 root 1.92 The handle object might or might not be in TLS mode, might be connected
323     to a proxy, be a persistent connection, use chunked transfer encoding
324     etc., and configured in unspecified ways. The user is responsible for this
325     handle (it will not be used by this module anymore).
326 root 1.41
327     This is useful with some push-type services, where, after the initial
328     headers, an interactive protocol is used (typical example would be the
329     push-style twitter API which starts a JSON/XML stream).
330    
331     If you think you need this, first have a look at C<on_body>, to see if
332 root 1.45 that doesn't solve your problem in a better way.
333 root 1.41
334 root 1.92 =item persistent => $boolean
335    
336     Try to create/reuse a persistent connection. When this flag is set
337     (default: true for idempotent requests, false for all others), then
338     C<http_request> tries to re-use an existing (previously-created)
339     persistent connection to the host and, failing that, tries to create a new
340     one.
341    
342     Requests failing in certain ways will be automatically retried once, which
343     is dangerous for non-idempotent requests, which is why it defaults to off
344     for them. The reason for this is because the bozos who designed HTTP/1.1
345     made it impossible to distinguish between a fatal error and a normal
346     connection timeout, so you never know whether there was a problem with
347     your request or not.
348    
349     When reusing an existent connection, many parameters (such as TLS context)
350     will be ignored. See the C<session> parameter for a workaround.
351    
352     =item keepalive => $boolean
353    
354     Only used when C<persistent> is also true. This parameter decides whether
355     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
356     (as opposed to only a HTTP/1.1 persistent connection).
357    
358     The default is true, except when using a proxy, in which case it defaults
359     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
360    
361     =item handle_params => { key => value ... }
362    
363     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
364     constructor that is called - not all requests will create a handle, and
365     sometimes more than one is created, so this parameter is only good for
366     setting hints.
367    
368     Example: set the maximum read size to 4096, to potentially conserve memory
369     at the cost of speed.
370    
371     handle_params => {
372     max_read_size => 4096,
373     },
374    
375 root 1.1 =back
376    
377 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
378     the response body.
379 root 1.9
380     http_request GET => "http://www.nethype.de/", sub {
381     my ($body, $hdr) = @_;
382     print "$body\n";
383     };
384    
385 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
386 root 1.9 timeout of 30 seconds.
387    
388     http_request
389 root 1.109 HEAD => "https://www.google.com",
390 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
391 root 1.9 timeout => 30,
392     sub {
393     my ($body, $hdr) = @_;
394     use Data::Dumper;
395     print Dumper $hdr;
396     }
397     ;
398 root 1.1
399 root 1.68 Example: do another simple HTTP GET request, but immediately try to
400 root 1.29 cancel it.
401    
402     my $request = http_request GET => "http://www.nethype.de/", sub {
403     my ($body, $hdr) = @_;
404     print "$body\n";
405     };
406    
407     undef $request;
408    
409 root 1.1 =cut
410    
411 root 1.92 #############################################################################
412     # wait queue/slots
413    
414 root 1.12 sub _slot_schedule;
415 root 1.11 sub _slot_schedule($) {
416     my $host = shift;
417    
418     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
419     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
420 root 1.12 # somebody wants that slot
421 root 1.11 ++$CO_SLOT{$host}[0];
422 root 1.14 ++$ACTIVE;
423 root 1.11
424     $cb->(AnyEvent::Util::guard {
425 root 1.14 --$ACTIVE;
426 root 1.11 --$CO_SLOT{$host}[0];
427     _slot_schedule $host;
428     });
429     } else {
430     # nobody wants the slot, maybe we can forget about it
431     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
432     last;
433     }
434     }
435     }
436    
437     # wait for a free slot on host, call callback
438     sub _get_slot($$) {
439     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
440    
441     _slot_schedule $_[0];
442     }
443    
444 root 1.80 #############################################################################
445 root 1.92 # cookie handling
446 root 1.80
447     # expire cookies
448     sub cookie_jar_expire($;$) {
449     my ($jar, $session_end) = @_;
450    
451     %$jar = () if $jar->{version} != 1;
452    
453     my $anow = AE::now;
454    
455     while (my ($chost, $paths) = each %$jar) {
456     next unless ref $paths;
457    
458     while (my ($cpath, $cookies) = each %$paths) {
459     while (my ($cookie, $kv) = each %$cookies) {
460     if (exists $kv->{_expires}) {
461     delete $cookies->{$cookie}
462     if $anow > $kv->{_expires};
463     } elsif ($session_end) {
464     delete $cookies->{$cookie};
465     }
466     }
467    
468     delete $paths->{$cpath}
469     unless %$cookies;
470     }
471    
472     delete $jar->{$chost}
473     unless %$paths;
474     }
475     }
476    
477 root 1.72 # extract cookies from jar
478 root 1.71 sub cookie_jar_extract($$$$) {
479 root 1.92 my ($jar, $scheme, $host, $path) = @_;
480 root 1.71
481     %$jar = () if $jar->{version} != 1;
482    
483     my @cookies;
484    
485     while (my ($chost, $paths) = each %$jar) {
486     next unless ref $paths;
487    
488     if ($chost =~ /^\./) {
489 root 1.92 next unless $chost eq substr $host, -length $chost;
490 root 1.71 } elsif ($chost =~ /\./) {
491 root 1.92 next unless $chost eq $host;
492 root 1.71 } else {
493     next;
494     }
495    
496     while (my ($cpath, $cookies) = each %$paths) {
497 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
498 root 1.71
499     while (my ($cookie, $kv) = each %$cookies) {
500 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
501 root 1.71
502 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
503     delete $cookies->{$cookie};
504     next;
505 root 1.71 }
506    
507     my $value = $kv->{value};
508    
509     if ($value =~ /[=;,[:space:]]/) {
510     $value =~ s/([\\"])/\\$1/g;
511     $value = "\"$value\"";
512     }
513    
514     push @cookies, "$cookie=$value";
515     }
516     }
517     }
518    
519     \@cookies
520     }
521    
522 root 1.72 # parse set_cookie header into jar
523 root 1.80 sub cookie_jar_set_cookie($$$$) {
524 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
525 root 1.80
526     my $anow = int AE::now;
527     my $snow; # server-now
528 root 1.72
529     for ($set_cookie) {
530     # parse NAME=VALUE
531     my @kv;
532    
533 root 1.79 # expires is not http-compliant in the original cookie-spec,
534     # we support the official date format and some extensions
535 root 1.72 while (
536     m{
537     \G\s*
538     (?:
539 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
540 root 1.101 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^;,[:space:]]*) ) )?
541 root 1.72 )
542     }gcxsi
543     ) {
544     my $name = $2;
545     my $value = $4;
546    
547 root 1.82 if (defined $1) {
548 root 1.72 # expires
549     $name = "expires";
550     $value = $1;
551 root 1.82 } elsif (defined $3) {
552 root 1.72 # quoted
553     $value = $3;
554     $value =~ s/\\(.)/$1/gs;
555     }
556    
557 root 1.95 push @kv, @kv ? lc $name : $name, $value;
558 root 1.72
559     last unless /\G\s*;/gc;
560     }
561    
562     last unless @kv;
563    
564     my $name = shift @kv;
565     my %kv = (value => shift @kv, @kv);
566    
567 root 1.80 if (exists $kv{"max-age"}) {
568     $kv{_expires} = $anow + delete $kv{"max-age"};
569     } elsif (exists $kv{expires}) {
570     $snow ||= parse_date ($date) || $anow;
571     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
572     } else {
573     delete $kv{_expires};
574     }
575 root 1.72
576     my $cdom;
577     my $cpath = (delete $kv{path}) || "/";
578    
579     if (exists $kv{domain}) {
580     $cdom = delete $kv{domain};
581    
582     $cdom =~ s/^\.?/./; # make sure it starts with a "."
583    
584     next if $cdom =~ /\.$/;
585    
586     # this is not rfc-like and not netscape-like. go figure.
587     my $ndots = $cdom =~ y/.//;
588     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
589     } else {
590 root 1.92 $cdom = $host;
591 root 1.72 }
592    
593     # store it
594 root 1.73 $jar->{version} = 1;
595 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
596 root 1.72
597     redo if /\G\s*,/gc;
598     }
599     }
600    
601 root 1.92 #############################################################################
602     # keepalive/persistent connection cache
603    
604     # fetch a connection from the keepalive cache
605     sub ka_fetch($) {
606     my $ka_key = shift;
607    
608     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
609     delete $KA_CACHE{$ka_key}
610     unless @{ $KA_CACHE{$ka_key} };
611    
612     $hdl
613     }
614    
615     sub ka_store($$) {
616     my ($ka_key, $hdl) = @_;
617    
618     my $kaa = $KA_CACHE{$ka_key} ||= [];
619    
620     my $destroy = sub {
621     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
622    
623     $hdl->destroy;
624    
625     @ka
626     ? $KA_CACHE{$ka_key} = \@ka
627     : delete $KA_CACHE{$ka_key};
628     };
629    
630     # on error etc., destroy
631     $hdl->on_error ($destroy);
632     $hdl->on_eof ($destroy);
633     $hdl->on_read ($destroy);
634     $hdl->timeout ($PERSISTENT_TIMEOUT);
635    
636     push @$kaa, $hdl;
637     shift @$kaa while @$kaa > $MAX_PER_HOST;
638     }
639    
640     #############################################################################
641     # utilities
642    
643 root 1.66 # continue to parse $_ for headers and place them into the arg
644 root 1.92 sub _parse_hdr() {
645 root 1.66 my %hdr;
646    
647     # things seen, not parsed:
648     # p3pP="NON CUR OTPi OUR NOR UNI"
649    
650     $hdr{lc $1} .= ",$2"
651     while /\G
652     ([^:\000-\037]*):
653     [\011\040]*
654     ((?: [^\012]+ | \012[\011\040] )*)
655     \012
656     /gxc;
657    
658     /\G$/
659     or return;
660    
661     # remove the "," prefix we added to all headers above
662     substr $_, 0, 1, ""
663     for values %hdr;
664    
665     \%hdr
666     }
667    
668 root 1.92 #############################################################################
669     # http_get
670    
671 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
672 root 1.34
673 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
674     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
675 root 1.40
676 root 1.92 # maybe it should just become a normal object :/
677    
678     sub _destroy_state(\%) {
679     my ($state) = @_;
680    
681     $state->{handle}->destroy if $state->{handle};
682     %$state = ();
683     }
684    
685     sub _error(\%$$) {
686     my ($state, $cb, $hdr) = @_;
687    
688     &_destroy_state ($state);
689    
690     $cb->(undef, $hdr);
691     ()
692     }
693    
694 root 1.116 our %IDEMPOTENT = (
695     DELETE => 1,
696     GET => 1,
697     HEAD => 1,
698     OPTIONS => 1,
699     PUT => 1,
700     TRACE => 1,
701    
702     ACL => 1,
703     "BASELINE-CONTROL" => 1,
704     BIND => 1,
705     CHECKIN => 1,
706     CHECKOUT => 1,
707     COPY => 1,
708     LABEL => 1,
709     LINK => 1,
710     MERGE => 1,
711     MKACTIVITY => 1,
712     MKCALENDAR => 1,
713     MKCOL => 1,
714     MKREDIRECTREF => 1,
715     MKWORKSPACE => 1,
716     MOVE => 1,
717     ORDERPATCH => 1,
718     PROPFIND => 1,
719     PROPPATCH => 1,
720     REBIND => 1,
721     REPORT => 1,
722     SEARCH => 1,
723     UNBIND => 1,
724     UNCHECKOUT => 1,
725     UNLINK => 1,
726     UNLOCK => 1,
727     UPDATE => 1,
728     UPDATEREDIRECTREF => 1,
729     "VERSION-CONTROL" => 1,
730     );
731    
732 elmex 1.15 sub http_request($$@) {
733 root 1.1 my $cb = pop;
734     my ($method, $url, %arg) = @_;
735    
736     my %hdr;
737    
738 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
739     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
740    
741 root 1.3 $method = uc $method;
742    
743 root 1.8 if (my $hdr = $arg{headers}) {
744 root 1.1 while (my ($k, $v) = each %$hdr) {
745     $hdr{lc $k} = $v;
746     }
747     }
748    
749 root 1.55 # pseudo headers for all subsequent responses
750     my @pseudo = (URL => $url);
751     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
752    
753 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
754 root 1.8
755 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
756 root 1.8 if $recurse < 0;
757    
758 root 1.102 my $proxy = exists $arg{proxy} ? $arg{proxy} : $PROXY;
759 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
760    
761 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
762 root 1.100 $url =~ m|^([^:]+):(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?$|;
763 root 1.2
764 root 1.31 $uscheme = lc $uscheme;
765 root 1.1
766 root 1.31 my $uport = $uscheme eq "http" ? 80
767     : $uscheme eq "https" ? 443
768 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
769 root 1.13
770 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
771 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
772 root 1.10
773 root 1.86 my $uhost = lc $1;
774 root 1.10 $uport = $2 if defined $2;
775    
776 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
777     unless exists $hdr{host};
778 root 1.43
779 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
780 root 1.56 $upath .= $query if length $query;
781 root 1.10
782     $upath =~ s%^/?%/%;
783    
784     # cookie processing
785     if (my $jar = $arg{cookie_jar}) {
786 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
787 root 1.70
788 root 1.71 $hdr{cookie} = join "; ", @$cookies
789     if @$cookies;
790 root 1.10 }
791 root 1.1
792 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
793 root 1.2
794 root 1.10 if ($proxy) {
795 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
796 root 1.31
797 root 1.47 $rscheme = "http" unless defined $rscheme;
798    
799 root 1.31 # don't support https requests over https-proxy transport,
800 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
801 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
802 root 1.86
803     $rhost = lc $rhost;
804     $rscheme = lc $rscheme;
805 root 1.10 } else {
806 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
807 root 1.2 }
808    
809 root 1.47 # leave out fragment and query string, just a heuristic
810 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
811     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
812 root 1.41
813 root 1.53 $hdr{"content-length"} = length $arg{body}
814     if length $arg{body} || $method ne "GET";
815 root 1.1
816 root 1.116 my $idempotent = $IDEMPOTENT{$method};
817 root 1.89
818     # default value for keepalive is true iff the request is for an idempotent method
819 root 1.103 my $persistent = exists $arg{persistent} ? !!$arg{persistent} : $idempotent;
820     my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : !$proxy;
821     my $was_persistent; # true if this is actually a recycled connection
822 root 1.92
823     # the key to use in the keepalive cache
824 root 1.105 my $ka_key = "$uscheme\x00$uhost\x00$uport\x00$arg{sessionid}";
825 root 1.89
826 root 1.103 $hdr{connection} = ($persistent ? $keepalive ? "keep-alive " : "" : "close ") . "Te"; #1.1
827 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
828 root 1.66
829 root 1.11 my %state = (connect_guard => 1);
830    
831 root 1.88 my $ae_error = 595; # connecting
832 root 1.1
833 root 1.88 # handle actual, non-tunneled, request
834     my $handle_actual_request = sub {
835     $ae_error = 596; # request phase
836    
837 root 1.92 my $hdl = $state{handle};
838    
839     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
840 root 1.88
841     # send request
842 root 1.92 $hdl->push_write (
843 root 1.88 "$method $rpath HTTP/1.1\015\012"
844     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
845     . "\015\012"
846     . (delete $arg{body})
847     );
848    
849 root 1.117 # return if error occurred during push_write()
850 root 1.88 return unless %state;
851    
852 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
853     %hdr = ();
854 root 1.88
855     # status line and headers
856     $state{read_response} = sub {
857 root 1.92 return unless %state;
858    
859 root 1.88 for ("$_[1]") {
860     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
861    
862     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
863 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
864 root 1.88
865     # 100 Continue handling
866     # should not happen as we don't send expect: 100-continue,
867     # but we handle it just in case.
868     # since we send the request body regardless, if we get an error
869     # we are out of-sync, which we currently do NOT handle correctly.
870     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
871     if $2 eq 100;
872    
873     push @pseudo,
874     HTTPVersion => $1,
875     Status => $2,
876     Reason => $3,
877     ;
878 root 1.1
879 root 1.92 my $hdr = _parse_hdr
880     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
881 root 1.77
882 root 1.88 %hdr = (%$hdr, @pseudo);
883     }
884 root 1.82
885 root 1.88 # redirect handling
886     # microsoft and other shitheads don't give a shit for following standards,
887     # try to support some common forms of broken Location headers.
888     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
889     $hdr{location} =~ s/^\.\/+//;
890    
891     my $url = "$rscheme://$uhost:$uport";
892    
893     unless ($hdr{location} =~ s/^\///) {
894     $url .= $upath;
895     $url =~ s/\/[^\/]*$//;
896 root 1.82 }
897    
898 root 1.88 $hdr{location} = "$url/$hdr{location}";
899     }
900 root 1.82
901 root 1.88 my $redirect;
902 root 1.82
903 root 1.88 if ($recurse) {
904     my $status = $hdr{Status};
905 root 1.82
906 root 1.88 # industry standard is to redirect POST as GET for
907     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
908     # also, the UA should ask the user for 301 and 307 and POST,
909     # industry standard seems to be to simply follow.
910     # we go with the industry standard.
911     if ($status == 301 or $status == 302 or $status == 303) {
912     # HTTP/1.1 is unclear on how to mutate the method
913     $method = "GET" unless $method eq "HEAD";
914     $redirect = 1;
915     } elsif ($status == 307) {
916     $redirect = 1;
917 root 1.82 }
918 root 1.88 }
919 root 1.82
920 root 1.103 my $finish = sub { # ($data, $err_status, $err_reason[, $persistent])
921 root 1.92 if ($state{handle}) {
922     # handle keepalive
923     if (
924 root 1.103 $persistent
925 root 1.92 && $_[3]
926     && ($hdr{HTTPVersion} < 1.1
927     ? $hdr{connection} =~ /\bkeep-?alive\b/i
928     : $hdr{connection} !~ /\bclose\b/i)
929     ) {
930     ka_store $ka_key, delete $state{handle};
931     } else {
932     # no keepalive, destroy the handle
933     $state{handle}->destroy;
934     }
935     }
936 root 1.82
937 root 1.88 %state = ();
938 root 1.82
939 root 1.88 if (defined $_[1]) {
940     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
941     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
942     }
943 root 1.82
944 root 1.88 # set-cookie processing
945     if ($arg{cookie_jar}) {
946     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
947     }
948 root 1.82
949 root 1.88 if ($redirect && exists $hdr{location}) {
950     # we ignore any errors, as it is very common to receive
951     # Content-Length != 0 but no actual body
952     # we also access %hdr, as $_[1] might be an erro
953 root 1.103 $state{recurse} =
954     http_request (
955     $method => $hdr{location},
956     %arg,
957     recurse => $recurse - 1,
958     Redirect => [$_[0], \%hdr],
959     sub {
960     %state = ();
961     &$cb
962     },
963     );
964 root 1.88 } else {
965     $cb->($_[0], \%hdr);
966     }
967     };
968 root 1.82
969 root 1.88 $ae_error = 597; # body phase
970 root 1.82
971 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
972    
973     my $len = $chunked ? undef : $hdr{"content-length"};
974 root 1.82
975 root 1.88 # body handling, many different code paths
976     # - no body expected
977     # - want_body_handle
978     # - te chunked
979     # - 2x length known (with or without on_body)
980     # - 2x length not known (with or without on_body)
981     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
982     $finish->(undef, 598 => "Request cancelled by on_header");
983     } elsif (
984     $hdr{Status} =~ /^(?:1..|204|205|304)$/
985     or $method eq "HEAD"
986     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
987     ) {
988     # no body
989     $finish->("", undef, undef, 1);
990    
991     } elsif (!$redirect && $arg{want_body_handle}) {
992     $_[0]->on_eof (undef);
993     $_[0]->on_error (undef);
994     $_[0]->on_read (undef);
995    
996     $finish->(delete $state{handle});
997    
998 root 1.91 } elsif ($chunked) {
999 root 1.88 my $cl = 0;
1000 root 1.92 my $body = "";
1001 root 1.88 my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
1002    
1003     $state{read_chunk} = sub {
1004     $_[1] =~ /^([0-9a-fA-F]+)/
1005 root 1.108 or return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1006 root 1.82
1007 root 1.88 my $len = hex $1;
1008 root 1.82
1009 root 1.88 if ($len) {
1010     $cl += $len;
1011 root 1.82
1012 root 1.88 $_[0]->push_read (chunk => $len, sub {
1013     $on_body->($_[1], \%hdr)
1014     or return $finish->(undef, 598 => "Request cancelled by on_body");
1015 root 1.82
1016 root 1.88 $_[0]->push_read (line => sub {
1017     length $_[1]
1018     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1019     $_[0]->push_read (line => $state{read_chunk});
1020 root 1.82 });
1021 root 1.88 });
1022     } else {
1023     $hdr{"content-length"} ||= $cl;
1024 root 1.84
1025 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
1026     if (length $_[1]) {
1027     for ("$_[1]") {
1028     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
1029 root 1.84
1030 root 1.92 my $hdr = _parse_hdr
1031 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
1032 root 1.84
1033 root 1.88 %hdr = (%hdr, %$hdr);
1034     }
1035     }
1036 root 1.84
1037 root 1.88 $finish->($body, undef, undef, 1);
1038 root 1.84 });
1039     }
1040 root 1.88 };
1041    
1042     $_[0]->push_read (line => $state{read_chunk});
1043    
1044     } elsif ($arg{on_body}) {
1045     if (defined $len) {
1046     $_[0]->on_read (sub {
1047     $len -= length $_[0]{rbuf};
1048    
1049     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1050     or return $finish->(undef, 598 => "Request cancelled by on_body");
1051    
1052     $len > 0
1053     or $finish->("", undef, undef, 1);
1054     });
1055 root 1.84 } else {
1056 root 1.88 $_[0]->on_eof (sub {
1057     $finish->("");
1058     });
1059     $_[0]->on_read (sub {
1060     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1061     or $finish->(undef, 598 => "Request cancelled by on_body");
1062     });
1063     }
1064     } else {
1065     $_[0]->on_eof (undef);
1066 root 1.82
1067 root 1.88 if (defined $len) {
1068     $_[0]->on_read (sub {
1069     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1070     if $len <= length $_[0]{rbuf};
1071     });
1072     } else {
1073     $_[0]->on_error (sub {
1074     ($! == Errno::EPIPE || !$!)
1075     ? $finish->(delete $_[0]{rbuf})
1076     : $finish->(undef, $ae_error => $_[2]);
1077     });
1078     $_[0]->on_read (sub { });
1079 root 1.82 }
1080 root 1.88 }
1081     };
1082 root 1.82
1083 root 1.92 # if keepalive is enabled, then the server closing the connection
1084     # before a response can happen legally - we retry on idempotent methods.
1085 root 1.103 if ($was_persistent && $idempotent) {
1086 root 1.92 my $old_eof = $hdl->{on_eof};
1087     $hdl->{on_eof} = sub {
1088     _destroy_state %state;
1089    
1090 root 1.103 %state = ();
1091     $state{recurse} =
1092     http_request (
1093 root 1.112 $method => $url,
1094 root 1.103 %arg,
1095 root 1.112 recurse => $recurse - 1,
1096 root 1.103 keepalive => 0,
1097     sub {
1098     %state = ();
1099     &$cb
1100     }
1101     );
1102 root 1.92 };
1103     $hdl->on_read (sub {
1104     return unless %state;
1105    
1106     # as soon as we receive something, a connection close
1107     # once more becomes a hard error
1108     $hdl->{on_eof} = $old_eof;
1109     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1110     });
1111     } else {
1112     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1113     }
1114     };
1115    
1116     my $prepare_handle = sub {
1117     my ($hdl) = $state{handle};
1118    
1119     $hdl->on_error (sub {
1120     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1121     });
1122     $hdl->on_eof (sub {
1123     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1124     });
1125 root 1.103 $hdl->timeout_reset;
1126     $hdl->timeout ($timeout);
1127 root 1.88 };
1128 root 1.82
1129 root 1.92 # connected to proxy (or origin server)
1130 root 1.88 my $connect_cb = sub {
1131 root 1.92 my $fh = shift
1132     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1133 root 1.44
1134 root 1.88 return unless delete $state{connect_guard};
1135 root 1.11
1136 root 1.88 # get handle
1137     $state{handle} = new AnyEvent::Handle
1138 root 1.92 %{ $arg{handle_params} },
1139     fh => $fh,
1140     peername => $uhost,
1141 root 1.88 tls_ctx => $arg{tls_ctx},
1142     ;
1143 root 1.11
1144 root 1.92 $prepare_handle->();
1145 root 1.1
1146 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1147 root 1.88
1148     # now handle proxy-CONNECT method
1149     if ($proxy && $uscheme eq "https") {
1150     # oh dear, we have to wrap it into a connect request
1151    
1152     # maybe re-use $uauthority with patched port?
1153     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012\015\012");
1154     $state{handle}->push_read (line => $qr_nlnl, sub {
1155     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1156 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1157 root 1.88
1158     if ($2 == 200) {
1159     $rpath = $upath;
1160     $handle_actual_request->();
1161     } else {
1162 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1163 root 1.88 }
1164     });
1165     } else {
1166     $handle_actual_request->();
1167     }
1168     };
1169    
1170     _get_slot $uhost, sub {
1171     $state{slot_guard} = shift;
1172 root 1.64
1173 root 1.88 return unless $state{connect_guard};
1174 root 1.64
1175 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1176     # on a keepalive request (in theory, this should be a separate config option).
1177 root 1.103 if ($persistent && $KA_CACHE{$ka_key}) {
1178     $was_persistent = 1;
1179    
1180 root 1.92 $state{handle} = ka_fetch $ka_key;
1181 root 1.103 $state{handle}->destroyed
1182 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (1), please report.";#d#
1183 root 1.92 $prepare_handle->();
1184 root 1.103 $state{handle}->destroyed
1185 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (2), please report.";#d#
1186 root 1.92 $handle_actual_request->();
1187    
1188     } else {
1189     my $tcp_connect = $arg{tcp_connect}
1190     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1191 root 1.57
1192 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1193     }
1194 root 1.1 };
1195    
1196 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1197 root 1.1 }
1198    
1199 elmex 1.15 sub http_get($@) {
1200 root 1.1 unshift @_, "GET";
1201     &http_request
1202     }
1203    
1204 elmex 1.15 sub http_head($@) {
1205 root 1.4 unshift @_, "HEAD";
1206     &http_request
1207     }
1208    
1209 elmex 1.15 sub http_post($$@) {
1210 root 1.22 my $url = shift;
1211     unshift @_, "POST", $url, "body";
1212 root 1.3 &http_request
1213     }
1214    
1215 root 1.9 =back
1216    
1217 root 1.55 =head2 DNS CACHING
1218    
1219     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1220     the actual connection, which in turn uses AnyEvent::DNS to resolve
1221     hostnames. The latter is a simple stub resolver and does no caching
1222     on its own. If you want DNS caching, you currently have to provide
1223     your own default resolver (by storing a suitable resolver object in
1224 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1225 root 1.55
1226 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1227 root 1.1
1228     =over 4
1229    
1230 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1231    
1232     Sets the default proxy server to use. The proxy-url must begin with a
1233 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1234 root 1.52
1235     To clear an already-set proxy, use C<undef>.
1236 root 1.2
1237 root 1.113 When AnyEvent::HTTP is loaded for the first time it will query the
1238 root 1.102 default proxy from the operating system, currently by looking at
1239     C<$ENV{http_proxy>}.
1240    
1241 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1242    
1243     Remove all cookies from the cookie jar that have been expired. If
1244     C<$session_end> is given and true, then additionally remove all session
1245     cookies.
1246    
1247     You should call this function (with a true C<$session_end>) before you
1248     save cookies to disk, and you should call this function after loading them
1249 root 1.117 again. If you have a long-running program you can additionally call this
1250 root 1.80 function from time to time.
1251    
1252     A cookie jar is initially an empty hash-reference that is managed by this
1253     module. It's format is subject to change, but currently it is like this:
1254    
1255     The key C<version> has to contain C<1>, otherwise the hash gets
1256     emptied. All other keys are hostnames or IP addresses pointing to
1257     hash-references. The key for these inner hash references is the
1258     server path for which this cookie is meant, and the values are again
1259 root 1.115 hash-references. Each key of those hash-references is a cookie name, and
1260 root 1.80 the value, you guessed it, is another hash-reference, this time with the
1261     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1262     which have been replaced by a C<_expires> key that contains the cookie
1263 root 1.115 expiry timestamp. Session cookies are indicated by not having an
1264     C<_expires> key.
1265 root 1.80
1266     Here is an example of a cookie jar with a single cookie, so you have a
1267     chance of understanding the above paragraph:
1268    
1269     {
1270     version => 1,
1271     "10.0.0.1" => {
1272     "/" => {
1273     "mythweb_id" => {
1274     _expires => 1293917923,
1275     value => "ooRung9dThee3ooyXooM1Ohm",
1276     },
1277     },
1278     },
1279     }
1280    
1281 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1282    
1283     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1284     Date (RFC 2616).
1285    
1286     =item $timestamp = AnyEvent::HTTP::parse_date $date
1287    
1288 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1289     bunch of minor variations of those, and returns the corresponding POSIX
1290     timestamp, or C<undef> if the date cannot be parsed.
1291 root 1.61
1292 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1293 root 1.1
1294 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1295 root 1.1
1296 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1297    
1298 root 1.110 The default timeout for connection operations (default: C<300>).
1299 root 1.92
1300 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1301    
1302     The default value for the C<User-Agent> header (the default is
1303 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1304 root 1.1
1305 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1306 root 1.1
1307 root 1.47 The maximum number of concurrent connections to the same host (identified
1308 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
1309 root 1.92 are queued until previous connections are closed. Both persistent and
1310     non-persistent connections are counted in this limit.
1311 root 1.1
1312 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1313 root 1.92 increase it much.
1314    
1315     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1316 root 1.117 connections, older browsers used 2, newer ones (such as firefox 3)
1317     typically use 6, and Opera uses 8 because like, they have the fastest
1318     browser and give a shit for everybody else on the planet.
1319 root 1.92
1320     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1321    
1322 root 1.116 The time after which idle persistent connections get closed by
1323 root 1.92 AnyEvent::HTTP (default: C<3>).
1324 root 1.3
1325 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1326    
1327     The number of active connections. This is not the number of currently
1328     running requests, but the number of currently open and non-idle TCP
1329 root 1.92 connections. This number can be useful for load-leveling.
1330 root 1.14
1331 root 1.1 =back
1332    
1333     =cut
1334    
1335 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1336     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1337    
1338     sub format_date($) {
1339     my ($time) = @_;
1340    
1341     # RFC 822/1123 format
1342     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1343    
1344     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1345     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1346     $H, $M, $S;
1347     }
1348    
1349     sub parse_date($) {
1350     my ($date) = @_;
1351    
1352     my ($d, $m, $y, $H, $M, $S);
1353    
1354 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1355 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1356     # cookie dates (with "-")
1357    
1358 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1359    
1360 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1361 root 1.61 # RFC 850
1362     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1363    
1364 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1365 root 1.61 # ISO C's asctime
1366     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1367     }
1368     # other formats fail in the loop below
1369    
1370     for (0..11) {
1371     if ($m eq $month[$_]) {
1372     require Time::Local;
1373 root 1.111 return eval { Time::Local::timegm ($S, $M, $H, $d, $_, $y) };
1374 root 1.61 }
1375     }
1376    
1377     undef
1378     }
1379    
1380 root 1.2 sub set_proxy($) {
1381 root 1.52 if (length $_[0]) {
1382 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1383 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1384     $PROXY = [$2, $3 || 3128, $1]
1385     } else {
1386     undef $PROXY;
1387     }
1388 root 1.2 }
1389    
1390     # initialise proxy from environment
1391 root 1.52 eval {
1392     set_proxy $ENV{http_proxy};
1393     };
1394 root 1.2
1395 root 1.93 =head2 SHOWCASE
1396    
1397 root 1.117 This section contains some more elaborate "real-world" examples or code
1398 root 1.93 snippets.
1399    
1400     =head2 HTTP/1.1 FILE DOWNLOAD
1401    
1402 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1403 root 1.98 goes wrong and you want to resume.
1404 root 1.93
1405     Here is a function that initiates and resumes a download. It uses the
1406     last modified time to check for file content changes, and works with many
1407     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1408     on older servers.
1409    
1410     It calls the completion callback with either C<undef>, which means a
1411 root 1.117 nonretryable error occurred, C<0> when the download was partial and should
1412 root 1.93 be retried, and C<1> if it was successful.
1413    
1414     use AnyEvent::HTTP;
1415    
1416     sub download($$$) {
1417     my ($url, $file, $cb) = @_;
1418    
1419     open my $fh, "+<", $file
1420     or die "$file: $!";
1421    
1422     my %hdr;
1423     my $ofs = 0;
1424    
1425     warn stat $fh;
1426     warn -s _;
1427     if (stat $fh and -s _) {
1428     $ofs = -s _;
1429 root 1.107 warn "-s is ", $ofs;
1430 root 1.93 $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1431     $hdr{"range"} = "bytes=$ofs-";
1432     }
1433    
1434     http_get $url,
1435     headers => \%hdr,
1436     on_header => sub {
1437     my ($hdr) = @_;
1438    
1439     if ($hdr->{Status} == 200 && $ofs) {
1440     # resume failed
1441     truncate $fh, $ofs = 0;
1442     }
1443    
1444     sysseek $fh, $ofs, 0;
1445    
1446     1
1447     },
1448     on_body => sub {
1449     my ($data, $hdr) = @_;
1450    
1451     if ($hdr->{Status} =~ /^2/) {
1452     length $data == syswrite $fh, $data
1453     or return; # abort on write errors
1454     }
1455    
1456     1
1457     },
1458     sub {
1459     my (undef, $hdr) = @_;
1460    
1461     my $status = $hdr->{Status};
1462    
1463     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1464     utime $fh, $time, $time;
1465     }
1466    
1467     if ($status == 200 || $status == 206 || $status == 416) {
1468     # download ok || resume ok || file already fully downloaded
1469     $cb->(1, $hdr);
1470    
1471     } elsif ($status == 412) {
1472     # file has changed while resuming, delete and retry
1473     unlink $file;
1474     $cb->(0, $hdr);
1475    
1476     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1477     # retry later
1478     $cb->(0, $hdr);
1479    
1480     } else {
1481     $cb->(undef, $hdr);
1482     }
1483     }
1484     ;
1485     }
1486    
1487     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1488     if ($_[0]) {
1489     print "OK!\n";
1490     } elsif (defined $_[0]) {
1491     print "please retry later\n";
1492     } else {
1493     print "ERROR\n";
1494     }
1495     };
1496    
1497     =head3 SOCKS PROXIES
1498 root 1.60
1499     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1500     compile your perl to support socks, or use an external program such as
1501     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1502     transparently.
1503    
1504     Alternatively, for AnyEvent::HTTP only, you can use your own
1505     C<tcp_connect> function that does the proxy handshake - here is an example
1506     that works with socks4a proxies:
1507    
1508     use Errno;
1509     use AnyEvent::Util;
1510     use AnyEvent::Socket;
1511     use AnyEvent::Handle;
1512    
1513     # host, port and username of/for your socks4a proxy
1514     my $socks_host = "10.0.0.23";
1515     my $socks_port = 9050;
1516     my $socks_user = "";
1517    
1518     sub socks4a_connect {
1519     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1520    
1521     my $hdl = new AnyEvent::Handle
1522     connect => [$socks_host, $socks_port],
1523     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1524     on_error => sub { $connect_cb->() },
1525     ;
1526    
1527     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1528    
1529     $hdl->push_read (chunk => 8, sub {
1530     my ($hdl, $chunk) = @_;
1531     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1532    
1533     if ($status == 0x5a) {
1534     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1535     } else {
1536     $! = Errno::ENXIO; $connect_cb->();
1537     }
1538     });
1539    
1540     $hdl
1541     }
1542    
1543     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1544     possibly after switching off other proxy types:
1545    
1546     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1547    
1548     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1549     my ($data, $headers) = @_;
1550     ...
1551     };
1552    
1553 root 1.1 =head1 SEE ALSO
1554    
1555     L<AnyEvent>.
1556    
1557     =head1 AUTHOR
1558    
1559 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1560     http://home.schmorp.de/
1561 root 1.1
1562 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1563     testcases and bugreports.
1564    
1565 root 1.1 =cut
1566    
1567     1
1568