ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.103
Committed: Thu Feb 24 12:13:11 2011 UTC (13 years, 2 months ago) by root
Branch: MAIN
CVS Tags: rel-2_1
Changes since 1.102: +41 -24 lines
Log Message:
2.1

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
21 root 1.11 automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41 root 1.85 use common::sense;
42 root 1.1
43 root 1.41 use Errno ();
44 root 1.1
45 root 1.51 use AnyEvent 5.0 ();
46 root 1.1 use AnyEvent::Util ();
47     use AnyEvent::Handle ();
48    
49     use base Exporter::;
50    
51 root 1.103 our $VERSION = '2.1';
52 root 1.1
53 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
54 root 1.1
55 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
56 root 1.3 our $MAX_RECURSE = 10;
57 root 1.92 our $PERSISTENT_TIMEOUT = 3;
58 root 1.2 our $TIMEOUT = 300;
59 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
60 root 1.1
61 root 1.2 our $PROXY;
62 root 1.14 our $ACTIVE = 0;
63 root 1.2
64 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
65 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
66 root 1.1
67     =item http_get $url, key => value..., $cb->($data, $headers)
68    
69     Executes an HTTP-GET request. See the http_request function for details on
70 root 1.29 additional parameters and the return value.
71 root 1.1
72 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
73    
74 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
75     on additional parameters and the return value.
76 root 1.5
77     =item http_post $url, $body, key => value..., $cb->($data, $headers)
78 root 1.3
79 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
80 root 1.29 http_request function for details on additional parameters and the return
81     value.
82 root 1.3
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.29 When called in void context, nothing is returned. In other contexts,
89     C<http_request> returns a "cancellation guard" - you have to keep the
90     object at least alive until the callback get called. If the object gets
91 root 1.58 destroyed before the callback is called, the request will be cancelled.
92 root 1.29
93 root 1.42 The callback will be called with the response body data as first argument
94 root 1.68 (or C<undef> if an error occured), and a hash-ref with response headers
95     (and trailers) as second argument.
96 root 1.2
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
99     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
100 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
101     during the body phase of a request, then the original C<Status> and
102     C<Reason> values from the header are available as C<OrigStatus> and
103     C<OrigReason>.
104 root 1.55
105     The pseudo-header C<URL> contains the actual URL (which can differ from
106     the requested URL when following redirects - for example, you might get
107     an error that your URL scheme is not supported even though your URL is a
108     valid http URL because it redirected to an ftp URL, in which case you can
109     look at the URL pseudo header).
110    
111     The pseudo-header C<Redirect> only exists when the request was a result
112     of an internal redirect. In that case it is an array reference with
113     the C<($data, $headers)> from the redirect response. Note that this
114     response could in turn be the result of a redirect itself, and C<<
115     $headers->{Redirect}[1]{Redirect} >> will then contain the original
116     response, and so on.
117 root 1.20
118 root 1.32 If the server sends a header multiple times, then their contents will be
119     joined together with a comma (C<,>), as per the HTTP spec.
120 root 1.2
121     If an internal error occurs, such as not being able to resolve a hostname,
122 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
123     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
124     message. Currently the following status codes are used:
125    
126     =over 4
127    
128     =item 595 - errors during connection etsbalishment, proxy handshake.
129    
130     =item 596 - errors during TLS negotiation, request sending and header processing.
131    
132 root 1.78 =item 597 - errors during body receiving or processing.
133 root 1.77
134 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
135 root 1.77
136     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
137    
138     =back
139 root 1.2
140 root 1.6 A typical callback might look like this:
141    
142     sub {
143     my ($body, $hdr) = @_;
144    
145     if ($hdr->{Status} =~ /^2/) {
146     ... everything should be ok
147     } else {
148     print "error, $hdr->{Status} $hdr->{Reason}\n";
149     }
150     }
151    
152 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
153     include:
154    
155     =over 4
156    
157 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
158 root 1.1
159     Whether to recurse requests or not, e.g. on redirects, authentication
160 root 1.3 retries and so on, and how often to do so.
161 root 1.1
162     =item headers => hashref
163    
164 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
165     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
166 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
167     (this can be suppressed by using C<undef> for these headers in which case
168     they won't be sent at all).
169 root 1.1
170 root 1.90 You really should provide your own C<User-Agent:> header value that is
171     appropriate for your program - I wouldn't be surprised if the default
172     AnyEvent string gets blocked by webservers sooner or later.
173    
174 root 1.1 =item timeout => $seconds
175    
176     The time-out to use for various stages - each connect attempt will reset
177 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
178     timeout.
179    
180     Default timeout is 5 minutes.
181 root 1.2
182     =item proxy => [$host, $port[, $scheme]] or undef
183    
184 root 1.102 Use the given http proxy for all requests, or no proxy if C<undef> is
185     used.
186 root 1.2
187 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
188 root 1.1
189 root 1.102 If not specified, then the default proxy is used (see
190     C<AnyEvent::HTTP::set_proxy>).
191    
192 root 1.3 =item body => $string
193    
194 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
195 root 1.3 this module might offer more options).
196    
197 root 1.10 =item cookie_jar => $hash_ref
198    
199     Passing this parameter enables (simplified) cookie-processing, loosely
200     based on the original netscape specification.
201    
202 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
203     will get updated automatically. It is possible to save the cookie jar
204     to persistent storage with something like JSON or Storable - see the
205     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
206     expired or session-only cookies, and also for documentation on the format
207     of the cookie jar.
208 root 1.10
209 root 1.70 Note that this cookie implementation is not meant to be complete. If
210     you want complete cookie management you have to do that on your
211 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
212 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
213     to.
214 root 1.10
215 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
216 root 1.70 headers will be set and handled by this module, otherwise they will be
217 root 1.69 left untouched.
218    
219 root 1.40 =item tls_ctx => $scheme | $tls_ctx
220    
221     Specifies the AnyEvent::TLS context to be used for https connections. This
222     parameter follows the same rules as the C<tls_ctx> parameter to
223     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
224     C<high> can be specified, which give you a predefined low-security (no
225     verification, highest compatibility) and high-security (CA and common-name
226     verification) TLS context.
227    
228     The default for this option is C<low>, which could be interpreted as "give
229     me the page, no matter what".
230    
231 root 1.92 See also the C<sessionid> parameter.
232    
233     =item session => $string
234    
235     The module might reuse connections to the same host internally. Sometimes
236     (e.g. when using TLS), you do not want to reuse connections from other
237     sessions. This can be achieved by setting this parameter to some unique
238     ID (such as the address of an object storing your state data, or the TLS
239     context) - only connections using the same unique ID will be reused.
240    
241 root 1.51 =item on_prepare => $callback->($fh)
242    
243     In rare cases you need to "tune" the socket before it is used to
244     connect (for exmaple, to bind it on a given IP address). This parameter
245     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
246     and behaves exactly the same way (e.g. it has to provide a
247     timeout). See the description for the C<$prepare_cb> argument of
248     C<AnyEvent::Socket::tcp_connect> for details.
249    
250 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
251    
252     In even rarer cases you want total control over how AnyEvent::HTTP
253     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
254     to do this, but you can provide your own C<tcp_connect> function -
255 root 1.60 obviously, it has to follow the same calling conventions, except that it
256     may always return a connection guard object.
257 root 1.59
258     There are probably lots of weird uses for this function, starting from
259     tracing the hosts C<http_request> actually tries to connect, to (inexact
260     but fast) host => IP address caching or even socks protocol support.
261    
262 root 1.42 =item on_header => $callback->($headers)
263 root 1.41
264     When specified, this callback will be called with the header hash as soon
265     as headers have been successfully received from the remote server (not on
266     locally-generated errors).
267    
268     It has to return either true (in which case AnyEvent::HTTP will continue),
269     or false, in which case AnyEvent::HTTP will cancel the download (and call
270     the finish callback with an error code of C<598>).
271    
272     This callback is useful, among other things, to quickly reject unwanted
273     content, which, if it is supposed to be rare, can be faster than first
274     doing a C<HEAD> request.
275    
276 root 1.68 The downside is that cancelling the request makes it impossible to re-use
277     the connection. Also, the C<on_header> callback will not receive any
278     trailer (headers sent after the response body).
279    
280 root 1.42 Example: cancel the request unless the content-type is "text/html".
281 root 1.41
282 root 1.42 on_header => sub {
283     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
284     },
285 root 1.41
286 root 1.42 =item on_body => $callback->($partial_body, $headers)
287 root 1.41
288 root 1.42 When specified, all body data will be passed to this callback instead of
289     to the completion callback. The completion callback will get the empty
290     string instead of the body data.
291 root 1.41
292 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
293     or false, in which case AnyEvent::HTTP will cancel the download (and call
294     the completion callback with an error code of C<598>).
295    
296 root 1.68 The downside to cancelling the request is that it makes it impossible to
297     re-use the connection.
298    
299 root 1.42 This callback is useful when the data is too large to be held in memory
300     (so the callback writes it to a file) or when only some information should
301     be extracted, or when the body should be processed incrementally.
302 root 1.41
303     It is usually preferred over doing your own body handling via
304 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
305     only used to create a connection, C<want_body_handle> is the better
306     alternative, as it allows you to install your own event handler, reducing
307     resource usage.
308 root 1.41
309     =item want_body_handle => $enable
310    
311     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
312     changes considerably: after parsing the headers, and instead of
313     downloading the body (if any), the completion callback will be
314     called. Instead of the C<$body> argument containing the body data, the
315     callback will receive the L<AnyEvent::Handle> object associated with the
316     connection. In error cases, C<undef> will be passed. When there is no body
317     (e.g. status C<304>), the empty string will be passed.
318    
319 root 1.92 The handle object might or might not be in TLS mode, might be connected
320     to a proxy, be a persistent connection, use chunked transfer encoding
321     etc., and configured in unspecified ways. The user is responsible for this
322     handle (it will not be used by this module anymore).
323 root 1.41
324     This is useful with some push-type services, where, after the initial
325     headers, an interactive protocol is used (typical example would be the
326     push-style twitter API which starts a JSON/XML stream).
327    
328     If you think you need this, first have a look at C<on_body>, to see if
329 root 1.45 that doesn't solve your problem in a better way.
330 root 1.41
331 root 1.92 =item persistent => $boolean
332    
333     Try to create/reuse a persistent connection. When this flag is set
334     (default: true for idempotent requests, false for all others), then
335     C<http_request> tries to re-use an existing (previously-created)
336     persistent connection to the host and, failing that, tries to create a new
337     one.
338    
339     Requests failing in certain ways will be automatically retried once, which
340     is dangerous for non-idempotent requests, which is why it defaults to off
341     for them. The reason for this is because the bozos who designed HTTP/1.1
342     made it impossible to distinguish between a fatal error and a normal
343     connection timeout, so you never know whether there was a problem with
344     your request or not.
345    
346     When reusing an existent connection, many parameters (such as TLS context)
347     will be ignored. See the C<session> parameter for a workaround.
348    
349     =item keepalive => $boolean
350    
351     Only used when C<persistent> is also true. This parameter decides whether
352     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
353     (as opposed to only a HTTP/1.1 persistent connection).
354    
355     The default is true, except when using a proxy, in which case it defaults
356     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
357    
358     =item handle_params => { key => value ... }
359    
360     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
361     constructor that is called - not all requests will create a handle, and
362     sometimes more than one is created, so this parameter is only good for
363     setting hints.
364    
365     Example: set the maximum read size to 4096, to potentially conserve memory
366     at the cost of speed.
367    
368     handle_params => {
369     max_read_size => 4096,
370     },
371    
372 root 1.1 =back
373    
374 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
375     the response body.
376 root 1.9
377     http_request GET => "http://www.nethype.de/", sub {
378     my ($body, $hdr) = @_;
379     print "$body\n";
380     };
381    
382 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
383 root 1.9 timeout of 30 seconds.
384    
385     http_request
386     GET => "https://www.google.com",
387 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
388 root 1.9 timeout => 30,
389     sub {
390     my ($body, $hdr) = @_;
391     use Data::Dumper;
392     print Dumper $hdr;
393     }
394     ;
395 root 1.1
396 root 1.68 Example: do another simple HTTP GET request, but immediately try to
397 root 1.29 cancel it.
398    
399     my $request = http_request GET => "http://www.nethype.de/", sub {
400     my ($body, $hdr) = @_;
401     print "$body\n";
402     };
403    
404     undef $request;
405    
406 root 1.1 =cut
407    
408 root 1.92 #############################################################################
409     # wait queue/slots
410    
411 root 1.12 sub _slot_schedule;
412 root 1.11 sub _slot_schedule($) {
413     my $host = shift;
414    
415     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
416     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
417 root 1.12 # somebody wants that slot
418 root 1.11 ++$CO_SLOT{$host}[0];
419 root 1.14 ++$ACTIVE;
420 root 1.11
421     $cb->(AnyEvent::Util::guard {
422 root 1.14 --$ACTIVE;
423 root 1.11 --$CO_SLOT{$host}[0];
424     _slot_schedule $host;
425     });
426     } else {
427     # nobody wants the slot, maybe we can forget about it
428     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
429     last;
430     }
431     }
432     }
433    
434     # wait for a free slot on host, call callback
435     sub _get_slot($$) {
436     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
437    
438     _slot_schedule $_[0];
439     }
440    
441 root 1.80 #############################################################################
442 root 1.92 # cookie handling
443 root 1.80
444     # expire cookies
445     sub cookie_jar_expire($;$) {
446     my ($jar, $session_end) = @_;
447    
448     %$jar = () if $jar->{version} != 1;
449    
450     my $anow = AE::now;
451    
452     while (my ($chost, $paths) = each %$jar) {
453     next unless ref $paths;
454    
455     while (my ($cpath, $cookies) = each %$paths) {
456     while (my ($cookie, $kv) = each %$cookies) {
457     if (exists $kv->{_expires}) {
458     delete $cookies->{$cookie}
459     if $anow > $kv->{_expires};
460     } elsif ($session_end) {
461     delete $cookies->{$cookie};
462     }
463     }
464    
465     delete $paths->{$cpath}
466     unless %$cookies;
467     }
468    
469     delete $jar->{$chost}
470     unless %$paths;
471     }
472     }
473    
474 root 1.72 # extract cookies from jar
475 root 1.71 sub cookie_jar_extract($$$$) {
476 root 1.92 my ($jar, $scheme, $host, $path) = @_;
477 root 1.71
478     %$jar = () if $jar->{version} != 1;
479    
480     my @cookies;
481    
482     while (my ($chost, $paths) = each %$jar) {
483     next unless ref $paths;
484    
485     if ($chost =~ /^\./) {
486 root 1.92 next unless $chost eq substr $host, -length $chost;
487 root 1.71 } elsif ($chost =~ /\./) {
488 root 1.92 next unless $chost eq $host;
489 root 1.71 } else {
490     next;
491     }
492    
493     while (my ($cpath, $cookies) = each %$paths) {
494 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
495 root 1.71
496     while (my ($cookie, $kv) = each %$cookies) {
497 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
498 root 1.71
499 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
500     delete $cookies->{$cookie};
501     next;
502 root 1.71 }
503    
504     my $value = $kv->{value};
505    
506     if ($value =~ /[=;,[:space:]]/) {
507     $value =~ s/([\\"])/\\$1/g;
508     $value = "\"$value\"";
509     }
510    
511     push @cookies, "$cookie=$value";
512     }
513     }
514     }
515    
516     \@cookies
517     }
518    
519 root 1.72 # parse set_cookie header into jar
520 root 1.80 sub cookie_jar_set_cookie($$$$) {
521 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
522 root 1.80
523     my $anow = int AE::now;
524     my $snow; # server-now
525 root 1.72
526     for ($set_cookie) {
527     # parse NAME=VALUE
528     my @kv;
529    
530 root 1.79 # expires is not http-compliant in the original cookie-spec,
531     # we support the official date format and some extensions
532 root 1.72 while (
533     m{
534     \G\s*
535     (?:
536 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
537 root 1.101 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^;,[:space:]]*) ) )?
538 root 1.72 )
539     }gcxsi
540     ) {
541     my $name = $2;
542     my $value = $4;
543    
544 root 1.82 if (defined $1) {
545 root 1.72 # expires
546     $name = "expires";
547     $value = $1;
548 root 1.82 } elsif (defined $3) {
549 root 1.72 # quoted
550     $value = $3;
551     $value =~ s/\\(.)/$1/gs;
552     }
553    
554 root 1.95 push @kv, @kv ? lc $name : $name, $value;
555 root 1.72
556     last unless /\G\s*;/gc;
557     }
558    
559     last unless @kv;
560    
561     my $name = shift @kv;
562     my %kv = (value => shift @kv, @kv);
563    
564 root 1.80 if (exists $kv{"max-age"}) {
565     $kv{_expires} = $anow + delete $kv{"max-age"};
566     } elsif (exists $kv{expires}) {
567     $snow ||= parse_date ($date) || $anow;
568     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
569     } else {
570     delete $kv{_expires};
571     }
572 root 1.72
573     my $cdom;
574     my $cpath = (delete $kv{path}) || "/";
575    
576     if (exists $kv{domain}) {
577     $cdom = delete $kv{domain};
578    
579     $cdom =~ s/^\.?/./; # make sure it starts with a "."
580    
581     next if $cdom =~ /\.$/;
582    
583     # this is not rfc-like and not netscape-like. go figure.
584     my $ndots = $cdom =~ y/.//;
585     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
586     } else {
587 root 1.92 $cdom = $host;
588 root 1.72 }
589    
590     # store it
591 root 1.73 $jar->{version} = 1;
592 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
593 root 1.72
594     redo if /\G\s*,/gc;
595     }
596     }
597    
598 root 1.92 #############################################################################
599     # keepalive/persistent connection cache
600    
601     # fetch a connection from the keepalive cache
602     sub ka_fetch($) {
603     my $ka_key = shift;
604    
605     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
606     delete $KA_CACHE{$ka_key}
607     unless @{ $KA_CACHE{$ka_key} };
608    
609     $hdl
610     }
611    
612     sub ka_store($$) {
613     my ($ka_key, $hdl) = @_;
614    
615     my $kaa = $KA_CACHE{$ka_key} ||= [];
616    
617     my $destroy = sub {
618     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
619    
620     $hdl->destroy;
621    
622     @ka
623     ? $KA_CACHE{$ka_key} = \@ka
624     : delete $KA_CACHE{$ka_key};
625     };
626    
627     # on error etc., destroy
628     $hdl->on_error ($destroy);
629     $hdl->on_eof ($destroy);
630     $hdl->on_read ($destroy);
631     $hdl->timeout ($PERSISTENT_TIMEOUT);
632    
633     push @$kaa, $hdl;
634     shift @$kaa while @$kaa > $MAX_PER_HOST;
635     }
636    
637     #############################################################################
638     # utilities
639    
640 root 1.66 # continue to parse $_ for headers and place them into the arg
641 root 1.92 sub _parse_hdr() {
642 root 1.66 my %hdr;
643    
644     # things seen, not parsed:
645     # p3pP="NON CUR OTPi OUR NOR UNI"
646    
647     $hdr{lc $1} .= ",$2"
648     while /\G
649     ([^:\000-\037]*):
650     [\011\040]*
651     ((?: [^\012]+ | \012[\011\040] )*)
652     \012
653     /gxc;
654    
655     /\G$/
656     or return;
657    
658     # remove the "," prefix we added to all headers above
659     substr $_, 0, 1, ""
660     for values %hdr;
661    
662     \%hdr
663     }
664    
665 root 1.92 #############################################################################
666     # http_get
667    
668 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
669 root 1.34
670 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
671     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
672 root 1.40
673 root 1.92 # maybe it should just become a normal object :/
674    
675     sub _destroy_state(\%) {
676     my ($state) = @_;
677    
678     $state->{handle}->destroy if $state->{handle};
679     %$state = ();
680     }
681    
682     sub _error(\%$$) {
683     my ($state, $cb, $hdr) = @_;
684    
685     &_destroy_state ($state);
686    
687     $cb->(undef, $hdr);
688     ()
689     }
690    
691 elmex 1.15 sub http_request($$@) {
692 root 1.1 my $cb = pop;
693     my ($method, $url, %arg) = @_;
694    
695     my %hdr;
696    
697 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
698     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
699    
700 root 1.3 $method = uc $method;
701    
702 root 1.8 if (my $hdr = $arg{headers}) {
703 root 1.1 while (my ($k, $v) = each %$hdr) {
704     $hdr{lc $k} = $v;
705     }
706     }
707    
708 root 1.55 # pseudo headers for all subsequent responses
709     my @pseudo = (URL => $url);
710     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
711    
712 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
713 root 1.8
714 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
715 root 1.8 if $recurse < 0;
716    
717 root 1.102 my $proxy = exists $arg{proxy} ? $arg{proxy} : $PROXY;
718 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
719    
720 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
721 root 1.100 $url =~ m|^([^:]+):(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?$|;
722 root 1.2
723 root 1.31 $uscheme = lc $uscheme;
724 root 1.1
725 root 1.31 my $uport = $uscheme eq "http" ? 80
726     : $uscheme eq "https" ? 443
727 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
728 root 1.13
729 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
730 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
731 root 1.10
732 root 1.86 my $uhost = lc $1;
733 root 1.10 $uport = $2 if defined $2;
734    
735 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
736     unless exists $hdr{host};
737 root 1.43
738 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
739 root 1.56 $upath .= $query if length $query;
740 root 1.10
741     $upath =~ s%^/?%/%;
742    
743     # cookie processing
744     if (my $jar = $arg{cookie_jar}) {
745 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
746 root 1.70
747 root 1.71 $hdr{cookie} = join "; ", @$cookies
748     if @$cookies;
749 root 1.10 }
750 root 1.1
751 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
752 root 1.2
753 root 1.10 if ($proxy) {
754 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
755 root 1.31
756 root 1.47 $rscheme = "http" unless defined $rscheme;
757    
758 root 1.31 # don't support https requests over https-proxy transport,
759 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
760 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
761 root 1.86
762     $rhost = lc $rhost;
763     $rscheme = lc $rscheme;
764 root 1.10 } else {
765 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
766 root 1.2 }
767    
768 root 1.47 # leave out fragment and query string, just a heuristic
769 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
770     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
771 root 1.41
772 root 1.53 $hdr{"content-length"} = length $arg{body}
773     if length $arg{body} || $method ne "GET";
774 root 1.1
775 root 1.89 my $idempotent = $method =~ /^(?:GET|HEAD|PUT|DELETE|OPTIONS|TRACE)$/;
776    
777     # default value for keepalive is true iff the request is for an idempotent method
778 root 1.103 my $persistent = exists $arg{persistent} ? !!$arg{persistent} : $idempotent;
779     my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : !$proxy;
780     my $was_persistent; # true if this is actually a recycled connection
781 root 1.92
782     # the key to use in the keepalive cache
783     my $ka_key = "$uhost\x00$arg{sessionid}";
784 root 1.89
785 root 1.103 $hdr{connection} = ($persistent ? $keepalive ? "keep-alive " : "" : "close ") . "Te"; #1.1
786 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
787 root 1.66
788 root 1.11 my %state = (connect_guard => 1);
789    
790 root 1.88 my $ae_error = 595; # connecting
791 root 1.1
792 root 1.88 # handle actual, non-tunneled, request
793     my $handle_actual_request = sub {
794     $ae_error = 596; # request phase
795    
796 root 1.92 my $hdl = $state{handle};
797    
798     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
799 root 1.88
800     # send request
801 root 1.92 $hdl->push_write (
802 root 1.88 "$method $rpath HTTP/1.1\015\012"
803     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
804     . "\015\012"
805     . (delete $arg{body})
806     );
807    
808     # return if error occured during push_write()
809     return unless %state;
810    
811 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
812     %hdr = ();
813 root 1.88
814     # status line and headers
815     $state{read_response} = sub {
816 root 1.92 return unless %state;
817    
818 root 1.88 for ("$_[1]") {
819     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
820    
821     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
822 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
823 root 1.88
824     # 100 Continue handling
825     # should not happen as we don't send expect: 100-continue,
826     # but we handle it just in case.
827     # since we send the request body regardless, if we get an error
828     # we are out of-sync, which we currently do NOT handle correctly.
829     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
830     if $2 eq 100;
831    
832     push @pseudo,
833     HTTPVersion => $1,
834     Status => $2,
835     Reason => $3,
836     ;
837 root 1.1
838 root 1.92 my $hdr = _parse_hdr
839     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
840 root 1.77
841 root 1.88 %hdr = (%$hdr, @pseudo);
842     }
843 root 1.82
844 root 1.88 # redirect handling
845     # microsoft and other shitheads don't give a shit for following standards,
846     # try to support some common forms of broken Location headers.
847     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
848     $hdr{location} =~ s/^\.\/+//;
849    
850     my $url = "$rscheme://$uhost:$uport";
851    
852     unless ($hdr{location} =~ s/^\///) {
853     $url .= $upath;
854     $url =~ s/\/[^\/]*$//;
855 root 1.82 }
856    
857 root 1.88 $hdr{location} = "$url/$hdr{location}";
858     }
859 root 1.82
860 root 1.88 my $redirect;
861 root 1.82
862 root 1.88 if ($recurse) {
863     my $status = $hdr{Status};
864 root 1.82
865 root 1.88 # industry standard is to redirect POST as GET for
866     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
867     # also, the UA should ask the user for 301 and 307 and POST,
868     # industry standard seems to be to simply follow.
869     # we go with the industry standard.
870     if ($status == 301 or $status == 302 or $status == 303) {
871     # HTTP/1.1 is unclear on how to mutate the method
872     $method = "GET" unless $method eq "HEAD";
873     $redirect = 1;
874     } elsif ($status == 307) {
875     $redirect = 1;
876 root 1.82 }
877 root 1.88 }
878 root 1.82
879 root 1.103 my $finish = sub { # ($data, $err_status, $err_reason[, $persistent])
880 root 1.92 if ($state{handle}) {
881     # handle keepalive
882     if (
883 root 1.103 $persistent
884 root 1.92 && $_[3]
885     && ($hdr{HTTPVersion} < 1.1
886     ? $hdr{connection} =~ /\bkeep-?alive\b/i
887     : $hdr{connection} !~ /\bclose\b/i)
888     ) {
889     ka_store $ka_key, delete $state{handle};
890     } else {
891     # no keepalive, destroy the handle
892     $state{handle}->destroy;
893     }
894     }
895 root 1.82
896 root 1.88 %state = ();
897 root 1.82
898 root 1.88 if (defined $_[1]) {
899     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
900     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
901     }
902 root 1.82
903 root 1.88 # set-cookie processing
904     if ($arg{cookie_jar}) {
905     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
906     }
907 root 1.82
908 root 1.88 if ($redirect && exists $hdr{location}) {
909     # we ignore any errors, as it is very common to receive
910     # Content-Length != 0 but no actual body
911     # we also access %hdr, as $_[1] might be an erro
912 root 1.103 $state{recurse} =
913     http_request (
914     $method => $hdr{location},
915     %arg,
916     recurse => $recurse - 1,
917     Redirect => [$_[0], \%hdr],
918     sub {
919     %state = ();
920     &$cb
921     },
922     );
923 root 1.88 } else {
924     $cb->($_[0], \%hdr);
925     }
926     };
927 root 1.82
928 root 1.88 $ae_error = 597; # body phase
929 root 1.82
930 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
931    
932     my $len = $chunked ? undef : $hdr{"content-length"};
933 root 1.82
934 root 1.88 # body handling, many different code paths
935     # - no body expected
936     # - want_body_handle
937     # - te chunked
938     # - 2x length known (with or without on_body)
939     # - 2x length not known (with or without on_body)
940     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
941     $finish->(undef, 598 => "Request cancelled by on_header");
942     } elsif (
943     $hdr{Status} =~ /^(?:1..|204|205|304)$/
944     or $method eq "HEAD"
945     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
946     ) {
947     # no body
948     $finish->("", undef, undef, 1);
949    
950     } elsif (!$redirect && $arg{want_body_handle}) {
951     $_[0]->on_eof (undef);
952     $_[0]->on_error (undef);
953     $_[0]->on_read (undef);
954    
955     $finish->(delete $state{handle});
956    
957 root 1.91 } elsif ($chunked) {
958 root 1.88 my $cl = 0;
959 root 1.92 my $body = "";
960 root 1.88 my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
961    
962     $state{read_chunk} = sub {
963     $_[1] =~ /^([0-9a-fA-F]+)/
964     or $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
965 root 1.82
966 root 1.88 my $len = hex $1;
967 root 1.82
968 root 1.88 if ($len) {
969     $cl += $len;
970 root 1.82
971 root 1.88 $_[0]->push_read (chunk => $len, sub {
972     $on_body->($_[1], \%hdr)
973     or return $finish->(undef, 598 => "Request cancelled by on_body");
974 root 1.82
975 root 1.88 $_[0]->push_read (line => sub {
976     length $_[1]
977     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
978     $_[0]->push_read (line => $state{read_chunk});
979 root 1.82 });
980 root 1.88 });
981     } else {
982     $hdr{"content-length"} ||= $cl;
983 root 1.84
984 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
985     if (length $_[1]) {
986     for ("$_[1]") {
987     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
988 root 1.84
989 root 1.92 my $hdr = _parse_hdr
990 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
991 root 1.84
992 root 1.88 %hdr = (%hdr, %$hdr);
993     }
994     }
995 root 1.84
996 root 1.88 $finish->($body, undef, undef, 1);
997 root 1.84 });
998     }
999 root 1.88 };
1000    
1001     $_[0]->push_read (line => $state{read_chunk});
1002    
1003     } elsif ($arg{on_body}) {
1004     if (defined $len) {
1005     $_[0]->on_read (sub {
1006     $len -= length $_[0]{rbuf};
1007    
1008     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1009     or return $finish->(undef, 598 => "Request cancelled by on_body");
1010    
1011     $len > 0
1012     or $finish->("", undef, undef, 1);
1013     });
1014 root 1.84 } else {
1015 root 1.88 $_[0]->on_eof (sub {
1016     $finish->("");
1017     });
1018     $_[0]->on_read (sub {
1019     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1020     or $finish->(undef, 598 => "Request cancelled by on_body");
1021     });
1022     }
1023     } else {
1024     $_[0]->on_eof (undef);
1025 root 1.82
1026 root 1.88 if (defined $len) {
1027     $_[0]->on_read (sub {
1028     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1029     if $len <= length $_[0]{rbuf};
1030     });
1031     } else {
1032     $_[0]->on_error (sub {
1033     ($! == Errno::EPIPE || !$!)
1034     ? $finish->(delete $_[0]{rbuf})
1035     : $finish->(undef, $ae_error => $_[2]);
1036     });
1037     $_[0]->on_read (sub { });
1038 root 1.82 }
1039 root 1.88 }
1040     };
1041 root 1.82
1042 root 1.92 # if keepalive is enabled, then the server closing the connection
1043     # before a response can happen legally - we retry on idempotent methods.
1044 root 1.103 if ($was_persistent && $idempotent) {
1045 root 1.92 my $old_eof = $hdl->{on_eof};
1046     $hdl->{on_eof} = sub {
1047     _destroy_state %state;
1048    
1049 root 1.103 %state = ();
1050     $state{recurse} =
1051     http_request (
1052     $method => $url,
1053     %arg,
1054     keepalive => 0,
1055     sub {
1056     %state = ();
1057     &$cb
1058     }
1059     );
1060 root 1.92 };
1061     $hdl->on_read (sub {
1062     return unless %state;
1063    
1064     # as soon as we receive something, a connection close
1065     # once more becomes a hard error
1066     $hdl->{on_eof} = $old_eof;
1067     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1068     });
1069     } else {
1070     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1071     }
1072     };
1073    
1074     my $prepare_handle = sub {
1075     my ($hdl) = $state{handle};
1076    
1077     $hdl->on_error (sub {
1078     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1079     });
1080     $hdl->on_eof (sub {
1081     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1082     });
1083 root 1.103 $hdl->timeout_reset;
1084     $hdl->timeout ($timeout);
1085 root 1.88 };
1086 root 1.82
1087 root 1.92 # connected to proxy (or origin server)
1088 root 1.88 my $connect_cb = sub {
1089 root 1.92 my $fh = shift
1090     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1091 root 1.44
1092 root 1.88 return unless delete $state{connect_guard};
1093 root 1.11
1094 root 1.88 # get handle
1095     $state{handle} = new AnyEvent::Handle
1096 root 1.92 %{ $arg{handle_params} },
1097     fh => $fh,
1098     peername => $uhost,
1099 root 1.88 tls_ctx => $arg{tls_ctx},
1100     ;
1101 root 1.11
1102 root 1.92 $prepare_handle->();
1103 root 1.1
1104 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1105 root 1.88
1106     # now handle proxy-CONNECT method
1107     if ($proxy && $uscheme eq "https") {
1108     # oh dear, we have to wrap it into a connect request
1109    
1110     # maybe re-use $uauthority with patched port?
1111     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012\015\012");
1112     $state{handle}->push_read (line => $qr_nlnl, sub {
1113     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1114 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1115 root 1.88
1116     if ($2 == 200) {
1117     $rpath = $upath;
1118     $handle_actual_request->();
1119     } else {
1120 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1121 root 1.88 }
1122     });
1123     } else {
1124     $handle_actual_request->();
1125     }
1126     };
1127    
1128     _get_slot $uhost, sub {
1129     $state{slot_guard} = shift;
1130 root 1.64
1131 root 1.88 return unless $state{connect_guard};
1132 root 1.64
1133 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1134     # on a keepalive request (in theory, this should be a separate config option).
1135 root 1.103 if ($persistent && $KA_CACHE{$ka_key}) {
1136     $was_persistent = 1;
1137    
1138 root 1.92 $state{handle} = ka_fetch $ka_key;
1139 root 1.103 $state{handle}->destroyed
1140     and die "got a destructed habndle. pah\n";#d#
1141 root 1.92 $prepare_handle->();
1142 root 1.103 $state{handle}->destroyed
1143     and die "got a destructed habndle. pa2\n";#d#
1144 root 1.92 $handle_actual_request->();
1145 root 1.103 $state{handle}->destroyed
1146     and die "got a destructed habndle. pa3\n";#d#
1147 root 1.92
1148     } else {
1149     my $tcp_connect = $arg{tcp_connect}
1150     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1151 root 1.57
1152 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1153     }
1154 root 1.1 };
1155    
1156 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1157 root 1.1 }
1158    
1159 elmex 1.15 sub http_get($@) {
1160 root 1.1 unshift @_, "GET";
1161     &http_request
1162     }
1163    
1164 elmex 1.15 sub http_head($@) {
1165 root 1.4 unshift @_, "HEAD";
1166     &http_request
1167     }
1168    
1169 elmex 1.15 sub http_post($$@) {
1170 root 1.22 my $url = shift;
1171     unshift @_, "POST", $url, "body";
1172 root 1.3 &http_request
1173     }
1174    
1175 root 1.9 =back
1176    
1177 root 1.55 =head2 DNS CACHING
1178    
1179     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1180     the actual connection, which in turn uses AnyEvent::DNS to resolve
1181     hostnames. The latter is a simple stub resolver and does no caching
1182     on its own. If you want DNS caching, you currently have to provide
1183     your own default resolver (by storing a suitable resolver object in
1184 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1185 root 1.55
1186 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1187 root 1.1
1188     =over 4
1189    
1190 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1191    
1192     Sets the default proxy server to use. The proxy-url must begin with a
1193 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1194 root 1.52
1195     To clear an already-set proxy, use C<undef>.
1196 root 1.2
1197 root 1.102 When AnyEvent::HTTP is laoded for the first time it will query the
1198     default proxy from the operating system, currently by looking at
1199     C<$ENV{http_proxy>}.
1200    
1201 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1202    
1203     Remove all cookies from the cookie jar that have been expired. If
1204     C<$session_end> is given and true, then additionally remove all session
1205     cookies.
1206    
1207     You should call this function (with a true C<$session_end>) before you
1208     save cookies to disk, and you should call this function after loading them
1209     again. If you have a long-running program you can additonally call this
1210     function from time to time.
1211    
1212     A cookie jar is initially an empty hash-reference that is managed by this
1213     module. It's format is subject to change, but currently it is like this:
1214    
1215     The key C<version> has to contain C<1>, otherwise the hash gets
1216     emptied. All other keys are hostnames or IP addresses pointing to
1217     hash-references. The key for these inner hash references is the
1218     server path for which this cookie is meant, and the values are again
1219     hash-references. The keys of those hash-references is the cookie name, and
1220     the value, you guessed it, is another hash-reference, this time with the
1221     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1222     which have been replaced by a C<_expires> key that contains the cookie
1223     expiry timestamp.
1224    
1225     Here is an example of a cookie jar with a single cookie, so you have a
1226     chance of understanding the above paragraph:
1227    
1228     {
1229     version => 1,
1230     "10.0.0.1" => {
1231     "/" => {
1232     "mythweb_id" => {
1233     _expires => 1293917923,
1234     value => "ooRung9dThee3ooyXooM1Ohm",
1235     },
1236     },
1237     },
1238     }
1239    
1240 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1241    
1242     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1243     Date (RFC 2616).
1244    
1245     =item $timestamp = AnyEvent::HTTP::parse_date $date
1246    
1247 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1248     bunch of minor variations of those, and returns the corresponding POSIX
1249     timestamp, or C<undef> if the date cannot be parsed.
1250 root 1.61
1251 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1252 root 1.1
1253 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1254 root 1.1
1255 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1256    
1257     The default timeout for conenction operations (default: C<300>).
1258    
1259 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1260    
1261     The default value for the C<User-Agent> header (the default is
1262 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1263 root 1.1
1264 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1265 root 1.1
1266 root 1.47 The maximum number of concurrent connections to the same host (identified
1267 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
1268 root 1.92 are queued until previous connections are closed. Both persistent and
1269     non-persistent connections are counted in this limit.
1270 root 1.1
1271 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1272 root 1.92 increase it much.
1273    
1274     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1275     connections, older browsers used 2, newers (such as firefox 3) typically
1276     use 6, and Opera uses 8 because like, they have the fastest browser and
1277     give a shit for everybody else on the planet.
1278    
1279     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1280    
1281     The time after which idle persistent conenctions get closed by
1282     AnyEvent::HTTP (default: C<3>).
1283 root 1.3
1284 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1285    
1286     The number of active connections. This is not the number of currently
1287     running requests, but the number of currently open and non-idle TCP
1288 root 1.92 connections. This number can be useful for load-leveling.
1289 root 1.14
1290 root 1.1 =back
1291    
1292     =cut
1293    
1294 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1295     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1296    
1297     sub format_date($) {
1298     my ($time) = @_;
1299    
1300     # RFC 822/1123 format
1301     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1302    
1303     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1304     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1305     $H, $M, $S;
1306     }
1307    
1308     sub parse_date($) {
1309     my ($date) = @_;
1310    
1311     my ($d, $m, $y, $H, $M, $S);
1312    
1313 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1314 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1315     # cookie dates (with "-")
1316    
1317 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1318    
1319 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1320 root 1.61 # RFC 850
1321     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1322    
1323 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1324 root 1.61 # ISO C's asctime
1325     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1326     }
1327     # other formats fail in the loop below
1328    
1329     for (0..11) {
1330     if ($m eq $month[$_]) {
1331     require Time::Local;
1332     return Time::Local::timegm ($S, $M, $H, $d, $_, $y);
1333     }
1334     }
1335    
1336     undef
1337     }
1338    
1339 root 1.2 sub set_proxy($) {
1340 root 1.52 if (length $_[0]) {
1341 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1342 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1343     $PROXY = [$2, $3 || 3128, $1]
1344     } else {
1345     undef $PROXY;
1346     }
1347 root 1.2 }
1348    
1349     # initialise proxy from environment
1350 root 1.52 eval {
1351     set_proxy $ENV{http_proxy};
1352     };
1353 root 1.2
1354 root 1.93 =head2 SHOWCASE
1355    
1356     This section contaisn some more elaborate "real-world" examples or code
1357     snippets.
1358    
1359     =head2 HTTP/1.1 FILE DOWNLOAD
1360    
1361 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1362 root 1.98 goes wrong and you want to resume.
1363 root 1.93
1364     Here is a function that initiates and resumes a download. It uses the
1365     last modified time to check for file content changes, and works with many
1366     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1367     on older servers.
1368    
1369     It calls the completion callback with either C<undef>, which means a
1370     nonretryable error occured, C<0> when the download was partial and should
1371     be retried, and C<1> if it was successful.
1372    
1373     use AnyEvent::HTTP;
1374    
1375     sub download($$$) {
1376     my ($url, $file, $cb) = @_;
1377    
1378     open my $fh, "+<", $file
1379     or die "$file: $!";
1380    
1381     my %hdr;
1382     my $ofs = 0;
1383    
1384     warn stat $fh;
1385     warn -s _;
1386     if (stat $fh and -s _) {
1387     $ofs = -s _;
1388     warn "-s is ", $ofs;#d#
1389     $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1390     $hdr{"range"} = "bytes=$ofs-";
1391     }
1392    
1393     http_get $url,
1394     headers => \%hdr,
1395     on_header => sub {
1396     my ($hdr) = @_;
1397    
1398     if ($hdr->{Status} == 200 && $ofs) {
1399     # resume failed
1400     truncate $fh, $ofs = 0;
1401     }
1402    
1403     sysseek $fh, $ofs, 0;
1404    
1405     1
1406     },
1407     on_body => sub {
1408     my ($data, $hdr) = @_;
1409    
1410     if ($hdr->{Status} =~ /^2/) {
1411     length $data == syswrite $fh, $data
1412     or return; # abort on write errors
1413     }
1414    
1415     1
1416     },
1417     sub {
1418     my (undef, $hdr) = @_;
1419    
1420     my $status = $hdr->{Status};
1421    
1422     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1423     utime $fh, $time, $time;
1424     }
1425    
1426     if ($status == 200 || $status == 206 || $status == 416) {
1427     # download ok || resume ok || file already fully downloaded
1428     $cb->(1, $hdr);
1429    
1430     } elsif ($status == 412) {
1431     # file has changed while resuming, delete and retry
1432     unlink $file;
1433     $cb->(0, $hdr);
1434    
1435     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1436     # retry later
1437     $cb->(0, $hdr);
1438    
1439     } else {
1440     $cb->(undef, $hdr);
1441     }
1442     }
1443     ;
1444     }
1445    
1446     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1447     if ($_[0]) {
1448     print "OK!\n";
1449     } elsif (defined $_[0]) {
1450     print "please retry later\n";
1451     } else {
1452     print "ERROR\n";
1453     }
1454     };
1455    
1456     =head3 SOCKS PROXIES
1457 root 1.60
1458     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1459     compile your perl to support socks, or use an external program such as
1460     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1461     transparently.
1462    
1463     Alternatively, for AnyEvent::HTTP only, you can use your own
1464     C<tcp_connect> function that does the proxy handshake - here is an example
1465     that works with socks4a proxies:
1466    
1467     use Errno;
1468     use AnyEvent::Util;
1469     use AnyEvent::Socket;
1470     use AnyEvent::Handle;
1471    
1472     # host, port and username of/for your socks4a proxy
1473     my $socks_host = "10.0.0.23";
1474     my $socks_port = 9050;
1475     my $socks_user = "";
1476    
1477     sub socks4a_connect {
1478     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1479    
1480     my $hdl = new AnyEvent::Handle
1481     connect => [$socks_host, $socks_port],
1482     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1483     on_error => sub { $connect_cb->() },
1484     ;
1485    
1486     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1487    
1488     $hdl->push_read (chunk => 8, sub {
1489     my ($hdl, $chunk) = @_;
1490     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1491    
1492     if ($status == 0x5a) {
1493     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1494     } else {
1495     $! = Errno::ENXIO; $connect_cb->();
1496     }
1497     });
1498    
1499     $hdl
1500     }
1501    
1502     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1503     possibly after switching off other proxy types:
1504    
1505     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1506    
1507     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1508     my ($data, $headers) = @_;
1509     ...
1510     };
1511    
1512 root 1.1 =head1 SEE ALSO
1513    
1514     L<AnyEvent>.
1515    
1516     =head1 AUTHOR
1517    
1518 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1519     http://home.schmorp.de/
1520 root 1.1
1521 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1522     testcases and bugreports.
1523    
1524 root 1.1 =cut
1525    
1526     1
1527