ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.102
Committed: Sat Feb 19 06:46:14 2011 UTC (13 years, 3 months ago) by root
Branch: MAIN
CVS Tags: rel-2_04
Changes since 1.101: +11 -4 lines
Log Message:
2.04

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
21 root 1.11 automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41 root 1.85 use common::sense;
42 root 1.1
43 root 1.41 use Errno ();
44 root 1.1
45 root 1.51 use AnyEvent 5.0 ();
46 root 1.1 use AnyEvent::Util ();
47     use AnyEvent::Handle ();
48    
49     use base Exporter::;
50    
51 root 1.102 our $VERSION = '2.04';
52 root 1.1
53 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
54 root 1.1
55 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
56 root 1.3 our $MAX_RECURSE = 10;
57 root 1.92 our $PERSISTENT_TIMEOUT = 3;
58 root 1.2 our $TIMEOUT = 300;
59 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
60 root 1.1
61 root 1.2 our $PROXY;
62 root 1.14 our $ACTIVE = 0;
63 root 1.2
64 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
65 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
66 root 1.1
67     =item http_get $url, key => value..., $cb->($data, $headers)
68    
69     Executes an HTTP-GET request. See the http_request function for details on
70 root 1.29 additional parameters and the return value.
71 root 1.1
72 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
73    
74 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
75     on additional parameters and the return value.
76 root 1.5
77     =item http_post $url, $body, key => value..., $cb->($data, $headers)
78 root 1.3
79 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
80 root 1.29 http_request function for details on additional parameters and the return
81     value.
82 root 1.3
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.29 When called in void context, nothing is returned. In other contexts,
89     C<http_request> returns a "cancellation guard" - you have to keep the
90     object at least alive until the callback get called. If the object gets
91 root 1.58 destroyed before the callback is called, the request will be cancelled.
92 root 1.29
93 root 1.42 The callback will be called with the response body data as first argument
94 root 1.68 (or C<undef> if an error occured), and a hash-ref with response headers
95     (and trailers) as second argument.
96 root 1.2
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
99     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
100 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
101     during the body phase of a request, then the original C<Status> and
102     C<Reason> values from the header are available as C<OrigStatus> and
103     C<OrigReason>.
104 root 1.55
105     The pseudo-header C<URL> contains the actual URL (which can differ from
106     the requested URL when following redirects - for example, you might get
107     an error that your URL scheme is not supported even though your URL is a
108     valid http URL because it redirected to an ftp URL, in which case you can
109     look at the URL pseudo header).
110    
111     The pseudo-header C<Redirect> only exists when the request was a result
112     of an internal redirect. In that case it is an array reference with
113     the C<($data, $headers)> from the redirect response. Note that this
114     response could in turn be the result of a redirect itself, and C<<
115     $headers->{Redirect}[1]{Redirect} >> will then contain the original
116     response, and so on.
117 root 1.20
118 root 1.32 If the server sends a header multiple times, then their contents will be
119     joined together with a comma (C<,>), as per the HTTP spec.
120 root 1.2
121     If an internal error occurs, such as not being able to resolve a hostname,
122 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
123     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
124     message. Currently the following status codes are used:
125    
126     =over 4
127    
128     =item 595 - errors during connection etsbalishment, proxy handshake.
129    
130     =item 596 - errors during TLS negotiation, request sending and header processing.
131    
132 root 1.78 =item 597 - errors during body receiving or processing.
133 root 1.77
134 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
135 root 1.77
136     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
137    
138     =back
139 root 1.2
140 root 1.6 A typical callback might look like this:
141    
142     sub {
143     my ($body, $hdr) = @_;
144    
145     if ($hdr->{Status} =~ /^2/) {
146     ... everything should be ok
147     } else {
148     print "error, $hdr->{Status} $hdr->{Reason}\n";
149     }
150     }
151    
152 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
153     include:
154    
155     =over 4
156    
157 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
158 root 1.1
159     Whether to recurse requests or not, e.g. on redirects, authentication
160 root 1.3 retries and so on, and how often to do so.
161 root 1.1
162     =item headers => hashref
163    
164 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
165     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
166 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
167     (this can be suppressed by using C<undef> for these headers in which case
168     they won't be sent at all).
169 root 1.1
170 root 1.90 You really should provide your own C<User-Agent:> header value that is
171     appropriate for your program - I wouldn't be surprised if the default
172     AnyEvent string gets blocked by webservers sooner or later.
173    
174 root 1.1 =item timeout => $seconds
175    
176     The time-out to use for various stages - each connect attempt will reset
177 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
178     timeout.
179    
180     Default timeout is 5 minutes.
181 root 1.2
182     =item proxy => [$host, $port[, $scheme]] or undef
183    
184 root 1.102 Use the given http proxy for all requests, or no proxy if C<undef> is
185     used.
186 root 1.2
187 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
188 root 1.1
189 root 1.102 If not specified, then the default proxy is used (see
190     C<AnyEvent::HTTP::set_proxy>).
191    
192 root 1.3 =item body => $string
193    
194 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
195 root 1.3 this module might offer more options).
196    
197 root 1.10 =item cookie_jar => $hash_ref
198    
199     Passing this parameter enables (simplified) cookie-processing, loosely
200     based on the original netscape specification.
201    
202 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
203     will get updated automatically. It is possible to save the cookie jar
204     to persistent storage with something like JSON or Storable - see the
205     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
206     expired or session-only cookies, and also for documentation on the format
207     of the cookie jar.
208 root 1.10
209 root 1.70 Note that this cookie implementation is not meant to be complete. If
210     you want complete cookie management you have to do that on your
211 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
212 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
213     to.
214 root 1.10
215 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
216 root 1.70 headers will be set and handled by this module, otherwise they will be
217 root 1.69 left untouched.
218    
219 root 1.40 =item tls_ctx => $scheme | $tls_ctx
220    
221     Specifies the AnyEvent::TLS context to be used for https connections. This
222     parameter follows the same rules as the C<tls_ctx> parameter to
223     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
224     C<high> can be specified, which give you a predefined low-security (no
225     verification, highest compatibility) and high-security (CA and common-name
226     verification) TLS context.
227    
228     The default for this option is C<low>, which could be interpreted as "give
229     me the page, no matter what".
230    
231 root 1.92 See also the C<sessionid> parameter.
232    
233     =item session => $string
234    
235     The module might reuse connections to the same host internally. Sometimes
236     (e.g. when using TLS), you do not want to reuse connections from other
237     sessions. This can be achieved by setting this parameter to some unique
238     ID (such as the address of an object storing your state data, or the TLS
239     context) - only connections using the same unique ID will be reused.
240    
241 root 1.51 =item on_prepare => $callback->($fh)
242    
243     In rare cases you need to "tune" the socket before it is used to
244     connect (for exmaple, to bind it on a given IP address). This parameter
245     overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
246     and behaves exactly the same way (e.g. it has to provide a
247     timeout). See the description for the C<$prepare_cb> argument of
248     C<AnyEvent::Socket::tcp_connect> for details.
249    
250 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
251    
252     In even rarer cases you want total control over how AnyEvent::HTTP
253     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
254     to do this, but you can provide your own C<tcp_connect> function -
255 root 1.60 obviously, it has to follow the same calling conventions, except that it
256     may always return a connection guard object.
257 root 1.59
258     There are probably lots of weird uses for this function, starting from
259     tracing the hosts C<http_request> actually tries to connect, to (inexact
260     but fast) host => IP address caching or even socks protocol support.
261    
262 root 1.42 =item on_header => $callback->($headers)
263 root 1.41
264     When specified, this callback will be called with the header hash as soon
265     as headers have been successfully received from the remote server (not on
266     locally-generated errors).
267    
268     It has to return either true (in which case AnyEvent::HTTP will continue),
269     or false, in which case AnyEvent::HTTP will cancel the download (and call
270     the finish callback with an error code of C<598>).
271    
272     This callback is useful, among other things, to quickly reject unwanted
273     content, which, if it is supposed to be rare, can be faster than first
274     doing a C<HEAD> request.
275    
276 root 1.68 The downside is that cancelling the request makes it impossible to re-use
277     the connection. Also, the C<on_header> callback will not receive any
278     trailer (headers sent after the response body).
279    
280 root 1.42 Example: cancel the request unless the content-type is "text/html".
281 root 1.41
282 root 1.42 on_header => sub {
283     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
284     },
285 root 1.41
286 root 1.42 =item on_body => $callback->($partial_body, $headers)
287 root 1.41
288 root 1.42 When specified, all body data will be passed to this callback instead of
289     to the completion callback. The completion callback will get the empty
290     string instead of the body data.
291 root 1.41
292 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
293     or false, in which case AnyEvent::HTTP will cancel the download (and call
294     the completion callback with an error code of C<598>).
295    
296 root 1.68 The downside to cancelling the request is that it makes it impossible to
297     re-use the connection.
298    
299 root 1.42 This callback is useful when the data is too large to be held in memory
300     (so the callback writes it to a file) or when only some information should
301     be extracted, or when the body should be processed incrementally.
302 root 1.41
303     It is usually preferred over doing your own body handling via
304 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
305     only used to create a connection, C<want_body_handle> is the better
306     alternative, as it allows you to install your own event handler, reducing
307     resource usage.
308 root 1.41
309     =item want_body_handle => $enable
310    
311     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
312     changes considerably: after parsing the headers, and instead of
313     downloading the body (if any), the completion callback will be
314     called. Instead of the C<$body> argument containing the body data, the
315     callback will receive the L<AnyEvent::Handle> object associated with the
316     connection. In error cases, C<undef> will be passed. When there is no body
317     (e.g. status C<304>), the empty string will be passed.
318    
319 root 1.92 The handle object might or might not be in TLS mode, might be connected
320     to a proxy, be a persistent connection, use chunked transfer encoding
321     etc., and configured in unspecified ways. The user is responsible for this
322     handle (it will not be used by this module anymore).
323 root 1.41
324     This is useful with some push-type services, where, after the initial
325     headers, an interactive protocol is used (typical example would be the
326     push-style twitter API which starts a JSON/XML stream).
327    
328     If you think you need this, first have a look at C<on_body>, to see if
329 root 1.45 that doesn't solve your problem in a better way.
330 root 1.41
331 root 1.92 =item persistent => $boolean
332    
333     Try to create/reuse a persistent connection. When this flag is set
334     (default: true for idempotent requests, false for all others), then
335     C<http_request> tries to re-use an existing (previously-created)
336     persistent connection to the host and, failing that, tries to create a new
337     one.
338    
339     Requests failing in certain ways will be automatically retried once, which
340     is dangerous for non-idempotent requests, which is why it defaults to off
341     for them. The reason for this is because the bozos who designed HTTP/1.1
342     made it impossible to distinguish between a fatal error and a normal
343     connection timeout, so you never know whether there was a problem with
344     your request or not.
345    
346     When reusing an existent connection, many parameters (such as TLS context)
347     will be ignored. See the C<session> parameter for a workaround.
348    
349     =item keepalive => $boolean
350    
351     Only used when C<persistent> is also true. This parameter decides whether
352     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
353     (as opposed to only a HTTP/1.1 persistent connection).
354    
355     The default is true, except when using a proxy, in which case it defaults
356     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
357    
358     =item handle_params => { key => value ... }
359    
360     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
361     constructor that is called - not all requests will create a handle, and
362     sometimes more than one is created, so this parameter is only good for
363     setting hints.
364    
365     Example: set the maximum read size to 4096, to potentially conserve memory
366     at the cost of speed.
367    
368     handle_params => {
369     max_read_size => 4096,
370     },
371    
372 root 1.1 =back
373    
374 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
375     the response body.
376 root 1.9
377     http_request GET => "http://www.nethype.de/", sub {
378     my ($body, $hdr) = @_;
379     print "$body\n";
380     };
381    
382 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
383 root 1.9 timeout of 30 seconds.
384    
385     http_request
386     GET => "https://www.google.com",
387 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
388 root 1.9 timeout => 30,
389     sub {
390     my ($body, $hdr) = @_;
391     use Data::Dumper;
392     print Dumper $hdr;
393     }
394     ;
395 root 1.1
396 root 1.68 Example: do another simple HTTP GET request, but immediately try to
397 root 1.29 cancel it.
398    
399     my $request = http_request GET => "http://www.nethype.de/", sub {
400     my ($body, $hdr) = @_;
401     print "$body\n";
402     };
403    
404     undef $request;
405    
406 root 1.1 =cut
407    
408 root 1.92 #############################################################################
409     # wait queue/slots
410    
411 root 1.12 sub _slot_schedule;
412 root 1.11 sub _slot_schedule($) {
413     my $host = shift;
414    
415     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
416     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
417 root 1.12 # somebody wants that slot
418 root 1.11 ++$CO_SLOT{$host}[0];
419 root 1.14 ++$ACTIVE;
420 root 1.11
421     $cb->(AnyEvent::Util::guard {
422 root 1.14 --$ACTIVE;
423 root 1.11 --$CO_SLOT{$host}[0];
424     _slot_schedule $host;
425     });
426     } else {
427     # nobody wants the slot, maybe we can forget about it
428     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
429     last;
430     }
431     }
432     }
433    
434     # wait for a free slot on host, call callback
435     sub _get_slot($$) {
436     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
437    
438     _slot_schedule $_[0];
439     }
440    
441 root 1.80 #############################################################################
442 root 1.92 # cookie handling
443 root 1.80
444     # expire cookies
445     sub cookie_jar_expire($;$) {
446     my ($jar, $session_end) = @_;
447    
448     %$jar = () if $jar->{version} != 1;
449    
450     my $anow = AE::now;
451    
452     while (my ($chost, $paths) = each %$jar) {
453     next unless ref $paths;
454    
455     while (my ($cpath, $cookies) = each %$paths) {
456     while (my ($cookie, $kv) = each %$cookies) {
457     if (exists $kv->{_expires}) {
458     delete $cookies->{$cookie}
459     if $anow > $kv->{_expires};
460     } elsif ($session_end) {
461     delete $cookies->{$cookie};
462     }
463     }
464    
465     delete $paths->{$cpath}
466     unless %$cookies;
467     }
468    
469     delete $jar->{$chost}
470     unless %$paths;
471     }
472     }
473    
474 root 1.72 # extract cookies from jar
475 root 1.71 sub cookie_jar_extract($$$$) {
476 root 1.92 my ($jar, $scheme, $host, $path) = @_;
477 root 1.71
478     %$jar = () if $jar->{version} != 1;
479    
480     my @cookies;
481    
482     while (my ($chost, $paths) = each %$jar) {
483     next unless ref $paths;
484    
485     if ($chost =~ /^\./) {
486 root 1.92 next unless $chost eq substr $host, -length $chost;
487 root 1.71 } elsif ($chost =~ /\./) {
488 root 1.92 next unless $chost eq $host;
489 root 1.71 } else {
490     next;
491     }
492    
493     while (my ($cpath, $cookies) = each %$paths) {
494 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
495 root 1.71
496     while (my ($cookie, $kv) = each %$cookies) {
497 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
498 root 1.71
499 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
500     delete $cookies->{$cookie};
501     next;
502 root 1.71 }
503    
504     my $value = $kv->{value};
505    
506     if ($value =~ /[=;,[:space:]]/) {
507     $value =~ s/([\\"])/\\$1/g;
508     $value = "\"$value\"";
509     }
510    
511     push @cookies, "$cookie=$value";
512     }
513     }
514     }
515    
516     \@cookies
517     }
518    
519 root 1.72 # parse set_cookie header into jar
520 root 1.80 sub cookie_jar_set_cookie($$$$) {
521 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
522 root 1.80
523     my $anow = int AE::now;
524     my $snow; # server-now
525 root 1.72
526     for ($set_cookie) {
527     # parse NAME=VALUE
528     my @kv;
529    
530 root 1.79 # expires is not http-compliant in the original cookie-spec,
531     # we support the official date format and some extensions
532 root 1.72 while (
533     m{
534     \G\s*
535     (?:
536 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
537 root 1.101 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^;,[:space:]]*) ) )?
538 root 1.72 )
539     }gcxsi
540     ) {
541     my $name = $2;
542     my $value = $4;
543    
544 root 1.82 if (defined $1) {
545 root 1.72 # expires
546     $name = "expires";
547     $value = $1;
548 root 1.82 } elsif (defined $3) {
549 root 1.72 # quoted
550     $value = $3;
551     $value =~ s/\\(.)/$1/gs;
552     }
553    
554 root 1.95 push @kv, @kv ? lc $name : $name, $value;
555 root 1.72
556     last unless /\G\s*;/gc;
557     }
558    
559     last unless @kv;
560    
561     my $name = shift @kv;
562     my %kv = (value => shift @kv, @kv);
563    
564 root 1.80 if (exists $kv{"max-age"}) {
565     $kv{_expires} = $anow + delete $kv{"max-age"};
566     } elsif (exists $kv{expires}) {
567     $snow ||= parse_date ($date) || $anow;
568     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
569     } else {
570     delete $kv{_expires};
571     }
572 root 1.72
573     my $cdom;
574     my $cpath = (delete $kv{path}) || "/";
575    
576     if (exists $kv{domain}) {
577     $cdom = delete $kv{domain};
578    
579     $cdom =~ s/^\.?/./; # make sure it starts with a "."
580    
581     next if $cdom =~ /\.$/;
582    
583     # this is not rfc-like and not netscape-like. go figure.
584     my $ndots = $cdom =~ y/.//;
585     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
586     } else {
587 root 1.92 $cdom = $host;
588 root 1.72 }
589    
590     # store it
591 root 1.73 $jar->{version} = 1;
592 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
593 root 1.72
594     redo if /\G\s*,/gc;
595     }
596     }
597    
598 root 1.92 #############################################################################
599     # keepalive/persistent connection cache
600    
601     # fetch a connection from the keepalive cache
602     sub ka_fetch($) {
603     my $ka_key = shift;
604    
605     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
606     delete $KA_CACHE{$ka_key}
607     unless @{ $KA_CACHE{$ka_key} };
608    
609     $hdl
610     }
611    
612     sub ka_store($$) {
613     my ($ka_key, $hdl) = @_;
614    
615     my $kaa = $KA_CACHE{$ka_key} ||= [];
616    
617     my $destroy = sub {
618     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
619    
620     $hdl->destroy;
621    
622     @ka
623     ? $KA_CACHE{$ka_key} = \@ka
624     : delete $KA_CACHE{$ka_key};
625     };
626    
627     # on error etc., destroy
628     $hdl->on_error ($destroy);
629     $hdl->on_eof ($destroy);
630     $hdl->on_read ($destroy);
631     $hdl->timeout ($PERSISTENT_TIMEOUT);
632    
633     push @$kaa, $hdl;
634     shift @$kaa while @$kaa > $MAX_PER_HOST;
635     }
636    
637     #############################################################################
638     # utilities
639    
640 root 1.66 # continue to parse $_ for headers and place them into the arg
641 root 1.92 sub _parse_hdr() {
642 root 1.66 my %hdr;
643    
644     # things seen, not parsed:
645     # p3pP="NON CUR OTPi OUR NOR UNI"
646    
647     $hdr{lc $1} .= ",$2"
648     while /\G
649     ([^:\000-\037]*):
650     [\011\040]*
651     ((?: [^\012]+ | \012[\011\040] )*)
652     \012
653     /gxc;
654    
655     /\G$/
656     or return;
657    
658     # remove the "," prefix we added to all headers above
659     substr $_, 0, 1, ""
660     for values %hdr;
661    
662     \%hdr
663     }
664    
665 root 1.92 #############################################################################
666     # http_get
667    
668 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
669 root 1.34
670 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
671     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
672 root 1.40
673 root 1.92 # maybe it should just become a normal object :/
674    
675     sub _destroy_state(\%) {
676     my ($state) = @_;
677    
678     $state->{handle}->destroy if $state->{handle};
679     %$state = ();
680     }
681    
682     sub _error(\%$$) {
683     my ($state, $cb, $hdr) = @_;
684    
685     &_destroy_state ($state);
686    
687     $cb->(undef, $hdr);
688     ()
689     }
690    
691 elmex 1.15 sub http_request($$@) {
692 root 1.1 my $cb = pop;
693     my ($method, $url, %arg) = @_;
694    
695     my %hdr;
696    
697 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
698     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
699    
700 root 1.3 $method = uc $method;
701    
702 root 1.8 if (my $hdr = $arg{headers}) {
703 root 1.1 while (my ($k, $v) = each %$hdr) {
704     $hdr{lc $k} = $v;
705     }
706     }
707    
708 root 1.55 # pseudo headers for all subsequent responses
709     my @pseudo = (URL => $url);
710     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
711    
712 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
713 root 1.8
714 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
715 root 1.8 if $recurse < 0;
716    
717 root 1.102 my $proxy = exists $arg{proxy} ? $arg{proxy} : $PROXY;
718 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
719    
720 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
721 root 1.100 $url =~ m|^([^:]+):(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?$|;
722 root 1.2
723 root 1.31 $uscheme = lc $uscheme;
724 root 1.1
725 root 1.31 my $uport = $uscheme eq "http" ? 80
726     : $uscheme eq "https" ? 443
727 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
728 root 1.13
729 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
730 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
731 root 1.10
732 root 1.86 my $uhost = lc $1;
733 root 1.10 $uport = $2 if defined $2;
734    
735 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
736     unless exists $hdr{host};
737 root 1.43
738 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
739 root 1.56 $upath .= $query if length $query;
740 root 1.10
741     $upath =~ s%^/?%/%;
742    
743     # cookie processing
744     if (my $jar = $arg{cookie_jar}) {
745 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
746 root 1.70
747 root 1.71 $hdr{cookie} = join "; ", @$cookies
748     if @$cookies;
749 root 1.10 }
750 root 1.1
751 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
752 root 1.2
753 root 1.10 if ($proxy) {
754 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
755 root 1.31
756 root 1.47 $rscheme = "http" unless defined $rscheme;
757    
758 root 1.31 # don't support https requests over https-proxy transport,
759 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
760 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
761 root 1.86
762     $rhost = lc $rhost;
763     $rscheme = lc $rscheme;
764 root 1.10 } else {
765 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
766 root 1.2 }
767    
768 root 1.47 # leave out fragment and query string, just a heuristic
769 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
770     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
771 root 1.41
772 root 1.53 $hdr{"content-length"} = length $arg{body}
773     if length $arg{body} || $method ne "GET";
774 root 1.1
775 root 1.89 my $idempotent = $method =~ /^(?:GET|HEAD|PUT|DELETE|OPTIONS|TRACE)$/;
776    
777     # default value for keepalive is true iff the request is for an idempotent method
778 root 1.92 my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : $idempotent;
779     my $keepalive10 = exists $arg{keepalive10} ? $arg{keepalive10} : !$proxy;
780     my $keptalive; # true if this is actually a recycled connection
781    
782     # the key to use in the keepalive cache
783     my $ka_key = "$uhost\x00$arg{sessionid}";
784 root 1.89
785 root 1.92 $hdr{connection} = ($keepalive ? $keepalive10 ? "keep-alive " : "" : "close ") . "Te"; #1.1
786 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
787 root 1.66
788 root 1.11 my %state = (connect_guard => 1);
789    
790 root 1.88 my $ae_error = 595; # connecting
791 root 1.1
792 root 1.88 # handle actual, non-tunneled, request
793     my $handle_actual_request = sub {
794     $ae_error = 596; # request phase
795    
796 root 1.92 my $hdl = $state{handle};
797    
798     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
799 root 1.88
800     # send request
801 root 1.92 $hdl->push_write (
802 root 1.88 "$method $rpath HTTP/1.1\015\012"
803     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
804     . "\015\012"
805     . (delete $arg{body})
806     );
807    
808     # return if error occured during push_write()
809     return unless %state;
810    
811 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
812     %hdr = ();
813 root 1.88
814     # status line and headers
815     $state{read_response} = sub {
816 root 1.92 return unless %state;
817    
818 root 1.88 for ("$_[1]") {
819     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
820    
821     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
822 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
823 root 1.88
824     # 100 Continue handling
825     # should not happen as we don't send expect: 100-continue,
826     # but we handle it just in case.
827     # since we send the request body regardless, if we get an error
828     # we are out of-sync, which we currently do NOT handle correctly.
829     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
830     if $2 eq 100;
831    
832     push @pseudo,
833     HTTPVersion => $1,
834     Status => $2,
835     Reason => $3,
836     ;
837 root 1.1
838 root 1.92 my $hdr = _parse_hdr
839     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
840 root 1.77
841 root 1.88 %hdr = (%$hdr, @pseudo);
842     }
843 root 1.82
844 root 1.88 # redirect handling
845     # microsoft and other shitheads don't give a shit for following standards,
846     # try to support some common forms of broken Location headers.
847     if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
848     $hdr{location} =~ s/^\.\/+//;
849    
850     my $url = "$rscheme://$uhost:$uport";
851    
852     unless ($hdr{location} =~ s/^\///) {
853     $url .= $upath;
854     $url =~ s/\/[^\/]*$//;
855 root 1.82 }
856    
857 root 1.88 $hdr{location} = "$url/$hdr{location}";
858     }
859 root 1.82
860 root 1.88 my $redirect;
861 root 1.82
862 root 1.88 if ($recurse) {
863     my $status = $hdr{Status};
864 root 1.82
865 root 1.88 # industry standard is to redirect POST as GET for
866     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
867     # also, the UA should ask the user for 301 and 307 and POST,
868     # industry standard seems to be to simply follow.
869     # we go with the industry standard.
870     if ($status == 301 or $status == 302 or $status == 303) {
871     # HTTP/1.1 is unclear on how to mutate the method
872     $method = "GET" unless $method eq "HEAD";
873     $redirect = 1;
874     } elsif ($status == 307) {
875     $redirect = 1;
876 root 1.82 }
877 root 1.88 }
878 root 1.82
879 root 1.88 my $finish = sub { # ($data, $err_status, $err_reason[, $keepalive])
880 root 1.92 if ($state{handle}) {
881     # handle keepalive
882     if (
883     $keepalive
884     && $_[3]
885     && ($hdr{HTTPVersion} < 1.1
886     ? $hdr{connection} =~ /\bkeep-?alive\b/i
887     : $hdr{connection} !~ /\bclose\b/i)
888     ) {
889     ka_store $ka_key, delete $state{handle};
890     } else {
891     # no keepalive, destroy the handle
892     $state{handle}->destroy;
893     }
894     }
895 root 1.82
896 root 1.88 %state = ();
897 root 1.82
898 root 1.88 if (defined $_[1]) {
899     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
900     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
901     }
902 root 1.82
903 root 1.88 # set-cookie processing
904     if ($arg{cookie_jar}) {
905     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
906     }
907 root 1.82
908 root 1.88 if ($redirect && exists $hdr{location}) {
909     # we ignore any errors, as it is very common to receive
910     # Content-Length != 0 but no actual body
911     # we also access %hdr, as $_[1] might be an erro
912     http_request (
913     $method => $hdr{location},
914     %arg,
915     recurse => $recurse - 1,
916     Redirect => [$_[0], \%hdr],
917 root 1.92 $cb
918     );
919 root 1.88 } else {
920     $cb->($_[0], \%hdr);
921     }
922     };
923 root 1.82
924 root 1.88 $ae_error = 597; # body phase
925 root 1.82
926 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
927    
928     my $len = $chunked ? undef : $hdr{"content-length"};
929 root 1.82
930 root 1.88 # body handling, many different code paths
931     # - no body expected
932     # - want_body_handle
933     # - te chunked
934     # - 2x length known (with or without on_body)
935     # - 2x length not known (with or without on_body)
936     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
937     $finish->(undef, 598 => "Request cancelled by on_header");
938     } elsif (
939     $hdr{Status} =~ /^(?:1..|204|205|304)$/
940     or $method eq "HEAD"
941     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
942     ) {
943     # no body
944     $finish->("", undef, undef, 1);
945    
946     } elsif (!$redirect && $arg{want_body_handle}) {
947     $_[0]->on_eof (undef);
948     $_[0]->on_error (undef);
949     $_[0]->on_read (undef);
950    
951     $finish->(delete $state{handle});
952    
953 root 1.91 } elsif ($chunked) {
954 root 1.88 my $cl = 0;
955 root 1.92 my $body = "";
956 root 1.88 my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
957    
958     $state{read_chunk} = sub {
959     $_[1] =~ /^([0-9a-fA-F]+)/
960     or $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
961 root 1.82
962 root 1.88 my $len = hex $1;
963 root 1.82
964 root 1.88 if ($len) {
965     $cl += $len;
966 root 1.82
967 root 1.88 $_[0]->push_read (chunk => $len, sub {
968     $on_body->($_[1], \%hdr)
969     or return $finish->(undef, 598 => "Request cancelled by on_body");
970 root 1.82
971 root 1.88 $_[0]->push_read (line => sub {
972     length $_[1]
973     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
974     $_[0]->push_read (line => $state{read_chunk});
975 root 1.82 });
976 root 1.88 });
977     } else {
978     $hdr{"content-length"} ||= $cl;
979 root 1.84
980 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
981     if (length $_[1]) {
982     for ("$_[1]") {
983     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
984 root 1.84
985 root 1.92 my $hdr = _parse_hdr
986 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
987 root 1.84
988 root 1.88 %hdr = (%hdr, %$hdr);
989     }
990     }
991 root 1.84
992 root 1.88 $finish->($body, undef, undef, 1);
993 root 1.84 });
994     }
995 root 1.88 };
996    
997     $_[0]->push_read (line => $state{read_chunk});
998    
999     } elsif ($arg{on_body}) {
1000     if (defined $len) {
1001     $_[0]->on_read (sub {
1002     $len -= length $_[0]{rbuf};
1003    
1004     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1005     or return $finish->(undef, 598 => "Request cancelled by on_body");
1006    
1007     $len > 0
1008     or $finish->("", undef, undef, 1);
1009     });
1010 root 1.84 } else {
1011 root 1.88 $_[0]->on_eof (sub {
1012     $finish->("");
1013     });
1014     $_[0]->on_read (sub {
1015     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1016     or $finish->(undef, 598 => "Request cancelled by on_body");
1017     });
1018     }
1019     } else {
1020     $_[0]->on_eof (undef);
1021 root 1.82
1022 root 1.88 if (defined $len) {
1023     $_[0]->on_read (sub {
1024     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1025     if $len <= length $_[0]{rbuf};
1026     });
1027     } else {
1028     $_[0]->on_error (sub {
1029     ($! == Errno::EPIPE || !$!)
1030     ? $finish->(delete $_[0]{rbuf})
1031     : $finish->(undef, $ae_error => $_[2]);
1032     });
1033     $_[0]->on_read (sub { });
1034 root 1.82 }
1035 root 1.88 }
1036     };
1037 root 1.82
1038 root 1.92 # if keepalive is enabled, then the server closing the connection
1039     # before a response can happen legally - we retry on idempotent methods.
1040     if ($keptalive && $idempotent) {
1041     my $old_eof = $hdl->{on_eof};
1042     $hdl->{on_eof} = sub {
1043     _destroy_state %state;
1044    
1045     http_request (
1046     $method => $url,
1047     %arg,
1048     keepalive => 0,
1049     $cb
1050     );
1051     };
1052     $hdl->on_read (sub {
1053     return unless %state;
1054    
1055     # as soon as we receive something, a connection close
1056     # once more becomes a hard error
1057     $hdl->{on_eof} = $old_eof;
1058     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1059     });
1060     } else {
1061     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1062     }
1063     };
1064    
1065     my $prepare_handle = sub {
1066     my ($hdl) = $state{handle};
1067    
1068     $hdl->timeout ($timeout);
1069     $hdl->on_error (sub {
1070     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1071     });
1072     $hdl->on_eof (sub {
1073     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1074     });
1075 root 1.88 };
1076 root 1.82
1077 root 1.92 # connected to proxy (or origin server)
1078 root 1.88 my $connect_cb = sub {
1079 root 1.92 my $fh = shift
1080     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1081 root 1.44
1082 root 1.88 return unless delete $state{connect_guard};
1083 root 1.11
1084 root 1.88 # get handle
1085     $state{handle} = new AnyEvent::Handle
1086 root 1.92 %{ $arg{handle_params} },
1087     fh => $fh,
1088     peername => $uhost,
1089 root 1.88 tls_ctx => $arg{tls_ctx},
1090     ;
1091 root 1.11
1092 root 1.92 $prepare_handle->();
1093 root 1.1
1094 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1095 root 1.88
1096     # now handle proxy-CONNECT method
1097     if ($proxy && $uscheme eq "https") {
1098     # oh dear, we have to wrap it into a connect request
1099    
1100     # maybe re-use $uauthority with patched port?
1101     $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012\015\012");
1102     $state{handle}->push_read (line => $qr_nlnl, sub {
1103     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1104 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1105 root 1.88
1106     if ($2 == 200) {
1107     $rpath = $upath;
1108     $handle_actual_request->();
1109     } else {
1110 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1111 root 1.88 }
1112     });
1113     } else {
1114     $handle_actual_request->();
1115     }
1116     };
1117    
1118     _get_slot $uhost, sub {
1119     $state{slot_guard} = shift;
1120 root 1.64
1121 root 1.88 return unless $state{connect_guard};
1122 root 1.64
1123 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1124     # on a keepalive request (in theory, this should be a separate config option).
1125     if ($keepalive && $KA_CACHE{$ka_key}) {
1126     $keptalive = 1;
1127     $state{handle} = ka_fetch $ka_key;
1128     $prepare_handle->();
1129     $handle_actual_request->();
1130    
1131     } else {
1132     my $tcp_connect = $arg{tcp_connect}
1133     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1134 root 1.57
1135 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1136     }
1137 root 1.1 };
1138    
1139 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1140 root 1.1 }
1141    
1142 elmex 1.15 sub http_get($@) {
1143 root 1.1 unshift @_, "GET";
1144     &http_request
1145     }
1146    
1147 elmex 1.15 sub http_head($@) {
1148 root 1.4 unshift @_, "HEAD";
1149     &http_request
1150     }
1151    
1152 elmex 1.15 sub http_post($$@) {
1153 root 1.22 my $url = shift;
1154     unshift @_, "POST", $url, "body";
1155 root 1.3 &http_request
1156     }
1157    
1158 root 1.9 =back
1159    
1160 root 1.55 =head2 DNS CACHING
1161    
1162     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1163     the actual connection, which in turn uses AnyEvent::DNS to resolve
1164     hostnames. The latter is a simple stub resolver and does no caching
1165     on its own. If you want DNS caching, you currently have to provide
1166     your own default resolver (by storing a suitable resolver object in
1167 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1168 root 1.55
1169 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1170 root 1.1
1171     =over 4
1172    
1173 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1174    
1175     Sets the default proxy server to use. The proxy-url must begin with a
1176 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1177 root 1.52
1178     To clear an already-set proxy, use C<undef>.
1179 root 1.2
1180 root 1.102 When AnyEvent::HTTP is laoded for the first time it will query the
1181     default proxy from the operating system, currently by looking at
1182     C<$ENV{http_proxy>}.
1183    
1184 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1185    
1186     Remove all cookies from the cookie jar that have been expired. If
1187     C<$session_end> is given and true, then additionally remove all session
1188     cookies.
1189    
1190     You should call this function (with a true C<$session_end>) before you
1191     save cookies to disk, and you should call this function after loading them
1192     again. If you have a long-running program you can additonally call this
1193     function from time to time.
1194    
1195     A cookie jar is initially an empty hash-reference that is managed by this
1196     module. It's format is subject to change, but currently it is like this:
1197    
1198     The key C<version> has to contain C<1>, otherwise the hash gets
1199     emptied. All other keys are hostnames or IP addresses pointing to
1200     hash-references. The key for these inner hash references is the
1201     server path for which this cookie is meant, and the values are again
1202     hash-references. The keys of those hash-references is the cookie name, and
1203     the value, you guessed it, is another hash-reference, this time with the
1204     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1205     which have been replaced by a C<_expires> key that contains the cookie
1206     expiry timestamp.
1207    
1208     Here is an example of a cookie jar with a single cookie, so you have a
1209     chance of understanding the above paragraph:
1210    
1211     {
1212     version => 1,
1213     "10.0.0.1" => {
1214     "/" => {
1215     "mythweb_id" => {
1216     _expires => 1293917923,
1217     value => "ooRung9dThee3ooyXooM1Ohm",
1218     },
1219     },
1220     },
1221     }
1222    
1223 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1224    
1225     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1226     Date (RFC 2616).
1227    
1228     =item $timestamp = AnyEvent::HTTP::parse_date $date
1229    
1230 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1231     bunch of minor variations of those, and returns the corresponding POSIX
1232     timestamp, or C<undef> if the date cannot be parsed.
1233 root 1.61
1234 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1235 root 1.1
1236 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1237 root 1.1
1238 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1239    
1240     The default timeout for conenction operations (default: C<300>).
1241    
1242 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1243    
1244     The default value for the C<User-Agent> header (the default is
1245 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1246 root 1.1
1247 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1248 root 1.1
1249 root 1.47 The maximum number of concurrent connections to the same host (identified
1250 root 1.43 by the hostname). If the limit is exceeded, then the additional requests
1251 root 1.92 are queued until previous connections are closed. Both persistent and
1252     non-persistent connections are counted in this limit.
1253 root 1.1
1254 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1255 root 1.92 increase it much.
1256    
1257     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1258     connections, older browsers used 2, newers (such as firefox 3) typically
1259     use 6, and Opera uses 8 because like, they have the fastest browser and
1260     give a shit for everybody else on the planet.
1261    
1262     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1263    
1264     The time after which idle persistent conenctions get closed by
1265     AnyEvent::HTTP (default: C<3>).
1266 root 1.3
1267 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1268    
1269     The number of active connections. This is not the number of currently
1270     running requests, but the number of currently open and non-idle TCP
1271 root 1.92 connections. This number can be useful for load-leveling.
1272 root 1.14
1273 root 1.1 =back
1274    
1275     =cut
1276    
1277 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1278     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1279    
1280     sub format_date($) {
1281     my ($time) = @_;
1282    
1283     # RFC 822/1123 format
1284     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1285    
1286     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1287     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1288     $H, $M, $S;
1289     }
1290    
1291     sub parse_date($) {
1292     my ($date) = @_;
1293    
1294     my ($d, $m, $y, $H, $M, $S);
1295    
1296 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1297 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1298     # cookie dates (with "-")
1299    
1300 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1301    
1302 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1303 root 1.61 # RFC 850
1304     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1305    
1306 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1307 root 1.61 # ISO C's asctime
1308     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1309     }
1310     # other formats fail in the loop below
1311    
1312     for (0..11) {
1313     if ($m eq $month[$_]) {
1314     require Time::Local;
1315     return Time::Local::timegm ($S, $M, $H, $d, $_, $y);
1316     }
1317     }
1318    
1319     undef
1320     }
1321    
1322 root 1.2 sub set_proxy($) {
1323 root 1.52 if (length $_[0]) {
1324 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1325 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1326     $PROXY = [$2, $3 || 3128, $1]
1327     } else {
1328     undef $PROXY;
1329     }
1330 root 1.2 }
1331    
1332     # initialise proxy from environment
1333 root 1.52 eval {
1334     set_proxy $ENV{http_proxy};
1335     };
1336 root 1.2
1337 root 1.93 =head2 SHOWCASE
1338    
1339     This section contaisn some more elaborate "real-world" examples or code
1340     snippets.
1341    
1342     =head2 HTTP/1.1 FILE DOWNLOAD
1343    
1344 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1345 root 1.98 goes wrong and you want to resume.
1346 root 1.93
1347     Here is a function that initiates and resumes a download. It uses the
1348     last modified time to check for file content changes, and works with many
1349     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1350     on older servers.
1351    
1352     It calls the completion callback with either C<undef>, which means a
1353     nonretryable error occured, C<0> when the download was partial and should
1354     be retried, and C<1> if it was successful.
1355    
1356     use AnyEvent::HTTP;
1357    
1358     sub download($$$) {
1359     my ($url, $file, $cb) = @_;
1360    
1361     open my $fh, "+<", $file
1362     or die "$file: $!";
1363    
1364     my %hdr;
1365     my $ofs = 0;
1366    
1367     warn stat $fh;
1368     warn -s _;
1369     if (stat $fh and -s _) {
1370     $ofs = -s _;
1371     warn "-s is ", $ofs;#d#
1372     $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1373     $hdr{"range"} = "bytes=$ofs-";
1374     }
1375    
1376     http_get $url,
1377     headers => \%hdr,
1378     on_header => sub {
1379     my ($hdr) = @_;
1380    
1381     if ($hdr->{Status} == 200 && $ofs) {
1382     # resume failed
1383     truncate $fh, $ofs = 0;
1384     }
1385    
1386     sysseek $fh, $ofs, 0;
1387    
1388     1
1389     },
1390     on_body => sub {
1391     my ($data, $hdr) = @_;
1392    
1393     if ($hdr->{Status} =~ /^2/) {
1394     length $data == syswrite $fh, $data
1395     or return; # abort on write errors
1396     }
1397    
1398     1
1399     },
1400     sub {
1401     my (undef, $hdr) = @_;
1402    
1403     my $status = $hdr->{Status};
1404    
1405     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1406     utime $fh, $time, $time;
1407     }
1408    
1409     if ($status == 200 || $status == 206 || $status == 416) {
1410     # download ok || resume ok || file already fully downloaded
1411     $cb->(1, $hdr);
1412    
1413     } elsif ($status == 412) {
1414     # file has changed while resuming, delete and retry
1415     unlink $file;
1416     $cb->(0, $hdr);
1417    
1418     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1419     # retry later
1420     $cb->(0, $hdr);
1421    
1422     } else {
1423     $cb->(undef, $hdr);
1424     }
1425     }
1426     ;
1427     }
1428    
1429     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1430     if ($_[0]) {
1431     print "OK!\n";
1432     } elsif (defined $_[0]) {
1433     print "please retry later\n";
1434     } else {
1435     print "ERROR\n";
1436     }
1437     };
1438    
1439     =head3 SOCKS PROXIES
1440 root 1.60
1441     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1442     compile your perl to support socks, or use an external program such as
1443     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1444     transparently.
1445    
1446     Alternatively, for AnyEvent::HTTP only, you can use your own
1447     C<tcp_connect> function that does the proxy handshake - here is an example
1448     that works with socks4a proxies:
1449    
1450     use Errno;
1451     use AnyEvent::Util;
1452     use AnyEvent::Socket;
1453     use AnyEvent::Handle;
1454    
1455     # host, port and username of/for your socks4a proxy
1456     my $socks_host = "10.0.0.23";
1457     my $socks_port = 9050;
1458     my $socks_user = "";
1459    
1460     sub socks4a_connect {
1461     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1462    
1463     my $hdl = new AnyEvent::Handle
1464     connect => [$socks_host, $socks_port],
1465     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1466     on_error => sub { $connect_cb->() },
1467     ;
1468    
1469     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1470    
1471     $hdl->push_read (chunk => 8, sub {
1472     my ($hdl, $chunk) = @_;
1473     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1474    
1475     if ($status == 0x5a) {
1476     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1477     } else {
1478     $! = Errno::ENXIO; $connect_cb->();
1479     }
1480     });
1481    
1482     $hdl
1483     }
1484    
1485     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1486     possibly after switching off other proxy types:
1487    
1488     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1489    
1490     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1491     my ($data, $headers) = @_;
1492     ...
1493     };
1494    
1495 root 1.1 =head1 SEE ALSO
1496    
1497     L<AnyEvent>.
1498    
1499     =head1 AUTHOR
1500    
1501 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1502     http://home.schmorp.de/
1503 root 1.1
1504 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1505     testcases and bugreports.
1506    
1507 root 1.1 =cut
1508    
1509     1
1510