ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.122
Committed: Fri May 8 17:28:39 2015 UTC (9 years ago) by root
Branch: MAIN
Changes since 1.121: +12 -3 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
21 root 1.11 automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41 root 1.85 use common::sense;
42 root 1.1
43 root 1.41 use Errno ();
44 root 1.1
45 root 1.51 use AnyEvent 5.0 ();
46 root 1.1 use AnyEvent::Util ();
47     use AnyEvent::Handle ();
48    
49     use base Exporter::;
50    
51 root 1.120 our $VERSION = 2.21;
52 root 1.1
53 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
54 root 1.1
55 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
56 root 1.3 our $MAX_RECURSE = 10;
57 root 1.92 our $PERSISTENT_TIMEOUT = 3;
58 root 1.2 our $TIMEOUT = 300;
59 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
60 root 1.1
61 root 1.2 our $PROXY;
62 root 1.14 our $ACTIVE = 0;
63 root 1.2
64 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
65 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
66 root 1.1
67     =item http_get $url, key => value..., $cb->($data, $headers)
68    
69     Executes an HTTP-GET request. See the http_request function for details on
70 root 1.29 additional parameters and the return value.
71 root 1.1
72 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
73    
74 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
75     on additional parameters and the return value.
76 root 1.5
77     =item http_post $url, $body, key => value..., $cb->($data, $headers)
78 root 1.3
79 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
80 root 1.29 http_request function for details on additional parameters and the return
81     value.
82 root 1.3
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.29 When called in void context, nothing is returned. In other contexts,
89     C<http_request> returns a "cancellation guard" - you have to keep the
90     object at least alive until the callback get called. If the object gets
91 root 1.58 destroyed before the callback is called, the request will be cancelled.
92 root 1.29
93 root 1.42 The callback will be called with the response body data as first argument
94 root 1.117 (or C<undef> if an error occurred), and a hash-ref with response headers
95 root 1.68 (and trailers) as second argument.
96 root 1.2
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
99     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
100 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
101     during the body phase of a request, then the original C<Status> and
102     C<Reason> values from the header are available as C<OrigStatus> and
103     C<OrigReason>.
104 root 1.55
105     The pseudo-header C<URL> contains the actual URL (which can differ from
106     the requested URL when following redirects - for example, you might get
107     an error that your URL scheme is not supported even though your URL is a
108     valid http URL because it redirected to an ftp URL, in which case you can
109     look at the URL pseudo header).
110    
111     The pseudo-header C<Redirect> only exists when the request was a result
112     of an internal redirect. In that case it is an array reference with
113     the C<($data, $headers)> from the redirect response. Note that this
114     response could in turn be the result of a redirect itself, and C<<
115     $headers->{Redirect}[1]{Redirect} >> will then contain the original
116     response, and so on.
117 root 1.20
118 root 1.32 If the server sends a header multiple times, then their contents will be
119     joined together with a comma (C<,>), as per the HTTP spec.
120 root 1.2
121     If an internal error occurs, such as not being able to resolve a hostname,
122 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
123     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
124     message. Currently the following status codes are used:
125    
126     =over 4
127    
128 root 1.114 =item 595 - errors during connection establishment, proxy handshake.
129 root 1.77
130     =item 596 - errors during TLS negotiation, request sending and header processing.
131    
132 root 1.78 =item 597 - errors during body receiving or processing.
133 root 1.77
134 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
135 root 1.77
136     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
137    
138     =back
139 root 1.2
140 root 1.6 A typical callback might look like this:
141    
142     sub {
143     my ($body, $hdr) = @_;
144    
145     if ($hdr->{Status} =~ /^2/) {
146     ... everything should be ok
147     } else {
148     print "error, $hdr->{Status} $hdr->{Reason}\n";
149     }
150     }
151    
152 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
153     include:
154    
155     =over 4
156    
157 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
158 root 1.1
159 root 1.112 Whether to recurse requests or not, e.g. on redirects, authentication and
160     other retries and so on, and how often to do so.
161 root 1.1
162 root 1.119 Only redirects to http and https URLs are supported. While most common
163     redirection forms are handled entirely within this module, some require
164     the use of the optional L<URI> module. If it is required but missing, then
165     the request will fail with an error.
166    
167 root 1.1 =item headers => hashref
168    
169 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
170     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
171 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
172     (this can be suppressed by using C<undef> for these headers in which case
173     they won't be sent at all).
174 root 1.1
175 root 1.90 You really should provide your own C<User-Agent:> header value that is
176     appropriate for your program - I wouldn't be surprised if the default
177     AnyEvent string gets blocked by webservers sooner or later.
178    
179 root 1.104 Also, make sure that your headers names and values do not contain any
180     embedded newlines.
181    
182 root 1.1 =item timeout => $seconds
183    
184     The time-out to use for various stages - each connect attempt will reset
185 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
186     timeout.
187    
188     Default timeout is 5 minutes.
189 root 1.2
190     =item proxy => [$host, $port[, $scheme]] or undef
191    
192 root 1.102 Use the given http proxy for all requests, or no proxy if C<undef> is
193     used.
194 root 1.2
195 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
196 root 1.1
197 root 1.102 If not specified, then the default proxy is used (see
198     C<AnyEvent::HTTP::set_proxy>).
199    
200 root 1.3 =item body => $string
201    
202 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
203 root 1.3 this module might offer more options).
204    
205 root 1.10 =item cookie_jar => $hash_ref
206    
207     Passing this parameter enables (simplified) cookie-processing, loosely
208     based on the original netscape specification.
209    
210 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
211     will get updated automatically. It is possible to save the cookie jar
212     to persistent storage with something like JSON or Storable - see the
213     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
214     expired or session-only cookies, and also for documentation on the format
215     of the cookie jar.
216 root 1.10
217 root 1.70 Note that this cookie implementation is not meant to be complete. If
218     you want complete cookie management you have to do that on your
219 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
220 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
221     to.
222 root 1.10
223 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
224 root 1.70 headers will be set and handled by this module, otherwise they will be
225 root 1.69 left untouched.
226    
227 root 1.40 =item tls_ctx => $scheme | $tls_ctx
228    
229     Specifies the AnyEvent::TLS context to be used for https connections. This
230     parameter follows the same rules as the C<tls_ctx> parameter to
231     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
232     C<high> can be specified, which give you a predefined low-security (no
233     verification, highest compatibility) and high-security (CA and common-name
234     verification) TLS context.
235    
236     The default for this option is C<low>, which could be interpreted as "give
237     me the page, no matter what".
238    
239 root 1.92 See also the C<sessionid> parameter.
240    
241     =item session => $string
242    
243     The module might reuse connections to the same host internally. Sometimes
244     (e.g. when using TLS), you do not want to reuse connections from other
245     sessions. This can be achieved by setting this parameter to some unique
246     ID (such as the address of an object storing your state data, or the TLS
247     context) - only connections using the same unique ID will be reused.
248    
249 root 1.51 =item on_prepare => $callback->($fh)
250    
251     In rare cases you need to "tune" the socket before it is used to
252 root 1.117 connect (for example, to bind it on a given IP address). This parameter
253 root 1.51 overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
254     and behaves exactly the same way (e.g. it has to provide a
255     timeout). See the description for the C<$prepare_cb> argument of
256     C<AnyEvent::Socket::tcp_connect> for details.
257    
258 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
259    
260     In even rarer cases you want total control over how AnyEvent::HTTP
261     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
262     to do this, but you can provide your own C<tcp_connect> function -
263 root 1.60 obviously, it has to follow the same calling conventions, except that it
264     may always return a connection guard object.
265 root 1.59
266     There are probably lots of weird uses for this function, starting from
267     tracing the hosts C<http_request> actually tries to connect, to (inexact
268     but fast) host => IP address caching or even socks protocol support.
269    
270 root 1.42 =item on_header => $callback->($headers)
271 root 1.41
272     When specified, this callback will be called with the header hash as soon
273     as headers have been successfully received from the remote server (not on
274     locally-generated errors).
275    
276     It has to return either true (in which case AnyEvent::HTTP will continue),
277     or false, in which case AnyEvent::HTTP will cancel the download (and call
278     the finish callback with an error code of C<598>).
279    
280     This callback is useful, among other things, to quickly reject unwanted
281     content, which, if it is supposed to be rare, can be faster than first
282     doing a C<HEAD> request.
283    
284 root 1.68 The downside is that cancelling the request makes it impossible to re-use
285     the connection. Also, the C<on_header> callback will not receive any
286     trailer (headers sent after the response body).
287    
288 root 1.42 Example: cancel the request unless the content-type is "text/html".
289 root 1.41
290 root 1.42 on_header => sub {
291     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
292     },
293 root 1.41
294 root 1.42 =item on_body => $callback->($partial_body, $headers)
295 root 1.41
296 root 1.42 When specified, all body data will be passed to this callback instead of
297     to the completion callback. The completion callback will get the empty
298     string instead of the body data.
299 root 1.41
300 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
301     or false, in which case AnyEvent::HTTP will cancel the download (and call
302     the completion callback with an error code of C<598>).
303    
304 root 1.68 The downside to cancelling the request is that it makes it impossible to
305     re-use the connection.
306    
307 root 1.42 This callback is useful when the data is too large to be held in memory
308     (so the callback writes it to a file) or when only some information should
309     be extracted, or when the body should be processed incrementally.
310 root 1.41
311     It is usually preferred over doing your own body handling via
312 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
313     only used to create a connection, C<want_body_handle> is the better
314     alternative, as it allows you to install your own event handler, reducing
315     resource usage.
316 root 1.41
317     =item want_body_handle => $enable
318    
319     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
320     changes considerably: after parsing the headers, and instead of
321     downloading the body (if any), the completion callback will be
322     called. Instead of the C<$body> argument containing the body data, the
323     callback will receive the L<AnyEvent::Handle> object associated with the
324     connection. In error cases, C<undef> will be passed. When there is no body
325     (e.g. status C<304>), the empty string will be passed.
326    
327 root 1.92 The handle object might or might not be in TLS mode, might be connected
328     to a proxy, be a persistent connection, use chunked transfer encoding
329     etc., and configured in unspecified ways. The user is responsible for this
330     handle (it will not be used by this module anymore).
331 root 1.41
332     This is useful with some push-type services, where, after the initial
333     headers, an interactive protocol is used (typical example would be the
334     push-style twitter API which starts a JSON/XML stream).
335    
336     If you think you need this, first have a look at C<on_body>, to see if
337 root 1.45 that doesn't solve your problem in a better way.
338 root 1.41
339 root 1.92 =item persistent => $boolean
340    
341     Try to create/reuse a persistent connection. When this flag is set
342     (default: true for idempotent requests, false for all others), then
343     C<http_request> tries to re-use an existing (previously-created)
344     persistent connection to the host and, failing that, tries to create a new
345     one.
346    
347     Requests failing in certain ways will be automatically retried once, which
348     is dangerous for non-idempotent requests, which is why it defaults to off
349     for them. The reason for this is because the bozos who designed HTTP/1.1
350     made it impossible to distinguish between a fatal error and a normal
351     connection timeout, so you never know whether there was a problem with
352     your request or not.
353    
354     When reusing an existent connection, many parameters (such as TLS context)
355     will be ignored. See the C<session> parameter for a workaround.
356    
357     =item keepalive => $boolean
358    
359     Only used when C<persistent> is also true. This parameter decides whether
360     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
361     (as opposed to only a HTTP/1.1 persistent connection).
362    
363     The default is true, except when using a proxy, in which case it defaults
364     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
365    
366     =item handle_params => { key => value ... }
367    
368     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
369     constructor that is called - not all requests will create a handle, and
370     sometimes more than one is created, so this parameter is only good for
371     setting hints.
372    
373     Example: set the maximum read size to 4096, to potentially conserve memory
374     at the cost of speed.
375    
376     handle_params => {
377     max_read_size => 4096,
378     },
379    
380 root 1.1 =back
381    
382 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
383     the response body.
384 root 1.9
385     http_request GET => "http://www.nethype.de/", sub {
386     my ($body, $hdr) = @_;
387     print "$body\n";
388     };
389    
390 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
391 root 1.9 timeout of 30 seconds.
392    
393     http_request
394 root 1.109 HEAD => "https://www.google.com",
395 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
396 root 1.9 timeout => 30,
397     sub {
398     my ($body, $hdr) = @_;
399     use Data::Dumper;
400     print Dumper $hdr;
401     }
402     ;
403 root 1.1
404 root 1.68 Example: do another simple HTTP GET request, but immediately try to
405 root 1.29 cancel it.
406    
407     my $request = http_request GET => "http://www.nethype.de/", sub {
408     my ($body, $hdr) = @_;
409     print "$body\n";
410     };
411    
412     undef $request;
413    
414 root 1.1 =cut
415    
416 root 1.92 #############################################################################
417     # wait queue/slots
418    
419 root 1.12 sub _slot_schedule;
420 root 1.11 sub _slot_schedule($) {
421     my $host = shift;
422    
423     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
424     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
425 root 1.12 # somebody wants that slot
426 root 1.11 ++$CO_SLOT{$host}[0];
427 root 1.14 ++$ACTIVE;
428 root 1.11
429     $cb->(AnyEvent::Util::guard {
430 root 1.14 --$ACTIVE;
431 root 1.11 --$CO_SLOT{$host}[0];
432     _slot_schedule $host;
433     });
434     } else {
435     # nobody wants the slot, maybe we can forget about it
436     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
437     last;
438     }
439     }
440     }
441    
442     # wait for a free slot on host, call callback
443     sub _get_slot($$) {
444     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
445    
446     _slot_schedule $_[0];
447     }
448    
449 root 1.80 #############################################################################
450 root 1.92 # cookie handling
451 root 1.80
452     # expire cookies
453     sub cookie_jar_expire($;$) {
454     my ($jar, $session_end) = @_;
455    
456     %$jar = () if $jar->{version} != 1;
457    
458     my $anow = AE::now;
459    
460     while (my ($chost, $paths) = each %$jar) {
461     next unless ref $paths;
462    
463     while (my ($cpath, $cookies) = each %$paths) {
464     while (my ($cookie, $kv) = each %$cookies) {
465     if (exists $kv->{_expires}) {
466     delete $cookies->{$cookie}
467     if $anow > $kv->{_expires};
468     } elsif ($session_end) {
469     delete $cookies->{$cookie};
470     }
471     }
472    
473     delete $paths->{$cpath}
474     unless %$cookies;
475     }
476    
477     delete $jar->{$chost}
478     unless %$paths;
479     }
480     }
481    
482 root 1.72 # extract cookies from jar
483 root 1.71 sub cookie_jar_extract($$$$) {
484 root 1.92 my ($jar, $scheme, $host, $path) = @_;
485 root 1.71
486     %$jar = () if $jar->{version} != 1;
487    
488     my @cookies;
489    
490     while (my ($chost, $paths) = each %$jar) {
491     next unless ref $paths;
492    
493     if ($chost =~ /^\./) {
494 root 1.92 next unless $chost eq substr $host, -length $chost;
495 root 1.71 } elsif ($chost =~ /\./) {
496 root 1.92 next unless $chost eq $host;
497 root 1.71 } else {
498     next;
499     }
500    
501     while (my ($cpath, $cookies) = each %$paths) {
502 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
503 root 1.71
504     while (my ($cookie, $kv) = each %$cookies) {
505 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
506 root 1.71
507 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
508     delete $cookies->{$cookie};
509     next;
510 root 1.71 }
511    
512     my $value = $kv->{value};
513    
514     if ($value =~ /[=;,[:space:]]/) {
515     $value =~ s/([\\"])/\\$1/g;
516     $value = "\"$value\"";
517     }
518    
519     push @cookies, "$cookie=$value";
520     }
521     }
522     }
523    
524     \@cookies
525     }
526    
527 root 1.72 # parse set_cookie header into jar
528 root 1.80 sub cookie_jar_set_cookie($$$$) {
529 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
530 root 1.80
531     my $anow = int AE::now;
532     my $snow; # server-now
533 root 1.72
534     for ($set_cookie) {
535     # parse NAME=VALUE
536     my @kv;
537    
538 root 1.79 # expires is not http-compliant in the original cookie-spec,
539     # we support the official date format and some extensions
540 root 1.72 while (
541     m{
542     \G\s*
543     (?:
544 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
545 root 1.101 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^;,[:space:]]*) ) )?
546 root 1.72 )
547     }gcxsi
548     ) {
549     my $name = $2;
550     my $value = $4;
551    
552 root 1.82 if (defined $1) {
553 root 1.72 # expires
554     $name = "expires";
555     $value = $1;
556 root 1.82 } elsif (defined $3) {
557 root 1.72 # quoted
558     $value = $3;
559     $value =~ s/\\(.)/$1/gs;
560     }
561    
562 root 1.95 push @kv, @kv ? lc $name : $name, $value;
563 root 1.72
564     last unless /\G\s*;/gc;
565     }
566    
567     last unless @kv;
568    
569     my $name = shift @kv;
570     my %kv = (value => shift @kv, @kv);
571    
572 root 1.80 if (exists $kv{"max-age"}) {
573     $kv{_expires} = $anow + delete $kv{"max-age"};
574     } elsif (exists $kv{expires}) {
575     $snow ||= parse_date ($date) || $anow;
576     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
577     } else {
578     delete $kv{_expires};
579     }
580 root 1.72
581     my $cdom;
582     my $cpath = (delete $kv{path}) || "/";
583    
584     if (exists $kv{domain}) {
585     $cdom = delete $kv{domain};
586    
587     $cdom =~ s/^\.?/./; # make sure it starts with a "."
588    
589     next if $cdom =~ /\.$/;
590    
591     # this is not rfc-like and not netscape-like. go figure.
592     my $ndots = $cdom =~ y/.//;
593     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
594     } else {
595 root 1.92 $cdom = $host;
596 root 1.72 }
597    
598     # store it
599 root 1.73 $jar->{version} = 1;
600 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
601 root 1.72
602     redo if /\G\s*,/gc;
603     }
604     }
605    
606 root 1.92 #############################################################################
607     # keepalive/persistent connection cache
608    
609     # fetch a connection from the keepalive cache
610     sub ka_fetch($) {
611     my $ka_key = shift;
612    
613     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
614     delete $KA_CACHE{$ka_key}
615     unless @{ $KA_CACHE{$ka_key} };
616    
617     $hdl
618     }
619    
620     sub ka_store($$) {
621     my ($ka_key, $hdl) = @_;
622    
623     my $kaa = $KA_CACHE{$ka_key} ||= [];
624    
625     my $destroy = sub {
626     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
627    
628     $hdl->destroy;
629    
630     @ka
631     ? $KA_CACHE{$ka_key} = \@ka
632     : delete $KA_CACHE{$ka_key};
633     };
634    
635     # on error etc., destroy
636     $hdl->on_error ($destroy);
637     $hdl->on_eof ($destroy);
638     $hdl->on_read ($destroy);
639     $hdl->timeout ($PERSISTENT_TIMEOUT);
640    
641     push @$kaa, $hdl;
642     shift @$kaa while @$kaa > $MAX_PER_HOST;
643     }
644    
645     #############################################################################
646     # utilities
647    
648 root 1.66 # continue to parse $_ for headers and place them into the arg
649 root 1.92 sub _parse_hdr() {
650 root 1.66 my %hdr;
651    
652     # things seen, not parsed:
653     # p3pP="NON CUR OTPi OUR NOR UNI"
654    
655     $hdr{lc $1} .= ",$2"
656     while /\G
657     ([^:\000-\037]*):
658     [\011\040]*
659     ((?: [^\012]+ | \012[\011\040] )*)
660     \012
661     /gxc;
662    
663     /\G$/
664     or return;
665    
666     # remove the "," prefix we added to all headers above
667     substr $_, 0, 1, ""
668     for values %hdr;
669    
670     \%hdr
671     }
672    
673 root 1.92 #############################################################################
674     # http_get
675    
676 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
677 root 1.34
678 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
679     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
680 root 1.40
681 root 1.92 # maybe it should just become a normal object :/
682    
683     sub _destroy_state(\%) {
684     my ($state) = @_;
685    
686     $state->{handle}->destroy if $state->{handle};
687     %$state = ();
688     }
689    
690     sub _error(\%$$) {
691     my ($state, $cb, $hdr) = @_;
692    
693     &_destroy_state ($state);
694    
695     $cb->(undef, $hdr);
696     ()
697     }
698    
699 root 1.116 our %IDEMPOTENT = (
700     DELETE => 1,
701     GET => 1,
702     HEAD => 1,
703     OPTIONS => 1,
704     PUT => 1,
705     TRACE => 1,
706    
707     ACL => 1,
708     "BASELINE-CONTROL" => 1,
709     BIND => 1,
710     CHECKIN => 1,
711     CHECKOUT => 1,
712     COPY => 1,
713     LABEL => 1,
714     LINK => 1,
715     MERGE => 1,
716     MKACTIVITY => 1,
717     MKCALENDAR => 1,
718     MKCOL => 1,
719     MKREDIRECTREF => 1,
720     MKWORKSPACE => 1,
721     MOVE => 1,
722     ORDERPATCH => 1,
723     PROPFIND => 1,
724     PROPPATCH => 1,
725     REBIND => 1,
726     REPORT => 1,
727     SEARCH => 1,
728     UNBIND => 1,
729     UNCHECKOUT => 1,
730     UNLINK => 1,
731     UNLOCK => 1,
732     UPDATE => 1,
733     UPDATEREDIRECTREF => 1,
734     "VERSION-CONTROL" => 1,
735     );
736    
737 elmex 1.15 sub http_request($$@) {
738 root 1.1 my $cb = pop;
739     my ($method, $url, %arg) = @_;
740    
741     my %hdr;
742    
743 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
744     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
745    
746 root 1.3 $method = uc $method;
747    
748 root 1.8 if (my $hdr = $arg{headers}) {
749 root 1.1 while (my ($k, $v) = each %$hdr) {
750     $hdr{lc $k} = $v;
751     }
752     }
753    
754 root 1.55 # pseudo headers for all subsequent responses
755     my @pseudo = (URL => $url);
756     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
757    
758 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
759 root 1.8
760 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
761 root 1.8 if $recurse < 0;
762    
763 root 1.102 my $proxy = exists $arg{proxy} ? $arg{proxy} : $PROXY;
764 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
765    
766 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
767 root 1.100 $url =~ m|^([^:]+):(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?$|;
768 root 1.2
769 root 1.31 $uscheme = lc $uscheme;
770 root 1.1
771 root 1.31 my $uport = $uscheme eq "http" ? 80
772     : $uscheme eq "https" ? 443
773 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
774 root 1.13
775 root 1.31 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
776 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
777 root 1.10
778 root 1.86 my $uhost = lc $1;
779 root 1.10 $uport = $2 if defined $2;
780    
781 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
782     unless exists $hdr{host};
783 root 1.43
784 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
785 root 1.56 $upath .= $query if length $query;
786 root 1.10
787     $upath =~ s%^/?%/%;
788    
789     # cookie processing
790     if (my $jar = $arg{cookie_jar}) {
791 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
792 root 1.70
793 root 1.71 $hdr{cookie} = join "; ", @$cookies
794     if @$cookies;
795 root 1.10 }
796 root 1.1
797 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
798 root 1.2
799 root 1.10 if ($proxy) {
800 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
801 root 1.31
802 root 1.47 $rscheme = "http" unless defined $rscheme;
803    
804 root 1.31 # don't support https requests over https-proxy transport,
805 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
806 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
807 root 1.86
808     $rhost = lc $rhost;
809     $rscheme = lc $rscheme;
810 root 1.10 } else {
811 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
812 root 1.2 }
813    
814 root 1.47 # leave out fragment and query string, just a heuristic
815 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
816     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
817 root 1.41
818 root 1.53 $hdr{"content-length"} = length $arg{body}
819     if length $arg{body} || $method ne "GET";
820 root 1.1
821 root 1.116 my $idempotent = $IDEMPOTENT{$method};
822 root 1.89
823     # default value for keepalive is true iff the request is for an idempotent method
824 root 1.103 my $persistent = exists $arg{persistent} ? !!$arg{persistent} : $idempotent;
825     my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : !$proxy;
826     my $was_persistent; # true if this is actually a recycled connection
827 root 1.92
828     # the key to use in the keepalive cache
829 root 1.105 my $ka_key = "$uscheme\x00$uhost\x00$uport\x00$arg{sessionid}";
830 root 1.89
831 root 1.118 $hdr{connection} = ($persistent ? $keepalive ? "keep-alive, " : "" : "close, ") . "Te"; #1.1
832 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
833 root 1.66
834 root 1.11 my %state = (connect_guard => 1);
835    
836 root 1.88 my $ae_error = 595; # connecting
837 root 1.1
838 root 1.88 # handle actual, non-tunneled, request
839     my $handle_actual_request = sub {
840     $ae_error = 596; # request phase
841    
842 root 1.92 my $hdl = $state{handle};
843    
844     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
845 root 1.88
846     # send request
847 root 1.92 $hdl->push_write (
848 root 1.88 "$method $rpath HTTP/1.1\015\012"
849     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
850     . "\015\012"
851 root 1.120 . $arg{body}
852 root 1.88 );
853    
854 root 1.117 # return if error occurred during push_write()
855 root 1.88 return unless %state;
856    
857 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
858     %hdr = ();
859 root 1.88
860     # status line and headers
861     $state{read_response} = sub {
862 root 1.92 return unless %state;
863    
864 root 1.88 for ("$_[1]") {
865     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
866    
867     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
868 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
869 root 1.88
870     # 100 Continue handling
871     # should not happen as we don't send expect: 100-continue,
872     # but we handle it just in case.
873     # since we send the request body regardless, if we get an error
874     # we are out of-sync, which we currently do NOT handle correctly.
875     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
876     if $2 eq 100;
877    
878     push @pseudo,
879     HTTPVersion => $1,
880     Status => $2,
881     Reason => $3,
882     ;
883 root 1.1
884 root 1.92 my $hdr = _parse_hdr
885     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
886 root 1.77
887 root 1.88 %hdr = (%$hdr, @pseudo);
888     }
889 root 1.82
890 root 1.88 # redirect handling
891 root 1.119 # relative uri handling forced by microsoft and other shitheads.
892     # we give our best and fall back to URI if available.
893     if (exists $hdr{location}) {
894     my $loc = $hdr{location};
895    
896     if ($loc =~ m%^//%) { # //
897     $loc = "$rscheme:$loc";
898    
899     } elsif ($loc eq "") {
900     $loc = $url;
901    
902     } elsif ($loc !~ /^(?: $ | [^:\/?\#]+ : )/x) { # anything "simple"
903     $loc =~ s/^\.\/+//;
904    
905     if ($loc !~ m%^[.?#]%) {
906     my $prefix = "$rscheme://$uhost:$uport";
907    
908     unless ($loc =~ s/^\///) {
909     $prefix .= $upath;
910     $prefix =~ s/\/[^\/]*$//;
911     }
912    
913     $loc = "$prefix/$loc";
914    
915     } elsif (eval { require URI }) { # uri
916     $loc = URI->new_abs ($loc, $url)->as_string;
917    
918     } else {
919     return _error %state, $cb, { @pseudo, Status => 599, Reason => "Cannot parse Location (URI module missing)" };
920     #$hdr{Status} = 599;
921     #$hdr{Reason} = "Unparsable Redirect (URI module missing)";
922     #$recurse = 0;
923     }
924 root 1.82 }
925    
926 root 1.119 $hdr{location} = $loc;
927 root 1.88 }
928 root 1.82
929 root 1.88 my $redirect;
930 root 1.82
931 root 1.88 if ($recurse) {
932     my $status = $hdr{Status};
933 root 1.82
934 root 1.88 # industry standard is to redirect POST as GET for
935     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
936     # also, the UA should ask the user for 301 and 307 and POST,
937     # industry standard seems to be to simply follow.
938 root 1.119 # we go with the industry standard. 308 is defined
939 root 1.121 # by rfc7538
940 root 1.88 if ($status == 301 or $status == 302 or $status == 303) {
941 root 1.122 $redirect = 1;
942 root 1.88 # HTTP/1.1 is unclear on how to mutate the method
943 root 1.122 unless ($method eq "HEAD") {
944     $method = "GET";
945     delete $arg{body};
946     }
947 root 1.119 } elsif ($status == 307 or $status == 308) {
948 root 1.88 $redirect = 1;
949 root 1.82 }
950 root 1.88 }
951 root 1.82
952 root 1.103 my $finish = sub { # ($data, $err_status, $err_reason[, $persistent])
953 root 1.92 if ($state{handle}) {
954     # handle keepalive
955     if (
956 root 1.103 $persistent
957 root 1.92 && $_[3]
958     && ($hdr{HTTPVersion} < 1.1
959     ? $hdr{connection} =~ /\bkeep-?alive\b/i
960     : $hdr{connection} !~ /\bclose\b/i)
961     ) {
962     ka_store $ka_key, delete $state{handle};
963     } else {
964     # no keepalive, destroy the handle
965     $state{handle}->destroy;
966     }
967     }
968 root 1.82
969 root 1.88 %state = ();
970 root 1.82
971 root 1.88 if (defined $_[1]) {
972     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
973     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
974     }
975 root 1.82
976 root 1.88 # set-cookie processing
977     if ($arg{cookie_jar}) {
978     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
979     }
980 root 1.82
981 root 1.88 if ($redirect && exists $hdr{location}) {
982     # we ignore any errors, as it is very common to receive
983     # Content-Length != 0 but no actual body
984     # we also access %hdr, as $_[1] might be an erro
985 root 1.103 $state{recurse} =
986     http_request (
987     $method => $hdr{location},
988     %arg,
989     recurse => $recurse - 1,
990     Redirect => [$_[0], \%hdr],
991     sub {
992     %state = ();
993     &$cb
994     },
995     );
996 root 1.88 } else {
997     $cb->($_[0], \%hdr);
998     }
999     };
1000 root 1.82
1001 root 1.88 $ae_error = 597; # body phase
1002 root 1.82
1003 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
1004    
1005     my $len = $chunked ? undef : $hdr{"content-length"};
1006 root 1.82
1007 root 1.88 # body handling, many different code paths
1008     # - no body expected
1009     # - want_body_handle
1010     # - te chunked
1011     # - 2x length known (with or without on_body)
1012     # - 2x length not known (with or without on_body)
1013     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
1014     $finish->(undef, 598 => "Request cancelled by on_header");
1015     } elsif (
1016     $hdr{Status} =~ /^(?:1..|204|205|304)$/
1017     or $method eq "HEAD"
1018     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
1019     ) {
1020     # no body
1021     $finish->("", undef, undef, 1);
1022    
1023     } elsif (!$redirect && $arg{want_body_handle}) {
1024     $_[0]->on_eof (undef);
1025     $_[0]->on_error (undef);
1026     $_[0]->on_read (undef);
1027    
1028     $finish->(delete $state{handle});
1029    
1030 root 1.91 } elsif ($chunked) {
1031 root 1.88 my $cl = 0;
1032 root 1.92 my $body = "";
1033 root 1.88 my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
1034    
1035     $state{read_chunk} = sub {
1036     $_[1] =~ /^([0-9a-fA-F]+)/
1037 root 1.108 or return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1038 root 1.82
1039 root 1.88 my $len = hex $1;
1040 root 1.82
1041 root 1.88 if ($len) {
1042     $cl += $len;
1043 root 1.82
1044 root 1.88 $_[0]->push_read (chunk => $len, sub {
1045     $on_body->($_[1], \%hdr)
1046     or return $finish->(undef, 598 => "Request cancelled by on_body");
1047 root 1.82
1048 root 1.88 $_[0]->push_read (line => sub {
1049     length $_[1]
1050     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1051     $_[0]->push_read (line => $state{read_chunk});
1052 root 1.82 });
1053 root 1.88 });
1054     } else {
1055     $hdr{"content-length"} ||= $cl;
1056 root 1.84
1057 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
1058     if (length $_[1]) {
1059     for ("$_[1]") {
1060     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
1061 root 1.84
1062 root 1.92 my $hdr = _parse_hdr
1063 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
1064 root 1.84
1065 root 1.88 %hdr = (%hdr, %$hdr);
1066     }
1067     }
1068 root 1.84
1069 root 1.88 $finish->($body, undef, undef, 1);
1070 root 1.84 });
1071     }
1072 root 1.88 };
1073    
1074     $_[0]->push_read (line => $state{read_chunk});
1075    
1076     } elsif ($arg{on_body}) {
1077     if (defined $len) {
1078     $_[0]->on_read (sub {
1079     $len -= length $_[0]{rbuf};
1080    
1081     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1082     or return $finish->(undef, 598 => "Request cancelled by on_body");
1083    
1084     $len > 0
1085     or $finish->("", undef, undef, 1);
1086     });
1087 root 1.84 } else {
1088 root 1.88 $_[0]->on_eof (sub {
1089     $finish->("");
1090     });
1091     $_[0]->on_read (sub {
1092     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1093     or $finish->(undef, 598 => "Request cancelled by on_body");
1094     });
1095     }
1096     } else {
1097     $_[0]->on_eof (undef);
1098 root 1.82
1099 root 1.88 if (defined $len) {
1100     $_[0]->on_read (sub {
1101     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1102     if $len <= length $_[0]{rbuf};
1103     });
1104     } else {
1105     $_[0]->on_error (sub {
1106     ($! == Errno::EPIPE || !$!)
1107     ? $finish->(delete $_[0]{rbuf})
1108     : $finish->(undef, $ae_error => $_[2]);
1109     });
1110     $_[0]->on_read (sub { });
1111 root 1.82 }
1112 root 1.88 }
1113     };
1114 root 1.82
1115 root 1.92 # if keepalive is enabled, then the server closing the connection
1116     # before a response can happen legally - we retry on idempotent methods.
1117 root 1.103 if ($was_persistent && $idempotent) {
1118 root 1.92 my $old_eof = $hdl->{on_eof};
1119     $hdl->{on_eof} = sub {
1120     _destroy_state %state;
1121    
1122 root 1.103 %state = ();
1123     $state{recurse} =
1124     http_request (
1125 root 1.112 $method => $url,
1126 root 1.103 %arg,
1127 root 1.112 recurse => $recurse - 1,
1128 root 1.103 keepalive => 0,
1129     sub {
1130     %state = ();
1131     &$cb
1132     }
1133     );
1134 root 1.92 };
1135     $hdl->on_read (sub {
1136     return unless %state;
1137    
1138     # as soon as we receive something, a connection close
1139     # once more becomes a hard error
1140     $hdl->{on_eof} = $old_eof;
1141     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1142     });
1143     } else {
1144     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1145     }
1146     };
1147    
1148     my $prepare_handle = sub {
1149     my ($hdl) = $state{handle};
1150    
1151     $hdl->on_error (sub {
1152     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1153     });
1154     $hdl->on_eof (sub {
1155     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1156     });
1157 root 1.103 $hdl->timeout_reset;
1158     $hdl->timeout ($timeout);
1159 root 1.88 };
1160 root 1.82
1161 root 1.92 # connected to proxy (or origin server)
1162 root 1.88 my $connect_cb = sub {
1163 root 1.92 my $fh = shift
1164     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1165 root 1.44
1166 root 1.88 return unless delete $state{connect_guard};
1167 root 1.11
1168 root 1.88 # get handle
1169     $state{handle} = new AnyEvent::Handle
1170 root 1.92 %{ $arg{handle_params} },
1171     fh => $fh,
1172     peername => $uhost,
1173 root 1.88 tls_ctx => $arg{tls_ctx},
1174     ;
1175 root 1.11
1176 root 1.92 $prepare_handle->();
1177 root 1.1
1178 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1179 root 1.88
1180     # now handle proxy-CONNECT method
1181     if ($proxy && $uscheme eq "https") {
1182     # oh dear, we have to wrap it into a connect request
1183    
1184 root 1.122 my $auth = exists $hdr{"proxy-authorization"}
1185     ? "proxy-authorization: " . (delete $hdr{"proxy-authorization"}) . "\015\012"
1186     : "";
1187    
1188 root 1.88 # maybe re-use $uauthority with patched port?
1189 root 1.122 $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012$auth\015\012");
1190 root 1.88 $state{handle}->push_read (line => $qr_nlnl, sub {
1191     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1192 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1193 root 1.88
1194     if ($2 == 200) {
1195     $rpath = $upath;
1196     $handle_actual_request->();
1197     } else {
1198 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1199 root 1.88 }
1200     });
1201     } else {
1202 root 1.122 delete $hdr{"proxy-authorization"} unless $proxy;
1203    
1204 root 1.88 $handle_actual_request->();
1205     }
1206     };
1207    
1208     _get_slot $uhost, sub {
1209     $state{slot_guard} = shift;
1210 root 1.64
1211 root 1.88 return unless $state{connect_guard};
1212 root 1.64
1213 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1214     # on a keepalive request (in theory, this should be a separate config option).
1215 root 1.103 if ($persistent && $KA_CACHE{$ka_key}) {
1216     $was_persistent = 1;
1217    
1218 root 1.92 $state{handle} = ka_fetch $ka_key;
1219 root 1.103 $state{handle}->destroyed
1220 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (1), please report.";#d#
1221 root 1.92 $prepare_handle->();
1222 root 1.103 $state{handle}->destroyed
1223 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (2), please report.";#d#
1224 root 1.92 $handle_actual_request->();
1225    
1226     } else {
1227     my $tcp_connect = $arg{tcp_connect}
1228     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1229 root 1.57
1230 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1231     }
1232 root 1.1 };
1233    
1234 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1235 root 1.1 }
1236    
1237 elmex 1.15 sub http_get($@) {
1238 root 1.1 unshift @_, "GET";
1239     &http_request
1240     }
1241    
1242 elmex 1.15 sub http_head($@) {
1243 root 1.4 unshift @_, "HEAD";
1244     &http_request
1245     }
1246    
1247 elmex 1.15 sub http_post($$@) {
1248 root 1.22 my $url = shift;
1249     unshift @_, "POST", $url, "body";
1250 root 1.3 &http_request
1251     }
1252    
1253 root 1.9 =back
1254    
1255 root 1.55 =head2 DNS CACHING
1256    
1257     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1258     the actual connection, which in turn uses AnyEvent::DNS to resolve
1259     hostnames. The latter is a simple stub resolver and does no caching
1260     on its own. If you want DNS caching, you currently have to provide
1261     your own default resolver (by storing a suitable resolver object in
1262 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1263 root 1.55
1264 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1265 root 1.1
1266     =over 4
1267    
1268 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1269    
1270     Sets the default proxy server to use. The proxy-url must begin with a
1271 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1272 root 1.52
1273     To clear an already-set proxy, use C<undef>.
1274 root 1.2
1275 root 1.113 When AnyEvent::HTTP is loaded for the first time it will query the
1276 root 1.102 default proxy from the operating system, currently by looking at
1277     C<$ENV{http_proxy>}.
1278    
1279 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1280    
1281     Remove all cookies from the cookie jar that have been expired. If
1282     C<$session_end> is given and true, then additionally remove all session
1283     cookies.
1284    
1285     You should call this function (with a true C<$session_end>) before you
1286     save cookies to disk, and you should call this function after loading them
1287 root 1.117 again. If you have a long-running program you can additionally call this
1288 root 1.80 function from time to time.
1289    
1290     A cookie jar is initially an empty hash-reference that is managed by this
1291 root 1.121 module. Its format is subject to change, but currently it is as follows:
1292 root 1.80
1293     The key C<version> has to contain C<1>, otherwise the hash gets
1294     emptied. All other keys are hostnames or IP addresses pointing to
1295     hash-references. The key for these inner hash references is the
1296     server path for which this cookie is meant, and the values are again
1297 root 1.115 hash-references. Each key of those hash-references is a cookie name, and
1298 root 1.80 the value, you guessed it, is another hash-reference, this time with the
1299     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1300     which have been replaced by a C<_expires> key that contains the cookie
1301 root 1.115 expiry timestamp. Session cookies are indicated by not having an
1302     C<_expires> key.
1303 root 1.80
1304     Here is an example of a cookie jar with a single cookie, so you have a
1305     chance of understanding the above paragraph:
1306    
1307     {
1308     version => 1,
1309     "10.0.0.1" => {
1310     "/" => {
1311     "mythweb_id" => {
1312     _expires => 1293917923,
1313     value => "ooRung9dThee3ooyXooM1Ohm",
1314     },
1315     },
1316     },
1317     }
1318    
1319 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1320    
1321     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1322     Date (RFC 2616).
1323    
1324     =item $timestamp = AnyEvent::HTTP::parse_date $date
1325    
1326 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1327     bunch of minor variations of those, and returns the corresponding POSIX
1328     timestamp, or C<undef> if the date cannot be parsed.
1329 root 1.61
1330 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1331 root 1.1
1332 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1333 root 1.1
1334 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1335    
1336 root 1.110 The default timeout for connection operations (default: C<300>).
1337 root 1.92
1338 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1339    
1340     The default value for the C<User-Agent> header (the default is
1341 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1342 root 1.1
1343 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1344 root 1.1
1345 root 1.47 The maximum number of concurrent connections to the same host (identified
1346 root 1.121 by the hostname). If the limit is exceeded, then additional requests
1347 root 1.92 are queued until previous connections are closed. Both persistent and
1348     non-persistent connections are counted in this limit.
1349 root 1.1
1350 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1351 root 1.92 increase it much.
1352    
1353     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1354 root 1.117 connections, older browsers used 2, newer ones (such as firefox 3)
1355     typically use 6, and Opera uses 8 because like, they have the fastest
1356     browser and give a shit for everybody else on the planet.
1357 root 1.92
1358     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1359    
1360 root 1.116 The time after which idle persistent connections get closed by
1361 root 1.92 AnyEvent::HTTP (default: C<3>).
1362 root 1.3
1363 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1364    
1365     The number of active connections. This is not the number of currently
1366     running requests, but the number of currently open and non-idle TCP
1367 root 1.92 connections. This number can be useful for load-leveling.
1368 root 1.14
1369 root 1.1 =back
1370    
1371     =cut
1372    
1373 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1374     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1375    
1376     sub format_date($) {
1377     my ($time) = @_;
1378    
1379     # RFC 822/1123 format
1380     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1381    
1382     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1383     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1384     $H, $M, $S;
1385     }
1386    
1387     sub parse_date($) {
1388     my ($date) = @_;
1389    
1390     my ($d, $m, $y, $H, $M, $S);
1391    
1392 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1393 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1394     # cookie dates (with "-")
1395    
1396 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1397    
1398 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1399 root 1.61 # RFC 850
1400     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1401    
1402 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1403 root 1.61 # ISO C's asctime
1404     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1405     }
1406     # other formats fail in the loop below
1407    
1408     for (0..11) {
1409     if ($m eq $month[$_]) {
1410     require Time::Local;
1411 root 1.111 return eval { Time::Local::timegm ($S, $M, $H, $d, $_, $y) };
1412 root 1.61 }
1413     }
1414    
1415     undef
1416     }
1417    
1418 root 1.2 sub set_proxy($) {
1419 root 1.52 if (length $_[0]) {
1420 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1421 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1422     $PROXY = [$2, $3 || 3128, $1]
1423     } else {
1424     undef $PROXY;
1425     }
1426 root 1.2 }
1427    
1428     # initialise proxy from environment
1429 root 1.52 eval {
1430     set_proxy $ENV{http_proxy};
1431     };
1432 root 1.2
1433 root 1.93 =head2 SHOWCASE
1434    
1435 root 1.117 This section contains some more elaborate "real-world" examples or code
1436 root 1.93 snippets.
1437    
1438     =head2 HTTP/1.1 FILE DOWNLOAD
1439    
1440 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1441 root 1.98 goes wrong and you want to resume.
1442 root 1.93
1443     Here is a function that initiates and resumes a download. It uses the
1444     last modified time to check for file content changes, and works with many
1445     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1446     on older servers.
1447    
1448     It calls the completion callback with either C<undef>, which means a
1449 root 1.117 nonretryable error occurred, C<0> when the download was partial and should
1450 root 1.93 be retried, and C<1> if it was successful.
1451    
1452     use AnyEvent::HTTP;
1453    
1454     sub download($$$) {
1455     my ($url, $file, $cb) = @_;
1456    
1457     open my $fh, "+<", $file
1458     or die "$file: $!";
1459    
1460     my %hdr;
1461     my $ofs = 0;
1462    
1463     warn stat $fh;
1464     warn -s _;
1465     if (stat $fh and -s _) {
1466     $ofs = -s _;
1467 root 1.107 warn "-s is ", $ofs;
1468 root 1.93 $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1469     $hdr{"range"} = "bytes=$ofs-";
1470     }
1471    
1472     http_get $url,
1473     headers => \%hdr,
1474     on_header => sub {
1475     my ($hdr) = @_;
1476    
1477     if ($hdr->{Status} == 200 && $ofs) {
1478     # resume failed
1479     truncate $fh, $ofs = 0;
1480     }
1481    
1482     sysseek $fh, $ofs, 0;
1483    
1484     1
1485     },
1486     on_body => sub {
1487     my ($data, $hdr) = @_;
1488    
1489     if ($hdr->{Status} =~ /^2/) {
1490     length $data == syswrite $fh, $data
1491     or return; # abort on write errors
1492     }
1493    
1494     1
1495     },
1496     sub {
1497     my (undef, $hdr) = @_;
1498    
1499     my $status = $hdr->{Status};
1500    
1501     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1502     utime $fh, $time, $time;
1503     }
1504    
1505     if ($status == 200 || $status == 206 || $status == 416) {
1506     # download ok || resume ok || file already fully downloaded
1507     $cb->(1, $hdr);
1508    
1509     } elsif ($status == 412) {
1510     # file has changed while resuming, delete and retry
1511     unlink $file;
1512     $cb->(0, $hdr);
1513    
1514     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1515     # retry later
1516     $cb->(0, $hdr);
1517    
1518     } else {
1519     $cb->(undef, $hdr);
1520     }
1521     }
1522     ;
1523     }
1524    
1525     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1526     if ($_[0]) {
1527     print "OK!\n";
1528     } elsif (defined $_[0]) {
1529     print "please retry later\n";
1530     } else {
1531     print "ERROR\n";
1532     }
1533     };
1534    
1535     =head3 SOCKS PROXIES
1536 root 1.60
1537     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1538     compile your perl to support socks, or use an external program such as
1539     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1540     transparently.
1541    
1542     Alternatively, for AnyEvent::HTTP only, you can use your own
1543     C<tcp_connect> function that does the proxy handshake - here is an example
1544     that works with socks4a proxies:
1545    
1546     use Errno;
1547     use AnyEvent::Util;
1548     use AnyEvent::Socket;
1549     use AnyEvent::Handle;
1550    
1551     # host, port and username of/for your socks4a proxy
1552     my $socks_host = "10.0.0.23";
1553     my $socks_port = 9050;
1554     my $socks_user = "";
1555    
1556     sub socks4a_connect {
1557     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1558    
1559     my $hdl = new AnyEvent::Handle
1560     connect => [$socks_host, $socks_port],
1561     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1562     on_error => sub { $connect_cb->() },
1563     ;
1564    
1565     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1566    
1567     $hdl->push_read (chunk => 8, sub {
1568     my ($hdl, $chunk) = @_;
1569     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1570    
1571     if ($status == 0x5a) {
1572     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1573     } else {
1574     $! = Errno::ENXIO; $connect_cb->();
1575     }
1576     });
1577    
1578     $hdl
1579     }
1580    
1581     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1582     possibly after switching off other proxy types:
1583    
1584     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1585    
1586     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1587     my ($data, $headers) = @_;
1588     ...
1589     };
1590    
1591 root 1.1 =head1 SEE ALSO
1592    
1593     L<AnyEvent>.
1594    
1595     =head1 AUTHOR
1596    
1597 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1598     http://home.schmorp.de/
1599 root 1.1
1600 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1601     testcases and bugreports.
1602    
1603 root 1.1 =cut
1604    
1605     1
1606