ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.131
Committed: Thu Aug 30 17:04:28 2018 UTC (5 years, 8 months ago) by root
Branch: MAIN
Changes since 1.130: +2 -2 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
21 root 1.11 automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41 root 1.85 use common::sense;
42 root 1.1
43 root 1.41 use Errno ();
44 root 1.1
45 root 1.51 use AnyEvent 5.0 ();
46 root 1.1 use AnyEvent::Util ();
47     use AnyEvent::Handle ();
48    
49     use base Exporter::;
50    
51 root 1.130 our $VERSION = 2.24;
52 root 1.1
53 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
54 root 1.1
55 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
56 root 1.3 our $MAX_RECURSE = 10;
57 root 1.92 our $PERSISTENT_TIMEOUT = 3;
58 root 1.2 our $TIMEOUT = 300;
59 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
60 root 1.1
61 root 1.2 our $PROXY;
62 root 1.14 our $ACTIVE = 0;
63 root 1.2
64 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
65 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
66 root 1.1
67     =item http_get $url, key => value..., $cb->($data, $headers)
68    
69     Executes an HTTP-GET request. See the http_request function for details on
70 root 1.29 additional parameters and the return value.
71 root 1.1
72 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
73    
74 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
75     on additional parameters and the return value.
76 root 1.5
77     =item http_post $url, $body, key => value..., $cb->($data, $headers)
78 root 1.3
79 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
80 root 1.29 http_request function for details on additional parameters and the return
81     value.
82 root 1.3
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.29 When called in void context, nothing is returned. In other contexts,
89     C<http_request> returns a "cancellation guard" - you have to keep the
90     object at least alive until the callback get called. If the object gets
91 root 1.58 destroyed before the callback is called, the request will be cancelled.
92 root 1.29
93 root 1.42 The callback will be called with the response body data as first argument
94 root 1.117 (or C<undef> if an error occurred), and a hash-ref with response headers
95 root 1.68 (and trailers) as second argument.
96 root 1.2
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
99     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
100 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
101     during the body phase of a request, then the original C<Status> and
102     C<Reason> values from the header are available as C<OrigStatus> and
103     C<OrigReason>.
104 root 1.55
105     The pseudo-header C<URL> contains the actual URL (which can differ from
106     the requested URL when following redirects - for example, you might get
107     an error that your URL scheme is not supported even though your URL is a
108     valid http URL because it redirected to an ftp URL, in which case you can
109     look at the URL pseudo header).
110    
111     The pseudo-header C<Redirect> only exists when the request was a result
112     of an internal redirect. In that case it is an array reference with
113     the C<($data, $headers)> from the redirect response. Note that this
114     response could in turn be the result of a redirect itself, and C<<
115     $headers->{Redirect}[1]{Redirect} >> will then contain the original
116     response, and so on.
117 root 1.20
118 root 1.32 If the server sends a header multiple times, then their contents will be
119     joined together with a comma (C<,>), as per the HTTP spec.
120 root 1.2
121     If an internal error occurs, such as not being able to resolve a hostname,
122 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
123     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
124     message. Currently the following status codes are used:
125    
126     =over 4
127    
128 root 1.114 =item 595 - errors during connection establishment, proxy handshake.
129 root 1.77
130     =item 596 - errors during TLS negotiation, request sending and header processing.
131    
132 root 1.78 =item 597 - errors during body receiving or processing.
133 root 1.77
134 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
135 root 1.77
136     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
137    
138     =back
139 root 1.2
140 root 1.6 A typical callback might look like this:
141    
142     sub {
143     my ($body, $hdr) = @_;
144    
145     if ($hdr->{Status} =~ /^2/) {
146     ... everything should be ok
147     } else {
148     print "error, $hdr->{Status} $hdr->{Reason}\n";
149     }
150     }
151    
152 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
153     include:
154    
155     =over 4
156    
157 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
158 root 1.1
159 root 1.112 Whether to recurse requests or not, e.g. on redirects, authentication and
160     other retries and so on, and how often to do so.
161 root 1.1
162 root 1.119 Only redirects to http and https URLs are supported. While most common
163     redirection forms are handled entirely within this module, some require
164     the use of the optional L<URI> module. If it is required but missing, then
165     the request will fail with an error.
166    
167 root 1.1 =item headers => hashref
168    
169 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
170     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
171 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
172     (this can be suppressed by using C<undef> for these headers in which case
173     they won't be sent at all).
174 root 1.1
175 root 1.90 You really should provide your own C<User-Agent:> header value that is
176     appropriate for your program - I wouldn't be surprised if the default
177     AnyEvent string gets blocked by webservers sooner or later.
178    
179 root 1.104 Also, make sure that your headers names and values do not contain any
180     embedded newlines.
181    
182 root 1.1 =item timeout => $seconds
183    
184     The time-out to use for various stages - each connect attempt will reset
185 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
186     timeout.
187    
188     Default timeout is 5 minutes.
189 root 1.2
190     =item proxy => [$host, $port[, $scheme]] or undef
191    
192 root 1.102 Use the given http proxy for all requests, or no proxy if C<undef> is
193     used.
194 root 1.2
195 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
196 root 1.1
197 root 1.102 If not specified, then the default proxy is used (see
198     C<AnyEvent::HTTP::set_proxy>).
199    
200 root 1.123 Currently, if your proxy requires authorization, you have to specify an
201     appropriate "Proxy-Authorization" header in every request.
202    
203 root 1.3 =item body => $string
204    
205 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
206 root 1.3 this module might offer more options).
207    
208 root 1.10 =item cookie_jar => $hash_ref
209    
210     Passing this parameter enables (simplified) cookie-processing, loosely
211     based on the original netscape specification.
212    
213 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
214     will get updated automatically. It is possible to save the cookie jar
215     to persistent storage with something like JSON or Storable - see the
216     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
217     expired or session-only cookies, and also for documentation on the format
218     of the cookie jar.
219 root 1.10
220 root 1.70 Note that this cookie implementation is not meant to be complete. If
221     you want complete cookie management you have to do that on your
222 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
223 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
224     to.
225 root 1.10
226 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
227 root 1.70 headers will be set and handled by this module, otherwise they will be
228 root 1.69 left untouched.
229    
230 root 1.40 =item tls_ctx => $scheme | $tls_ctx
231    
232     Specifies the AnyEvent::TLS context to be used for https connections. This
233     parameter follows the same rules as the C<tls_ctx> parameter to
234     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
235     C<high> can be specified, which give you a predefined low-security (no
236     verification, highest compatibility) and high-security (CA and common-name
237     verification) TLS context.
238    
239     The default for this option is C<low>, which could be interpreted as "give
240     me the page, no matter what".
241    
242 root 1.92 See also the C<sessionid> parameter.
243    
244     =item session => $string
245    
246     The module might reuse connections to the same host internally. Sometimes
247     (e.g. when using TLS), you do not want to reuse connections from other
248     sessions. This can be achieved by setting this parameter to some unique
249     ID (such as the address of an object storing your state data, or the TLS
250     context) - only connections using the same unique ID will be reused.
251    
252 root 1.51 =item on_prepare => $callback->($fh)
253    
254     In rare cases you need to "tune" the socket before it is used to
255 root 1.117 connect (for example, to bind it on a given IP address). This parameter
256 root 1.51 overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
257     and behaves exactly the same way (e.g. it has to provide a
258     timeout). See the description for the C<$prepare_cb> argument of
259     C<AnyEvent::Socket::tcp_connect> for details.
260    
261 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
262    
263     In even rarer cases you want total control over how AnyEvent::HTTP
264     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
265     to do this, but you can provide your own C<tcp_connect> function -
266 root 1.60 obviously, it has to follow the same calling conventions, except that it
267     may always return a connection guard object.
268 root 1.59
269     There are probably lots of weird uses for this function, starting from
270     tracing the hosts C<http_request> actually tries to connect, to (inexact
271     but fast) host => IP address caching or even socks protocol support.
272    
273 root 1.42 =item on_header => $callback->($headers)
274 root 1.41
275     When specified, this callback will be called with the header hash as soon
276     as headers have been successfully received from the remote server (not on
277     locally-generated errors).
278    
279     It has to return either true (in which case AnyEvent::HTTP will continue),
280     or false, in which case AnyEvent::HTTP will cancel the download (and call
281     the finish callback with an error code of C<598>).
282    
283     This callback is useful, among other things, to quickly reject unwanted
284     content, which, if it is supposed to be rare, can be faster than first
285     doing a C<HEAD> request.
286    
287 root 1.68 The downside is that cancelling the request makes it impossible to re-use
288     the connection. Also, the C<on_header> callback will not receive any
289     trailer (headers sent after the response body).
290    
291 root 1.42 Example: cancel the request unless the content-type is "text/html".
292 root 1.41
293 root 1.42 on_header => sub {
294     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
295     },
296 root 1.41
297 root 1.42 =item on_body => $callback->($partial_body, $headers)
298 root 1.41
299 root 1.42 When specified, all body data will be passed to this callback instead of
300     to the completion callback. The completion callback will get the empty
301     string instead of the body data.
302 root 1.41
303 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
304     or false, in which case AnyEvent::HTTP will cancel the download (and call
305     the completion callback with an error code of C<598>).
306    
307 root 1.68 The downside to cancelling the request is that it makes it impossible to
308     re-use the connection.
309    
310 root 1.42 This callback is useful when the data is too large to be held in memory
311     (so the callback writes it to a file) or when only some information should
312     be extracted, or when the body should be processed incrementally.
313 root 1.41
314     It is usually preferred over doing your own body handling via
315 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
316     only used to create a connection, C<want_body_handle> is the better
317     alternative, as it allows you to install your own event handler, reducing
318     resource usage.
319 root 1.41
320     =item want_body_handle => $enable
321    
322     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
323     changes considerably: after parsing the headers, and instead of
324     downloading the body (if any), the completion callback will be
325     called. Instead of the C<$body> argument containing the body data, the
326     callback will receive the L<AnyEvent::Handle> object associated with the
327     connection. In error cases, C<undef> will be passed. When there is no body
328     (e.g. status C<304>), the empty string will be passed.
329    
330 root 1.92 The handle object might or might not be in TLS mode, might be connected
331     to a proxy, be a persistent connection, use chunked transfer encoding
332     etc., and configured in unspecified ways. The user is responsible for this
333     handle (it will not be used by this module anymore).
334 root 1.41
335     This is useful with some push-type services, where, after the initial
336     headers, an interactive protocol is used (typical example would be the
337     push-style twitter API which starts a JSON/XML stream).
338    
339     If you think you need this, first have a look at C<on_body>, to see if
340 root 1.45 that doesn't solve your problem in a better way.
341 root 1.41
342 root 1.92 =item persistent => $boolean
343    
344     Try to create/reuse a persistent connection. When this flag is set
345     (default: true for idempotent requests, false for all others), then
346     C<http_request> tries to re-use an existing (previously-created)
347     persistent connection to the host and, failing that, tries to create a new
348     one.
349    
350     Requests failing in certain ways will be automatically retried once, which
351     is dangerous for non-idempotent requests, which is why it defaults to off
352     for them. The reason for this is because the bozos who designed HTTP/1.1
353     made it impossible to distinguish between a fatal error and a normal
354     connection timeout, so you never know whether there was a problem with
355     your request or not.
356    
357     When reusing an existent connection, many parameters (such as TLS context)
358     will be ignored. See the C<session> parameter for a workaround.
359    
360     =item keepalive => $boolean
361    
362     Only used when C<persistent> is also true. This parameter decides whether
363     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
364     (as opposed to only a HTTP/1.1 persistent connection).
365    
366     The default is true, except when using a proxy, in which case it defaults
367     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
368    
369     =item handle_params => { key => value ... }
370    
371     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
372     constructor that is called - not all requests will create a handle, and
373     sometimes more than one is created, so this parameter is only good for
374     setting hints.
375    
376     Example: set the maximum read size to 4096, to potentially conserve memory
377     at the cost of speed.
378    
379     handle_params => {
380     max_read_size => 4096,
381     },
382    
383 root 1.1 =back
384    
385 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
386     the response body.
387 root 1.9
388     http_request GET => "http://www.nethype.de/", sub {
389     my ($body, $hdr) = @_;
390     print "$body\n";
391     };
392    
393 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
394 root 1.9 timeout of 30 seconds.
395    
396     http_request
397 root 1.109 HEAD => "https://www.google.com",
398 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
399 root 1.9 timeout => 30,
400     sub {
401     my ($body, $hdr) = @_;
402     use Data::Dumper;
403     print Dumper $hdr;
404     }
405     ;
406 root 1.1
407 root 1.68 Example: do another simple HTTP GET request, but immediately try to
408 root 1.29 cancel it.
409    
410     my $request = http_request GET => "http://www.nethype.de/", sub {
411     my ($body, $hdr) = @_;
412     print "$body\n";
413     };
414    
415     undef $request;
416    
417 root 1.1 =cut
418    
419 root 1.92 #############################################################################
420     # wait queue/slots
421    
422 root 1.12 sub _slot_schedule;
423 root 1.11 sub _slot_schedule($) {
424     my $host = shift;
425    
426     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
427     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
428 root 1.12 # somebody wants that slot
429 root 1.11 ++$CO_SLOT{$host}[0];
430 root 1.14 ++$ACTIVE;
431 root 1.11
432     $cb->(AnyEvent::Util::guard {
433 root 1.14 --$ACTIVE;
434 root 1.11 --$CO_SLOT{$host}[0];
435     _slot_schedule $host;
436     });
437     } else {
438     # nobody wants the slot, maybe we can forget about it
439     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
440     last;
441     }
442     }
443     }
444    
445     # wait for a free slot on host, call callback
446     sub _get_slot($$) {
447     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
448    
449     _slot_schedule $_[0];
450     }
451    
452 root 1.80 #############################################################################
453 root 1.92 # cookie handling
454 root 1.80
455     # expire cookies
456     sub cookie_jar_expire($;$) {
457     my ($jar, $session_end) = @_;
458    
459 root 1.130 %$jar = () if $jar->{version} != 2;
460 root 1.80
461     my $anow = AE::now;
462    
463     while (my ($chost, $paths) = each %$jar) {
464     next unless ref $paths;
465    
466     while (my ($cpath, $cookies) = each %$paths) {
467     while (my ($cookie, $kv) = each %$cookies) {
468     if (exists $kv->{_expires}) {
469     delete $cookies->{$cookie}
470     if $anow > $kv->{_expires};
471     } elsif ($session_end) {
472     delete $cookies->{$cookie};
473     }
474     }
475    
476     delete $paths->{$cpath}
477     unless %$cookies;
478     }
479    
480     delete $jar->{$chost}
481     unless %$paths;
482     }
483     }
484    
485 root 1.72 # extract cookies from jar
486 root 1.71 sub cookie_jar_extract($$$$) {
487 root 1.92 my ($jar, $scheme, $host, $path) = @_;
488 root 1.71
489 root 1.130 %$jar = () if $jar->{version} != 2;
490    
491     $host = AnyEvent::Util::idn_to_ascii $host
492     if $host =~ /[^\x00-\x7f]/;
493 root 1.71
494     my @cookies;
495    
496     while (my ($chost, $paths) = each %$jar) {
497     next unless ref $paths;
498    
499 root 1.130 # exact match or suffix including . match
500     $chost eq $host or ".$chost" eq substr $host, -1 - length $chost
501     or next;
502 root 1.71
503     while (my ($cpath, $cookies) = each %$paths) {
504 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
505 root 1.71
506     while (my ($cookie, $kv) = each %$cookies) {
507 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
508 root 1.71
509 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
510     delete $cookies->{$cookie};
511     next;
512 root 1.71 }
513    
514     my $value = $kv->{value};
515    
516     if ($value =~ /[=;,[:space:]]/) {
517     $value =~ s/([\\"])/\\$1/g;
518     $value = "\"$value\"";
519     }
520    
521     push @cookies, "$cookie=$value";
522     }
523     }
524     }
525    
526     \@cookies
527     }
528    
529 root 1.72 # parse set_cookie header into jar
530 root 1.80 sub cookie_jar_set_cookie($$$$) {
531 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
532 root 1.80
533 root 1.130 %$jar = () if $jar->{version} != 2;
534    
535 root 1.80 my $anow = int AE::now;
536     my $snow; # server-now
537 root 1.72
538     for ($set_cookie) {
539     # parse NAME=VALUE
540     my @kv;
541    
542 root 1.79 # expires is not http-compliant in the original cookie-spec,
543     # we support the official date format and some extensions
544 root 1.72 while (
545     m{
546     \G\s*
547     (?:
548 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
549 root 1.101 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^;,[:space:]]*) ) )?
550 root 1.72 )
551     }gcxsi
552     ) {
553     my $name = $2;
554     my $value = $4;
555    
556 root 1.82 if (defined $1) {
557 root 1.72 # expires
558     $name = "expires";
559     $value = $1;
560 root 1.82 } elsif (defined $3) {
561 root 1.72 # quoted
562     $value = $3;
563     $value =~ s/\\(.)/$1/gs;
564     }
565    
566 root 1.95 push @kv, @kv ? lc $name : $name, $value;
567 root 1.72
568     last unless /\G\s*;/gc;
569     }
570    
571     last unless @kv;
572    
573     my $name = shift @kv;
574     my %kv = (value => shift @kv, @kv);
575    
576 root 1.80 if (exists $kv{"max-age"}) {
577     $kv{_expires} = $anow + delete $kv{"max-age"};
578     } elsif (exists $kv{expires}) {
579     $snow ||= parse_date ($date) || $anow;
580     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
581     } else {
582     delete $kv{_expires};
583     }
584 root 1.72
585     my $cdom;
586     my $cpath = (delete $kv{path}) || "/";
587    
588     if (exists $kv{domain}) {
589 root 1.130 $cdom = $kv{domain};
590 root 1.72
591     $cdom =~ s/^\.?/./; # make sure it starts with a "."
592    
593     next if $cdom =~ /\.$/;
594    
595     # this is not rfc-like and not netscape-like. go figure.
596     my $ndots = $cdom =~ y/.//;
597     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
598 root 1.130
599     $cdom = substr $cdom, 1; # remove initial .
600 root 1.72 } else {
601 root 1.92 $cdom = $host;
602 root 1.72 }
603    
604     # store it
605 root 1.130 $jar->{version} = 2;
606 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
607 root 1.72
608     redo if /\G\s*,/gc;
609     }
610     }
611    
612 root 1.92 #############################################################################
613     # keepalive/persistent connection cache
614    
615     # fetch a connection from the keepalive cache
616     sub ka_fetch($) {
617     my $ka_key = shift;
618    
619     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
620     delete $KA_CACHE{$ka_key}
621     unless @{ $KA_CACHE{$ka_key} };
622    
623     $hdl
624     }
625    
626     sub ka_store($$) {
627     my ($ka_key, $hdl) = @_;
628    
629     my $kaa = $KA_CACHE{$ka_key} ||= [];
630    
631     my $destroy = sub {
632     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
633    
634     $hdl->destroy;
635    
636     @ka
637     ? $KA_CACHE{$ka_key} = \@ka
638     : delete $KA_CACHE{$ka_key};
639     };
640    
641     # on error etc., destroy
642     $hdl->on_error ($destroy);
643     $hdl->on_eof ($destroy);
644     $hdl->on_read ($destroy);
645     $hdl->timeout ($PERSISTENT_TIMEOUT);
646    
647     push @$kaa, $hdl;
648     shift @$kaa while @$kaa > $MAX_PER_HOST;
649     }
650    
651     #############################################################################
652     # utilities
653    
654 root 1.66 # continue to parse $_ for headers and place them into the arg
655 root 1.92 sub _parse_hdr() {
656 root 1.66 my %hdr;
657    
658     # things seen, not parsed:
659     # p3pP="NON CUR OTPi OUR NOR UNI"
660    
661     $hdr{lc $1} .= ",$2"
662     while /\G
663     ([^:\000-\037]*):
664     [\011\040]*
665     ((?: [^\012]+ | \012[\011\040] )*)
666     \012
667     /gxc;
668    
669     /\G$/
670     or return;
671    
672     # remove the "," prefix we added to all headers above
673     substr $_, 0, 1, ""
674     for values %hdr;
675    
676     \%hdr
677     }
678    
679 root 1.92 #############################################################################
680     # http_get
681    
682 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
683 root 1.34
684 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
685     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
686 root 1.40
687 root 1.92 # maybe it should just become a normal object :/
688    
689     sub _destroy_state(\%) {
690     my ($state) = @_;
691    
692     $state->{handle}->destroy if $state->{handle};
693     %$state = ();
694     }
695    
696     sub _error(\%$$) {
697     my ($state, $cb, $hdr) = @_;
698    
699     &_destroy_state ($state);
700    
701     $cb->(undef, $hdr);
702     ()
703     }
704    
705 root 1.116 our %IDEMPOTENT = (
706     DELETE => 1,
707     GET => 1,
708     HEAD => 1,
709     OPTIONS => 1,
710     PUT => 1,
711     TRACE => 1,
712    
713     ACL => 1,
714     "BASELINE-CONTROL" => 1,
715     BIND => 1,
716     CHECKIN => 1,
717     CHECKOUT => 1,
718     COPY => 1,
719     LABEL => 1,
720     LINK => 1,
721     MERGE => 1,
722     MKACTIVITY => 1,
723     MKCALENDAR => 1,
724     MKCOL => 1,
725     MKREDIRECTREF => 1,
726     MKWORKSPACE => 1,
727     MOVE => 1,
728     ORDERPATCH => 1,
729     PROPFIND => 1,
730     PROPPATCH => 1,
731     REBIND => 1,
732     REPORT => 1,
733     SEARCH => 1,
734     UNBIND => 1,
735     UNCHECKOUT => 1,
736     UNLINK => 1,
737     UNLOCK => 1,
738     UPDATE => 1,
739     UPDATEREDIRECTREF => 1,
740     "VERSION-CONTROL" => 1,
741     );
742    
743 elmex 1.15 sub http_request($$@) {
744 root 1.1 my $cb = pop;
745     my ($method, $url, %arg) = @_;
746    
747     my %hdr;
748    
749 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
750     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
751    
752 root 1.3 $method = uc $method;
753    
754 root 1.8 if (my $hdr = $arg{headers}) {
755 root 1.1 while (my ($k, $v) = each %$hdr) {
756     $hdr{lc $k} = $v;
757     }
758     }
759    
760 root 1.55 # pseudo headers for all subsequent responses
761     my @pseudo = (URL => $url);
762     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
763    
764 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
765 root 1.8
766 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
767 root 1.8 if $recurse < 0;
768    
769 root 1.102 my $proxy = exists $arg{proxy} ? $arg{proxy} : $PROXY;
770 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
771    
772 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
773 root 1.100 $url =~ m|^([^:]+):(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?$|;
774 root 1.2
775 root 1.31 $uscheme = lc $uscheme;
776 root 1.1
777 root 1.31 my $uport = $uscheme eq "http" ? 80
778     : $uscheme eq "https" ? 443
779 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
780 root 1.13
781 root 1.124 $uauthority =~ /^(?: .*\@ )? ([^\@]+?) (?: : (\d+) )?$/x
782 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
783 root 1.10
784 root 1.86 my $uhost = lc $1;
785 root 1.10 $uport = $2 if defined $2;
786    
787 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
788     unless exists $hdr{host};
789 root 1.43
790 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
791 root 1.56 $upath .= $query if length $query;
792 root 1.10
793     $upath =~ s%^/?%/%;
794    
795     # cookie processing
796     if (my $jar = $arg{cookie_jar}) {
797 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
798 root 1.70
799 root 1.71 $hdr{cookie} = join "; ", @$cookies
800     if @$cookies;
801 root 1.10 }
802 root 1.1
803 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
804 root 1.2
805 root 1.10 if ($proxy) {
806 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
807 root 1.31
808 root 1.47 $rscheme = "http" unless defined $rscheme;
809    
810 root 1.31 # don't support https requests over https-proxy transport,
811 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
812 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
813 root 1.86
814     $rhost = lc $rhost;
815     $rscheme = lc $rscheme;
816 root 1.10 } else {
817 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
818 root 1.2 }
819    
820 root 1.47 # leave out fragment and query string, just a heuristic
821 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
822     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
823 root 1.41
824 root 1.53 $hdr{"content-length"} = length $arg{body}
825     if length $arg{body} || $method ne "GET";
826 root 1.1
827 root 1.116 my $idempotent = $IDEMPOTENT{$method};
828 root 1.89
829     # default value for keepalive is true iff the request is for an idempotent method
830 root 1.103 my $persistent = exists $arg{persistent} ? !!$arg{persistent} : $idempotent;
831     my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : !$proxy;
832     my $was_persistent; # true if this is actually a recycled connection
833 root 1.92
834     # the key to use in the keepalive cache
835 root 1.105 my $ka_key = "$uscheme\x00$uhost\x00$uport\x00$arg{sessionid}";
836 root 1.89
837 root 1.118 $hdr{connection} = ($persistent ? $keepalive ? "keep-alive, " : "" : "close, ") . "Te"; #1.1
838 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
839 root 1.66
840 root 1.11 my %state = (connect_guard => 1);
841    
842 root 1.88 my $ae_error = 595; # connecting
843 root 1.1
844 root 1.88 # handle actual, non-tunneled, request
845     my $handle_actual_request = sub {
846     $ae_error = 596; # request phase
847    
848 root 1.92 my $hdl = $state{handle};
849    
850     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
851 root 1.88
852     # send request
853 root 1.92 $hdl->push_write (
854 root 1.88 "$method $rpath HTTP/1.1\015\012"
855     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
856     . "\015\012"
857 root 1.120 . $arg{body}
858 root 1.88 );
859    
860 root 1.117 # return if error occurred during push_write()
861 root 1.88 return unless %state;
862    
863 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
864     %hdr = ();
865 root 1.88
866     # status line and headers
867     $state{read_response} = sub {
868 root 1.92 return unless %state;
869    
870 root 1.88 for ("$_[1]") {
871     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
872    
873     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
874 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
875 root 1.88
876     # 100 Continue handling
877     # should not happen as we don't send expect: 100-continue,
878     # but we handle it just in case.
879     # since we send the request body regardless, if we get an error
880     # we are out of-sync, which we currently do NOT handle correctly.
881     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
882     if $2 eq 100;
883    
884     push @pseudo,
885     HTTPVersion => $1,
886     Status => $2,
887     Reason => $3,
888     ;
889 root 1.1
890 root 1.92 my $hdr = _parse_hdr
891     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
892 root 1.77
893 root 1.88 %hdr = (%$hdr, @pseudo);
894     }
895 root 1.82
896 root 1.88 # redirect handling
897 root 1.119 # relative uri handling forced by microsoft and other shitheads.
898     # we give our best and fall back to URI if available.
899     if (exists $hdr{location}) {
900     my $loc = $hdr{location};
901    
902     if ($loc =~ m%^//%) { # //
903 root 1.126 $loc = "$uscheme:$loc";
904 root 1.119
905     } elsif ($loc eq "") {
906     $loc = $url;
907    
908     } elsif ($loc !~ /^(?: $ | [^:\/?\#]+ : )/x) { # anything "simple"
909     $loc =~ s/^\.\/+//;
910    
911     if ($loc !~ m%^[.?#]%) {
912 root 1.129 my $prefix = "$uscheme://$uauthority";
913 root 1.119
914     unless ($loc =~ s/^\///) {
915     $prefix .= $upath;
916     $prefix =~ s/\/[^\/]*$//;
917     }
918    
919     $loc = "$prefix/$loc";
920    
921     } elsif (eval { require URI }) { # uri
922     $loc = URI->new_abs ($loc, $url)->as_string;
923    
924     } else {
925     return _error %state, $cb, { @pseudo, Status => 599, Reason => "Cannot parse Location (URI module missing)" };
926     #$hdr{Status} = 599;
927     #$hdr{Reason} = "Unparsable Redirect (URI module missing)";
928     #$recurse = 0;
929     }
930 root 1.82 }
931    
932 root 1.119 $hdr{location} = $loc;
933 root 1.88 }
934 root 1.82
935 root 1.88 my $redirect;
936 root 1.82
937 root 1.88 if ($recurse) {
938     my $status = $hdr{Status};
939 root 1.82
940 root 1.88 # industry standard is to redirect POST as GET for
941     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
942     # also, the UA should ask the user for 301 and 307 and POST,
943     # industry standard seems to be to simply follow.
944 root 1.119 # we go with the industry standard. 308 is defined
945 root 1.121 # by rfc7538
946 root 1.88 if ($status == 301 or $status == 302 or $status == 303) {
947 root 1.122 $redirect = 1;
948 root 1.88 # HTTP/1.1 is unclear on how to mutate the method
949 root 1.122 unless ($method eq "HEAD") {
950     $method = "GET";
951     delete $arg{body};
952     }
953 root 1.119 } elsif ($status == 307 or $status == 308) {
954 root 1.88 $redirect = 1;
955 root 1.82 }
956 root 1.88 }
957 root 1.82
958 root 1.103 my $finish = sub { # ($data, $err_status, $err_reason[, $persistent])
959 root 1.92 if ($state{handle}) {
960     # handle keepalive
961     if (
962 root 1.103 $persistent
963 root 1.92 && $_[3]
964     && ($hdr{HTTPVersion} < 1.1
965     ? $hdr{connection} =~ /\bkeep-?alive\b/i
966     : $hdr{connection} !~ /\bclose\b/i)
967     ) {
968     ka_store $ka_key, delete $state{handle};
969     } else {
970     # no keepalive, destroy the handle
971     $state{handle}->destroy;
972     }
973     }
974 root 1.82
975 root 1.88 %state = ();
976 root 1.82
977 root 1.88 if (defined $_[1]) {
978     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
979     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
980     }
981 root 1.82
982 root 1.88 # set-cookie processing
983     if ($arg{cookie_jar}) {
984     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
985     }
986 root 1.82
987 root 1.88 if ($redirect && exists $hdr{location}) {
988     # we ignore any errors, as it is very common to receive
989     # Content-Length != 0 but no actual body
990     # we also access %hdr, as $_[1] might be an erro
991 root 1.103 $state{recurse} =
992     http_request (
993     $method => $hdr{location},
994     %arg,
995     recurse => $recurse - 1,
996     Redirect => [$_[0], \%hdr],
997     sub {
998     %state = ();
999     &$cb
1000     },
1001     );
1002 root 1.88 } else {
1003     $cb->($_[0], \%hdr);
1004     }
1005     };
1006 root 1.82
1007 root 1.88 $ae_error = 597; # body phase
1008 root 1.82
1009 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
1010    
1011     my $len = $chunked ? undef : $hdr{"content-length"};
1012 root 1.82
1013 root 1.88 # body handling, many different code paths
1014     # - no body expected
1015     # - want_body_handle
1016     # - te chunked
1017     # - 2x length known (with or without on_body)
1018     # - 2x length not known (with or without on_body)
1019     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
1020     $finish->(undef, 598 => "Request cancelled by on_header");
1021     } elsif (
1022     $hdr{Status} =~ /^(?:1..|204|205|304)$/
1023     or $method eq "HEAD"
1024     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
1025     ) {
1026     # no body
1027     $finish->("", undef, undef, 1);
1028    
1029     } elsif (!$redirect && $arg{want_body_handle}) {
1030     $_[0]->on_eof (undef);
1031     $_[0]->on_error (undef);
1032     $_[0]->on_read (undef);
1033    
1034     $finish->(delete $state{handle});
1035    
1036 root 1.91 } elsif ($chunked) {
1037 root 1.88 my $cl = 0;
1038 root 1.92 my $body = "";
1039 root 1.128 my $on_body = (!$redirect && $arg{on_body}) || sub { $body .= shift; 1 };
1040 root 1.88
1041     $state{read_chunk} = sub {
1042     $_[1] =~ /^([0-9a-fA-F]+)/
1043 root 1.108 or return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1044 root 1.82
1045 root 1.88 my $len = hex $1;
1046 root 1.82
1047 root 1.88 if ($len) {
1048     $cl += $len;
1049 root 1.82
1050 root 1.88 $_[0]->push_read (chunk => $len, sub {
1051     $on_body->($_[1], \%hdr)
1052     or return $finish->(undef, 598 => "Request cancelled by on_body");
1053 root 1.82
1054 root 1.88 $_[0]->push_read (line => sub {
1055     length $_[1]
1056     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1057     $_[0]->push_read (line => $state{read_chunk});
1058 root 1.82 });
1059 root 1.88 });
1060     } else {
1061     $hdr{"content-length"} ||= $cl;
1062 root 1.84
1063 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
1064     if (length $_[1]) {
1065     for ("$_[1]") {
1066     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
1067 root 1.84
1068 root 1.92 my $hdr = _parse_hdr
1069 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
1070 root 1.84
1071 root 1.88 %hdr = (%hdr, %$hdr);
1072     }
1073     }
1074 root 1.84
1075 root 1.88 $finish->($body, undef, undef, 1);
1076 root 1.84 });
1077     }
1078 root 1.88 };
1079    
1080     $_[0]->push_read (line => $state{read_chunk});
1081    
1082 root 1.128 } elsif (!$redirect && $arg{on_body}) {
1083 root 1.88 if (defined $len) {
1084     $_[0]->on_read (sub {
1085     $len -= length $_[0]{rbuf};
1086    
1087     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1088     or return $finish->(undef, 598 => "Request cancelled by on_body");
1089    
1090     $len > 0
1091     or $finish->("", undef, undef, 1);
1092     });
1093 root 1.84 } else {
1094 root 1.88 $_[0]->on_eof (sub {
1095     $finish->("");
1096     });
1097     $_[0]->on_read (sub {
1098     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1099     or $finish->(undef, 598 => "Request cancelled by on_body");
1100     });
1101     }
1102     } else {
1103     $_[0]->on_eof (undef);
1104 root 1.82
1105 root 1.88 if (defined $len) {
1106     $_[0]->on_read (sub {
1107     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1108     if $len <= length $_[0]{rbuf};
1109     });
1110     } else {
1111     $_[0]->on_error (sub {
1112     ($! == Errno::EPIPE || !$!)
1113     ? $finish->(delete $_[0]{rbuf})
1114     : $finish->(undef, $ae_error => $_[2]);
1115     });
1116     $_[0]->on_read (sub { });
1117 root 1.82 }
1118 root 1.88 }
1119     };
1120 root 1.82
1121 root 1.92 # if keepalive is enabled, then the server closing the connection
1122     # before a response can happen legally - we retry on idempotent methods.
1123 root 1.103 if ($was_persistent && $idempotent) {
1124 root 1.92 my $old_eof = $hdl->{on_eof};
1125     $hdl->{on_eof} = sub {
1126     _destroy_state %state;
1127    
1128 root 1.103 %state = ();
1129     $state{recurse} =
1130     http_request (
1131 root 1.124 $method => $url,
1132 root 1.103 %arg,
1133 root 1.124 recurse => $recurse - 1,
1134     persistent => 0,
1135 root 1.103 sub {
1136     %state = ();
1137     &$cb
1138     }
1139     );
1140 root 1.92 };
1141     $hdl->on_read (sub {
1142     return unless %state;
1143    
1144     # as soon as we receive something, a connection close
1145     # once more becomes a hard error
1146     $hdl->{on_eof} = $old_eof;
1147     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1148     });
1149     } else {
1150     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1151     }
1152     };
1153    
1154     my $prepare_handle = sub {
1155     my ($hdl) = $state{handle};
1156    
1157     $hdl->on_error (sub {
1158     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1159     });
1160     $hdl->on_eof (sub {
1161     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1162     });
1163 root 1.103 $hdl->timeout_reset;
1164     $hdl->timeout ($timeout);
1165 root 1.88 };
1166 root 1.82
1167 root 1.92 # connected to proxy (or origin server)
1168 root 1.88 my $connect_cb = sub {
1169 root 1.92 my $fh = shift
1170     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1171 root 1.44
1172 root 1.88 return unless delete $state{connect_guard};
1173 root 1.11
1174 root 1.88 # get handle
1175     $state{handle} = new AnyEvent::Handle
1176 root 1.92 %{ $arg{handle_params} },
1177     fh => $fh,
1178     peername => $uhost,
1179 root 1.88 tls_ctx => $arg{tls_ctx},
1180     ;
1181 root 1.11
1182 root 1.92 $prepare_handle->();
1183 root 1.1
1184 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1185 root 1.88
1186     # now handle proxy-CONNECT method
1187     if ($proxy && $uscheme eq "https") {
1188     # oh dear, we have to wrap it into a connect request
1189    
1190 root 1.122 my $auth = exists $hdr{"proxy-authorization"}
1191     ? "proxy-authorization: " . (delete $hdr{"proxy-authorization"}) . "\015\012"
1192     : "";
1193    
1194 root 1.88 # maybe re-use $uauthority with patched port?
1195 root 1.122 $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012$auth\015\012");
1196 root 1.88 $state{handle}->push_read (line => $qr_nlnl, sub {
1197     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1198 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1199 root 1.88
1200     if ($2 == 200) {
1201     $rpath = $upath;
1202     $handle_actual_request->();
1203     } else {
1204 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1205 root 1.88 }
1206     });
1207     } else {
1208 root 1.122 delete $hdr{"proxy-authorization"} unless $proxy;
1209    
1210 root 1.88 $handle_actual_request->();
1211     }
1212     };
1213    
1214     _get_slot $uhost, sub {
1215     $state{slot_guard} = shift;
1216 root 1.64
1217 root 1.88 return unless $state{connect_guard};
1218 root 1.64
1219 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1220     # on a keepalive request (in theory, this should be a separate config option).
1221 root 1.103 if ($persistent && $KA_CACHE{$ka_key}) {
1222     $was_persistent = 1;
1223    
1224 root 1.92 $state{handle} = ka_fetch $ka_key;
1225 root 1.129 # $state{handle}->destroyed
1226     # and die "AnyEvent::HTTP: unexpectedly got a destructed handle (1), please report.";#d#
1227 root 1.92 $prepare_handle->();
1228 root 1.129 # $state{handle}->destroyed
1229     # and die "AnyEvent::HTTP: unexpectedly got a destructed handle (2), please report.";#d#
1230 root 1.92 $handle_actual_request->();
1231    
1232     } else {
1233     my $tcp_connect = $arg{tcp_connect}
1234     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1235 root 1.57
1236 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1237     }
1238 root 1.1 };
1239    
1240 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1241 root 1.1 }
1242    
1243 elmex 1.15 sub http_get($@) {
1244 root 1.1 unshift @_, "GET";
1245     &http_request
1246     }
1247    
1248 elmex 1.15 sub http_head($@) {
1249 root 1.4 unshift @_, "HEAD";
1250     &http_request
1251     }
1252    
1253 elmex 1.15 sub http_post($$@) {
1254 root 1.22 my $url = shift;
1255     unshift @_, "POST", $url, "body";
1256 root 1.3 &http_request
1257     }
1258    
1259 root 1.9 =back
1260    
1261 root 1.55 =head2 DNS CACHING
1262    
1263     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1264     the actual connection, which in turn uses AnyEvent::DNS to resolve
1265     hostnames. The latter is a simple stub resolver and does no caching
1266     on its own. If you want DNS caching, you currently have to provide
1267     your own default resolver (by storing a suitable resolver object in
1268 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1269 root 1.55
1270 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1271 root 1.1
1272     =over 4
1273    
1274 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1275    
1276     Sets the default proxy server to use. The proxy-url must begin with a
1277 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1278 root 1.52
1279     To clear an already-set proxy, use C<undef>.
1280 root 1.2
1281 root 1.113 When AnyEvent::HTTP is loaded for the first time it will query the
1282 root 1.102 default proxy from the operating system, currently by looking at
1283     C<$ENV{http_proxy>}.
1284    
1285 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1286    
1287     Remove all cookies from the cookie jar that have been expired. If
1288     C<$session_end> is given and true, then additionally remove all session
1289     cookies.
1290    
1291     You should call this function (with a true C<$session_end>) before you
1292     save cookies to disk, and you should call this function after loading them
1293 root 1.117 again. If you have a long-running program you can additionally call this
1294 root 1.80 function from time to time.
1295    
1296     A cookie jar is initially an empty hash-reference that is managed by this
1297 root 1.121 module. Its format is subject to change, but currently it is as follows:
1298 root 1.80
1299 root 1.131 The key C<version> has to contain C<2>, otherwise the hash gets
1300     cleared. All other keys are hostnames or IP addresses pointing to
1301 root 1.80 hash-references. The key for these inner hash references is the
1302     server path for which this cookie is meant, and the values are again
1303 root 1.115 hash-references. Each key of those hash-references is a cookie name, and
1304 root 1.80 the value, you guessed it, is another hash-reference, this time with the
1305     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1306     which have been replaced by a C<_expires> key that contains the cookie
1307 root 1.115 expiry timestamp. Session cookies are indicated by not having an
1308     C<_expires> key.
1309 root 1.80
1310     Here is an example of a cookie jar with a single cookie, so you have a
1311     chance of understanding the above paragraph:
1312    
1313     {
1314     version => 1,
1315     "10.0.0.1" => {
1316     "/" => {
1317     "mythweb_id" => {
1318     _expires => 1293917923,
1319     value => "ooRung9dThee3ooyXooM1Ohm",
1320     },
1321     },
1322     },
1323     }
1324    
1325 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1326    
1327     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1328     Date (RFC 2616).
1329    
1330     =item $timestamp = AnyEvent::HTTP::parse_date $date
1331    
1332 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1333     bunch of minor variations of those, and returns the corresponding POSIX
1334     timestamp, or C<undef> if the date cannot be parsed.
1335 root 1.61
1336 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1337 root 1.1
1338 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1339 root 1.1
1340 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1341    
1342 root 1.110 The default timeout for connection operations (default: C<300>).
1343 root 1.92
1344 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1345    
1346     The default value for the C<User-Agent> header (the default is
1347 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1348 root 1.1
1349 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1350 root 1.1
1351 root 1.47 The maximum number of concurrent connections to the same host (identified
1352 root 1.121 by the hostname). If the limit is exceeded, then additional requests
1353 root 1.92 are queued until previous connections are closed. Both persistent and
1354     non-persistent connections are counted in this limit.
1355 root 1.1
1356 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1357 root 1.92 increase it much.
1358    
1359     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1360 root 1.117 connections, older browsers used 2, newer ones (such as firefox 3)
1361     typically use 6, and Opera uses 8 because like, they have the fastest
1362     browser and give a shit for everybody else on the planet.
1363 root 1.92
1364     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1365    
1366 root 1.116 The time after which idle persistent connections get closed by
1367 root 1.92 AnyEvent::HTTP (default: C<3>).
1368 root 1.3
1369 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1370    
1371     The number of active connections. This is not the number of currently
1372     running requests, but the number of currently open and non-idle TCP
1373 root 1.92 connections. This number can be useful for load-leveling.
1374 root 1.14
1375 root 1.1 =back
1376    
1377     =cut
1378    
1379 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1380     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1381    
1382     sub format_date($) {
1383     my ($time) = @_;
1384    
1385     # RFC 822/1123 format
1386     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1387    
1388     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1389     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1390     $H, $M, $S;
1391     }
1392    
1393     sub parse_date($) {
1394     my ($date) = @_;
1395    
1396     my ($d, $m, $y, $H, $M, $S);
1397    
1398 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1399 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1400     # cookie dates (with "-")
1401    
1402 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1403    
1404 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1405 root 1.61 # RFC 850
1406     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1407    
1408 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1409 root 1.61 # ISO C's asctime
1410     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1411     }
1412     # other formats fail in the loop below
1413    
1414     for (0..11) {
1415     if ($m eq $month[$_]) {
1416     require Time::Local;
1417 root 1.111 return eval { Time::Local::timegm ($S, $M, $H, $d, $_, $y) };
1418 root 1.61 }
1419     }
1420    
1421     undef
1422     }
1423    
1424 root 1.2 sub set_proxy($) {
1425 root 1.52 if (length $_[0]) {
1426 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1427 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1428     $PROXY = [$2, $3 || 3128, $1]
1429     } else {
1430     undef $PROXY;
1431     }
1432 root 1.2 }
1433    
1434     # initialise proxy from environment
1435 root 1.52 eval {
1436     set_proxy $ENV{http_proxy};
1437     };
1438 root 1.2
1439 root 1.93 =head2 SHOWCASE
1440    
1441 root 1.117 This section contains some more elaborate "real-world" examples or code
1442 root 1.93 snippets.
1443    
1444     =head2 HTTP/1.1 FILE DOWNLOAD
1445    
1446 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1447 root 1.98 goes wrong and you want to resume.
1448 root 1.93
1449     Here is a function that initiates and resumes a download. It uses the
1450     last modified time to check for file content changes, and works with many
1451     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1452     on older servers.
1453    
1454     It calls the completion callback with either C<undef>, which means a
1455 root 1.117 nonretryable error occurred, C<0> when the download was partial and should
1456 root 1.93 be retried, and C<1> if it was successful.
1457    
1458     use AnyEvent::HTTP;
1459    
1460     sub download($$$) {
1461     my ($url, $file, $cb) = @_;
1462    
1463     open my $fh, "+<", $file
1464     or die "$file: $!";
1465    
1466     my %hdr;
1467     my $ofs = 0;
1468    
1469     if (stat $fh and -s _) {
1470     $ofs = -s _;
1471 root 1.107 warn "-s is ", $ofs;
1472 root 1.93 $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1473     $hdr{"range"} = "bytes=$ofs-";
1474     }
1475    
1476     http_get $url,
1477     headers => \%hdr,
1478     on_header => sub {
1479     my ($hdr) = @_;
1480    
1481     if ($hdr->{Status} == 200 && $ofs) {
1482     # resume failed
1483     truncate $fh, $ofs = 0;
1484     }
1485    
1486     sysseek $fh, $ofs, 0;
1487    
1488     1
1489     },
1490     on_body => sub {
1491     my ($data, $hdr) = @_;
1492    
1493     if ($hdr->{Status} =~ /^2/) {
1494     length $data == syswrite $fh, $data
1495     or return; # abort on write errors
1496     }
1497    
1498     1
1499     },
1500     sub {
1501     my (undef, $hdr) = @_;
1502    
1503     my $status = $hdr->{Status};
1504    
1505     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1506 root 1.127 utime $time, $time, $fh;
1507 root 1.93 }
1508    
1509     if ($status == 200 || $status == 206 || $status == 416) {
1510     # download ok || resume ok || file already fully downloaded
1511     $cb->(1, $hdr);
1512    
1513     } elsif ($status == 412) {
1514     # file has changed while resuming, delete and retry
1515     unlink $file;
1516     $cb->(0, $hdr);
1517    
1518     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1519     # retry later
1520     $cb->(0, $hdr);
1521    
1522     } else {
1523     $cb->(undef, $hdr);
1524     }
1525     }
1526     ;
1527     }
1528    
1529     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1530     if ($_[0]) {
1531     print "OK!\n";
1532     } elsif (defined $_[0]) {
1533     print "please retry later\n";
1534     } else {
1535     print "ERROR\n";
1536     }
1537     };
1538    
1539     =head3 SOCKS PROXIES
1540 root 1.60
1541     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1542     compile your perl to support socks, or use an external program such as
1543     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1544     transparently.
1545    
1546     Alternatively, for AnyEvent::HTTP only, you can use your own
1547     C<tcp_connect> function that does the proxy handshake - here is an example
1548     that works with socks4a proxies:
1549    
1550     use Errno;
1551     use AnyEvent::Util;
1552     use AnyEvent::Socket;
1553     use AnyEvent::Handle;
1554    
1555     # host, port and username of/for your socks4a proxy
1556     my $socks_host = "10.0.0.23";
1557     my $socks_port = 9050;
1558     my $socks_user = "";
1559    
1560     sub socks4a_connect {
1561     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1562    
1563     my $hdl = new AnyEvent::Handle
1564     connect => [$socks_host, $socks_port],
1565     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1566     on_error => sub { $connect_cb->() },
1567     ;
1568    
1569     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1570    
1571     $hdl->push_read (chunk => 8, sub {
1572     my ($hdl, $chunk) = @_;
1573     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1574    
1575     if ($status == 0x5a) {
1576     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1577     } else {
1578     $! = Errno::ENXIO; $connect_cb->();
1579     }
1580     });
1581    
1582     $hdl
1583     }
1584    
1585     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1586     possibly after switching off other proxy types:
1587    
1588     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1589    
1590     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1591     my ($data, $headers) = @_;
1592     ...
1593     };
1594    
1595 root 1.1 =head1 SEE ALSO
1596    
1597     L<AnyEvent>.
1598    
1599     =head1 AUTHOR
1600    
1601 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1602     http://home.schmorp.de/
1603 root 1.1
1604 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1605     testcases and bugreports.
1606    
1607 root 1.1 =cut
1608    
1609     1
1610