ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.139
Committed: Fri Aug 5 20:48:14 2022 UTC (21 months, 1 week ago) by root
Branch: MAIN
Changes since 1.138: +2 -1 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
21 root 1.11 automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41 root 1.85 use common::sense;
42 root 1.1
43 root 1.41 use Errno ();
44 root 1.1
45 root 1.51 use AnyEvent 5.0 ();
46 root 1.1 use AnyEvent::Util ();
47     use AnyEvent::Handle ();
48    
49     use base Exporter::;
50    
51 root 1.137 our $VERSION = 2.25;
52 root 1.1
53 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
54 root 1.1
55 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
56 root 1.3 our $MAX_RECURSE = 10;
57 root 1.92 our $PERSISTENT_TIMEOUT = 3;
58 root 1.2 our $TIMEOUT = 300;
59 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
60 root 1.1
61 root 1.2 our $PROXY;
62 root 1.14 our $ACTIVE = 0;
63 root 1.2
64 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
65 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
66 root 1.1
67     =item http_get $url, key => value..., $cb->($data, $headers)
68    
69     Executes an HTTP-GET request. See the http_request function for details on
70 root 1.29 additional parameters and the return value.
71 root 1.1
72 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
73    
74 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
75     on additional parameters and the return value.
76 root 1.5
77     =item http_post $url, $body, key => value..., $cb->($data, $headers)
78 root 1.3
79 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
80 root 1.29 http_request function for details on additional parameters and the return
81     value.
82 root 1.3
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.29 When called in void context, nothing is returned. In other contexts,
89     C<http_request> returns a "cancellation guard" - you have to keep the
90     object at least alive until the callback get called. If the object gets
91 root 1.58 destroyed before the callback is called, the request will be cancelled.
92 root 1.29
93 root 1.42 The callback will be called with the response body data as first argument
94 root 1.117 (or C<undef> if an error occurred), and a hash-ref with response headers
95 root 1.68 (and trailers) as second argument.
96 root 1.2
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
99     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
100 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
101     during the body phase of a request, then the original C<Status> and
102     C<Reason> values from the header are available as C<OrigStatus> and
103     C<OrigReason>.
104 root 1.55
105     The pseudo-header C<URL> contains the actual URL (which can differ from
106     the requested URL when following redirects - for example, you might get
107     an error that your URL scheme is not supported even though your URL is a
108     valid http URL because it redirected to an ftp URL, in which case you can
109     look at the URL pseudo header).
110    
111     The pseudo-header C<Redirect> only exists when the request was a result
112     of an internal redirect. In that case it is an array reference with
113     the C<($data, $headers)> from the redirect response. Note that this
114     response could in turn be the result of a redirect itself, and C<<
115     $headers->{Redirect}[1]{Redirect} >> will then contain the original
116     response, and so on.
117 root 1.20
118 root 1.32 If the server sends a header multiple times, then their contents will be
119     joined together with a comma (C<,>), as per the HTTP spec.
120 root 1.2
121     If an internal error occurs, such as not being able to resolve a hostname,
122 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
123     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
124     message. Currently the following status codes are used:
125    
126     =over 4
127    
128 root 1.114 =item 595 - errors during connection establishment, proxy handshake.
129 root 1.77
130     =item 596 - errors during TLS negotiation, request sending and header processing.
131    
132 root 1.78 =item 597 - errors during body receiving or processing.
133 root 1.77
134 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
135 root 1.77
136     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
137    
138     =back
139 root 1.2
140 root 1.6 A typical callback might look like this:
141    
142     sub {
143     my ($body, $hdr) = @_;
144    
145     if ($hdr->{Status} =~ /^2/) {
146     ... everything should be ok
147     } else {
148     print "error, $hdr->{Status} $hdr->{Reason}\n";
149     }
150     }
151    
152 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
153     include:
154    
155     =over 4
156    
157 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
158 root 1.1
159 root 1.112 Whether to recurse requests or not, e.g. on redirects, authentication and
160     other retries and so on, and how often to do so.
161 root 1.1
162 root 1.119 Only redirects to http and https URLs are supported. While most common
163     redirection forms are handled entirely within this module, some require
164     the use of the optional L<URI> module. If it is required but missing, then
165     the request will fail with an error.
166    
167 root 1.1 =item headers => hashref
168    
169 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
170     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
171 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
172     (this can be suppressed by using C<undef> for these headers in which case
173     they won't be sent at all).
174 root 1.1
175 root 1.90 You really should provide your own C<User-Agent:> header value that is
176     appropriate for your program - I wouldn't be surprised if the default
177     AnyEvent string gets blocked by webservers sooner or later.
178    
179 root 1.104 Also, make sure that your headers names and values do not contain any
180     embedded newlines.
181    
182 root 1.1 =item timeout => $seconds
183    
184     The time-out to use for various stages - each connect attempt will reset
185 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
186     timeout.
187    
188     Default timeout is 5 minutes.
189 root 1.2
190     =item proxy => [$host, $port[, $scheme]] or undef
191    
192 root 1.102 Use the given http proxy for all requests, or no proxy if C<undef> is
193     used.
194 root 1.2
195 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
196 root 1.1
197 root 1.102 If not specified, then the default proxy is used (see
198     C<AnyEvent::HTTP::set_proxy>).
199    
200 root 1.123 Currently, if your proxy requires authorization, you have to specify an
201     appropriate "Proxy-Authorization" header in every request.
202    
203 root 1.135 Note that this module will prefer an existing persistent connection,
204     even if that connection was made using another proxy. If you need to
205 root 1.136 ensure that a new connection is made in this case, you can either force
206 root 1.135 C<persistent> to false or e.g. use the proxy address in your C<sessionid>.
207    
208 root 1.3 =item body => $string
209    
210 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
211 root 1.3 this module might offer more options).
212    
213 root 1.10 =item cookie_jar => $hash_ref
214    
215     Passing this parameter enables (simplified) cookie-processing, loosely
216     based on the original netscape specification.
217    
218 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
219     will get updated automatically. It is possible to save the cookie jar
220     to persistent storage with something like JSON or Storable - see the
221     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
222     expired or session-only cookies, and also for documentation on the format
223     of the cookie jar.
224 root 1.10
225 root 1.70 Note that this cookie implementation is not meant to be complete. If
226     you want complete cookie management you have to do that on your
227 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
228 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
229     to.
230 root 1.10
231 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
232 root 1.70 headers will be set and handled by this module, otherwise they will be
233 root 1.69 left untouched.
234    
235 root 1.40 =item tls_ctx => $scheme | $tls_ctx
236    
237     Specifies the AnyEvent::TLS context to be used for https connections. This
238     parameter follows the same rules as the C<tls_ctx> parameter to
239     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
240     C<high> can be specified, which give you a predefined low-security (no
241     verification, highest compatibility) and high-security (CA and common-name
242     verification) TLS context.
243    
244     The default for this option is C<low>, which could be interpreted as "give
245     me the page, no matter what".
246    
247 root 1.92 See also the C<sessionid> parameter.
248    
249 root 1.135 =item sessionid => $string
250 root 1.92
251 root 1.135 The module might reuse connections to the same host internally (regardless
252     of other settings, such as C<tcp_connect> or C<proxy>). Sometimes (e.g.
253     when using TLS or a specfic proxy), you do not want to reuse connections
254     from other sessions. This can be achieved by setting this parameter to
255     some unique ID (such as the address of an object storing your state data
256     or the TLS context, or the proxy IP) - only connections using the same
257     unique ID will be reused.
258 root 1.92
259 root 1.51 =item on_prepare => $callback->($fh)
260    
261     In rare cases you need to "tune" the socket before it is used to
262 root 1.117 connect (for example, to bind it on a given IP address). This parameter
263 root 1.51 overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
264     and behaves exactly the same way (e.g. it has to provide a
265     timeout). See the description for the C<$prepare_cb> argument of
266     C<AnyEvent::Socket::tcp_connect> for details.
267    
268 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
269    
270     In even rarer cases you want total control over how AnyEvent::HTTP
271     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
272     to do this, but you can provide your own C<tcp_connect> function -
273 root 1.60 obviously, it has to follow the same calling conventions, except that it
274     may always return a connection guard object.
275 root 1.59
276 root 1.134 The connections made by this hook will be treated as equivalent to
277 root 1.136 connections made the built-in way, specifically, they will be put into
278     and taken from the persistent connection cache. If your C<$tcp_connect>
279 root 1.134 function is incompatible with this kind of re-use, consider switching off
280 root 1.135 C<persistent> connections and/or providing a C<sessionid> identifier.
281 root 1.134
282 root 1.59 There are probably lots of weird uses for this function, starting from
283     tracing the hosts C<http_request> actually tries to connect, to (inexact
284     but fast) host => IP address caching or even socks protocol support.
285    
286 root 1.42 =item on_header => $callback->($headers)
287 root 1.41
288     When specified, this callback will be called with the header hash as soon
289     as headers have been successfully received from the remote server (not on
290     locally-generated errors).
291    
292     It has to return either true (in which case AnyEvent::HTTP will continue),
293     or false, in which case AnyEvent::HTTP will cancel the download (and call
294     the finish callback with an error code of C<598>).
295    
296     This callback is useful, among other things, to quickly reject unwanted
297     content, which, if it is supposed to be rare, can be faster than first
298     doing a C<HEAD> request.
299    
300 root 1.68 The downside is that cancelling the request makes it impossible to re-use
301     the connection. Also, the C<on_header> callback will not receive any
302     trailer (headers sent after the response body).
303    
304 root 1.42 Example: cancel the request unless the content-type is "text/html".
305 root 1.41
306 root 1.42 on_header => sub {
307     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
308     },
309 root 1.41
310 root 1.42 =item on_body => $callback->($partial_body, $headers)
311 root 1.41
312 root 1.42 When specified, all body data will be passed to this callback instead of
313     to the completion callback. The completion callback will get the empty
314     string instead of the body data.
315 root 1.41
316 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
317     or false, in which case AnyEvent::HTTP will cancel the download (and call
318     the completion callback with an error code of C<598>).
319    
320 root 1.68 The downside to cancelling the request is that it makes it impossible to
321     re-use the connection.
322    
323 root 1.42 This callback is useful when the data is too large to be held in memory
324     (so the callback writes it to a file) or when only some information should
325     be extracted, or when the body should be processed incrementally.
326 root 1.41
327     It is usually preferred over doing your own body handling via
328 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
329     only used to create a connection, C<want_body_handle> is the better
330     alternative, as it allows you to install your own event handler, reducing
331     resource usage.
332 root 1.41
333     =item want_body_handle => $enable
334    
335     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
336     changes considerably: after parsing the headers, and instead of
337     downloading the body (if any), the completion callback will be
338     called. Instead of the C<$body> argument containing the body data, the
339     callback will receive the L<AnyEvent::Handle> object associated with the
340     connection. In error cases, C<undef> will be passed. When there is no body
341     (e.g. status C<304>), the empty string will be passed.
342    
343 root 1.92 The handle object might or might not be in TLS mode, might be connected
344     to a proxy, be a persistent connection, use chunked transfer encoding
345     etc., and configured in unspecified ways. The user is responsible for this
346     handle (it will not be used by this module anymore).
347 root 1.41
348     This is useful with some push-type services, where, after the initial
349     headers, an interactive protocol is used (typical example would be the
350     push-style twitter API which starts a JSON/XML stream).
351    
352     If you think you need this, first have a look at C<on_body>, to see if
353 root 1.45 that doesn't solve your problem in a better way.
354 root 1.41
355 root 1.92 =item persistent => $boolean
356    
357     Try to create/reuse a persistent connection. When this flag is set
358     (default: true for idempotent requests, false for all others), then
359     C<http_request> tries to re-use an existing (previously-created)
360 root 1.133 persistent connection to same host (i.e. identical URL scheme, hostname,
361 root 1.135 port and sessionid) and, failing that, tries to create a new one.
362 root 1.92
363     Requests failing in certain ways will be automatically retried once, which
364     is dangerous for non-idempotent requests, which is why it defaults to off
365     for them. The reason for this is because the bozos who designed HTTP/1.1
366     made it impossible to distinguish between a fatal error and a normal
367     connection timeout, so you never know whether there was a problem with
368     your request or not.
369    
370     When reusing an existent connection, many parameters (such as TLS context)
371 root 1.135 will be ignored. See the C<sessionid> parameter for a workaround.
372 root 1.92
373     =item keepalive => $boolean
374    
375     Only used when C<persistent> is also true. This parameter decides whether
376     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
377     (as opposed to only a HTTP/1.1 persistent connection).
378    
379     The default is true, except when using a proxy, in which case it defaults
380     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
381    
382     =item handle_params => { key => value ... }
383    
384     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
385     constructor that is called - not all requests will create a handle, and
386     sometimes more than one is created, so this parameter is only good for
387     setting hints.
388    
389     Example: set the maximum read size to 4096, to potentially conserve memory
390     at the cost of speed.
391    
392     handle_params => {
393     max_read_size => 4096,
394     },
395    
396 root 1.1 =back
397    
398 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
399     the response body.
400 root 1.9
401     http_request GET => "http://www.nethype.de/", sub {
402     my ($body, $hdr) = @_;
403     print "$body\n";
404     };
405    
406 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
407 root 1.9 timeout of 30 seconds.
408    
409     http_request
410 root 1.109 HEAD => "https://www.google.com",
411 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
412 root 1.9 timeout => 30,
413     sub {
414     my ($body, $hdr) = @_;
415     use Data::Dumper;
416     print Dumper $hdr;
417     }
418     ;
419 root 1.1
420 root 1.68 Example: do another simple HTTP GET request, but immediately try to
421 root 1.29 cancel it.
422    
423     my $request = http_request GET => "http://www.nethype.de/", sub {
424     my ($body, $hdr) = @_;
425     print "$body\n";
426     };
427    
428     undef $request;
429    
430 root 1.1 =cut
431    
432 root 1.92 #############################################################################
433     # wait queue/slots
434    
435 root 1.12 sub _slot_schedule;
436 root 1.11 sub _slot_schedule($) {
437     my $host = shift;
438    
439     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
440     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
441 root 1.12 # somebody wants that slot
442 root 1.11 ++$CO_SLOT{$host}[0];
443 root 1.14 ++$ACTIVE;
444 root 1.11
445     $cb->(AnyEvent::Util::guard {
446 root 1.14 --$ACTIVE;
447 root 1.11 --$CO_SLOT{$host}[0];
448     _slot_schedule $host;
449     });
450     } else {
451     # nobody wants the slot, maybe we can forget about it
452     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
453     last;
454     }
455     }
456     }
457    
458     # wait for a free slot on host, call callback
459     sub _get_slot($$) {
460     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
461    
462     _slot_schedule $_[0];
463     }
464    
465 root 1.80 #############################################################################
466 root 1.92 # cookie handling
467 root 1.80
468     # expire cookies
469     sub cookie_jar_expire($;$) {
470     my ($jar, $session_end) = @_;
471    
472 root 1.130 %$jar = () if $jar->{version} != 2;
473 root 1.80
474     my $anow = AE::now;
475    
476     while (my ($chost, $paths) = each %$jar) {
477     next unless ref $paths;
478    
479     while (my ($cpath, $cookies) = each %$paths) {
480     while (my ($cookie, $kv) = each %$cookies) {
481     if (exists $kv->{_expires}) {
482     delete $cookies->{$cookie}
483     if $anow > $kv->{_expires};
484     } elsif ($session_end) {
485     delete $cookies->{$cookie};
486     }
487     }
488    
489     delete $paths->{$cpath}
490     unless %$cookies;
491     }
492    
493     delete $jar->{$chost}
494     unless %$paths;
495     }
496     }
497    
498 root 1.72 # extract cookies from jar
499 root 1.71 sub cookie_jar_extract($$$$) {
500 root 1.92 my ($jar, $scheme, $host, $path) = @_;
501 root 1.71
502 root 1.130 %$jar = () if $jar->{version} != 2;
503    
504     $host = AnyEvent::Util::idn_to_ascii $host
505     if $host =~ /[^\x00-\x7f]/;
506 root 1.71
507     my @cookies;
508    
509     while (my ($chost, $paths) = each %$jar) {
510     next unless ref $paths;
511    
512 root 1.130 # exact match or suffix including . match
513     $chost eq $host or ".$chost" eq substr $host, -1 - length $chost
514     or next;
515 root 1.71
516     while (my ($cpath, $cookies) = each %$paths) {
517 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
518 root 1.71
519     while (my ($cookie, $kv) = each %$cookies) {
520 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
521 root 1.71
522 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
523     delete $cookies->{$cookie};
524     next;
525 root 1.71 }
526    
527     my $value = $kv->{value};
528    
529     if ($value =~ /[=;,[:space:]]/) {
530     $value =~ s/([\\"])/\\$1/g;
531     $value = "\"$value\"";
532     }
533    
534     push @cookies, "$cookie=$value";
535     }
536     }
537     }
538    
539     \@cookies
540     }
541    
542 root 1.72 # parse set_cookie header into jar
543 root 1.80 sub cookie_jar_set_cookie($$$$) {
544 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
545 root 1.80
546 root 1.130 %$jar = () if $jar->{version} != 2;
547    
548 root 1.80 my $anow = int AE::now;
549     my $snow; # server-now
550 root 1.72
551     for ($set_cookie) {
552     # parse NAME=VALUE
553     my @kv;
554    
555 root 1.79 # expires is not http-compliant in the original cookie-spec,
556     # we support the official date format and some extensions
557 root 1.72 while (
558     m{
559     \G\s*
560     (?:
561 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
562 root 1.101 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^;,[:space:]]*) ) )?
563 root 1.72 )
564     }gcxsi
565     ) {
566     my $name = $2;
567     my $value = $4;
568    
569 root 1.82 if (defined $1) {
570 root 1.72 # expires
571     $name = "expires";
572     $value = $1;
573 root 1.82 } elsif (defined $3) {
574 root 1.72 # quoted
575     $value = $3;
576     $value =~ s/\\(.)/$1/gs;
577     }
578    
579 root 1.95 push @kv, @kv ? lc $name : $name, $value;
580 root 1.72
581     last unless /\G\s*;/gc;
582     }
583    
584     last unless @kv;
585    
586     my $name = shift @kv;
587     my %kv = (value => shift @kv, @kv);
588    
589 root 1.80 if (exists $kv{"max-age"}) {
590     $kv{_expires} = $anow + delete $kv{"max-age"};
591     } elsif (exists $kv{expires}) {
592     $snow ||= parse_date ($date) || $anow;
593     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
594     } else {
595     delete $kv{_expires};
596     }
597 root 1.72
598     my $cdom;
599     my $cpath = (delete $kv{path}) || "/";
600    
601     if (exists $kv{domain}) {
602 root 1.130 $cdom = $kv{domain};
603 root 1.72
604     $cdom =~ s/^\.?/./; # make sure it starts with a "."
605    
606     next if $cdom =~ /\.$/;
607    
608     # this is not rfc-like and not netscape-like. go figure.
609     my $ndots = $cdom =~ y/.//;
610     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
611 root 1.130
612     $cdom = substr $cdom, 1; # remove initial .
613 root 1.72 } else {
614 root 1.92 $cdom = $host;
615 root 1.72 }
616    
617     # store it
618 root 1.130 $jar->{version} = 2;
619 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
620 root 1.72
621     redo if /\G\s*,/gc;
622     }
623     }
624    
625 root 1.92 #############################################################################
626     # keepalive/persistent connection cache
627    
628     # fetch a connection from the keepalive cache
629     sub ka_fetch($) {
630     my $ka_key = shift;
631    
632     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
633     delete $KA_CACHE{$ka_key}
634     unless @{ $KA_CACHE{$ka_key} };
635    
636     $hdl
637     }
638    
639     sub ka_store($$) {
640     my ($ka_key, $hdl) = @_;
641    
642     my $kaa = $KA_CACHE{$ka_key} ||= [];
643    
644     my $destroy = sub {
645     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
646    
647     $hdl->destroy;
648    
649     @ka
650     ? $KA_CACHE{$ka_key} = \@ka
651     : delete $KA_CACHE{$ka_key};
652     };
653    
654     # on error etc., destroy
655     $hdl->on_error ($destroy);
656     $hdl->on_eof ($destroy);
657     $hdl->on_read ($destroy);
658     $hdl->timeout ($PERSISTENT_TIMEOUT);
659    
660     push @$kaa, $hdl;
661     shift @$kaa while @$kaa > $MAX_PER_HOST;
662     }
663    
664     #############################################################################
665     # utilities
666    
667 root 1.66 # continue to parse $_ for headers and place them into the arg
668 root 1.92 sub _parse_hdr() {
669 root 1.66 my %hdr;
670    
671     # things seen, not parsed:
672     # p3pP="NON CUR OTPi OUR NOR UNI"
673    
674     $hdr{lc $1} .= ",$2"
675     while /\G
676     ([^:\000-\037]*):
677     [\011\040]*
678     ((?: [^\012]+ | \012[\011\040] )*)
679     \012
680     /gxc;
681    
682     /\G$/
683     or return;
684    
685     # remove the "," prefix we added to all headers above
686     substr $_, 0, 1, ""
687     for values %hdr;
688    
689     \%hdr
690     }
691    
692 root 1.92 #############################################################################
693     # http_get
694    
695 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
696 root 1.34
697 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
698     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
699 root 1.40
700 root 1.92 # maybe it should just become a normal object :/
701    
702     sub _destroy_state(\%) {
703     my ($state) = @_;
704    
705     $state->{handle}->destroy if $state->{handle};
706     %$state = ();
707     }
708    
709     sub _error(\%$$) {
710     my ($state, $cb, $hdr) = @_;
711    
712     &_destroy_state ($state);
713    
714     $cb->(undef, $hdr);
715     ()
716     }
717    
718 root 1.116 our %IDEMPOTENT = (
719     DELETE => 1,
720     GET => 1,
721 root 1.139 QUERY => 1,
722 root 1.116 HEAD => 1,
723     OPTIONS => 1,
724     PUT => 1,
725     TRACE => 1,
726    
727     ACL => 1,
728     "BASELINE-CONTROL" => 1,
729     BIND => 1,
730     CHECKIN => 1,
731     CHECKOUT => 1,
732     COPY => 1,
733     LABEL => 1,
734     LINK => 1,
735     MERGE => 1,
736     MKACTIVITY => 1,
737     MKCALENDAR => 1,
738     MKCOL => 1,
739     MKREDIRECTREF => 1,
740     MKWORKSPACE => 1,
741     MOVE => 1,
742     ORDERPATCH => 1,
743 root 1.139 PRI => 1,
744 root 1.116 PROPFIND => 1,
745     PROPPATCH => 1,
746     REBIND => 1,
747     REPORT => 1,
748     SEARCH => 1,
749     UNBIND => 1,
750     UNCHECKOUT => 1,
751     UNLINK => 1,
752     UNLOCK => 1,
753     UPDATE => 1,
754     UPDATEREDIRECTREF => 1,
755     "VERSION-CONTROL" => 1,
756     );
757    
758 elmex 1.15 sub http_request($$@) {
759 root 1.1 my $cb = pop;
760     my ($method, $url, %arg) = @_;
761    
762     my %hdr;
763    
764 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
765     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
766    
767 root 1.3 $method = uc $method;
768    
769 root 1.8 if (my $hdr = $arg{headers}) {
770 root 1.1 while (my ($k, $v) = each %$hdr) {
771     $hdr{lc $k} = $v;
772     }
773     }
774    
775 root 1.55 # pseudo headers for all subsequent responses
776     my @pseudo = (URL => $url);
777     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
778    
779 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
780 root 1.8
781 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
782 root 1.8 if $recurse < 0;
783    
784 root 1.102 my $proxy = exists $arg{proxy} ? $arg{proxy} : $PROXY;
785 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
786    
787 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
788 root 1.100 $url =~ m|^([^:]+):(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?$|;
789 root 1.2
790 root 1.31 $uscheme = lc $uscheme;
791 root 1.1
792 root 1.31 my $uport = $uscheme eq "http" ? 80
793     : $uscheme eq "https" ? 443
794 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
795 root 1.13
796 root 1.124 $uauthority =~ /^(?: .*\@ )? ([^\@]+?) (?: : (\d+) )?$/x
797 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
798 root 1.10
799 root 1.86 my $uhost = lc $1;
800 root 1.10 $uport = $2 if defined $2;
801    
802 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
803     unless exists $hdr{host};
804 root 1.43
805 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
806 root 1.56 $upath .= $query if length $query;
807 root 1.10
808     $upath =~ s%^/?%/%;
809    
810     # cookie processing
811     if (my $jar = $arg{cookie_jar}) {
812 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
813 root 1.70
814 root 1.71 $hdr{cookie} = join "; ", @$cookies
815     if @$cookies;
816 root 1.10 }
817 root 1.1
818 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
819 root 1.2
820 root 1.10 if ($proxy) {
821 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
822 root 1.31
823 root 1.47 $rscheme = "http" unless defined $rscheme;
824    
825 root 1.31 # don't support https requests over https-proxy transport,
826 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
827 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
828 root 1.86
829     $rhost = lc $rhost;
830     $rscheme = lc $rscheme;
831 root 1.10 } else {
832 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
833 root 1.2 }
834    
835 root 1.47 # leave out fragment and query string, just a heuristic
836 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
837     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
838 root 1.41
839 root 1.53 $hdr{"content-length"} = length $arg{body}
840     if length $arg{body} || $method ne "GET";
841 root 1.1
842 root 1.116 my $idempotent = $IDEMPOTENT{$method};
843 root 1.89
844     # default value for keepalive is true iff the request is for an idempotent method
845 root 1.103 my $persistent = exists $arg{persistent} ? !!$arg{persistent} : $idempotent;
846     my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : !$proxy;
847     my $was_persistent; # true if this is actually a recycled connection
848 root 1.92
849     # the key to use in the keepalive cache
850 root 1.105 my $ka_key = "$uscheme\x00$uhost\x00$uport\x00$arg{sessionid}";
851 root 1.89
852 root 1.118 $hdr{connection} = ($persistent ? $keepalive ? "keep-alive, " : "" : "close, ") . "Te"; #1.1
853 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
854 root 1.66
855 root 1.11 my %state = (connect_guard => 1);
856    
857 root 1.88 my $ae_error = 595; # connecting
858 root 1.1
859 root 1.88 # handle actual, non-tunneled, request
860     my $handle_actual_request = sub {
861     $ae_error = 596; # request phase
862    
863 root 1.92 my $hdl = $state{handle};
864    
865     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
866 root 1.88
867     # send request
868 root 1.92 $hdl->push_write (
869 root 1.88 "$method $rpath HTTP/1.1\015\012"
870     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
871     . "\015\012"
872 root 1.120 . $arg{body}
873 root 1.88 );
874    
875 root 1.117 # return if error occurred during push_write()
876 root 1.88 return unless %state;
877    
878 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
879     %hdr = ();
880 root 1.88
881     # status line and headers
882     $state{read_response} = sub {
883 root 1.92 return unless %state;
884    
885 root 1.88 for ("$_[1]") {
886     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
887    
888     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
889 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
890 root 1.88
891     # 100 Continue handling
892     # should not happen as we don't send expect: 100-continue,
893     # but we handle it just in case.
894     # since we send the request body regardless, if we get an error
895     # we are out of-sync, which we currently do NOT handle correctly.
896     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
897     if $2 eq 100;
898    
899     push @pseudo,
900     HTTPVersion => $1,
901     Status => $2,
902     Reason => $3,
903     ;
904 root 1.1
905 root 1.92 my $hdr = _parse_hdr
906     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
907 root 1.77
908 root 1.88 %hdr = (%$hdr, @pseudo);
909     }
910 root 1.82
911 root 1.88 # redirect handling
912 root 1.119 # relative uri handling forced by microsoft and other shitheads.
913     # we give our best and fall back to URI if available.
914     if (exists $hdr{location}) {
915     my $loc = $hdr{location};
916    
917     if ($loc =~ m%^//%) { # //
918 root 1.126 $loc = "$uscheme:$loc";
919 root 1.119
920     } elsif ($loc eq "") {
921     $loc = $url;
922    
923     } elsif ($loc !~ /^(?: $ | [^:\/?\#]+ : )/x) { # anything "simple"
924     $loc =~ s/^\.\/+//;
925    
926     if ($loc !~ m%^[.?#]%) {
927 root 1.129 my $prefix = "$uscheme://$uauthority";
928 root 1.119
929     unless ($loc =~ s/^\///) {
930     $prefix .= $upath;
931     $prefix =~ s/\/[^\/]*$//;
932     }
933    
934     $loc = "$prefix/$loc";
935    
936     } elsif (eval { require URI }) { # uri
937     $loc = URI->new_abs ($loc, $url)->as_string;
938    
939     } else {
940     return _error %state, $cb, { @pseudo, Status => 599, Reason => "Cannot parse Location (URI module missing)" };
941     #$hdr{Status} = 599;
942     #$hdr{Reason} = "Unparsable Redirect (URI module missing)";
943     #$recurse = 0;
944     }
945 root 1.82 }
946    
947 root 1.119 $hdr{location} = $loc;
948 root 1.88 }
949 root 1.82
950 root 1.88 my $redirect;
951 root 1.82
952 root 1.88 if ($recurse) {
953     my $status = $hdr{Status};
954 root 1.82
955 root 1.88 # industry standard is to redirect POST as GET for
956     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
957     # also, the UA should ask the user for 301 and 307 and POST,
958     # industry standard seems to be to simply follow.
959 root 1.119 # we go with the industry standard. 308 is defined
960 root 1.121 # by rfc7538
961 root 1.88 if ($status == 301 or $status == 302 or $status == 303) {
962 root 1.122 $redirect = 1;
963 root 1.88 # HTTP/1.1 is unclear on how to mutate the method
964 root 1.122 unless ($method eq "HEAD") {
965     $method = "GET";
966     delete $arg{body};
967     }
968 root 1.119 } elsif ($status == 307 or $status == 308) {
969 root 1.88 $redirect = 1;
970 root 1.82 }
971 root 1.88 }
972 root 1.82
973 root 1.103 my $finish = sub { # ($data, $err_status, $err_reason[, $persistent])
974 root 1.92 if ($state{handle}) {
975     # handle keepalive
976     if (
977 root 1.103 $persistent
978 root 1.92 && $_[3]
979     && ($hdr{HTTPVersion} < 1.1
980     ? $hdr{connection} =~ /\bkeep-?alive\b/i
981     : $hdr{connection} !~ /\bclose\b/i)
982     ) {
983     ka_store $ka_key, delete $state{handle};
984     } else {
985     # no keepalive, destroy the handle
986     $state{handle}->destroy;
987     }
988     }
989 root 1.82
990 root 1.88 %state = ();
991 root 1.82
992 root 1.88 if (defined $_[1]) {
993     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
994     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
995     }
996 root 1.82
997 root 1.88 # set-cookie processing
998     if ($arg{cookie_jar}) {
999     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
1000     }
1001 root 1.82
1002 root 1.88 if ($redirect && exists $hdr{location}) {
1003     # we ignore any errors, as it is very common to receive
1004     # Content-Length != 0 but no actual body
1005     # we also access %hdr, as $_[1] might be an erro
1006 root 1.103 $state{recurse} =
1007     http_request (
1008     $method => $hdr{location},
1009     %arg,
1010     recurse => $recurse - 1,
1011     Redirect => [$_[0], \%hdr],
1012     sub {
1013     %state = ();
1014     &$cb
1015     },
1016     );
1017 root 1.88 } else {
1018     $cb->($_[0], \%hdr);
1019     }
1020     };
1021 root 1.82
1022 root 1.88 $ae_error = 597; # body phase
1023 root 1.82
1024 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
1025    
1026     my $len = $chunked ? undef : $hdr{"content-length"};
1027 root 1.82
1028 root 1.88 # body handling, many different code paths
1029     # - no body expected
1030     # - want_body_handle
1031     # - te chunked
1032     # - 2x length known (with or without on_body)
1033     # - 2x length not known (with or without on_body)
1034     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
1035     $finish->(undef, 598 => "Request cancelled by on_header");
1036     } elsif (
1037     $hdr{Status} =~ /^(?:1..|204|205|304)$/
1038     or $method eq "HEAD"
1039     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
1040     ) {
1041     # no body
1042     $finish->("", undef, undef, 1);
1043    
1044     } elsif (!$redirect && $arg{want_body_handle}) {
1045     $_[0]->on_eof (undef);
1046     $_[0]->on_error (undef);
1047     $_[0]->on_read (undef);
1048    
1049     $finish->(delete $state{handle});
1050    
1051 root 1.91 } elsif ($chunked) {
1052 root 1.88 my $cl = 0;
1053 root 1.92 my $body = "";
1054 root 1.128 my $on_body = (!$redirect && $arg{on_body}) || sub { $body .= shift; 1 };
1055 root 1.88
1056     $state{read_chunk} = sub {
1057     $_[1] =~ /^([0-9a-fA-F]+)/
1058 root 1.108 or return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1059 root 1.82
1060 root 1.88 my $len = hex $1;
1061 root 1.82
1062 root 1.88 if ($len) {
1063     $cl += $len;
1064 root 1.82
1065 root 1.88 $_[0]->push_read (chunk => $len, sub {
1066     $on_body->($_[1], \%hdr)
1067     or return $finish->(undef, 598 => "Request cancelled by on_body");
1068 root 1.82
1069 root 1.88 $_[0]->push_read (line => sub {
1070     length $_[1]
1071     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1072     $_[0]->push_read (line => $state{read_chunk});
1073 root 1.82 });
1074 root 1.88 });
1075     } else {
1076     $hdr{"content-length"} ||= $cl;
1077 root 1.84
1078 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
1079     if (length $_[1]) {
1080     for ("$_[1]") {
1081     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
1082 root 1.84
1083 root 1.92 my $hdr = _parse_hdr
1084 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
1085 root 1.84
1086 root 1.88 %hdr = (%hdr, %$hdr);
1087     }
1088     }
1089 root 1.84
1090 root 1.88 $finish->($body, undef, undef, 1);
1091 root 1.84 });
1092     }
1093 root 1.88 };
1094    
1095     $_[0]->push_read (line => $state{read_chunk});
1096    
1097 root 1.128 } elsif (!$redirect && $arg{on_body}) {
1098 root 1.88 if (defined $len) {
1099     $_[0]->on_read (sub {
1100     $len -= length $_[0]{rbuf};
1101    
1102     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1103     or return $finish->(undef, 598 => "Request cancelled by on_body");
1104    
1105     $len > 0
1106     or $finish->("", undef, undef, 1);
1107     });
1108 root 1.84 } else {
1109 root 1.88 $_[0]->on_eof (sub {
1110     $finish->("");
1111     });
1112     $_[0]->on_read (sub {
1113     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1114     or $finish->(undef, 598 => "Request cancelled by on_body");
1115     });
1116     }
1117     } else {
1118     $_[0]->on_eof (undef);
1119 root 1.82
1120 root 1.88 if (defined $len) {
1121     $_[0]->on_read (sub {
1122     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1123     if $len <= length $_[0]{rbuf};
1124     });
1125     } else {
1126     $_[0]->on_error (sub {
1127     ($! == Errno::EPIPE || !$!)
1128     ? $finish->(delete $_[0]{rbuf})
1129     : $finish->(undef, $ae_error => $_[2]);
1130     });
1131     $_[0]->on_read (sub { });
1132 root 1.82 }
1133 root 1.88 }
1134     };
1135 root 1.82
1136 root 1.92 # if keepalive is enabled, then the server closing the connection
1137     # before a response can happen legally - we retry on idempotent methods.
1138 root 1.103 if ($was_persistent && $idempotent) {
1139 root 1.92 my $old_eof = $hdl->{on_eof};
1140     $hdl->{on_eof} = sub {
1141     _destroy_state %state;
1142    
1143 root 1.103 %state = ();
1144     $state{recurse} =
1145     http_request (
1146 root 1.124 $method => $url,
1147 root 1.103 %arg,
1148 root 1.124 recurse => $recurse - 1,
1149     persistent => 0,
1150 root 1.103 sub {
1151     %state = ();
1152     &$cb
1153     }
1154     );
1155 root 1.92 };
1156     $hdl->on_read (sub {
1157     return unless %state;
1158    
1159     # as soon as we receive something, a connection close
1160     # once more becomes a hard error
1161     $hdl->{on_eof} = $old_eof;
1162     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1163     });
1164     } else {
1165     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1166     }
1167     };
1168    
1169     my $prepare_handle = sub {
1170     my ($hdl) = $state{handle};
1171    
1172     $hdl->on_error (sub {
1173     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1174     });
1175     $hdl->on_eof (sub {
1176     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1177     });
1178 root 1.103 $hdl->timeout_reset;
1179     $hdl->timeout ($timeout);
1180 root 1.88 };
1181 root 1.82
1182 root 1.92 # connected to proxy (or origin server)
1183 root 1.88 my $connect_cb = sub {
1184 root 1.92 my $fh = shift
1185     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1186 root 1.44
1187 root 1.88 return unless delete $state{connect_guard};
1188 root 1.11
1189 root 1.88 # get handle
1190     $state{handle} = new AnyEvent::Handle
1191 root 1.92 %{ $arg{handle_params} },
1192     fh => $fh,
1193     peername => $uhost,
1194 root 1.88 tls_ctx => $arg{tls_ctx},
1195     ;
1196 root 1.11
1197 root 1.92 $prepare_handle->();
1198 root 1.1
1199 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1200 root 1.88
1201     # now handle proxy-CONNECT method
1202     if ($proxy && $uscheme eq "https") {
1203     # oh dear, we have to wrap it into a connect request
1204    
1205 root 1.122 my $auth = exists $hdr{"proxy-authorization"}
1206     ? "proxy-authorization: " . (delete $hdr{"proxy-authorization"}) . "\015\012"
1207     : "";
1208    
1209 root 1.88 # maybe re-use $uauthority with patched port?
1210 root 1.122 $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012$auth\015\012");
1211 root 1.88 $state{handle}->push_read (line => $qr_nlnl, sub {
1212     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1213 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1214 root 1.88
1215     if ($2 == 200) {
1216     $rpath = $upath;
1217     $handle_actual_request->();
1218     } else {
1219 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1220 root 1.88 }
1221     });
1222     } else {
1223 root 1.122 delete $hdr{"proxy-authorization"} unless $proxy;
1224    
1225 root 1.88 $handle_actual_request->();
1226     }
1227     };
1228    
1229     _get_slot $uhost, sub {
1230     $state{slot_guard} = shift;
1231 root 1.64
1232 root 1.88 return unless $state{connect_guard};
1233 root 1.64
1234 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1235     # on a keepalive request (in theory, this should be a separate config option).
1236 root 1.103 if ($persistent && $KA_CACHE{$ka_key}) {
1237     $was_persistent = 1;
1238    
1239 root 1.92 $state{handle} = ka_fetch $ka_key;
1240 root 1.129 # $state{handle}->destroyed
1241     # and die "AnyEvent::HTTP: unexpectedly got a destructed handle (1), please report.";#d#
1242 root 1.92 $prepare_handle->();
1243 root 1.129 # $state{handle}->destroyed
1244     # and die "AnyEvent::HTTP: unexpectedly got a destructed handle (2), please report.";#d#
1245 root 1.137 $rpath = $upath;
1246 root 1.92 $handle_actual_request->();
1247    
1248     } else {
1249     my $tcp_connect = $arg{tcp_connect}
1250     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1251 root 1.57
1252 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1253     }
1254 root 1.1 };
1255    
1256 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1257 root 1.1 }
1258    
1259 elmex 1.15 sub http_get($@) {
1260 root 1.1 unshift @_, "GET";
1261     &http_request
1262     }
1263    
1264 elmex 1.15 sub http_head($@) {
1265 root 1.4 unshift @_, "HEAD";
1266     &http_request
1267     }
1268    
1269 elmex 1.15 sub http_post($$@) {
1270 root 1.22 my $url = shift;
1271     unshift @_, "POST", $url, "body";
1272 root 1.3 &http_request
1273     }
1274    
1275 root 1.9 =back
1276    
1277 root 1.55 =head2 DNS CACHING
1278    
1279     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1280     the actual connection, which in turn uses AnyEvent::DNS to resolve
1281     hostnames. The latter is a simple stub resolver and does no caching
1282     on its own. If you want DNS caching, you currently have to provide
1283     your own default resolver (by storing a suitable resolver object in
1284 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1285 root 1.55
1286 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1287 root 1.1
1288     =over 4
1289    
1290 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1291    
1292     Sets the default proxy server to use. The proxy-url must begin with a
1293 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1294 root 1.52
1295     To clear an already-set proxy, use C<undef>.
1296 root 1.2
1297 root 1.113 When AnyEvent::HTTP is loaded for the first time it will query the
1298 root 1.102 default proxy from the operating system, currently by looking at
1299     C<$ENV{http_proxy>}.
1300    
1301 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1302    
1303     Remove all cookies from the cookie jar that have been expired. If
1304     C<$session_end> is given and true, then additionally remove all session
1305     cookies.
1306    
1307     You should call this function (with a true C<$session_end>) before you
1308     save cookies to disk, and you should call this function after loading them
1309 root 1.117 again. If you have a long-running program you can additionally call this
1310 root 1.80 function from time to time.
1311    
1312     A cookie jar is initially an empty hash-reference that is managed by this
1313 root 1.121 module. Its format is subject to change, but currently it is as follows:
1314 root 1.80
1315 root 1.131 The key C<version> has to contain C<2>, otherwise the hash gets
1316     cleared. All other keys are hostnames or IP addresses pointing to
1317 root 1.80 hash-references. The key for these inner hash references is the
1318     server path for which this cookie is meant, and the values are again
1319 root 1.115 hash-references. Each key of those hash-references is a cookie name, and
1320 root 1.80 the value, you guessed it, is another hash-reference, this time with the
1321     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1322     which have been replaced by a C<_expires> key that contains the cookie
1323 root 1.115 expiry timestamp. Session cookies are indicated by not having an
1324     C<_expires> key.
1325 root 1.80
1326     Here is an example of a cookie jar with a single cookie, so you have a
1327     chance of understanding the above paragraph:
1328    
1329     {
1330 root 1.132 version => 2,
1331 root 1.80 "10.0.0.1" => {
1332     "/" => {
1333     "mythweb_id" => {
1334     _expires => 1293917923,
1335     value => "ooRung9dThee3ooyXooM1Ohm",
1336     },
1337     },
1338     },
1339     }
1340    
1341 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1342    
1343     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1344     Date (RFC 2616).
1345    
1346     =item $timestamp = AnyEvent::HTTP::parse_date $date
1347    
1348 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1349     bunch of minor variations of those, and returns the corresponding POSIX
1350     timestamp, or C<undef> if the date cannot be parsed.
1351 root 1.61
1352 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1353 root 1.1
1354 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1355 root 1.1
1356 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1357    
1358 root 1.110 The default timeout for connection operations (default: C<300>).
1359 root 1.92
1360 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1361    
1362     The default value for the C<User-Agent> header (the default is
1363 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1364 root 1.1
1365 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1366 root 1.1
1367 root 1.47 The maximum number of concurrent connections to the same host (identified
1368 root 1.121 by the hostname). If the limit is exceeded, then additional requests
1369 root 1.92 are queued until previous connections are closed. Both persistent and
1370     non-persistent connections are counted in this limit.
1371 root 1.1
1372 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1373 root 1.92 increase it much.
1374    
1375     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1376 root 1.117 connections, older browsers used 2, newer ones (such as firefox 3)
1377     typically use 6, and Opera uses 8 because like, they have the fastest
1378     browser and give a shit for everybody else on the planet.
1379 root 1.92
1380     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1381    
1382 root 1.116 The time after which idle persistent connections get closed by
1383 root 1.92 AnyEvent::HTTP (default: C<3>).
1384 root 1.3
1385 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1386    
1387     The number of active connections. This is not the number of currently
1388     running requests, but the number of currently open and non-idle TCP
1389 root 1.92 connections. This number can be useful for load-leveling.
1390 root 1.14
1391 root 1.1 =back
1392    
1393     =cut
1394    
1395 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1396     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1397    
1398     sub format_date($) {
1399     my ($time) = @_;
1400    
1401     # RFC 822/1123 format
1402     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1403    
1404     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1405     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1406     $H, $M, $S;
1407     }
1408    
1409     sub parse_date($) {
1410     my ($date) = @_;
1411    
1412     my ($d, $m, $y, $H, $M, $S);
1413    
1414 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1415 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1416     # cookie dates (with "-")
1417    
1418 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1419    
1420 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1421 root 1.61 # RFC 850
1422     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1423    
1424 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1425 root 1.61 # ISO C's asctime
1426     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1427     }
1428     # other formats fail in the loop below
1429    
1430     for (0..11) {
1431     if ($m eq $month[$_]) {
1432     require Time::Local;
1433 root 1.111 return eval { Time::Local::timegm ($S, $M, $H, $d, $_, $y) };
1434 root 1.61 }
1435     }
1436    
1437     undef
1438     }
1439    
1440 root 1.2 sub set_proxy($) {
1441 root 1.52 if (length $_[0]) {
1442 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1443 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1444     $PROXY = [$2, $3 || 3128, $1]
1445     } else {
1446     undef $PROXY;
1447     }
1448 root 1.2 }
1449    
1450     # initialise proxy from environment
1451 root 1.52 eval {
1452     set_proxy $ENV{http_proxy};
1453     };
1454 root 1.2
1455 root 1.93 =head2 SHOWCASE
1456    
1457 root 1.117 This section contains some more elaborate "real-world" examples or code
1458 root 1.93 snippets.
1459    
1460     =head2 HTTP/1.1 FILE DOWNLOAD
1461    
1462 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1463 root 1.98 goes wrong and you want to resume.
1464 root 1.93
1465     Here is a function that initiates and resumes a download. It uses the
1466     last modified time to check for file content changes, and works with many
1467     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1468     on older servers.
1469    
1470     It calls the completion callback with either C<undef>, which means a
1471 root 1.117 nonretryable error occurred, C<0> when the download was partial and should
1472 root 1.93 be retried, and C<1> if it was successful.
1473    
1474     use AnyEvent::HTTP;
1475    
1476     sub download($$$) {
1477     my ($url, $file, $cb) = @_;
1478    
1479     open my $fh, "+<", $file
1480     or die "$file: $!";
1481    
1482     my %hdr;
1483     my $ofs = 0;
1484    
1485     if (stat $fh and -s _) {
1486     $ofs = -s _;
1487 root 1.107 warn "-s is ", $ofs;
1488 root 1.93 $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1489     $hdr{"range"} = "bytes=$ofs-";
1490     }
1491    
1492     http_get $url,
1493     headers => \%hdr,
1494     on_header => sub {
1495     my ($hdr) = @_;
1496    
1497     if ($hdr->{Status} == 200 && $ofs) {
1498     # resume failed
1499     truncate $fh, $ofs = 0;
1500     }
1501    
1502     sysseek $fh, $ofs, 0;
1503    
1504     1
1505     },
1506     on_body => sub {
1507     my ($data, $hdr) = @_;
1508    
1509     if ($hdr->{Status} =~ /^2/) {
1510     length $data == syswrite $fh, $data
1511     or return; # abort on write errors
1512     }
1513    
1514     1
1515     },
1516     sub {
1517     my (undef, $hdr) = @_;
1518    
1519     my $status = $hdr->{Status};
1520    
1521     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1522 root 1.127 utime $time, $time, $fh;
1523 root 1.93 }
1524    
1525     if ($status == 200 || $status == 206 || $status == 416) {
1526     # download ok || resume ok || file already fully downloaded
1527     $cb->(1, $hdr);
1528    
1529     } elsif ($status == 412) {
1530     # file has changed while resuming, delete and retry
1531     unlink $file;
1532     $cb->(0, $hdr);
1533    
1534     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1535     # retry later
1536     $cb->(0, $hdr);
1537    
1538     } else {
1539     $cb->(undef, $hdr);
1540     }
1541     }
1542     ;
1543     }
1544    
1545     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1546     if ($_[0]) {
1547     print "OK!\n";
1548     } elsif (defined $_[0]) {
1549     print "please retry later\n";
1550     } else {
1551     print "ERROR\n";
1552     }
1553     };
1554    
1555     =head3 SOCKS PROXIES
1556 root 1.60
1557     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1558     compile your perl to support socks, or use an external program such as
1559     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1560     transparently.
1561    
1562     Alternatively, for AnyEvent::HTTP only, you can use your own
1563     C<tcp_connect> function that does the proxy handshake - here is an example
1564     that works with socks4a proxies:
1565    
1566     use Errno;
1567     use AnyEvent::Util;
1568     use AnyEvent::Socket;
1569     use AnyEvent::Handle;
1570    
1571     # host, port and username of/for your socks4a proxy
1572     my $socks_host = "10.0.0.23";
1573     my $socks_port = 9050;
1574     my $socks_user = "";
1575    
1576     sub socks4a_connect {
1577     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1578    
1579     my $hdl = new AnyEvent::Handle
1580     connect => [$socks_host, $socks_port],
1581     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1582     on_error => sub { $connect_cb->() },
1583     ;
1584    
1585     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1586    
1587     $hdl->push_read (chunk => 8, sub {
1588     my ($hdl, $chunk) = @_;
1589     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1590    
1591     if ($status == 0x5a) {
1592     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1593     } else {
1594     $! = Errno::ENXIO; $connect_cb->();
1595     }
1596     });
1597    
1598     $hdl
1599     }
1600    
1601     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1602     possibly after switching off other proxy types:
1603    
1604     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1605    
1606     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1607     my ($data, $headers) = @_;
1608     ...
1609     };
1610    
1611 root 1.1 =head1 SEE ALSO
1612    
1613     L<AnyEvent>.
1614    
1615     =head1 AUTHOR
1616    
1617 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1618     http://home.schmorp.de/
1619 root 1.1
1620 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1621     testcases and bugreports.
1622    
1623 root 1.1 =cut
1624    
1625     1
1626