ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.140
Committed: Wed Mar 6 19:29:18 2024 UTC (2 months, 1 week ago) by root
Branch: MAIN
CVS Tags: HEAD
Changes since 1.139: +5 -1 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.140 http_get "http://www.nethype.de/", sub {
10     my ($body, $hdr) = @_;
11     print "$hdr->{URL} Status: $hdr->{Status}\n";
12     print $body;
13     };
14 root 1.17
15     # ... do something else here
16    
17 root 1.1 =head1 DESCRIPTION
18    
19     This module is an L<AnyEvent> user, you need to make sure that you use and
20     run a supported event loop.
21    
22 root 1.11 This module implements a simple, stateless and non-blocking HTTP
23     client. It supports GET, POST and other request methods, cookies and more,
24 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
25 root 1.11 automatically limits the number of connections to the values specified in
26     the RFC.
27    
28     It should generally be a "good client" that is enough for most HTTP
29     tasks. Simple tasks should be simple, but complex tasks should still be
30     possible as the user retains control over request and response headers.
31    
32     The caller is responsible for authentication management, cookies (if
33     the simplistic implementation in this module doesn't suffice), referer
34     and other high-level protocol details for which this module offers only
35     limited support.
36    
37 root 1.1 =head2 METHODS
38    
39     =over 4
40    
41     =cut
42    
43     package AnyEvent::HTTP;
44    
45 root 1.85 use common::sense;
46 root 1.1
47 root 1.41 use Errno ();
48 root 1.1
49 root 1.51 use AnyEvent 5.0 ();
50 root 1.1 use AnyEvent::Util ();
51     use AnyEvent::Handle ();
52    
53     use base Exporter::;
54    
55 root 1.137 our $VERSION = 2.25;
56 root 1.1
57 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
58 root 1.1
59 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
60 root 1.3 our $MAX_RECURSE = 10;
61 root 1.92 our $PERSISTENT_TIMEOUT = 3;
62 root 1.2 our $TIMEOUT = 300;
63 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
64 root 1.1
65 root 1.2 our $PROXY;
66 root 1.14 our $ACTIVE = 0;
67 root 1.2
68 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
69 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
70 root 1.1
71     =item http_get $url, key => value..., $cb->($data, $headers)
72    
73     Executes an HTTP-GET request. See the http_request function for details on
74 root 1.29 additional parameters and the return value.
75 root 1.1
76 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
77    
78 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
79     on additional parameters and the return value.
80 root 1.5
81     =item http_post $url, $body, key => value..., $cb->($data, $headers)
82 root 1.3
83 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
84 root 1.29 http_request function for details on additional parameters and the return
85     value.
86 root 1.3
87 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
88    
89     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
90     must be an absolute http or https URL.
91    
92 root 1.29 When called in void context, nothing is returned. In other contexts,
93     C<http_request> returns a "cancellation guard" - you have to keep the
94     object at least alive until the callback get called. If the object gets
95 root 1.58 destroyed before the callback is called, the request will be cancelled.
96 root 1.29
97 root 1.42 The callback will be called with the response body data as first argument
98 root 1.117 (or C<undef> if an error occurred), and a hash-ref with response headers
99 root 1.68 (and trailers) as second argument.
100 root 1.2
101 root 1.7 All the headers in that hash are lowercased. In addition to the response
102 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
103     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
104 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
105     during the body phase of a request, then the original C<Status> and
106     C<Reason> values from the header are available as C<OrigStatus> and
107     C<OrigReason>.
108 root 1.55
109     The pseudo-header C<URL> contains the actual URL (which can differ from
110     the requested URL when following redirects - for example, you might get
111     an error that your URL scheme is not supported even though your URL is a
112     valid http URL because it redirected to an ftp URL, in which case you can
113     look at the URL pseudo header).
114    
115     The pseudo-header C<Redirect> only exists when the request was a result
116     of an internal redirect. In that case it is an array reference with
117     the C<($data, $headers)> from the redirect response. Note that this
118     response could in turn be the result of a redirect itself, and C<<
119     $headers->{Redirect}[1]{Redirect} >> will then contain the original
120     response, and so on.
121 root 1.20
122 root 1.32 If the server sends a header multiple times, then their contents will be
123     joined together with a comma (C<,>), as per the HTTP spec.
124 root 1.2
125     If an internal error occurs, such as not being able to resolve a hostname,
126 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
127     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
128     message. Currently the following status codes are used:
129    
130     =over 4
131    
132 root 1.114 =item 595 - errors during connection establishment, proxy handshake.
133 root 1.77
134     =item 596 - errors during TLS negotiation, request sending and header processing.
135    
136 root 1.78 =item 597 - errors during body receiving or processing.
137 root 1.77
138 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
139 root 1.77
140     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
141    
142     =back
143 root 1.2
144 root 1.6 A typical callback might look like this:
145    
146     sub {
147     my ($body, $hdr) = @_;
148    
149     if ($hdr->{Status} =~ /^2/) {
150     ... everything should be ok
151     } else {
152     print "error, $hdr->{Status} $hdr->{Reason}\n";
153     }
154     }
155    
156 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
157     include:
158    
159     =over 4
160    
161 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
162 root 1.1
163 root 1.112 Whether to recurse requests or not, e.g. on redirects, authentication and
164     other retries and so on, and how often to do so.
165 root 1.1
166 root 1.119 Only redirects to http and https URLs are supported. While most common
167     redirection forms are handled entirely within this module, some require
168     the use of the optional L<URI> module. If it is required but missing, then
169     the request will fail with an error.
170    
171 root 1.1 =item headers => hashref
172    
173 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
174     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
175 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
176     (this can be suppressed by using C<undef> for these headers in which case
177     they won't be sent at all).
178 root 1.1
179 root 1.90 You really should provide your own C<User-Agent:> header value that is
180     appropriate for your program - I wouldn't be surprised if the default
181     AnyEvent string gets blocked by webservers sooner or later.
182    
183 root 1.104 Also, make sure that your headers names and values do not contain any
184     embedded newlines.
185    
186 root 1.1 =item timeout => $seconds
187    
188     The time-out to use for various stages - each connect attempt will reset
189 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
190     timeout.
191    
192     Default timeout is 5 minutes.
193 root 1.2
194     =item proxy => [$host, $port[, $scheme]] or undef
195    
196 root 1.102 Use the given http proxy for all requests, or no proxy if C<undef> is
197     used.
198 root 1.2
199 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
200 root 1.1
201 root 1.102 If not specified, then the default proxy is used (see
202     C<AnyEvent::HTTP::set_proxy>).
203    
204 root 1.123 Currently, if your proxy requires authorization, you have to specify an
205     appropriate "Proxy-Authorization" header in every request.
206    
207 root 1.135 Note that this module will prefer an existing persistent connection,
208     even if that connection was made using another proxy. If you need to
209 root 1.136 ensure that a new connection is made in this case, you can either force
210 root 1.135 C<persistent> to false or e.g. use the proxy address in your C<sessionid>.
211    
212 root 1.3 =item body => $string
213    
214 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
215 root 1.3 this module might offer more options).
216    
217 root 1.10 =item cookie_jar => $hash_ref
218    
219     Passing this parameter enables (simplified) cookie-processing, loosely
220     based on the original netscape specification.
221    
222 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
223     will get updated automatically. It is possible to save the cookie jar
224     to persistent storage with something like JSON or Storable - see the
225     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
226     expired or session-only cookies, and also for documentation on the format
227     of the cookie jar.
228 root 1.10
229 root 1.70 Note that this cookie implementation is not meant to be complete. If
230     you want complete cookie management you have to do that on your
231 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
232 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
233     to.
234 root 1.10
235 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
236 root 1.70 headers will be set and handled by this module, otherwise they will be
237 root 1.69 left untouched.
238    
239 root 1.40 =item tls_ctx => $scheme | $tls_ctx
240    
241     Specifies the AnyEvent::TLS context to be used for https connections. This
242     parameter follows the same rules as the C<tls_ctx> parameter to
243     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
244     C<high> can be specified, which give you a predefined low-security (no
245     verification, highest compatibility) and high-security (CA and common-name
246     verification) TLS context.
247    
248     The default for this option is C<low>, which could be interpreted as "give
249     me the page, no matter what".
250    
251 root 1.92 See also the C<sessionid> parameter.
252    
253 root 1.135 =item sessionid => $string
254 root 1.92
255 root 1.135 The module might reuse connections to the same host internally (regardless
256     of other settings, such as C<tcp_connect> or C<proxy>). Sometimes (e.g.
257     when using TLS or a specfic proxy), you do not want to reuse connections
258     from other sessions. This can be achieved by setting this parameter to
259     some unique ID (such as the address of an object storing your state data
260     or the TLS context, or the proxy IP) - only connections using the same
261     unique ID will be reused.
262 root 1.92
263 root 1.51 =item on_prepare => $callback->($fh)
264    
265     In rare cases you need to "tune" the socket before it is used to
266 root 1.117 connect (for example, to bind it on a given IP address). This parameter
267 root 1.51 overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
268     and behaves exactly the same way (e.g. it has to provide a
269     timeout). See the description for the C<$prepare_cb> argument of
270     C<AnyEvent::Socket::tcp_connect> for details.
271    
272 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
273    
274     In even rarer cases you want total control over how AnyEvent::HTTP
275     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
276     to do this, but you can provide your own C<tcp_connect> function -
277 root 1.60 obviously, it has to follow the same calling conventions, except that it
278     may always return a connection guard object.
279 root 1.59
280 root 1.134 The connections made by this hook will be treated as equivalent to
281 root 1.136 connections made the built-in way, specifically, they will be put into
282     and taken from the persistent connection cache. If your C<$tcp_connect>
283 root 1.134 function is incompatible with this kind of re-use, consider switching off
284 root 1.135 C<persistent> connections and/or providing a C<sessionid> identifier.
285 root 1.134
286 root 1.59 There are probably lots of weird uses for this function, starting from
287     tracing the hosts C<http_request> actually tries to connect, to (inexact
288     but fast) host => IP address caching or even socks protocol support.
289    
290 root 1.42 =item on_header => $callback->($headers)
291 root 1.41
292     When specified, this callback will be called with the header hash as soon
293     as headers have been successfully received from the remote server (not on
294     locally-generated errors).
295    
296     It has to return either true (in which case AnyEvent::HTTP will continue),
297     or false, in which case AnyEvent::HTTP will cancel the download (and call
298     the finish callback with an error code of C<598>).
299    
300     This callback is useful, among other things, to quickly reject unwanted
301     content, which, if it is supposed to be rare, can be faster than first
302     doing a C<HEAD> request.
303    
304 root 1.68 The downside is that cancelling the request makes it impossible to re-use
305     the connection. Also, the C<on_header> callback will not receive any
306     trailer (headers sent after the response body).
307    
308 root 1.42 Example: cancel the request unless the content-type is "text/html".
309 root 1.41
310 root 1.42 on_header => sub {
311     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
312     },
313 root 1.41
314 root 1.42 =item on_body => $callback->($partial_body, $headers)
315 root 1.41
316 root 1.42 When specified, all body data will be passed to this callback instead of
317     to the completion callback. The completion callback will get the empty
318     string instead of the body data.
319 root 1.41
320 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
321     or false, in which case AnyEvent::HTTP will cancel the download (and call
322     the completion callback with an error code of C<598>).
323    
324 root 1.68 The downside to cancelling the request is that it makes it impossible to
325     re-use the connection.
326    
327 root 1.42 This callback is useful when the data is too large to be held in memory
328     (so the callback writes it to a file) or when only some information should
329     be extracted, or when the body should be processed incrementally.
330 root 1.41
331     It is usually preferred over doing your own body handling via
332 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
333     only used to create a connection, C<want_body_handle> is the better
334     alternative, as it allows you to install your own event handler, reducing
335     resource usage.
336 root 1.41
337     =item want_body_handle => $enable
338    
339     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
340     changes considerably: after parsing the headers, and instead of
341     downloading the body (if any), the completion callback will be
342     called. Instead of the C<$body> argument containing the body data, the
343     callback will receive the L<AnyEvent::Handle> object associated with the
344     connection. In error cases, C<undef> will be passed. When there is no body
345     (e.g. status C<304>), the empty string will be passed.
346    
347 root 1.92 The handle object might or might not be in TLS mode, might be connected
348     to a proxy, be a persistent connection, use chunked transfer encoding
349     etc., and configured in unspecified ways. The user is responsible for this
350     handle (it will not be used by this module anymore).
351 root 1.41
352     This is useful with some push-type services, where, after the initial
353     headers, an interactive protocol is used (typical example would be the
354     push-style twitter API which starts a JSON/XML stream).
355    
356     If you think you need this, first have a look at C<on_body>, to see if
357 root 1.45 that doesn't solve your problem in a better way.
358 root 1.41
359 root 1.92 =item persistent => $boolean
360    
361     Try to create/reuse a persistent connection. When this flag is set
362     (default: true for idempotent requests, false for all others), then
363     C<http_request> tries to re-use an existing (previously-created)
364 root 1.133 persistent connection to same host (i.e. identical URL scheme, hostname,
365 root 1.135 port and sessionid) and, failing that, tries to create a new one.
366 root 1.92
367     Requests failing in certain ways will be automatically retried once, which
368     is dangerous for non-idempotent requests, which is why it defaults to off
369     for them. The reason for this is because the bozos who designed HTTP/1.1
370     made it impossible to distinguish between a fatal error and a normal
371     connection timeout, so you never know whether there was a problem with
372     your request or not.
373    
374     When reusing an existent connection, many parameters (such as TLS context)
375 root 1.135 will be ignored. See the C<sessionid> parameter for a workaround.
376 root 1.92
377     =item keepalive => $boolean
378    
379     Only used when C<persistent> is also true. This parameter decides whether
380     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
381     (as opposed to only a HTTP/1.1 persistent connection).
382    
383     The default is true, except when using a proxy, in which case it defaults
384     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
385    
386     =item handle_params => { key => value ... }
387    
388     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
389     constructor that is called - not all requests will create a handle, and
390     sometimes more than one is created, so this parameter is only good for
391     setting hints.
392    
393     Example: set the maximum read size to 4096, to potentially conserve memory
394     at the cost of speed.
395    
396     handle_params => {
397     max_read_size => 4096,
398     },
399    
400 root 1.1 =back
401    
402 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
403     the response body.
404 root 1.9
405     http_request GET => "http://www.nethype.de/", sub {
406     my ($body, $hdr) = @_;
407     print "$body\n";
408     };
409    
410 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
411 root 1.9 timeout of 30 seconds.
412    
413     http_request
414 root 1.109 HEAD => "https://www.google.com",
415 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
416 root 1.9 timeout => 30,
417     sub {
418     my ($body, $hdr) = @_;
419     use Data::Dumper;
420     print Dumper $hdr;
421     }
422     ;
423 root 1.1
424 root 1.68 Example: do another simple HTTP GET request, but immediately try to
425 root 1.29 cancel it.
426    
427     my $request = http_request GET => "http://www.nethype.de/", sub {
428     my ($body, $hdr) = @_;
429     print "$body\n";
430     };
431    
432     undef $request;
433    
434 root 1.1 =cut
435    
436 root 1.92 #############################################################################
437     # wait queue/slots
438    
439 root 1.12 sub _slot_schedule;
440 root 1.11 sub _slot_schedule($) {
441     my $host = shift;
442    
443     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
444     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
445 root 1.12 # somebody wants that slot
446 root 1.11 ++$CO_SLOT{$host}[0];
447 root 1.14 ++$ACTIVE;
448 root 1.11
449     $cb->(AnyEvent::Util::guard {
450 root 1.14 --$ACTIVE;
451 root 1.11 --$CO_SLOT{$host}[0];
452     _slot_schedule $host;
453     });
454     } else {
455     # nobody wants the slot, maybe we can forget about it
456     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
457     last;
458     }
459     }
460     }
461    
462     # wait for a free slot on host, call callback
463     sub _get_slot($$) {
464     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
465    
466     _slot_schedule $_[0];
467     }
468    
469 root 1.80 #############################################################################
470 root 1.92 # cookie handling
471 root 1.80
472     # expire cookies
473     sub cookie_jar_expire($;$) {
474     my ($jar, $session_end) = @_;
475    
476 root 1.130 %$jar = () if $jar->{version} != 2;
477 root 1.80
478     my $anow = AE::now;
479    
480     while (my ($chost, $paths) = each %$jar) {
481     next unless ref $paths;
482    
483     while (my ($cpath, $cookies) = each %$paths) {
484     while (my ($cookie, $kv) = each %$cookies) {
485     if (exists $kv->{_expires}) {
486     delete $cookies->{$cookie}
487     if $anow > $kv->{_expires};
488     } elsif ($session_end) {
489     delete $cookies->{$cookie};
490     }
491     }
492    
493     delete $paths->{$cpath}
494     unless %$cookies;
495     }
496    
497     delete $jar->{$chost}
498     unless %$paths;
499     }
500     }
501    
502 root 1.72 # extract cookies from jar
503 root 1.71 sub cookie_jar_extract($$$$) {
504 root 1.92 my ($jar, $scheme, $host, $path) = @_;
505 root 1.71
506 root 1.130 %$jar = () if $jar->{version} != 2;
507    
508     $host = AnyEvent::Util::idn_to_ascii $host
509     if $host =~ /[^\x00-\x7f]/;
510 root 1.71
511     my @cookies;
512    
513     while (my ($chost, $paths) = each %$jar) {
514     next unless ref $paths;
515    
516 root 1.130 # exact match or suffix including . match
517     $chost eq $host or ".$chost" eq substr $host, -1 - length $chost
518     or next;
519 root 1.71
520     while (my ($cpath, $cookies) = each %$paths) {
521 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
522 root 1.71
523     while (my ($cookie, $kv) = each %$cookies) {
524 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
525 root 1.71
526 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
527     delete $cookies->{$cookie};
528     next;
529 root 1.71 }
530    
531     my $value = $kv->{value};
532    
533     if ($value =~ /[=;,[:space:]]/) {
534     $value =~ s/([\\"])/\\$1/g;
535     $value = "\"$value\"";
536     }
537    
538     push @cookies, "$cookie=$value";
539     }
540     }
541     }
542    
543     \@cookies
544     }
545    
546 root 1.72 # parse set_cookie header into jar
547 root 1.80 sub cookie_jar_set_cookie($$$$) {
548 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
549 root 1.80
550 root 1.130 %$jar = () if $jar->{version} != 2;
551    
552 root 1.80 my $anow = int AE::now;
553     my $snow; # server-now
554 root 1.72
555     for ($set_cookie) {
556     # parse NAME=VALUE
557     my @kv;
558    
559 root 1.79 # expires is not http-compliant in the original cookie-spec,
560     # we support the official date format and some extensions
561 root 1.72 while (
562     m{
563     \G\s*
564     (?:
565 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
566 root 1.101 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^;,[:space:]]*) ) )?
567 root 1.72 )
568     }gcxsi
569     ) {
570     my $name = $2;
571     my $value = $4;
572    
573 root 1.82 if (defined $1) {
574 root 1.72 # expires
575     $name = "expires";
576     $value = $1;
577 root 1.82 } elsif (defined $3) {
578 root 1.72 # quoted
579     $value = $3;
580     $value =~ s/\\(.)/$1/gs;
581     }
582    
583 root 1.95 push @kv, @kv ? lc $name : $name, $value;
584 root 1.72
585     last unless /\G\s*;/gc;
586     }
587    
588     last unless @kv;
589    
590     my $name = shift @kv;
591     my %kv = (value => shift @kv, @kv);
592    
593 root 1.80 if (exists $kv{"max-age"}) {
594     $kv{_expires} = $anow + delete $kv{"max-age"};
595     } elsif (exists $kv{expires}) {
596     $snow ||= parse_date ($date) || $anow;
597     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
598     } else {
599     delete $kv{_expires};
600     }
601 root 1.72
602     my $cdom;
603     my $cpath = (delete $kv{path}) || "/";
604    
605     if (exists $kv{domain}) {
606 root 1.130 $cdom = $kv{domain};
607 root 1.72
608     $cdom =~ s/^\.?/./; # make sure it starts with a "."
609    
610     next if $cdom =~ /\.$/;
611    
612     # this is not rfc-like and not netscape-like. go figure.
613     my $ndots = $cdom =~ y/.//;
614     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
615 root 1.130
616     $cdom = substr $cdom, 1; # remove initial .
617 root 1.72 } else {
618 root 1.92 $cdom = $host;
619 root 1.72 }
620    
621     # store it
622 root 1.130 $jar->{version} = 2;
623 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
624 root 1.72
625     redo if /\G\s*,/gc;
626     }
627     }
628    
629 root 1.92 #############################################################################
630     # keepalive/persistent connection cache
631    
632     # fetch a connection from the keepalive cache
633     sub ka_fetch($) {
634     my $ka_key = shift;
635    
636     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
637     delete $KA_CACHE{$ka_key}
638     unless @{ $KA_CACHE{$ka_key} };
639    
640     $hdl
641     }
642    
643     sub ka_store($$) {
644     my ($ka_key, $hdl) = @_;
645    
646     my $kaa = $KA_CACHE{$ka_key} ||= [];
647    
648     my $destroy = sub {
649     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
650    
651     $hdl->destroy;
652    
653     @ka
654     ? $KA_CACHE{$ka_key} = \@ka
655     : delete $KA_CACHE{$ka_key};
656     };
657    
658     # on error etc., destroy
659     $hdl->on_error ($destroy);
660     $hdl->on_eof ($destroy);
661     $hdl->on_read ($destroy);
662     $hdl->timeout ($PERSISTENT_TIMEOUT);
663    
664     push @$kaa, $hdl;
665     shift @$kaa while @$kaa > $MAX_PER_HOST;
666     }
667    
668     #############################################################################
669     # utilities
670    
671 root 1.66 # continue to parse $_ for headers and place them into the arg
672 root 1.92 sub _parse_hdr() {
673 root 1.66 my %hdr;
674    
675     # things seen, not parsed:
676     # p3pP="NON CUR OTPi OUR NOR UNI"
677    
678     $hdr{lc $1} .= ",$2"
679     while /\G
680     ([^:\000-\037]*):
681     [\011\040]*
682     ((?: [^\012]+ | \012[\011\040] )*)
683     \012
684     /gxc;
685    
686     /\G$/
687     or return;
688    
689     # remove the "," prefix we added to all headers above
690     substr $_, 0, 1, ""
691     for values %hdr;
692    
693     \%hdr
694     }
695    
696 root 1.92 #############################################################################
697     # http_get
698    
699 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
700 root 1.34
701 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
702     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
703 root 1.40
704 root 1.92 # maybe it should just become a normal object :/
705    
706     sub _destroy_state(\%) {
707     my ($state) = @_;
708    
709     $state->{handle}->destroy if $state->{handle};
710     %$state = ();
711     }
712    
713     sub _error(\%$$) {
714     my ($state, $cb, $hdr) = @_;
715    
716     &_destroy_state ($state);
717    
718     $cb->(undef, $hdr);
719     ()
720     }
721    
722 root 1.116 our %IDEMPOTENT = (
723     DELETE => 1,
724     GET => 1,
725 root 1.139 QUERY => 1,
726 root 1.116 HEAD => 1,
727     OPTIONS => 1,
728     PUT => 1,
729     TRACE => 1,
730    
731     ACL => 1,
732     "BASELINE-CONTROL" => 1,
733     BIND => 1,
734     CHECKIN => 1,
735     CHECKOUT => 1,
736     COPY => 1,
737     LABEL => 1,
738     LINK => 1,
739     MERGE => 1,
740     MKACTIVITY => 1,
741     MKCALENDAR => 1,
742     MKCOL => 1,
743     MKREDIRECTREF => 1,
744     MKWORKSPACE => 1,
745     MOVE => 1,
746     ORDERPATCH => 1,
747 root 1.139 PRI => 1,
748 root 1.116 PROPFIND => 1,
749     PROPPATCH => 1,
750     REBIND => 1,
751     REPORT => 1,
752     SEARCH => 1,
753     UNBIND => 1,
754     UNCHECKOUT => 1,
755     UNLINK => 1,
756     UNLOCK => 1,
757     UPDATE => 1,
758     UPDATEREDIRECTREF => 1,
759     "VERSION-CONTROL" => 1,
760     );
761    
762 elmex 1.15 sub http_request($$@) {
763 root 1.1 my $cb = pop;
764     my ($method, $url, %arg) = @_;
765    
766     my %hdr;
767    
768 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
769     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
770    
771 root 1.3 $method = uc $method;
772    
773 root 1.8 if (my $hdr = $arg{headers}) {
774 root 1.1 while (my ($k, $v) = each %$hdr) {
775     $hdr{lc $k} = $v;
776     }
777     }
778    
779 root 1.55 # pseudo headers for all subsequent responses
780     my @pseudo = (URL => $url);
781     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
782    
783 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
784 root 1.8
785 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
786 root 1.8 if $recurse < 0;
787    
788 root 1.102 my $proxy = exists $arg{proxy} ? $arg{proxy} : $PROXY;
789 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
790    
791 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
792 root 1.100 $url =~ m|^([^:]+):(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?$|;
793 root 1.2
794 root 1.31 $uscheme = lc $uscheme;
795 root 1.1
796 root 1.31 my $uport = $uscheme eq "http" ? 80
797     : $uscheme eq "https" ? 443
798 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
799 root 1.13
800 root 1.124 $uauthority =~ /^(?: .*\@ )? ([^\@]+?) (?: : (\d+) )?$/x
801 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
802 root 1.10
803 root 1.86 my $uhost = lc $1;
804 root 1.10 $uport = $2 if defined $2;
805    
806 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
807     unless exists $hdr{host};
808 root 1.43
809 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
810 root 1.56 $upath .= $query if length $query;
811 root 1.10
812     $upath =~ s%^/?%/%;
813    
814     # cookie processing
815     if (my $jar = $arg{cookie_jar}) {
816 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
817 root 1.70
818 root 1.71 $hdr{cookie} = join "; ", @$cookies
819     if @$cookies;
820 root 1.10 }
821 root 1.1
822 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
823 root 1.2
824 root 1.10 if ($proxy) {
825 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
826 root 1.31
827 root 1.47 $rscheme = "http" unless defined $rscheme;
828    
829 root 1.31 # don't support https requests over https-proxy transport,
830 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
831 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
832 root 1.86
833     $rhost = lc $rhost;
834     $rscheme = lc $rscheme;
835 root 1.10 } else {
836 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
837 root 1.2 }
838    
839 root 1.47 # leave out fragment and query string, just a heuristic
840 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
841     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
842 root 1.41
843 root 1.53 $hdr{"content-length"} = length $arg{body}
844     if length $arg{body} || $method ne "GET";
845 root 1.1
846 root 1.116 my $idempotent = $IDEMPOTENT{$method};
847 root 1.89
848     # default value for keepalive is true iff the request is for an idempotent method
849 root 1.103 my $persistent = exists $arg{persistent} ? !!$arg{persistent} : $idempotent;
850     my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : !$proxy;
851     my $was_persistent; # true if this is actually a recycled connection
852 root 1.92
853     # the key to use in the keepalive cache
854 root 1.105 my $ka_key = "$uscheme\x00$uhost\x00$uport\x00$arg{sessionid}";
855 root 1.89
856 root 1.118 $hdr{connection} = ($persistent ? $keepalive ? "keep-alive, " : "" : "close, ") . "Te"; #1.1
857 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
858 root 1.66
859 root 1.11 my %state = (connect_guard => 1);
860    
861 root 1.88 my $ae_error = 595; # connecting
862 root 1.1
863 root 1.88 # handle actual, non-tunneled, request
864     my $handle_actual_request = sub {
865     $ae_error = 596; # request phase
866    
867 root 1.92 my $hdl = $state{handle};
868    
869     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
870 root 1.88
871     # send request
872 root 1.92 $hdl->push_write (
873 root 1.88 "$method $rpath HTTP/1.1\015\012"
874     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
875     . "\015\012"
876 root 1.120 . $arg{body}
877 root 1.88 );
878    
879 root 1.117 # return if error occurred during push_write()
880 root 1.88 return unless %state;
881    
882 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
883     %hdr = ();
884 root 1.88
885     # status line and headers
886     $state{read_response} = sub {
887 root 1.92 return unless %state;
888    
889 root 1.88 for ("$_[1]") {
890     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
891    
892     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
893 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
894 root 1.88
895     # 100 Continue handling
896     # should not happen as we don't send expect: 100-continue,
897     # but we handle it just in case.
898     # since we send the request body regardless, if we get an error
899     # we are out of-sync, which we currently do NOT handle correctly.
900     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
901     if $2 eq 100;
902    
903     push @pseudo,
904     HTTPVersion => $1,
905     Status => $2,
906     Reason => $3,
907     ;
908 root 1.1
909 root 1.92 my $hdr = _parse_hdr
910     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
911 root 1.77
912 root 1.88 %hdr = (%$hdr, @pseudo);
913     }
914 root 1.82
915 root 1.88 # redirect handling
916 root 1.119 # relative uri handling forced by microsoft and other shitheads.
917     # we give our best and fall back to URI if available.
918     if (exists $hdr{location}) {
919     my $loc = $hdr{location};
920    
921     if ($loc =~ m%^//%) { # //
922 root 1.126 $loc = "$uscheme:$loc";
923 root 1.119
924     } elsif ($loc eq "") {
925     $loc = $url;
926    
927     } elsif ($loc !~ /^(?: $ | [^:\/?\#]+ : )/x) { # anything "simple"
928     $loc =~ s/^\.\/+//;
929    
930     if ($loc !~ m%^[.?#]%) {
931 root 1.129 my $prefix = "$uscheme://$uauthority";
932 root 1.119
933     unless ($loc =~ s/^\///) {
934     $prefix .= $upath;
935     $prefix =~ s/\/[^\/]*$//;
936     }
937    
938     $loc = "$prefix/$loc";
939    
940     } elsif (eval { require URI }) { # uri
941     $loc = URI->new_abs ($loc, $url)->as_string;
942    
943     } else {
944     return _error %state, $cb, { @pseudo, Status => 599, Reason => "Cannot parse Location (URI module missing)" };
945     #$hdr{Status} = 599;
946     #$hdr{Reason} = "Unparsable Redirect (URI module missing)";
947     #$recurse = 0;
948     }
949 root 1.82 }
950    
951 root 1.119 $hdr{location} = $loc;
952 root 1.88 }
953 root 1.82
954 root 1.88 my $redirect;
955 root 1.82
956 root 1.88 if ($recurse) {
957     my $status = $hdr{Status};
958 root 1.82
959 root 1.88 # industry standard is to redirect POST as GET for
960     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
961     # also, the UA should ask the user for 301 and 307 and POST,
962     # industry standard seems to be to simply follow.
963 root 1.119 # we go with the industry standard. 308 is defined
964 root 1.121 # by rfc7538
965 root 1.88 if ($status == 301 or $status == 302 or $status == 303) {
966 root 1.122 $redirect = 1;
967 root 1.88 # HTTP/1.1 is unclear on how to mutate the method
968 root 1.122 unless ($method eq "HEAD") {
969     $method = "GET";
970     delete $arg{body};
971     }
972 root 1.119 } elsif ($status == 307 or $status == 308) {
973 root 1.88 $redirect = 1;
974 root 1.82 }
975 root 1.88 }
976 root 1.82
977 root 1.103 my $finish = sub { # ($data, $err_status, $err_reason[, $persistent])
978 root 1.92 if ($state{handle}) {
979     # handle keepalive
980     if (
981 root 1.103 $persistent
982 root 1.92 && $_[3]
983     && ($hdr{HTTPVersion} < 1.1
984     ? $hdr{connection} =~ /\bkeep-?alive\b/i
985     : $hdr{connection} !~ /\bclose\b/i)
986     ) {
987     ka_store $ka_key, delete $state{handle};
988     } else {
989     # no keepalive, destroy the handle
990     $state{handle}->destroy;
991     }
992     }
993 root 1.82
994 root 1.88 %state = ();
995 root 1.82
996 root 1.88 if (defined $_[1]) {
997     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
998     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
999     }
1000 root 1.82
1001 root 1.88 # set-cookie processing
1002     if ($arg{cookie_jar}) {
1003     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
1004     }
1005 root 1.82
1006 root 1.88 if ($redirect && exists $hdr{location}) {
1007     # we ignore any errors, as it is very common to receive
1008     # Content-Length != 0 but no actual body
1009     # we also access %hdr, as $_[1] might be an erro
1010 root 1.103 $state{recurse} =
1011     http_request (
1012     $method => $hdr{location},
1013     %arg,
1014     recurse => $recurse - 1,
1015     Redirect => [$_[0], \%hdr],
1016     sub {
1017     %state = ();
1018     &$cb
1019     },
1020     );
1021 root 1.88 } else {
1022     $cb->($_[0], \%hdr);
1023     }
1024     };
1025 root 1.82
1026 root 1.88 $ae_error = 597; # body phase
1027 root 1.82
1028 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
1029    
1030     my $len = $chunked ? undef : $hdr{"content-length"};
1031 root 1.82
1032 root 1.88 # body handling, many different code paths
1033     # - no body expected
1034     # - want_body_handle
1035     # - te chunked
1036     # - 2x length known (with or without on_body)
1037     # - 2x length not known (with or without on_body)
1038     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
1039     $finish->(undef, 598 => "Request cancelled by on_header");
1040     } elsif (
1041     $hdr{Status} =~ /^(?:1..|204|205|304)$/
1042     or $method eq "HEAD"
1043     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
1044     ) {
1045     # no body
1046     $finish->("", undef, undef, 1);
1047    
1048     } elsif (!$redirect && $arg{want_body_handle}) {
1049     $_[0]->on_eof (undef);
1050     $_[0]->on_error (undef);
1051     $_[0]->on_read (undef);
1052    
1053     $finish->(delete $state{handle});
1054    
1055 root 1.91 } elsif ($chunked) {
1056 root 1.88 my $cl = 0;
1057 root 1.92 my $body = "";
1058 root 1.128 my $on_body = (!$redirect && $arg{on_body}) || sub { $body .= shift; 1 };
1059 root 1.88
1060     $state{read_chunk} = sub {
1061     $_[1] =~ /^([0-9a-fA-F]+)/
1062 root 1.108 or return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1063 root 1.82
1064 root 1.88 my $len = hex $1;
1065 root 1.82
1066 root 1.88 if ($len) {
1067     $cl += $len;
1068 root 1.82
1069 root 1.88 $_[0]->push_read (chunk => $len, sub {
1070     $on_body->($_[1], \%hdr)
1071     or return $finish->(undef, 598 => "Request cancelled by on_body");
1072 root 1.82
1073 root 1.88 $_[0]->push_read (line => sub {
1074     length $_[1]
1075     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1076     $_[0]->push_read (line => $state{read_chunk});
1077 root 1.82 });
1078 root 1.88 });
1079     } else {
1080     $hdr{"content-length"} ||= $cl;
1081 root 1.84
1082 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
1083     if (length $_[1]) {
1084     for ("$_[1]") {
1085     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
1086 root 1.84
1087 root 1.92 my $hdr = _parse_hdr
1088 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
1089 root 1.84
1090 root 1.88 %hdr = (%hdr, %$hdr);
1091     }
1092     }
1093 root 1.84
1094 root 1.88 $finish->($body, undef, undef, 1);
1095 root 1.84 });
1096     }
1097 root 1.88 };
1098    
1099     $_[0]->push_read (line => $state{read_chunk});
1100    
1101 root 1.128 } elsif (!$redirect && $arg{on_body}) {
1102 root 1.88 if (defined $len) {
1103     $_[0]->on_read (sub {
1104     $len -= length $_[0]{rbuf};
1105    
1106     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1107     or return $finish->(undef, 598 => "Request cancelled by on_body");
1108    
1109     $len > 0
1110     or $finish->("", undef, undef, 1);
1111     });
1112 root 1.84 } else {
1113 root 1.88 $_[0]->on_eof (sub {
1114     $finish->("");
1115     });
1116     $_[0]->on_read (sub {
1117     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1118     or $finish->(undef, 598 => "Request cancelled by on_body");
1119     });
1120     }
1121     } else {
1122     $_[0]->on_eof (undef);
1123 root 1.82
1124 root 1.88 if (defined $len) {
1125     $_[0]->on_read (sub {
1126     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1127     if $len <= length $_[0]{rbuf};
1128     });
1129     } else {
1130     $_[0]->on_error (sub {
1131     ($! == Errno::EPIPE || !$!)
1132     ? $finish->(delete $_[0]{rbuf})
1133     : $finish->(undef, $ae_error => $_[2]);
1134     });
1135     $_[0]->on_read (sub { });
1136 root 1.82 }
1137 root 1.88 }
1138     };
1139 root 1.82
1140 root 1.92 # if keepalive is enabled, then the server closing the connection
1141     # before a response can happen legally - we retry on idempotent methods.
1142 root 1.103 if ($was_persistent && $idempotent) {
1143 root 1.92 my $old_eof = $hdl->{on_eof};
1144     $hdl->{on_eof} = sub {
1145     _destroy_state %state;
1146    
1147 root 1.103 %state = ();
1148     $state{recurse} =
1149     http_request (
1150 root 1.124 $method => $url,
1151 root 1.103 %arg,
1152 root 1.124 recurse => $recurse - 1,
1153     persistent => 0,
1154 root 1.103 sub {
1155     %state = ();
1156     &$cb
1157     }
1158     );
1159 root 1.92 };
1160     $hdl->on_read (sub {
1161     return unless %state;
1162    
1163     # as soon as we receive something, a connection close
1164     # once more becomes a hard error
1165     $hdl->{on_eof} = $old_eof;
1166     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1167     });
1168     } else {
1169     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1170     }
1171     };
1172    
1173     my $prepare_handle = sub {
1174     my ($hdl) = $state{handle};
1175    
1176     $hdl->on_error (sub {
1177     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1178     });
1179     $hdl->on_eof (sub {
1180     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1181     });
1182 root 1.103 $hdl->timeout_reset;
1183     $hdl->timeout ($timeout);
1184 root 1.88 };
1185 root 1.82
1186 root 1.92 # connected to proxy (or origin server)
1187 root 1.88 my $connect_cb = sub {
1188 root 1.92 my $fh = shift
1189     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1190 root 1.44
1191 root 1.88 return unless delete $state{connect_guard};
1192 root 1.11
1193 root 1.88 # get handle
1194     $state{handle} = new AnyEvent::Handle
1195 root 1.92 %{ $arg{handle_params} },
1196     fh => $fh,
1197     peername => $uhost,
1198 root 1.88 tls_ctx => $arg{tls_ctx},
1199     ;
1200 root 1.11
1201 root 1.92 $prepare_handle->();
1202 root 1.1
1203 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1204 root 1.88
1205     # now handle proxy-CONNECT method
1206     if ($proxy && $uscheme eq "https") {
1207     # oh dear, we have to wrap it into a connect request
1208    
1209 root 1.122 my $auth = exists $hdr{"proxy-authorization"}
1210     ? "proxy-authorization: " . (delete $hdr{"proxy-authorization"}) . "\015\012"
1211     : "";
1212    
1213 root 1.88 # maybe re-use $uauthority with patched port?
1214 root 1.122 $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012$auth\015\012");
1215 root 1.88 $state{handle}->push_read (line => $qr_nlnl, sub {
1216     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1217 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1218 root 1.88
1219     if ($2 == 200) {
1220     $rpath = $upath;
1221     $handle_actual_request->();
1222     } else {
1223 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1224 root 1.88 }
1225     });
1226     } else {
1227 root 1.122 delete $hdr{"proxy-authorization"} unless $proxy;
1228    
1229 root 1.88 $handle_actual_request->();
1230     }
1231     };
1232    
1233     _get_slot $uhost, sub {
1234     $state{slot_guard} = shift;
1235 root 1.64
1236 root 1.88 return unless $state{connect_guard};
1237 root 1.64
1238 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1239     # on a keepalive request (in theory, this should be a separate config option).
1240 root 1.103 if ($persistent && $KA_CACHE{$ka_key}) {
1241     $was_persistent = 1;
1242    
1243 root 1.92 $state{handle} = ka_fetch $ka_key;
1244 root 1.129 # $state{handle}->destroyed
1245     # and die "AnyEvent::HTTP: unexpectedly got a destructed handle (1), please report.";#d#
1246 root 1.92 $prepare_handle->();
1247 root 1.129 # $state{handle}->destroyed
1248     # and die "AnyEvent::HTTP: unexpectedly got a destructed handle (2), please report.";#d#
1249 root 1.137 $rpath = $upath;
1250 root 1.92 $handle_actual_request->();
1251    
1252     } else {
1253     my $tcp_connect = $arg{tcp_connect}
1254     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1255 root 1.57
1256 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1257     }
1258 root 1.1 };
1259    
1260 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1261 root 1.1 }
1262    
1263 elmex 1.15 sub http_get($@) {
1264 root 1.1 unshift @_, "GET";
1265     &http_request
1266     }
1267    
1268 elmex 1.15 sub http_head($@) {
1269 root 1.4 unshift @_, "HEAD";
1270     &http_request
1271     }
1272    
1273 elmex 1.15 sub http_post($$@) {
1274 root 1.22 my $url = shift;
1275     unshift @_, "POST", $url, "body";
1276 root 1.3 &http_request
1277     }
1278    
1279 root 1.9 =back
1280    
1281 root 1.55 =head2 DNS CACHING
1282    
1283     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1284     the actual connection, which in turn uses AnyEvent::DNS to resolve
1285     hostnames. The latter is a simple stub resolver and does no caching
1286     on its own. If you want DNS caching, you currently have to provide
1287     your own default resolver (by storing a suitable resolver object in
1288 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1289 root 1.55
1290 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1291 root 1.1
1292     =over 4
1293    
1294 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1295    
1296     Sets the default proxy server to use. The proxy-url must begin with a
1297 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1298 root 1.52
1299     To clear an already-set proxy, use C<undef>.
1300 root 1.2
1301 root 1.113 When AnyEvent::HTTP is loaded for the first time it will query the
1302 root 1.102 default proxy from the operating system, currently by looking at
1303     C<$ENV{http_proxy>}.
1304    
1305 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1306    
1307     Remove all cookies from the cookie jar that have been expired. If
1308     C<$session_end> is given and true, then additionally remove all session
1309     cookies.
1310    
1311     You should call this function (with a true C<$session_end>) before you
1312     save cookies to disk, and you should call this function after loading them
1313 root 1.117 again. If you have a long-running program you can additionally call this
1314 root 1.80 function from time to time.
1315    
1316     A cookie jar is initially an empty hash-reference that is managed by this
1317 root 1.121 module. Its format is subject to change, but currently it is as follows:
1318 root 1.80
1319 root 1.131 The key C<version> has to contain C<2>, otherwise the hash gets
1320     cleared. All other keys are hostnames or IP addresses pointing to
1321 root 1.80 hash-references. The key for these inner hash references is the
1322     server path for which this cookie is meant, and the values are again
1323 root 1.115 hash-references. Each key of those hash-references is a cookie name, and
1324 root 1.80 the value, you guessed it, is another hash-reference, this time with the
1325     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1326     which have been replaced by a C<_expires> key that contains the cookie
1327 root 1.115 expiry timestamp. Session cookies are indicated by not having an
1328     C<_expires> key.
1329 root 1.80
1330     Here is an example of a cookie jar with a single cookie, so you have a
1331     chance of understanding the above paragraph:
1332    
1333     {
1334 root 1.132 version => 2,
1335 root 1.80 "10.0.0.1" => {
1336     "/" => {
1337     "mythweb_id" => {
1338     _expires => 1293917923,
1339     value => "ooRung9dThee3ooyXooM1Ohm",
1340     },
1341     },
1342     },
1343     }
1344    
1345 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1346    
1347     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1348     Date (RFC 2616).
1349    
1350     =item $timestamp = AnyEvent::HTTP::parse_date $date
1351    
1352 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1353     bunch of minor variations of those, and returns the corresponding POSIX
1354     timestamp, or C<undef> if the date cannot be parsed.
1355 root 1.61
1356 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1357 root 1.1
1358 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1359 root 1.1
1360 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1361    
1362 root 1.110 The default timeout for connection operations (default: C<300>).
1363 root 1.92
1364 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1365    
1366     The default value for the C<User-Agent> header (the default is
1367 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1368 root 1.1
1369 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1370 root 1.1
1371 root 1.47 The maximum number of concurrent connections to the same host (identified
1372 root 1.121 by the hostname). If the limit is exceeded, then additional requests
1373 root 1.92 are queued until previous connections are closed. Both persistent and
1374     non-persistent connections are counted in this limit.
1375 root 1.1
1376 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1377 root 1.92 increase it much.
1378    
1379     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1380 root 1.117 connections, older browsers used 2, newer ones (such as firefox 3)
1381     typically use 6, and Opera uses 8 because like, they have the fastest
1382     browser and give a shit for everybody else on the planet.
1383 root 1.92
1384     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1385    
1386 root 1.116 The time after which idle persistent connections get closed by
1387 root 1.92 AnyEvent::HTTP (default: C<3>).
1388 root 1.3
1389 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1390    
1391     The number of active connections. This is not the number of currently
1392     running requests, but the number of currently open and non-idle TCP
1393 root 1.92 connections. This number can be useful for load-leveling.
1394 root 1.14
1395 root 1.1 =back
1396    
1397     =cut
1398    
1399 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1400     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1401    
1402     sub format_date($) {
1403     my ($time) = @_;
1404    
1405     # RFC 822/1123 format
1406     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1407    
1408     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1409     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1410     $H, $M, $S;
1411     }
1412    
1413     sub parse_date($) {
1414     my ($date) = @_;
1415    
1416     my ($d, $m, $y, $H, $M, $S);
1417    
1418 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1419 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1420     # cookie dates (with "-")
1421    
1422 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1423    
1424 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1425 root 1.61 # RFC 850
1426     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1427    
1428 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1429 root 1.61 # ISO C's asctime
1430     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1431     }
1432     # other formats fail in the loop below
1433    
1434     for (0..11) {
1435     if ($m eq $month[$_]) {
1436     require Time::Local;
1437 root 1.111 return eval { Time::Local::timegm ($S, $M, $H, $d, $_, $y) };
1438 root 1.61 }
1439     }
1440    
1441     undef
1442     }
1443    
1444 root 1.2 sub set_proxy($) {
1445 root 1.52 if (length $_[0]) {
1446 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1447 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1448     $PROXY = [$2, $3 || 3128, $1]
1449     } else {
1450     undef $PROXY;
1451     }
1452 root 1.2 }
1453    
1454     # initialise proxy from environment
1455 root 1.52 eval {
1456     set_proxy $ENV{http_proxy};
1457     };
1458 root 1.2
1459 root 1.93 =head2 SHOWCASE
1460    
1461 root 1.117 This section contains some more elaborate "real-world" examples or code
1462 root 1.93 snippets.
1463    
1464     =head2 HTTP/1.1 FILE DOWNLOAD
1465    
1466 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1467 root 1.98 goes wrong and you want to resume.
1468 root 1.93
1469     Here is a function that initiates and resumes a download. It uses the
1470     last modified time to check for file content changes, and works with many
1471     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1472     on older servers.
1473    
1474     It calls the completion callback with either C<undef>, which means a
1475 root 1.117 nonretryable error occurred, C<0> when the download was partial and should
1476 root 1.93 be retried, and C<1> if it was successful.
1477    
1478     use AnyEvent::HTTP;
1479    
1480     sub download($$$) {
1481     my ($url, $file, $cb) = @_;
1482    
1483     open my $fh, "+<", $file
1484     or die "$file: $!";
1485    
1486     my %hdr;
1487     my $ofs = 0;
1488    
1489     if (stat $fh and -s _) {
1490     $ofs = -s _;
1491 root 1.107 warn "-s is ", $ofs;
1492 root 1.93 $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1493     $hdr{"range"} = "bytes=$ofs-";
1494     }
1495    
1496     http_get $url,
1497     headers => \%hdr,
1498     on_header => sub {
1499     my ($hdr) = @_;
1500    
1501     if ($hdr->{Status} == 200 && $ofs) {
1502     # resume failed
1503     truncate $fh, $ofs = 0;
1504     }
1505    
1506     sysseek $fh, $ofs, 0;
1507    
1508     1
1509     },
1510     on_body => sub {
1511     my ($data, $hdr) = @_;
1512    
1513     if ($hdr->{Status} =~ /^2/) {
1514     length $data == syswrite $fh, $data
1515     or return; # abort on write errors
1516     }
1517    
1518     1
1519     },
1520     sub {
1521     my (undef, $hdr) = @_;
1522    
1523     my $status = $hdr->{Status};
1524    
1525     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1526 root 1.127 utime $time, $time, $fh;
1527 root 1.93 }
1528    
1529     if ($status == 200 || $status == 206 || $status == 416) {
1530     # download ok || resume ok || file already fully downloaded
1531     $cb->(1, $hdr);
1532    
1533     } elsif ($status == 412) {
1534     # file has changed while resuming, delete and retry
1535     unlink $file;
1536     $cb->(0, $hdr);
1537    
1538     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1539     # retry later
1540     $cb->(0, $hdr);
1541    
1542     } else {
1543     $cb->(undef, $hdr);
1544     }
1545     }
1546     ;
1547     }
1548    
1549     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1550     if ($_[0]) {
1551     print "OK!\n";
1552     } elsif (defined $_[0]) {
1553     print "please retry later\n";
1554     } else {
1555     print "ERROR\n";
1556     }
1557     };
1558    
1559     =head3 SOCKS PROXIES
1560 root 1.60
1561     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1562     compile your perl to support socks, or use an external program such as
1563     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1564     transparently.
1565    
1566     Alternatively, for AnyEvent::HTTP only, you can use your own
1567     C<tcp_connect> function that does the proxy handshake - here is an example
1568     that works with socks4a proxies:
1569    
1570     use Errno;
1571     use AnyEvent::Util;
1572     use AnyEvent::Socket;
1573     use AnyEvent::Handle;
1574    
1575     # host, port and username of/for your socks4a proxy
1576     my $socks_host = "10.0.0.23";
1577     my $socks_port = 9050;
1578     my $socks_user = "";
1579    
1580     sub socks4a_connect {
1581     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1582    
1583     my $hdl = new AnyEvent::Handle
1584     connect => [$socks_host, $socks_port],
1585     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1586     on_error => sub { $connect_cb->() },
1587     ;
1588    
1589     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1590    
1591     $hdl->push_read (chunk => 8, sub {
1592     my ($hdl, $chunk) = @_;
1593     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1594    
1595     if ($status == 0x5a) {
1596     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1597     } else {
1598     $! = Errno::ENXIO; $connect_cb->();
1599     }
1600     });
1601    
1602     $hdl
1603     }
1604    
1605     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1606     possibly after switching off other proxy types:
1607    
1608     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1609    
1610     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1611     my ($data, $headers) = @_;
1612     ...
1613     };
1614    
1615 root 1.1 =head1 SEE ALSO
1616    
1617     L<AnyEvent>.
1618    
1619     =head1 AUTHOR
1620    
1621 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1622     http://home.schmorp.de/
1623 root 1.1
1624 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1625     testcases and bugreports.
1626    
1627 root 1.1 =cut
1628    
1629     1
1630