ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.125
Committed: Thu May 14 02:04:35 2015 UTC (9 years ago) by root
Branch: MAIN
CVS Tags: rel-2_22
Changes since 1.124: +1 -1 lines
Log Message:
2.22

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20 root 1.94 all on a very low level. It can follow redirects, supports proxies, and
21 root 1.11 automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41 root 1.85 use common::sense;
42 root 1.1
43 root 1.41 use Errno ();
44 root 1.1
45 root 1.51 use AnyEvent 5.0 ();
46 root 1.1 use AnyEvent::Util ();
47     use AnyEvent::Handle ();
48    
49     use base Exporter::;
50    
51 root 1.125 our $VERSION = 2.22;
52 root 1.1
53 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
54 root 1.1
55 root 1.40 our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
56 root 1.3 our $MAX_RECURSE = 10;
57 root 1.92 our $PERSISTENT_TIMEOUT = 3;
58 root 1.2 our $TIMEOUT = 300;
59 root 1.92 our $MAX_PER_HOST = 4; # changing this is evil
60 root 1.1
61 root 1.2 our $PROXY;
62 root 1.14 our $ACTIVE = 0;
63 root 1.2
64 root 1.92 my %KA_CACHE; # indexed by uhost currently, points to [$handle...] array
65 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
66 root 1.1
67     =item http_get $url, key => value..., $cb->($data, $headers)
68    
69     Executes an HTTP-GET request. See the http_request function for details on
70 root 1.29 additional parameters and the return value.
71 root 1.1
72 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
73    
74 root 1.29 Executes an HTTP-HEAD request. See the http_request function for details
75     on additional parameters and the return value.
76 root 1.5
77     =item http_post $url, $body, key => value..., $cb->($data, $headers)
78 root 1.3
79 root 1.26 Executes an HTTP-POST request with a request body of C<$body>. See the
80 root 1.29 http_request function for details on additional parameters and the return
81     value.
82 root 1.3
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.29 When called in void context, nothing is returned. In other contexts,
89     C<http_request> returns a "cancellation guard" - you have to keep the
90     object at least alive until the callback get called. If the object gets
91 root 1.58 destroyed before the callback is called, the request will be cancelled.
92 root 1.29
93 root 1.42 The callback will be called with the response body data as first argument
94 root 1.117 (or C<undef> if an error occurred), and a hash-ref with response headers
95 root 1.68 (and trailers) as second argument.
96 root 1.2
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.55 headers, the "pseudo-headers" (uppercase to avoid clashing with possible
99     response headers) C<HTTPVersion>, C<Status> and C<Reason> contain the
100 root 1.64 three parts of the HTTP Status-Line of the same name. If an error occurs
101     during the body phase of a request, then the original C<Status> and
102     C<Reason> values from the header are available as C<OrigStatus> and
103     C<OrigReason>.
104 root 1.55
105     The pseudo-header C<URL> contains the actual URL (which can differ from
106     the requested URL when following redirects - for example, you might get
107     an error that your URL scheme is not supported even though your URL is a
108     valid http URL because it redirected to an ftp URL, in which case you can
109     look at the URL pseudo header).
110    
111     The pseudo-header C<Redirect> only exists when the request was a result
112     of an internal redirect. In that case it is an array reference with
113     the C<($data, $headers)> from the redirect response. Note that this
114     response could in turn be the result of a redirect itself, and C<<
115     $headers->{Redirect}[1]{Redirect} >> will then contain the original
116     response, and so on.
117 root 1.20
118 root 1.32 If the server sends a header multiple times, then their contents will be
119     joined together with a comma (C<,>), as per the HTTP spec.
120 root 1.2
121     If an internal error occurs, such as not being able to resolve a hostname,
122 root 1.77 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be
123     C<590>-C<599> and the C<Reason> pseudo-header will contain an error
124     message. Currently the following status codes are used:
125    
126     =over 4
127    
128 root 1.114 =item 595 - errors during connection establishment, proxy handshake.
129 root 1.77
130     =item 596 - errors during TLS negotiation, request sending and header processing.
131    
132 root 1.78 =item 597 - errors during body receiving or processing.
133 root 1.77
134 root 1.78 =item 598 - user aborted request via C<on_header> or C<on_body>.
135 root 1.77
136     =item 599 - other, usually nonretryable, errors (garbled URL etc.).
137    
138     =back
139 root 1.2
140 root 1.6 A typical callback might look like this:
141    
142     sub {
143     my ($body, $hdr) = @_;
144    
145     if ($hdr->{Status} =~ /^2/) {
146     ... everything should be ok
147     } else {
148     print "error, $hdr->{Status} $hdr->{Reason}\n";
149     }
150     }
151    
152 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
153     include:
154    
155     =over 4
156    
157 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
158 root 1.1
159 root 1.112 Whether to recurse requests or not, e.g. on redirects, authentication and
160     other retries and so on, and how often to do so.
161 root 1.1
162 root 1.119 Only redirects to http and https URLs are supported. While most common
163     redirection forms are handled entirely within this module, some require
164     the use of the optional L<URI> module. If it is required but missing, then
165     the request will fail with an error.
166    
167 root 1.1 =item headers => hashref
168    
169 root 1.68 The request headers to use. Currently, C<http_request> may provide its own
170     C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers and
171 root 1.69 will provide defaults at least for C<TE:>, C<Referer:> and C<User-Agent:>
172     (this can be suppressed by using C<undef> for these headers in which case
173     they won't be sent at all).
174 root 1.1
175 root 1.90 You really should provide your own C<User-Agent:> header value that is
176     appropriate for your program - I wouldn't be surprised if the default
177     AnyEvent string gets blocked by webservers sooner or later.
178    
179 root 1.104 Also, make sure that your headers names and values do not contain any
180     embedded newlines.
181    
182 root 1.1 =item timeout => $seconds
183    
184     The time-out to use for various stages - each connect attempt will reset
185 root 1.51 the timeout, as will read or write activity, i.e. this is not an overall
186     timeout.
187    
188     Default timeout is 5 minutes.
189 root 1.2
190     =item proxy => [$host, $port[, $scheme]] or undef
191    
192 root 1.102 Use the given http proxy for all requests, or no proxy if C<undef> is
193     used.
194 root 1.2
195 root 1.92 C<$scheme> must be either missing or must be C<http> for HTTP.
196 root 1.1
197 root 1.102 If not specified, then the default proxy is used (see
198     C<AnyEvent::HTTP::set_proxy>).
199    
200 root 1.123 Currently, if your proxy requires authorization, you have to specify an
201     appropriate "Proxy-Authorization" header in every request.
202    
203 root 1.3 =item body => $string
204    
205 root 1.68 The request body, usually empty. Will be sent as-is (future versions of
206 root 1.3 this module might offer more options).
207    
208 root 1.10 =item cookie_jar => $hash_ref
209    
210     Passing this parameter enables (simplified) cookie-processing, loosely
211     based on the original netscape specification.
212    
213 root 1.80 The C<$hash_ref> must be an (initially empty) hash reference which
214     will get updated automatically. It is possible to save the cookie jar
215     to persistent storage with something like JSON or Storable - see the
216     C<AnyEvent::HTTP::cookie_jar_expire> function if you wish to remove
217     expired or session-only cookies, and also for documentation on the format
218     of the cookie jar.
219 root 1.10
220 root 1.70 Note that this cookie implementation is not meant to be complete. If
221     you want complete cookie management you have to do that on your
222 root 1.80 own. C<cookie_jar> is meant as a quick fix to get most cookie-using sites
223 root 1.70 working. Cookies are a privacy disaster, do not use them unless required
224     to.
225 root 1.10
226 root 1.69 When cookie processing is enabled, the C<Cookie:> and C<Set-Cookie:>
227 root 1.70 headers will be set and handled by this module, otherwise they will be
228 root 1.69 left untouched.
229    
230 root 1.40 =item tls_ctx => $scheme | $tls_ctx
231    
232     Specifies the AnyEvent::TLS context to be used for https connections. This
233     parameter follows the same rules as the C<tls_ctx> parameter to
234     L<AnyEvent::Handle>, but additionally, the two strings C<low> or
235     C<high> can be specified, which give you a predefined low-security (no
236     verification, highest compatibility) and high-security (CA and common-name
237     verification) TLS context.
238    
239     The default for this option is C<low>, which could be interpreted as "give
240     me the page, no matter what".
241    
242 root 1.92 See also the C<sessionid> parameter.
243    
244     =item session => $string
245    
246     The module might reuse connections to the same host internally. Sometimes
247     (e.g. when using TLS), you do not want to reuse connections from other
248     sessions. This can be achieved by setting this parameter to some unique
249     ID (such as the address of an object storing your state data, or the TLS
250     context) - only connections using the same unique ID will be reused.
251    
252 root 1.51 =item on_prepare => $callback->($fh)
253    
254     In rare cases you need to "tune" the socket before it is used to
255 root 1.117 connect (for example, to bind it on a given IP address). This parameter
256 root 1.51 overrides the prepare callback passed to C<AnyEvent::Socket::tcp_connect>
257     and behaves exactly the same way (e.g. it has to provide a
258     timeout). See the description for the C<$prepare_cb> argument of
259     C<AnyEvent::Socket::tcp_connect> for details.
260    
261 root 1.59 =item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb)
262    
263     In even rarer cases you want total control over how AnyEvent::HTTP
264     establishes connections. Normally it uses L<AnyEvent::Socket::tcp_connect>
265     to do this, but you can provide your own C<tcp_connect> function -
266 root 1.60 obviously, it has to follow the same calling conventions, except that it
267     may always return a connection guard object.
268 root 1.59
269     There are probably lots of weird uses for this function, starting from
270     tracing the hosts C<http_request> actually tries to connect, to (inexact
271     but fast) host => IP address caching or even socks protocol support.
272    
273 root 1.42 =item on_header => $callback->($headers)
274 root 1.41
275     When specified, this callback will be called with the header hash as soon
276     as headers have been successfully received from the remote server (not on
277     locally-generated errors).
278    
279     It has to return either true (in which case AnyEvent::HTTP will continue),
280     or false, in which case AnyEvent::HTTP will cancel the download (and call
281     the finish callback with an error code of C<598>).
282    
283     This callback is useful, among other things, to quickly reject unwanted
284     content, which, if it is supposed to be rare, can be faster than first
285     doing a C<HEAD> request.
286    
287 root 1.68 The downside is that cancelling the request makes it impossible to re-use
288     the connection. Also, the C<on_header> callback will not receive any
289     trailer (headers sent after the response body).
290    
291 root 1.42 Example: cancel the request unless the content-type is "text/html".
292 root 1.41
293 root 1.42 on_header => sub {
294     $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/
295     },
296 root 1.41
297 root 1.42 =item on_body => $callback->($partial_body, $headers)
298 root 1.41
299 root 1.42 When specified, all body data will be passed to this callback instead of
300     to the completion callback. The completion callback will get the empty
301     string instead of the body data.
302 root 1.41
303 root 1.42 It has to return either true (in which case AnyEvent::HTTP will continue),
304     or false, in which case AnyEvent::HTTP will cancel the download (and call
305     the completion callback with an error code of C<598>).
306    
307 root 1.68 The downside to cancelling the request is that it makes it impossible to
308     re-use the connection.
309    
310 root 1.42 This callback is useful when the data is too large to be held in memory
311     (so the callback writes it to a file) or when only some information should
312     be extracted, or when the body should be processed incrementally.
313 root 1.41
314     It is usually preferred over doing your own body handling via
315 root 1.45 C<want_body_handle>, but in case of streaming APIs, where HTTP is
316     only used to create a connection, C<want_body_handle> is the better
317     alternative, as it allows you to install your own event handler, reducing
318     resource usage.
319 root 1.41
320     =item want_body_handle => $enable
321    
322     When enabled (default is disabled), the behaviour of AnyEvent::HTTP
323     changes considerably: after parsing the headers, and instead of
324     downloading the body (if any), the completion callback will be
325     called. Instead of the C<$body> argument containing the body data, the
326     callback will receive the L<AnyEvent::Handle> object associated with the
327     connection. In error cases, C<undef> will be passed. When there is no body
328     (e.g. status C<304>), the empty string will be passed.
329    
330 root 1.92 The handle object might or might not be in TLS mode, might be connected
331     to a proxy, be a persistent connection, use chunked transfer encoding
332     etc., and configured in unspecified ways. The user is responsible for this
333     handle (it will not be used by this module anymore).
334 root 1.41
335     This is useful with some push-type services, where, after the initial
336     headers, an interactive protocol is used (typical example would be the
337     push-style twitter API which starts a JSON/XML stream).
338    
339     If you think you need this, first have a look at C<on_body>, to see if
340 root 1.45 that doesn't solve your problem in a better way.
341 root 1.41
342 root 1.92 =item persistent => $boolean
343    
344     Try to create/reuse a persistent connection. When this flag is set
345     (default: true for idempotent requests, false for all others), then
346     C<http_request> tries to re-use an existing (previously-created)
347     persistent connection to the host and, failing that, tries to create a new
348     one.
349    
350     Requests failing in certain ways will be automatically retried once, which
351     is dangerous for non-idempotent requests, which is why it defaults to off
352     for them. The reason for this is because the bozos who designed HTTP/1.1
353     made it impossible to distinguish between a fatal error and a normal
354     connection timeout, so you never know whether there was a problem with
355     your request or not.
356    
357     When reusing an existent connection, many parameters (such as TLS context)
358     will be ignored. See the C<session> parameter for a workaround.
359    
360     =item keepalive => $boolean
361    
362     Only used when C<persistent> is also true. This parameter decides whether
363     C<http_request> tries to handshake a HTTP/1.0-style keep-alive connection
364     (as opposed to only a HTTP/1.1 persistent connection).
365    
366     The default is true, except when using a proxy, in which case it defaults
367     to false, as HTTP/1.0 proxies cannot support this in a meaningful way.
368    
369     =item handle_params => { key => value ... }
370    
371     The key-value pairs in this hash will be passed to any L<AnyEvent::Handle>
372     constructor that is called - not all requests will create a handle, and
373     sometimes more than one is created, so this parameter is only good for
374     setting hints.
375    
376     Example: set the maximum read size to 4096, to potentially conserve memory
377     at the cost of speed.
378    
379     handle_params => {
380     max_read_size => 4096,
381     },
382    
383 root 1.1 =back
384    
385 root 1.68 Example: do a simple HTTP GET request for http://www.nethype.de/ and print
386     the response body.
387 root 1.9
388     http_request GET => "http://www.nethype.de/", sub {
389     my ($body, $hdr) = @_;
390     print "$body\n";
391     };
392    
393 root 1.68 Example: do a HTTP HEAD request on https://www.google.com/, use a
394 root 1.9 timeout of 30 seconds.
395    
396     http_request
397 root 1.109 HEAD => "https://www.google.com",
398 root 1.90 headers => { "user-agent" => "MySearchClient 1.0" },
399 root 1.9 timeout => 30,
400     sub {
401     my ($body, $hdr) = @_;
402     use Data::Dumper;
403     print Dumper $hdr;
404     }
405     ;
406 root 1.1
407 root 1.68 Example: do another simple HTTP GET request, but immediately try to
408 root 1.29 cancel it.
409    
410     my $request = http_request GET => "http://www.nethype.de/", sub {
411     my ($body, $hdr) = @_;
412     print "$body\n";
413     };
414    
415     undef $request;
416    
417 root 1.1 =cut
418    
419 root 1.92 #############################################################################
420     # wait queue/slots
421    
422 root 1.12 sub _slot_schedule;
423 root 1.11 sub _slot_schedule($) {
424     my $host = shift;
425    
426     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
427     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
428 root 1.12 # somebody wants that slot
429 root 1.11 ++$CO_SLOT{$host}[0];
430 root 1.14 ++$ACTIVE;
431 root 1.11
432     $cb->(AnyEvent::Util::guard {
433 root 1.14 --$ACTIVE;
434 root 1.11 --$CO_SLOT{$host}[0];
435     _slot_schedule $host;
436     });
437     } else {
438     # nobody wants the slot, maybe we can forget about it
439     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
440     last;
441     }
442     }
443     }
444    
445     # wait for a free slot on host, call callback
446     sub _get_slot($$) {
447     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
448    
449     _slot_schedule $_[0];
450     }
451    
452 root 1.80 #############################################################################
453 root 1.92 # cookie handling
454 root 1.80
455     # expire cookies
456     sub cookie_jar_expire($;$) {
457     my ($jar, $session_end) = @_;
458    
459     %$jar = () if $jar->{version} != 1;
460    
461     my $anow = AE::now;
462    
463     while (my ($chost, $paths) = each %$jar) {
464     next unless ref $paths;
465    
466     while (my ($cpath, $cookies) = each %$paths) {
467     while (my ($cookie, $kv) = each %$cookies) {
468     if (exists $kv->{_expires}) {
469     delete $cookies->{$cookie}
470     if $anow > $kv->{_expires};
471     } elsif ($session_end) {
472     delete $cookies->{$cookie};
473     }
474     }
475    
476     delete $paths->{$cpath}
477     unless %$cookies;
478     }
479    
480     delete $jar->{$chost}
481     unless %$paths;
482     }
483     }
484    
485 root 1.72 # extract cookies from jar
486 root 1.71 sub cookie_jar_extract($$$$) {
487 root 1.92 my ($jar, $scheme, $host, $path) = @_;
488 root 1.71
489     %$jar = () if $jar->{version} != 1;
490    
491     my @cookies;
492    
493     while (my ($chost, $paths) = each %$jar) {
494     next unless ref $paths;
495    
496     if ($chost =~ /^\./) {
497 root 1.92 next unless $chost eq substr $host, -length $chost;
498 root 1.71 } elsif ($chost =~ /\./) {
499 root 1.92 next unless $chost eq $host;
500 root 1.71 } else {
501     next;
502     }
503    
504     while (my ($cpath, $cookies) = each %$paths) {
505 root 1.92 next unless $cpath eq substr $path, 0, length $cpath;
506 root 1.71
507     while (my ($cookie, $kv) = each %$cookies) {
508 root 1.92 next if $scheme ne "https" && exists $kv->{secure};
509 root 1.71
510 root 1.80 if (exists $kv->{_expires} and AE::now > $kv->{_expires}) {
511     delete $cookies->{$cookie};
512     next;
513 root 1.71 }
514    
515     my $value = $kv->{value};
516    
517     if ($value =~ /[=;,[:space:]]/) {
518     $value =~ s/([\\"])/\\$1/g;
519     $value = "\"$value\"";
520     }
521    
522     push @cookies, "$cookie=$value";
523     }
524     }
525     }
526    
527     \@cookies
528     }
529    
530 root 1.72 # parse set_cookie header into jar
531 root 1.80 sub cookie_jar_set_cookie($$$$) {
532 root 1.92 my ($jar, $set_cookie, $host, $date) = @_;
533 root 1.80
534     my $anow = int AE::now;
535     my $snow; # server-now
536 root 1.72
537     for ($set_cookie) {
538     # parse NAME=VALUE
539     my @kv;
540    
541 root 1.79 # expires is not http-compliant in the original cookie-spec,
542     # we support the official date format and some extensions
543 root 1.72 while (
544     m{
545     \G\s*
546     (?:
547 root 1.79 expires \s*=\s* ([A-Z][a-z][a-z]+,\ [^,;]+)
548 root 1.101 | ([^=;,[:space:]]+) (?: \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^;,[:space:]]*) ) )?
549 root 1.72 )
550     }gcxsi
551     ) {
552     my $name = $2;
553     my $value = $4;
554    
555 root 1.82 if (defined $1) {
556 root 1.72 # expires
557     $name = "expires";
558     $value = $1;
559 root 1.82 } elsif (defined $3) {
560 root 1.72 # quoted
561     $value = $3;
562     $value =~ s/\\(.)/$1/gs;
563     }
564    
565 root 1.95 push @kv, @kv ? lc $name : $name, $value;
566 root 1.72
567     last unless /\G\s*;/gc;
568     }
569    
570     last unless @kv;
571    
572     my $name = shift @kv;
573     my %kv = (value => shift @kv, @kv);
574    
575 root 1.80 if (exists $kv{"max-age"}) {
576     $kv{_expires} = $anow + delete $kv{"max-age"};
577     } elsif (exists $kv{expires}) {
578     $snow ||= parse_date ($date) || $anow;
579     $kv{_expires} = $anow + (parse_date (delete $kv{expires}) - $snow);
580     } else {
581     delete $kv{_expires};
582     }
583 root 1.72
584     my $cdom;
585     my $cpath = (delete $kv{path}) || "/";
586    
587     if (exists $kv{domain}) {
588     $cdom = delete $kv{domain};
589    
590     $cdom =~ s/^\.?/./; # make sure it starts with a "."
591    
592     next if $cdom =~ /\.$/;
593    
594     # this is not rfc-like and not netscape-like. go figure.
595     my $ndots = $cdom =~ y/.//;
596     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
597     } else {
598 root 1.92 $cdom = $host;
599 root 1.72 }
600    
601     # store it
602 root 1.73 $jar->{version} = 1;
603 root 1.83 $jar->{lc $cdom}{$cpath}{$name} = \%kv;
604 root 1.72
605     redo if /\G\s*,/gc;
606     }
607     }
608    
609 root 1.92 #############################################################################
610     # keepalive/persistent connection cache
611    
612     # fetch a connection from the keepalive cache
613     sub ka_fetch($) {
614     my $ka_key = shift;
615    
616     my $hdl = pop @{ $KA_CACHE{$ka_key} }; # currently we reuse the MOST RECENTLY USED connection
617     delete $KA_CACHE{$ka_key}
618     unless @{ $KA_CACHE{$ka_key} };
619    
620     $hdl
621     }
622    
623     sub ka_store($$) {
624     my ($ka_key, $hdl) = @_;
625    
626     my $kaa = $KA_CACHE{$ka_key} ||= [];
627    
628     my $destroy = sub {
629     my @ka = grep $_ != $hdl, @{ $KA_CACHE{$ka_key} };
630    
631     $hdl->destroy;
632    
633     @ka
634     ? $KA_CACHE{$ka_key} = \@ka
635     : delete $KA_CACHE{$ka_key};
636     };
637    
638     # on error etc., destroy
639     $hdl->on_error ($destroy);
640     $hdl->on_eof ($destroy);
641     $hdl->on_read ($destroy);
642     $hdl->timeout ($PERSISTENT_TIMEOUT);
643    
644     push @$kaa, $hdl;
645     shift @$kaa while @$kaa > $MAX_PER_HOST;
646     }
647    
648     #############################################################################
649     # utilities
650    
651 root 1.66 # continue to parse $_ for headers and place them into the arg
652 root 1.92 sub _parse_hdr() {
653 root 1.66 my %hdr;
654    
655     # things seen, not parsed:
656     # p3pP="NON CUR OTPi OUR NOR UNI"
657    
658     $hdr{lc $1} .= ",$2"
659     while /\G
660     ([^:\000-\037]*):
661     [\011\040]*
662     ((?: [^\012]+ | \012[\011\040] )*)
663     \012
664     /gxc;
665    
666     /\G$/
667     or return;
668    
669     # remove the "," prefix we added to all headers above
670     substr $_, 0, 1, ""
671     for values %hdr;
672    
673     \%hdr
674     }
675    
676 root 1.92 #############################################################################
677     # http_get
678    
679 root 1.46 our $qr_nlnl = qr{(?<![^\012])\015?\012};
680 root 1.34
681 root 1.41 our $TLS_CTX_LOW = { cache => 1, sslv2 => 1 };
682     our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_peername => "https" };
683 root 1.40
684 root 1.92 # maybe it should just become a normal object :/
685    
686     sub _destroy_state(\%) {
687     my ($state) = @_;
688    
689     $state->{handle}->destroy if $state->{handle};
690     %$state = ();
691     }
692    
693     sub _error(\%$$) {
694     my ($state, $cb, $hdr) = @_;
695    
696     &_destroy_state ($state);
697    
698     $cb->(undef, $hdr);
699     ()
700     }
701    
702 root 1.116 our %IDEMPOTENT = (
703     DELETE => 1,
704     GET => 1,
705     HEAD => 1,
706     OPTIONS => 1,
707     PUT => 1,
708     TRACE => 1,
709    
710     ACL => 1,
711     "BASELINE-CONTROL" => 1,
712     BIND => 1,
713     CHECKIN => 1,
714     CHECKOUT => 1,
715     COPY => 1,
716     LABEL => 1,
717     LINK => 1,
718     MERGE => 1,
719     MKACTIVITY => 1,
720     MKCALENDAR => 1,
721     MKCOL => 1,
722     MKREDIRECTREF => 1,
723     MKWORKSPACE => 1,
724     MOVE => 1,
725     ORDERPATCH => 1,
726     PROPFIND => 1,
727     PROPPATCH => 1,
728     REBIND => 1,
729     REPORT => 1,
730     SEARCH => 1,
731     UNBIND => 1,
732     UNCHECKOUT => 1,
733     UNLINK => 1,
734     UNLOCK => 1,
735     UPDATE => 1,
736     UPDATEREDIRECTREF => 1,
737     "VERSION-CONTROL" => 1,
738     );
739    
740 elmex 1.15 sub http_request($$@) {
741 root 1.1 my $cb = pop;
742     my ($method, $url, %arg) = @_;
743    
744     my %hdr;
745    
746 root 1.40 $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx};
747     $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high";
748    
749 root 1.3 $method = uc $method;
750    
751 root 1.8 if (my $hdr = $arg{headers}) {
752 root 1.1 while (my ($k, $v) = each %$hdr) {
753     $hdr{lc $k} = $v;
754     }
755     }
756    
757 root 1.55 # pseudo headers for all subsequent responses
758     my @pseudo = (URL => $url);
759     push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect};
760    
761 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
762 root 1.8
763 root 1.64 return $cb->(undef, { @pseudo, Status => 599, Reason => "Too many redirections" })
764 root 1.8 if $recurse < 0;
765    
766 root 1.102 my $proxy = exists $arg{proxy} ? $arg{proxy} : $PROXY;
767 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
768    
769 root 1.92 my ($uscheme, $uauthority, $upath, $query, undef) = # ignore fragment
770 root 1.100 $url =~ m|^([^:]+):(?://([^/?#]*))?([^?#]*)(?:(\?[^#]*))?(?:#(.*))?$|;
771 root 1.2
772 root 1.31 $uscheme = lc $uscheme;
773 root 1.1
774 root 1.31 my $uport = $uscheme eq "http" ? 80
775     : $uscheme eq "https" ? 443
776 root 1.64 : return $cb->(undef, { @pseudo, Status => 599, Reason => "Only http and https URL schemes supported" });
777 root 1.13
778 root 1.124 $uauthority =~ /^(?: .*\@ )? ([^\@]+?) (?: : (\d+) )?$/x
779 root 1.64 or return $cb->(undef, { @pseudo, Status => 599, Reason => "Unparsable URL" });
780 root 1.10
781 root 1.86 my $uhost = lc $1;
782 root 1.10 $uport = $2 if defined $2;
783    
784 root 1.53 $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"
785     unless exists $hdr{host};
786 root 1.43
787 root 1.10 $uhost =~ s/^\[(.*)\]$/$1/;
788 root 1.56 $upath .= $query if length $query;
789 root 1.10
790     $upath =~ s%^/?%/%;
791    
792     # cookie processing
793     if (my $jar = $arg{cookie_jar}) {
794 root 1.71 my $cookies = cookie_jar_extract $jar, $uscheme, $uhost, $upath;
795 root 1.70
796 root 1.71 $hdr{cookie} = join "; ", @$cookies
797     if @$cookies;
798 root 1.10 }
799 root 1.1
800 root 1.31 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
801 root 1.2
802 root 1.10 if ($proxy) {
803 root 1.38 ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy);
804 root 1.31
805 root 1.47 $rscheme = "http" unless defined $rscheme;
806    
807 root 1.31 # don't support https requests over https-proxy transport,
808 root 1.38 # can't be done with tls as spec'ed, unless you double-encrypt.
809 root 1.31 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
810 root 1.86
811     $rhost = lc $rhost;
812     $rscheme = lc $rscheme;
813 root 1.10 } else {
814 root 1.31 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
815 root 1.2 }
816    
817 root 1.47 # leave out fragment and query string, just a heuristic
818 root 1.66 $hdr{referer} = "$uscheme://$uauthority$upath" unless exists $hdr{referer};
819     $hdr{"user-agent"} = $USERAGENT unless exists $hdr{"user-agent"};
820 root 1.41
821 root 1.53 $hdr{"content-length"} = length $arg{body}
822     if length $arg{body} || $method ne "GET";
823 root 1.1
824 root 1.116 my $idempotent = $IDEMPOTENT{$method};
825 root 1.89
826     # default value for keepalive is true iff the request is for an idempotent method
827 root 1.103 my $persistent = exists $arg{persistent} ? !!$arg{persistent} : $idempotent;
828     my $keepalive = exists $arg{keepalive} ? !!$arg{keepalive} : !$proxy;
829     my $was_persistent; # true if this is actually a recycled connection
830 root 1.92
831     # the key to use in the keepalive cache
832 root 1.105 my $ka_key = "$uscheme\x00$uhost\x00$uport\x00$arg{sessionid}";
833 root 1.89
834 root 1.118 $hdr{connection} = ($persistent ? $keepalive ? "keep-alive, " : "" : "close, ") . "Te"; #1.1
835 root 1.68 $hdr{te} = "trailers" unless exists $hdr{te}; #1.1
836 root 1.66
837 root 1.11 my %state = (connect_guard => 1);
838    
839 root 1.88 my $ae_error = 595; # connecting
840 root 1.1
841 root 1.88 # handle actual, non-tunneled, request
842     my $handle_actual_request = sub {
843     $ae_error = 596; # request phase
844    
845 root 1.92 my $hdl = $state{handle};
846    
847     $hdl->starttls ("connect") if $uscheme eq "https" && !exists $hdl->{tls};
848 root 1.88
849     # send request
850 root 1.92 $hdl->push_write (
851 root 1.88 "$method $rpath HTTP/1.1\015\012"
852     . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr)
853     . "\015\012"
854 root 1.120 . $arg{body}
855 root 1.88 );
856    
857 root 1.117 # return if error occurred during push_write()
858 root 1.88 return unless %state;
859    
860 root 1.89 # reduce memory usage, save a kitten, also re-use it for the response headers.
861     %hdr = ();
862 root 1.88
863     # status line and headers
864     $state{read_response} = sub {
865 root 1.92 return unless %state;
866    
867 root 1.88 for ("$_[1]") {
868     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
869    
870     /^HTTP\/0*([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\012]*) )? \012/gxci
871 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid server response" };
872 root 1.88
873     # 100 Continue handling
874     # should not happen as we don't send expect: 100-continue,
875     # but we handle it just in case.
876     # since we send the request body regardless, if we get an error
877     # we are out of-sync, which we currently do NOT handle correctly.
878     return $state{handle}->push_read (line => $qr_nlnl, $state{read_response})
879     if $2 eq 100;
880    
881     push @pseudo,
882     HTTPVersion => $1,
883     Status => $2,
884     Reason => $3,
885     ;
886 root 1.1
887 root 1.92 my $hdr = _parse_hdr
888     or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Garbled response headers" };
889 root 1.77
890 root 1.88 %hdr = (%$hdr, @pseudo);
891     }
892 root 1.82
893 root 1.88 # redirect handling
894 root 1.119 # relative uri handling forced by microsoft and other shitheads.
895     # we give our best and fall back to URI if available.
896     if (exists $hdr{location}) {
897     my $loc = $hdr{location};
898    
899     if ($loc =~ m%^//%) { # //
900     $loc = "$rscheme:$loc";
901    
902     } elsif ($loc eq "") {
903     $loc = $url;
904    
905     } elsif ($loc !~ /^(?: $ | [^:\/?\#]+ : )/x) { # anything "simple"
906     $loc =~ s/^\.\/+//;
907    
908     if ($loc !~ m%^[.?#]%) {
909     my $prefix = "$rscheme://$uhost:$uport";
910    
911     unless ($loc =~ s/^\///) {
912     $prefix .= $upath;
913     $prefix =~ s/\/[^\/]*$//;
914     }
915    
916     $loc = "$prefix/$loc";
917    
918     } elsif (eval { require URI }) { # uri
919     $loc = URI->new_abs ($loc, $url)->as_string;
920    
921     } else {
922     return _error %state, $cb, { @pseudo, Status => 599, Reason => "Cannot parse Location (URI module missing)" };
923     #$hdr{Status} = 599;
924     #$hdr{Reason} = "Unparsable Redirect (URI module missing)";
925     #$recurse = 0;
926     }
927 root 1.82 }
928    
929 root 1.119 $hdr{location} = $loc;
930 root 1.88 }
931 root 1.82
932 root 1.88 my $redirect;
933 root 1.82
934 root 1.88 if ($recurse) {
935     my $status = $hdr{Status};
936 root 1.82
937 root 1.88 # industry standard is to redirect POST as GET for
938     # 301, 302 and 303, in contrast to HTTP/1.0 and 1.1.
939     # also, the UA should ask the user for 301 and 307 and POST,
940     # industry standard seems to be to simply follow.
941 root 1.119 # we go with the industry standard. 308 is defined
942 root 1.121 # by rfc7538
943 root 1.88 if ($status == 301 or $status == 302 or $status == 303) {
944 root 1.122 $redirect = 1;
945 root 1.88 # HTTP/1.1 is unclear on how to mutate the method
946 root 1.122 unless ($method eq "HEAD") {
947     $method = "GET";
948     delete $arg{body};
949     }
950 root 1.119 } elsif ($status == 307 or $status == 308) {
951 root 1.88 $redirect = 1;
952 root 1.82 }
953 root 1.88 }
954 root 1.82
955 root 1.103 my $finish = sub { # ($data, $err_status, $err_reason[, $persistent])
956 root 1.92 if ($state{handle}) {
957     # handle keepalive
958     if (
959 root 1.103 $persistent
960 root 1.92 && $_[3]
961     && ($hdr{HTTPVersion} < 1.1
962     ? $hdr{connection} =~ /\bkeep-?alive\b/i
963     : $hdr{connection} !~ /\bclose\b/i)
964     ) {
965     ka_store $ka_key, delete $state{handle};
966     } else {
967     # no keepalive, destroy the handle
968     $state{handle}->destroy;
969     }
970     }
971 root 1.82
972 root 1.88 %state = ();
973 root 1.82
974 root 1.88 if (defined $_[1]) {
975     $hdr{OrigStatus} = $hdr{Status}; $hdr{Status} = $_[1];
976     $hdr{OrigReason} = $hdr{Reason}; $hdr{Reason} = $_[2];
977     }
978 root 1.82
979 root 1.88 # set-cookie processing
980     if ($arg{cookie_jar}) {
981     cookie_jar_set_cookie $arg{cookie_jar}, $hdr{"set-cookie"}, $uhost, $hdr{date};
982     }
983 root 1.82
984 root 1.88 if ($redirect && exists $hdr{location}) {
985     # we ignore any errors, as it is very common to receive
986     # Content-Length != 0 but no actual body
987     # we also access %hdr, as $_[1] might be an erro
988 root 1.103 $state{recurse} =
989     http_request (
990     $method => $hdr{location},
991     %arg,
992     recurse => $recurse - 1,
993     Redirect => [$_[0], \%hdr],
994     sub {
995     %state = ();
996     &$cb
997     },
998     );
999 root 1.88 } else {
1000     $cb->($_[0], \%hdr);
1001     }
1002     };
1003 root 1.82
1004 root 1.88 $ae_error = 597; # body phase
1005 root 1.82
1006 root 1.91 my $chunked = $hdr{"transfer-encoding"} =~ /\bchunked\b/i; # not quite correct...
1007    
1008     my $len = $chunked ? undef : $hdr{"content-length"};
1009 root 1.82
1010 root 1.88 # body handling, many different code paths
1011     # - no body expected
1012     # - want_body_handle
1013     # - te chunked
1014     # - 2x length known (with or without on_body)
1015     # - 2x length not known (with or without on_body)
1016     if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) {
1017     $finish->(undef, 598 => "Request cancelled by on_header");
1018     } elsif (
1019     $hdr{Status} =~ /^(?:1..|204|205|304)$/
1020     or $method eq "HEAD"
1021     or (defined $len && $len == 0) # == 0, not !, because "0 " is true
1022     ) {
1023     # no body
1024     $finish->("", undef, undef, 1);
1025    
1026     } elsif (!$redirect && $arg{want_body_handle}) {
1027     $_[0]->on_eof (undef);
1028     $_[0]->on_error (undef);
1029     $_[0]->on_read (undef);
1030    
1031     $finish->(delete $state{handle});
1032    
1033 root 1.91 } elsif ($chunked) {
1034 root 1.88 my $cl = 0;
1035 root 1.92 my $body = "";
1036 root 1.88 my $on_body = $arg{on_body} || sub { $body .= shift; 1 };
1037    
1038     $state{read_chunk} = sub {
1039     $_[1] =~ /^([0-9a-fA-F]+)/
1040 root 1.108 or return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1041 root 1.82
1042 root 1.88 my $len = hex $1;
1043 root 1.82
1044 root 1.88 if ($len) {
1045     $cl += $len;
1046 root 1.82
1047 root 1.88 $_[0]->push_read (chunk => $len, sub {
1048     $on_body->($_[1], \%hdr)
1049     or return $finish->(undef, 598 => "Request cancelled by on_body");
1050 root 1.82
1051 root 1.88 $_[0]->push_read (line => sub {
1052     length $_[1]
1053     and return $finish->(undef, $ae_error => "Garbled chunked transfer encoding");
1054     $_[0]->push_read (line => $state{read_chunk});
1055 root 1.82 });
1056 root 1.88 });
1057     } else {
1058     $hdr{"content-length"} ||= $cl;
1059 root 1.84
1060 root 1.88 $_[0]->push_read (line => $qr_nlnl, sub {
1061     if (length $_[1]) {
1062     for ("$_[1]") {
1063     y/\015//d; # weed out any \015, as they show up in the weirdest of places.
1064 root 1.84
1065 root 1.92 my $hdr = _parse_hdr
1066 root 1.88 or return $finish->(undef, $ae_error => "Garbled response trailers");
1067 root 1.84
1068 root 1.88 %hdr = (%hdr, %$hdr);
1069     }
1070     }
1071 root 1.84
1072 root 1.88 $finish->($body, undef, undef, 1);
1073 root 1.84 });
1074     }
1075 root 1.88 };
1076    
1077     $_[0]->push_read (line => $state{read_chunk});
1078    
1079     } elsif ($arg{on_body}) {
1080     if (defined $len) {
1081     $_[0]->on_read (sub {
1082     $len -= length $_[0]{rbuf};
1083    
1084     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1085     or return $finish->(undef, 598 => "Request cancelled by on_body");
1086    
1087     $len > 0
1088     or $finish->("", undef, undef, 1);
1089     });
1090 root 1.84 } else {
1091 root 1.88 $_[0]->on_eof (sub {
1092     $finish->("");
1093     });
1094     $_[0]->on_read (sub {
1095     $arg{on_body}(delete $_[0]{rbuf}, \%hdr)
1096     or $finish->(undef, 598 => "Request cancelled by on_body");
1097     });
1098     }
1099     } else {
1100     $_[0]->on_eof (undef);
1101 root 1.82
1102 root 1.88 if (defined $len) {
1103     $_[0]->on_read (sub {
1104     $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), undef, undef, 1)
1105     if $len <= length $_[0]{rbuf};
1106     });
1107     } else {
1108     $_[0]->on_error (sub {
1109     ($! == Errno::EPIPE || !$!)
1110     ? $finish->(delete $_[0]{rbuf})
1111     : $finish->(undef, $ae_error => $_[2]);
1112     });
1113     $_[0]->on_read (sub { });
1114 root 1.82 }
1115 root 1.88 }
1116     };
1117 root 1.82
1118 root 1.92 # if keepalive is enabled, then the server closing the connection
1119     # before a response can happen legally - we retry on idempotent methods.
1120 root 1.103 if ($was_persistent && $idempotent) {
1121 root 1.92 my $old_eof = $hdl->{on_eof};
1122     $hdl->{on_eof} = sub {
1123     _destroy_state %state;
1124    
1125 root 1.103 %state = ();
1126     $state{recurse} =
1127     http_request (
1128 root 1.124 $method => $url,
1129 root 1.103 %arg,
1130 root 1.124 recurse => $recurse - 1,
1131     persistent => 0,
1132 root 1.103 sub {
1133     %state = ();
1134     &$cb
1135     }
1136     );
1137 root 1.92 };
1138     $hdl->on_read (sub {
1139     return unless %state;
1140    
1141     # as soon as we receive something, a connection close
1142     # once more becomes a hard error
1143     $hdl->{on_eof} = $old_eof;
1144     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1145     });
1146     } else {
1147     $hdl->push_read (line => $qr_nlnl, $state{read_response});
1148     }
1149     };
1150    
1151     my $prepare_handle = sub {
1152     my ($hdl) = $state{handle};
1153    
1154     $hdl->on_error (sub {
1155     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => $_[2] };
1156     });
1157     $hdl->on_eof (sub {
1158     _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "Unexpected end-of-file" };
1159     });
1160 root 1.103 $hdl->timeout_reset;
1161     $hdl->timeout ($timeout);
1162 root 1.88 };
1163 root 1.82
1164 root 1.92 # connected to proxy (or origin server)
1165 root 1.88 my $connect_cb = sub {
1166 root 1.92 my $fh = shift
1167     or return _error %state, $cb, { @pseudo, Status => $ae_error, Reason => "$!" };
1168 root 1.44
1169 root 1.88 return unless delete $state{connect_guard};
1170 root 1.11
1171 root 1.88 # get handle
1172     $state{handle} = new AnyEvent::Handle
1173 root 1.92 %{ $arg{handle_params} },
1174     fh => $fh,
1175     peername => $uhost,
1176 root 1.88 tls_ctx => $arg{tls_ctx},
1177     ;
1178 root 1.11
1179 root 1.92 $prepare_handle->();
1180 root 1.1
1181 root 1.92 #$state{handle}->starttls ("connect") if $rscheme eq "https";
1182 root 1.88
1183     # now handle proxy-CONNECT method
1184     if ($proxy && $uscheme eq "https") {
1185     # oh dear, we have to wrap it into a connect request
1186    
1187 root 1.122 my $auth = exists $hdr{"proxy-authorization"}
1188     ? "proxy-authorization: " . (delete $hdr{"proxy-authorization"}) . "\015\012"
1189     : "";
1190    
1191 root 1.88 # maybe re-use $uauthority with patched port?
1192 root 1.122 $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012$auth\015\012");
1193 root 1.88 $state{handle}->push_read (line => $qr_nlnl, sub {
1194     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
1195 root 1.92 or return _error %state, $cb, { @pseudo, Status => 599, Reason => "Invalid proxy connect response ($_[1])" };
1196 root 1.88
1197     if ($2 == 200) {
1198     $rpath = $upath;
1199     $handle_actual_request->();
1200     } else {
1201 root 1.92 _error %state, $cb, { @pseudo, Status => $2, Reason => $3 };
1202 root 1.88 }
1203     });
1204     } else {
1205 root 1.122 delete $hdr{"proxy-authorization"} unless $proxy;
1206    
1207 root 1.88 $handle_actual_request->();
1208     }
1209     };
1210    
1211     _get_slot $uhost, sub {
1212     $state{slot_guard} = shift;
1213 root 1.64
1214 root 1.88 return unless $state{connect_guard};
1215 root 1.64
1216 root 1.92 # try to use an existing keepalive connection, but only if we, ourselves, plan
1217     # on a keepalive request (in theory, this should be a separate config option).
1218 root 1.103 if ($persistent && $KA_CACHE{$ka_key}) {
1219     $was_persistent = 1;
1220    
1221 root 1.92 $state{handle} = ka_fetch $ka_key;
1222 root 1.103 $state{handle}->destroyed
1223 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (1), please report.";#d#
1224 root 1.92 $prepare_handle->();
1225 root 1.103 $state{handle}->destroyed
1226 root 1.107 and die "AnyEvent::HTTP: unexpectedly got a destructed handle (2), please report.";#d#
1227 root 1.92 $handle_actual_request->();
1228    
1229     } else {
1230     my $tcp_connect = $arg{tcp_connect}
1231     || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect };
1232 root 1.57
1233 root 1.92 $state{connect_guard} = $tcp_connect->($rhost, $rport, $connect_cb, $arg{on_prepare} || sub { $timeout });
1234     }
1235 root 1.1 };
1236    
1237 root 1.92 defined wantarray && AnyEvent::Util::guard { _destroy_state %state }
1238 root 1.1 }
1239    
1240 elmex 1.15 sub http_get($@) {
1241 root 1.1 unshift @_, "GET";
1242     &http_request
1243     }
1244    
1245 elmex 1.15 sub http_head($@) {
1246 root 1.4 unshift @_, "HEAD";
1247     &http_request
1248     }
1249    
1250 elmex 1.15 sub http_post($$@) {
1251 root 1.22 my $url = shift;
1252     unshift @_, "POST", $url, "body";
1253 root 1.3 &http_request
1254     }
1255    
1256 root 1.9 =back
1257    
1258 root 1.55 =head2 DNS CACHING
1259    
1260     AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for
1261     the actual connection, which in turn uses AnyEvent::DNS to resolve
1262     hostnames. The latter is a simple stub resolver and does no caching
1263     on its own. If you want DNS caching, you currently have to provide
1264     your own default resolver (by storing a suitable resolver object in
1265 root 1.92 C<$AnyEvent::DNS::RESOLVER>) or your own C<tcp_connect> callback.
1266 root 1.55
1267 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
1268 root 1.1
1269     =over 4
1270    
1271 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
1272    
1273     Sets the default proxy server to use. The proxy-url must begin with a
1274 root 1.92 string of the form C<http://host:port>, croaks otherwise.
1275 root 1.52
1276     To clear an already-set proxy, use C<undef>.
1277 root 1.2
1278 root 1.113 When AnyEvent::HTTP is loaded for the first time it will query the
1279 root 1.102 default proxy from the operating system, currently by looking at
1280     C<$ENV{http_proxy>}.
1281    
1282 root 1.80 =item AnyEvent::HTTP::cookie_jar_expire $jar[, $session_end]
1283    
1284     Remove all cookies from the cookie jar that have been expired. If
1285     C<$session_end> is given and true, then additionally remove all session
1286     cookies.
1287    
1288     You should call this function (with a true C<$session_end>) before you
1289     save cookies to disk, and you should call this function after loading them
1290 root 1.117 again. If you have a long-running program you can additionally call this
1291 root 1.80 function from time to time.
1292    
1293     A cookie jar is initially an empty hash-reference that is managed by this
1294 root 1.121 module. Its format is subject to change, but currently it is as follows:
1295 root 1.80
1296     The key C<version> has to contain C<1>, otherwise the hash gets
1297     emptied. All other keys are hostnames or IP addresses pointing to
1298     hash-references. The key for these inner hash references is the
1299     server path for which this cookie is meant, and the values are again
1300 root 1.115 hash-references. Each key of those hash-references is a cookie name, and
1301 root 1.80 the value, you guessed it, is another hash-reference, this time with the
1302     key-value pairs from the cookie, except for C<expires> and C<max-age>,
1303     which have been replaced by a C<_expires> key that contains the cookie
1304 root 1.115 expiry timestamp. Session cookies are indicated by not having an
1305     C<_expires> key.
1306 root 1.80
1307     Here is an example of a cookie jar with a single cookie, so you have a
1308     chance of understanding the above paragraph:
1309    
1310     {
1311     version => 1,
1312     "10.0.0.1" => {
1313     "/" => {
1314     "mythweb_id" => {
1315     _expires => 1293917923,
1316     value => "ooRung9dThee3ooyXooM1Ohm",
1317     },
1318     },
1319     },
1320     }
1321    
1322 root 1.61 =item $date = AnyEvent::HTTP::format_date $timestamp
1323    
1324     Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP
1325     Date (RFC 2616).
1326    
1327     =item $timestamp = AnyEvent::HTTP::parse_date $date
1328    
1329 root 1.79 Takes a HTTP Date (RFC 2616) or a Cookie date (netscape cookie spec) or a
1330     bunch of minor variations of those, and returns the corresponding POSIX
1331     timestamp, or C<undef> if the date cannot be parsed.
1332 root 1.61
1333 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
1334 root 1.1
1335 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
1336 root 1.1
1337 root 1.92 =item $AnyEvent::HTTP::TIMEOUT
1338    
1339 root 1.110 The default timeout for connection operations (default: C<300>).
1340 root 1.92
1341 root 1.1 =item $AnyEvent::HTTP::USERAGENT
1342    
1343     The default value for the C<User-Agent> header (the default is
1344 root 1.40 C<Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
1345 root 1.1
1346 root 1.43 =item $AnyEvent::HTTP::MAX_PER_HOST
1347 root 1.1
1348 root 1.47 The maximum number of concurrent connections to the same host (identified
1349 root 1.121 by the hostname). If the limit is exceeded, then additional requests
1350 root 1.92 are queued until previous connections are closed. Both persistent and
1351     non-persistent connections are counted in this limit.
1352 root 1.1
1353 root 1.43 The default value for this is C<4>, and it is highly advisable to not
1354 root 1.92 increase it much.
1355    
1356     For comparison: the RFC's recommend 4 non-persistent or 2 persistent
1357 root 1.117 connections, older browsers used 2, newer ones (such as firefox 3)
1358     typically use 6, and Opera uses 8 because like, they have the fastest
1359     browser and give a shit for everybody else on the planet.
1360 root 1.92
1361     =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
1362    
1363 root 1.116 The time after which idle persistent connections get closed by
1364 root 1.92 AnyEvent::HTTP (default: C<3>).
1365 root 1.3
1366 root 1.14 =item $AnyEvent::HTTP::ACTIVE
1367    
1368     The number of active connections. This is not the number of currently
1369     running requests, but the number of currently open and non-idle TCP
1370 root 1.92 connections. This number can be useful for load-leveling.
1371 root 1.14
1372 root 1.1 =back
1373    
1374     =cut
1375    
1376 root 1.61 our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
1377     our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat);
1378    
1379     sub format_date($) {
1380     my ($time) = @_;
1381    
1382     # RFC 822/1123 format
1383     my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time;
1384    
1385     sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT",
1386     $weekday[$wday], $mday, $month[$mon], $year + 1900,
1387     $H, $M, $S;
1388     }
1389    
1390     sub parse_date($) {
1391     my ($date) = @_;
1392    
1393     my ($d, $m, $y, $H, $M, $S);
1394    
1395 root 1.79 if ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)[\- ]([A-Z][a-z][a-z])[\- ]([0-9][0-9][0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1396 root 1.70 # RFC 822/1123, required by RFC 2616 (with " ")
1397     # cookie dates (with "-")
1398    
1399 root 1.61 ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6);
1400    
1401 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+, ([0-9][0-9]?)-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) GMT$/) {
1402 root 1.61 # RFC 850
1403     ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6);
1404    
1405 root 1.79 } elsif ($date =~ /^[A-Z][a-z][a-z]+ ([A-Z][a-z][a-z]) ([0-9 ]?[0-9]) ([0-9][0-9]?):([0-9][0-9]?):([0-9][0-9]?) ([0-9][0-9][0-9][0-9])$/) {
1406 root 1.61 # ISO C's asctime
1407     ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5);
1408     }
1409     # other formats fail in the loop below
1410    
1411     for (0..11) {
1412     if ($m eq $month[$_]) {
1413     require Time::Local;
1414 root 1.111 return eval { Time::Local::timegm ($S, $M, $H, $d, $_, $y) };
1415 root 1.61 }
1416     }
1417    
1418     undef
1419     }
1420    
1421 root 1.2 sub set_proxy($) {
1422 root 1.52 if (length $_[0]) {
1423 root 1.92 $_[0] =~ m%^(http):// ([^:/]+) (?: : (\d*) )?%ix
1424 root 1.52 or Carp::croak "$_[0]: invalid proxy URL";
1425     $PROXY = [$2, $3 || 3128, $1]
1426     } else {
1427     undef $PROXY;
1428     }
1429 root 1.2 }
1430    
1431     # initialise proxy from environment
1432 root 1.52 eval {
1433     set_proxy $ENV{http_proxy};
1434     };
1435 root 1.2
1436 root 1.93 =head2 SHOWCASE
1437    
1438 root 1.117 This section contains some more elaborate "real-world" examples or code
1439 root 1.93 snippets.
1440    
1441     =head2 HTTP/1.1 FILE DOWNLOAD
1442    
1443 root 1.96 Downloading files with HTTP can be quite tricky, especially when something
1444 root 1.98 goes wrong and you want to resume.
1445 root 1.93
1446     Here is a function that initiates and resumes a download. It uses the
1447     last modified time to check for file content changes, and works with many
1448     HTTP/1.0 servers as well, and usually falls back to a complete re-download
1449     on older servers.
1450    
1451     It calls the completion callback with either C<undef>, which means a
1452 root 1.117 nonretryable error occurred, C<0> when the download was partial and should
1453 root 1.93 be retried, and C<1> if it was successful.
1454    
1455     use AnyEvent::HTTP;
1456    
1457     sub download($$$) {
1458     my ($url, $file, $cb) = @_;
1459    
1460     open my $fh, "+<", $file
1461     or die "$file: $!";
1462    
1463     my %hdr;
1464     my $ofs = 0;
1465    
1466     warn stat $fh;
1467     warn -s _;
1468     if (stat $fh and -s _) {
1469     $ofs = -s _;
1470 root 1.107 warn "-s is ", $ofs;
1471 root 1.93 $hdr{"if-unmodified-since"} = AnyEvent::HTTP::format_date +(stat _)[9];
1472     $hdr{"range"} = "bytes=$ofs-";
1473     }
1474    
1475     http_get $url,
1476     headers => \%hdr,
1477     on_header => sub {
1478     my ($hdr) = @_;
1479    
1480     if ($hdr->{Status} == 200 && $ofs) {
1481     # resume failed
1482     truncate $fh, $ofs = 0;
1483     }
1484    
1485     sysseek $fh, $ofs, 0;
1486    
1487     1
1488     },
1489     on_body => sub {
1490     my ($data, $hdr) = @_;
1491    
1492     if ($hdr->{Status} =~ /^2/) {
1493     length $data == syswrite $fh, $data
1494     or return; # abort on write errors
1495     }
1496    
1497     1
1498     },
1499     sub {
1500     my (undef, $hdr) = @_;
1501    
1502     my $status = $hdr->{Status};
1503    
1504     if (my $time = AnyEvent::HTTP::parse_date $hdr->{"last-modified"}) {
1505     utime $fh, $time, $time;
1506     }
1507    
1508     if ($status == 200 || $status == 206 || $status == 416) {
1509     # download ok || resume ok || file already fully downloaded
1510     $cb->(1, $hdr);
1511    
1512     } elsif ($status == 412) {
1513     # file has changed while resuming, delete and retry
1514     unlink $file;
1515     $cb->(0, $hdr);
1516    
1517     } elsif ($status == 500 or $status == 503 or $status =~ /^59/) {
1518     # retry later
1519     $cb->(0, $hdr);
1520    
1521     } else {
1522     $cb->(undef, $hdr);
1523     }
1524     }
1525     ;
1526     }
1527    
1528     download "http://server/somelargefile", "/tmp/somelargefile", sub {
1529     if ($_[0]) {
1530     print "OK!\n";
1531     } elsif (defined $_[0]) {
1532     print "please retry later\n";
1533     } else {
1534     print "ERROR\n";
1535     }
1536     };
1537    
1538     =head3 SOCKS PROXIES
1539 root 1.60
1540     Socks proxies are not directly supported by AnyEvent::HTTP. You can
1541     compile your perl to support socks, or use an external program such as
1542     F<socksify> (dante) or F<tsocks> to make your program use a socks proxy
1543     transparently.
1544    
1545     Alternatively, for AnyEvent::HTTP only, you can use your own
1546     C<tcp_connect> function that does the proxy handshake - here is an example
1547     that works with socks4a proxies:
1548    
1549     use Errno;
1550     use AnyEvent::Util;
1551     use AnyEvent::Socket;
1552     use AnyEvent::Handle;
1553    
1554     # host, port and username of/for your socks4a proxy
1555     my $socks_host = "10.0.0.23";
1556     my $socks_port = 9050;
1557     my $socks_user = "";
1558    
1559     sub socks4a_connect {
1560     my ($host, $port, $connect_cb, $prepare_cb) = @_;
1561    
1562     my $hdl = new AnyEvent::Handle
1563     connect => [$socks_host, $socks_port],
1564     on_prepare => sub { $prepare_cb->($_[0]{fh}) },
1565     on_error => sub { $connect_cb->() },
1566     ;
1567    
1568     $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host);
1569    
1570     $hdl->push_read (chunk => 8, sub {
1571     my ($hdl, $chunk) = @_;
1572     my ($status, $port, $ipn) = unpack "xCna4", $chunk;
1573    
1574     if ($status == 0x5a) {
1575     $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port");
1576     } else {
1577     $! = Errno::ENXIO; $connect_cb->();
1578     }
1579     });
1580    
1581     $hdl
1582     }
1583    
1584     Use C<socks4a_connect> instead of C<tcp_connect> when doing C<http_request>s,
1585     possibly after switching off other proxy types:
1586    
1587     AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies
1588    
1589     http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub {
1590     my ($data, $headers) = @_;
1591     ...
1592     };
1593    
1594 root 1.1 =head1 SEE ALSO
1595    
1596     L<AnyEvent>.
1597    
1598     =head1 AUTHOR
1599    
1600 root 1.18 Marc Lehmann <schmorp@schmorp.de>
1601     http://home.schmorp.de/
1602 root 1.1
1603 root 1.36 With many thanks to Дмитрий Шалашов, who provided countless
1604     testcases and bugreports.
1605    
1606 root 1.1 =cut
1607    
1608     1
1609