ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.13
Committed: Thu Jun 5 16:43:45 2008 UTC (15 years, 11 months ago) by root
Branch: MAIN
Changes since 1.12: +3 -3 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9     =head1 DESCRIPTION
10    
11     This module is an L<AnyEvent> user, you need to make sure that you use and
12     run a supported event loop.
13    
14 root 1.11 This module implements a simple, stateless and non-blocking HTTP
15     client. It supports GET, POST and other request methods, cookies and more,
16     all on a very low level. It can follow redirects supports proxies and
17     automatically limits the number of connections to the values specified in
18     the RFC.
19    
20     It should generally be a "good client" that is enough for most HTTP
21     tasks. Simple tasks should be simple, but complex tasks should still be
22     possible as the user retains control over request and response headers.
23    
24     The caller is responsible for authentication management, cookies (if
25     the simplistic implementation in this module doesn't suffice), referer
26     and other high-level protocol details for which this module offers only
27     limited support.
28    
29 root 1.1 =head2 METHODS
30    
31     =over 4
32    
33     =cut
34    
35     package AnyEvent::HTTP;
36    
37     use strict;
38     no warnings;
39    
40     use Carp;
41    
42     use AnyEvent ();
43     use AnyEvent::Util ();
44     use AnyEvent::Socket ();
45     use AnyEvent::Handle ();
46    
47     use base Exporter::;
48    
49     our $VERSION = '1.0';
50    
51     our @EXPORT = qw(http_get http_request);
52    
53     our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
54 root 1.3 our $MAX_RECURSE = 10;
55 root 1.2 our $MAX_PERSISTENT = 8;
56     our $PERSISTENT_TIMEOUT = 2;
57     our $TIMEOUT = 300;
58 root 1.1
59     # changing these is evil
60     our $MAX_PERSISTENT_PER_HOST = 2;
61 root 1.11 our $MAX_PER_HOST = 4;
62 root 1.1
63 root 1.2 our $PROXY;
64    
65 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
66 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
67 root 1.1
68     =item http_get $url, key => value..., $cb->($data, $headers)
69    
70     Executes an HTTP-GET request. See the http_request function for details on
71     additional parameters.
72    
73 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
74    
75     Executes an HTTP-HEAD request. See the http_request function for details on
76     additional parameters.
77    
78     =item http_post $url, $body, key => value..., $cb->($data, $headers)
79 root 1.3
80 root 1.7 Executes an HTTP-POST request with a request body of C<$bod>. See the
81 root 1.3 http_request function for details on additional parameters.
82    
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.2 The callback will be called with the response data as first argument
89     (or C<undef> if it wasn't available due to errors), and a hash-ref with
90     response headers as second argument.
91    
92 root 1.7 All the headers in that hash are lowercased. In addition to the response
93 root 1.2 headers, the three "pseudo-headers" C<HTTPVersion>, C<Status> and
94     C<Reason> contain the three parts of the HTTP Status-Line of the same
95 root 1.10 name. If the server sends a header multiple lines, then their contents
96     will be joined together with C<\x00>.
97 root 1.2
98     If an internal error occurs, such as not being able to resolve a hostname,
99     then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
100     and the C<Reason> pseudo-header will contain an error message.
101    
102 root 1.6 A typical callback might look like this:
103    
104     sub {
105     my ($body, $hdr) = @_;
106    
107     if ($hdr->{Status} =~ /^2/) {
108     ... everything should be ok
109     } else {
110     print "error, $hdr->{Status} $hdr->{Reason}\n";
111     }
112     }
113    
114 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
115     include:
116    
117     =over 4
118    
119 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
120 root 1.1
121     Whether to recurse requests or not, e.g. on redirects, authentication
122 root 1.3 retries and so on, and how often to do so.
123 root 1.1
124     =item headers => hashref
125    
126 root 1.12 The request headers to use. Currently, C<http_request> may provide its
127     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
128     and will provide defaults for C<User-Agent:> and C<Referer:>.
129 root 1.1
130     =item timeout => $seconds
131    
132     The time-out to use for various stages - each connect attempt will reset
133 root 1.2 the timeout, as will read or write activity. Default timeout is 5 minutes.
134    
135     =item proxy => [$host, $port[, $scheme]] or undef
136    
137     Use the given http proxy for all requests. If not specified, then the
138     default proxy (as specified by C<$ENV{http_proxy}>) is used.
139    
140     C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
141     HTTPS.
142 root 1.1
143 root 1.3 =item body => $string
144    
145     The request body, usually empty. Will be-sent as-is (future versions of
146     this module might offer more options).
147    
148 root 1.10 =item cookie_jar => $hash_ref
149    
150     Passing this parameter enables (simplified) cookie-processing, loosely
151     based on the original netscape specification.
152    
153     The C<$hash_ref> must be an (initially empty) hash reference which will
154     get updated automatically. It is possible to save the cookie_jar to
155     persistent storage with something like JSON or Storable, but this is not
156     recommended, as expire times are currently being ignored.
157    
158     Note that this cookie implementation is not of very high quality, nor
159     meant to be complete. If you want complete cookie management you have to
160     do that on your own. C<cookie_jar> is meant as a quick fix to get some
161     cookie-using sites working. Cookies are a privacy disaster, do not use
162     them unless required to.
163    
164 root 1.1 =back
165    
166 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
167    
168     http_request GET => "http://www.nethype.de/", sub {
169     my ($body, $hdr) = @_;
170     print "$body\n";
171     };
172    
173     Example: make a HTTP HEAD request on https://www.google.com/, use a
174     timeout of 30 seconds.
175    
176     http_request
177     GET => "https://www.google.com",
178     timeout => 30,
179     sub {
180     my ($body, $hdr) = @_;
181     use Data::Dumper;
182     print Dumper $hdr;
183     }
184     ;
185 root 1.1
186     =cut
187    
188 root 1.12 sub _slot_schedule;
189 root 1.11 sub _slot_schedule($) {
190     my $host = shift;
191    
192     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
193     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
194 root 1.12 # somebody wants that slot
195 root 1.11 ++$CO_SLOT{$host}[0];
196    
197     $cb->(AnyEvent::Util::guard {
198     --$CO_SLOT{$host}[0];
199     _slot_schedule $host;
200     });
201     } else {
202     # nobody wants the slot, maybe we can forget about it
203     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
204     last;
205     }
206     }
207     }
208    
209     # wait for a free slot on host, call callback
210     sub _get_slot($$) {
211     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
212    
213     _slot_schedule $_[0];
214     }
215    
216 root 1.1 sub http_request($$$;@) {
217     my $cb = pop;
218     my ($method, $url, %arg) = @_;
219    
220     my %hdr;
221    
222 root 1.3 $method = uc $method;
223    
224 root 1.8 if (my $hdr = $arg{headers}) {
225 root 1.1 while (my ($k, $v) = each %$hdr) {
226     $hdr{lc $k} = $v;
227     }
228     }
229    
230 root 1.8 my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE;
231    
232     return $cb->(undef, { Status => 599, Reason => "recursion limit reached" })
233     if $recurse < 0;
234    
235 root 1.2 my $proxy = $arg{proxy} || $PROXY;
236 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
237    
238     $hdr{"user-agent"} ||= $USERAGENT;
239    
240 root 1.10 my ($scheme, $authority, $upath, $query, $fragment) =
241     $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
242 root 1.2
243 root 1.10 $scheme = lc $scheme;
244 root 1.1
245 root 1.10 my $uport = $scheme eq "http" ? 80
246     : $scheme eq "https" ? 443
247     : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" });
248    
249 root 1.13 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
250    
251 root 1.10 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
252     or return $cb->(undef, { Status => 599, Reason => "unparsable URL" });
253    
254     my $uhost = $1;
255     $uport = $2 if defined $2;
256    
257     $uhost =~ s/^\[(.*)\]$/$1/;
258     $upath .= "?$query" if length $query;
259    
260     $upath =~ s%^/?%/%;
261    
262     # cookie processing
263     if (my $jar = $arg{cookie_jar}) {
264     %$jar = () if $jar->{version} < 1;
265    
266     my @cookie;
267    
268     while (my ($chost, $v) = each %$jar) {
269     next unless $chost eq substr $uhost, -length $chost;
270     next unless $chost =~ /^\./;
271    
272     while (my ($cpath, $v) = each %$v) {
273     next unless $cpath eq substr $upath, 0, length $cpath;
274    
275     while (my ($k, $v) = each %$v) {
276     next if $scheme ne "https" && exists $v->{secure};
277     push @cookie, "$k=$v->{value}";
278     }
279     }
280     }
281    
282     $hdr{cookie} = join "; ", @cookie
283     if @cookie;
284     }
285 root 1.1
286 root 1.10 my ($rhost, $rport, $rpath); # request host, port, path
287 root 1.2
288 root 1.10 if ($proxy) {
289     ($rhost, $rport, $scheme) = @$proxy;
290     $rpath = $url;
291     } else {
292     ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
293     $hdr{host} = $uhost;
294 root 1.2 }
295    
296 root 1.10 $hdr{"content-length"} = length $arg{body};
297 root 1.1
298 root 1.11 my %state = (connect_guard => 1);
299    
300     _get_slot $uhost, sub {
301     $state{slot_guard} = shift;
302 root 1.1
303 root 1.11 return unless $state{connect_guard};
304 root 1.1
305 root 1.11 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
306     $state{fh} = shift
307     or return $cb->(undef, { Status => 599, Reason => "$!" });
308    
309     delete $state{connect_guard}; # reduce memory usage, save a tree
310    
311     # get handle
312     $state{handle} = new AnyEvent::Handle
313     fh => $state{fh},
314     ($scheme eq "https" ? (tls => "connect") : ());
315    
316     # limit the number of persistent connections
317     if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
318     ++$KA_COUNT{$_[1]};
319     $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
320     $hdr{connection} = "keep-alive";
321     delete $hdr{connection}; # keep-alive not yet supported
322     } else {
323     delete $hdr{connection};
324     }
325 root 1.1
326 root 1.11 # (re-)configure handle
327     $state{handle}->timeout ($timeout);
328     $state{handle}->on_error (sub {
329     %state = ();
330     $cb->(undef, { Status => 599, Reason => "$!" });
331     });
332     $state{handle}->on_eof (sub {
333     %state = ();
334     $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" });
335     });
336 root 1.1
337 root 1.11 # send request
338     $state{handle}->push_write (
339     "$method $rpath HTTP/1.0\015\012"
340     . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
341     . "\015\012"
342     . (delete $arg{body})
343     );
344 root 1.1
345 root 1.11 %hdr = (); # reduce memory usage, save a kitten
346 root 1.1
347 root 1.11 # status line
348     $state{handle}->push_read (line => qr/\015?\012/, sub {
349     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
350     or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" }));
351    
352     my %hdr = ( # response headers
353     HTTPVersion => "\x00$1",
354     Status => "\x00$2",
355     Reason => "\x00$3",
356     );
357    
358     # headers, could be optimized a bit
359     $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
360     for ("$_[1]\012") {
361     # we support spaces in field names, as lotus domino
362     # creates them.
363     $hdr{lc $1} .= "\x00$2"
364     while /\G
365     ([^:\000-\037]+):
366     [\011\040]*
367     ((?: [^\015\012]+ | \015?\012[\011\040] )*)
368     \015?\012
369     /gxc;
370 root 1.10
371 root 1.11 /\G$/
372     or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" }));
373 root 1.10 }
374    
375 root 1.11 substr $_, 0, 1, ""
376     for values %hdr;
377    
378     my $finish = sub {
379     %state = ();
380 root 1.10
381 root 1.11 # set-cookie processing
382     if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
383     for (split /\x00/, $hdr{"set-cookie"}) {
384     my ($cookie, @arg) = split /;\s*/;
385     my ($name, $value) = split /=/, $cookie, 2;
386     my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
387    
388     my $cdom = (delete $kv{domain}) || $uhost;
389     my $cpath = (delete $kv{path}) || "/";
390    
391     $cdom =~ s/^.?/./; # make sure it starts with a "."
392    
393     next if $cdom =~ /\.$/;
394    
395     # this is not rfc-like and not netscape-like. go figure.
396     my $ndots = $cdom =~ y/.//;
397     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
398    
399     # store it
400     $arg{cookie_jar}{version} = 1;
401     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
402     }
403     }
404 root 1.8
405 root 1.13 if ($_[1]{Status} =~ /^30[12]$/ && $recurse) {
406 root 1.11 # microsoft and other assholes don't give a shit for following standards,
407     # try to support a common form of broken Location header.
408     $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
409    
410     http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
411     } else {
412     $cb->($_[0], $_[1]);
413     }
414     };
415 root 1.3
416 root 1.11 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
417     $finish->(undef, \%hdr);
418 root 1.3 } else {
419 root 1.11 if (exists $hdr{"content-length"}) {
420     $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
421     # could cache persistent connection now
422     if ($hdr{connection} =~ /\bkeep-alive\b/i) {
423     # but we don't, due to misdesigns, this is annoyingly complex
424     };
425    
426     $finish->($_[1], \%hdr);
427     });
428     } else {
429     # too bad, need to read until we get an error or EOF,
430     # no way to detect winged data.
431     $_[0]->on_error (sub {
432     $finish->($_[0]{rbuf}, \%hdr);
433     });
434     $_[0]->on_eof (undef);
435     $_[0]->on_read (sub { });
436     }
437 root 1.3 }
438 root 1.11 });
439 root 1.1 });
440 root 1.11 }, sub {
441     $timeout
442     };
443 root 1.1 };
444    
445     defined wantarray && AnyEvent::Util::guard { %state = () }
446     }
447    
448     sub http_get($$;@) {
449     unshift @_, "GET";
450     &http_request
451     }
452    
453 root 1.4 sub http_head($$;@) {
454     unshift @_, "HEAD";
455     &http_request
456     }
457    
458 root 1.3 sub http_post($$$;@) {
459     unshift @_, "POST", "body";
460     &http_request
461     }
462    
463 root 1.9 =back
464    
465 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
466 root 1.1
467     =over 4
468    
469 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
470    
471     Sets the default proxy server to use. The proxy-url must begin with a
472     string of the form C<http://host:port> (optionally C<https:...>).
473    
474 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
475 root 1.1
476 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
477 root 1.1
478     =item $AnyEvent::HTTP::USERAGENT
479    
480     The default value for the C<User-Agent> header (the default is
481     C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
482    
483     =item $AnyEvent::HTTP::MAX_PERSISTENT
484    
485     The maximum number of persistent connections to keep open (default: 8).
486    
487 root 1.3 Not implemented currently.
488    
489 root 1.1 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
490    
491 root 1.2 The maximum time to cache a persistent connection, in seconds (default: 2).
492 root 1.1
493 root 1.3 Not implemented currently.
494    
495 root 1.1 =back
496    
497     =cut
498    
499 root 1.2 sub set_proxy($) {
500     $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
501     }
502    
503     # initialise proxy from environment
504     set_proxy $ENV{http_proxy};
505    
506 root 1.1 =head1 SEE ALSO
507    
508     L<AnyEvent>.
509    
510     =head1 AUTHOR
511    
512     Marc Lehmann <schmorp@schmorp.de>
513     http://home.schmorp.de/
514    
515     =cut
516    
517     1
518