ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.18
Committed: Fri Jun 6 16:23:57 2008 UTC (15 years, 11 months ago) by root
Branch: MAIN
Changes since 1.17: +2 -2 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44     use Carp;
45    
46     use AnyEvent ();
47     use AnyEvent::Util ();
48     use AnyEvent::Socket ();
49     use AnyEvent::Handle ();
50    
51     use base Exporter::;
52    
53 root 1.16 our $VERSION = '1.01';
54 root 1.1
55 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
56 root 1.1
57     our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
58 root 1.3 our $MAX_RECURSE = 10;
59 root 1.2 our $MAX_PERSISTENT = 8;
60     our $PERSISTENT_TIMEOUT = 2;
61     our $TIMEOUT = 300;
62 root 1.1
63     # changing these is evil
64     our $MAX_PERSISTENT_PER_HOST = 2;
65 root 1.11 our $MAX_PER_HOST = 4;
66 root 1.1
67 root 1.2 our $PROXY;
68 root 1.14 our $ACTIVE = 0;
69 root 1.2
70 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
71 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
72 root 1.1
73     =item http_get $url, key => value..., $cb->($data, $headers)
74    
75     Executes an HTTP-GET request. See the http_request function for details on
76     additional parameters.
77    
78 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
79    
80     Executes an HTTP-HEAD request. See the http_request function for details on
81     additional parameters.
82    
83     =item http_post $url, $body, key => value..., $cb->($data, $headers)
84 root 1.3
85 root 1.7 Executes an HTTP-POST request with a request body of C<$bod>. See the
86 root 1.3 http_request function for details on additional parameters.
87    
88 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89    
90     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91     must be an absolute http or https URL.
92    
93 root 1.2 The callback will be called with the response data as first argument
94     (or C<undef> if it wasn't available due to errors), and a hash-ref with
95     response headers as second argument.
96    
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.2 headers, the three "pseudo-headers" C<HTTPVersion>, C<Status> and
99     C<Reason> contain the three parts of the HTTP Status-Line of the same
100 root 1.10 name. If the server sends a header multiple lines, then their contents
101     will be joined together with C<\x00>.
102 root 1.2
103     If an internal error occurs, such as not being able to resolve a hostname,
104     then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
105     and the C<Reason> pseudo-header will contain an error message.
106    
107 root 1.6 A typical callback might look like this:
108    
109     sub {
110     my ($body, $hdr) = @_;
111    
112     if ($hdr->{Status} =~ /^2/) {
113     ... everything should be ok
114     } else {
115     print "error, $hdr->{Status} $hdr->{Reason}\n";
116     }
117     }
118    
119 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
120     include:
121    
122     =over 4
123    
124 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
125 root 1.1
126     Whether to recurse requests or not, e.g. on redirects, authentication
127 root 1.3 retries and so on, and how often to do so.
128 root 1.1
129     =item headers => hashref
130    
131 root 1.12 The request headers to use. Currently, C<http_request> may provide its
132     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
133     and will provide defaults for C<User-Agent:> and C<Referer:>.
134 root 1.1
135     =item timeout => $seconds
136    
137     The time-out to use for various stages - each connect attempt will reset
138 root 1.2 the timeout, as will read or write activity. Default timeout is 5 minutes.
139    
140     =item proxy => [$host, $port[, $scheme]] or undef
141    
142     Use the given http proxy for all requests. If not specified, then the
143     default proxy (as specified by C<$ENV{http_proxy}>) is used.
144    
145     C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
146     HTTPS.
147 root 1.1
148 root 1.3 =item body => $string
149    
150     The request body, usually empty. Will be-sent as-is (future versions of
151     this module might offer more options).
152    
153 root 1.10 =item cookie_jar => $hash_ref
154    
155     Passing this parameter enables (simplified) cookie-processing, loosely
156     based on the original netscape specification.
157    
158     The C<$hash_ref> must be an (initially empty) hash reference which will
159     get updated automatically. It is possible to save the cookie_jar to
160     persistent storage with something like JSON or Storable, but this is not
161     recommended, as expire times are currently being ignored.
162    
163     Note that this cookie implementation is not of very high quality, nor
164     meant to be complete. If you want complete cookie management you have to
165     do that on your own. C<cookie_jar> is meant as a quick fix to get some
166     cookie-using sites working. Cookies are a privacy disaster, do not use
167     them unless required to.
168    
169 root 1.1 =back
170    
171 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
172    
173     http_request GET => "http://www.nethype.de/", sub {
174     my ($body, $hdr) = @_;
175     print "$body\n";
176     };
177    
178     Example: make a HTTP HEAD request on https://www.google.com/, use a
179     timeout of 30 seconds.
180    
181     http_request
182     GET => "https://www.google.com",
183     timeout => 30,
184     sub {
185     my ($body, $hdr) = @_;
186     use Data::Dumper;
187     print Dumper $hdr;
188     }
189     ;
190 root 1.1
191     =cut
192    
193 root 1.12 sub _slot_schedule;
194 root 1.11 sub _slot_schedule($) {
195     my $host = shift;
196    
197     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
198     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
199 root 1.12 # somebody wants that slot
200 root 1.11 ++$CO_SLOT{$host}[0];
201 root 1.14 ++$ACTIVE;
202 root 1.11
203     $cb->(AnyEvent::Util::guard {
204 root 1.14 --$ACTIVE;
205 root 1.11 --$CO_SLOT{$host}[0];
206     _slot_schedule $host;
207     });
208     } else {
209     # nobody wants the slot, maybe we can forget about it
210     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
211     last;
212     }
213     }
214     }
215    
216     # wait for a free slot on host, call callback
217     sub _get_slot($$) {
218     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
219    
220     _slot_schedule $_[0];
221     }
222    
223 elmex 1.15 sub http_request($$@) {
224 root 1.1 my $cb = pop;
225     my ($method, $url, %arg) = @_;
226    
227     my %hdr;
228    
229 root 1.3 $method = uc $method;
230    
231 root 1.8 if (my $hdr = $arg{headers}) {
232 root 1.1 while (my ($k, $v) = each %$hdr) {
233     $hdr{lc $k} = $v;
234     }
235     }
236    
237 root 1.8 my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE;
238    
239     return $cb->(undef, { Status => 599, Reason => "recursion limit reached" })
240     if $recurse < 0;
241    
242 root 1.2 my $proxy = $arg{proxy} || $PROXY;
243 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
244    
245     $hdr{"user-agent"} ||= $USERAGENT;
246    
247 root 1.10 my ($scheme, $authority, $upath, $query, $fragment) =
248     $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
249 root 1.2
250 root 1.10 $scheme = lc $scheme;
251 root 1.1
252 root 1.10 my $uport = $scheme eq "http" ? 80
253     : $scheme eq "https" ? 443
254     : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" });
255    
256 root 1.13 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
257    
258 root 1.10 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
259     or return $cb->(undef, { Status => 599, Reason => "unparsable URL" });
260    
261     my $uhost = $1;
262     $uport = $2 if defined $2;
263    
264     $uhost =~ s/^\[(.*)\]$/$1/;
265     $upath .= "?$query" if length $query;
266    
267     $upath =~ s%^/?%/%;
268    
269     # cookie processing
270     if (my $jar = $arg{cookie_jar}) {
271     %$jar = () if $jar->{version} < 1;
272    
273     my @cookie;
274    
275     while (my ($chost, $v) = each %$jar) {
276     next unless $chost eq substr $uhost, -length $chost;
277     next unless $chost =~ /^\./;
278    
279     while (my ($cpath, $v) = each %$v) {
280     next unless $cpath eq substr $upath, 0, length $cpath;
281    
282     while (my ($k, $v) = each %$v) {
283     next if $scheme ne "https" && exists $v->{secure};
284     push @cookie, "$k=$v->{value}";
285     }
286     }
287     }
288    
289     $hdr{cookie} = join "; ", @cookie
290     if @cookie;
291     }
292 root 1.1
293 root 1.10 my ($rhost, $rport, $rpath); # request host, port, path
294 root 1.2
295 root 1.10 if ($proxy) {
296     ($rhost, $rport, $scheme) = @$proxy;
297     $rpath = $url;
298     } else {
299     ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
300     $hdr{host} = $uhost;
301 root 1.2 }
302    
303 root 1.10 $hdr{"content-length"} = length $arg{body};
304 root 1.1
305 root 1.11 my %state = (connect_guard => 1);
306    
307     _get_slot $uhost, sub {
308     $state{slot_guard} = shift;
309 root 1.1
310 root 1.11 return unless $state{connect_guard};
311 root 1.1
312 root 1.11 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
313     $state{fh} = shift
314     or return $cb->(undef, { Status => 599, Reason => "$!" });
315    
316     delete $state{connect_guard}; # reduce memory usage, save a tree
317    
318     # get handle
319     $state{handle} = new AnyEvent::Handle
320     fh => $state{fh},
321     ($scheme eq "https" ? (tls => "connect") : ());
322    
323     # limit the number of persistent connections
324     if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
325     ++$KA_COUNT{$_[1]};
326     $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
327     $hdr{connection} = "keep-alive";
328     delete $hdr{connection}; # keep-alive not yet supported
329     } else {
330     delete $hdr{connection};
331     }
332 root 1.1
333 root 1.11 # (re-)configure handle
334     $state{handle}->timeout ($timeout);
335     $state{handle}->on_error (sub {
336 root 1.14 my $errno = "$!";
337 root 1.11 %state = ();
338 root 1.14 $cb->(undef, { Status => 599, Reason => $errno });
339 root 1.11 });
340     $state{handle}->on_eof (sub {
341     %state = ();
342     $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" });
343     });
344 root 1.1
345 root 1.11 # send request
346     $state{handle}->push_write (
347     "$method $rpath HTTP/1.0\015\012"
348     . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
349     . "\015\012"
350     . (delete $arg{body})
351     );
352 root 1.1
353 root 1.11 %hdr = (); # reduce memory usage, save a kitten
354 root 1.1
355 root 1.11 # status line
356     $state{handle}->push_read (line => qr/\015?\012/, sub {
357     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
358     or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" }));
359    
360     my %hdr = ( # response headers
361     HTTPVersion => "\x00$1",
362     Status => "\x00$2",
363     Reason => "\x00$3",
364     );
365    
366     # headers, could be optimized a bit
367     $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
368     for ("$_[1]\012") {
369     # we support spaces in field names, as lotus domino
370     # creates them.
371     $hdr{lc $1} .= "\x00$2"
372     while /\G
373     ([^:\000-\037]+):
374     [\011\040]*
375     ((?: [^\015\012]+ | \015?\012[\011\040] )*)
376     \015?\012
377     /gxc;
378 root 1.10
379 root 1.11 /\G$/
380     or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" }));
381 root 1.10 }
382    
383 root 1.11 substr $_, 0, 1, ""
384     for values %hdr;
385    
386     my $finish = sub {
387     %state = ();
388 root 1.10
389 root 1.11 # set-cookie processing
390     if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
391     for (split /\x00/, $hdr{"set-cookie"}) {
392     my ($cookie, @arg) = split /;\s*/;
393     my ($name, $value) = split /=/, $cookie, 2;
394     my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
395    
396     my $cdom = (delete $kv{domain}) || $uhost;
397     my $cpath = (delete $kv{path}) || "/";
398    
399     $cdom =~ s/^.?/./; # make sure it starts with a "."
400    
401     next if $cdom =~ /\.$/;
402    
403     # this is not rfc-like and not netscape-like. go figure.
404     my $ndots = $cdom =~ y/.//;
405     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
406    
407     # store it
408     $arg{cookie_jar}{version} = 1;
409     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
410     }
411     }
412 root 1.8
413 root 1.13 if ($_[1]{Status} =~ /^30[12]$/ && $recurse) {
414 root 1.11 # microsoft and other assholes don't give a shit for following standards,
415     # try to support a common form of broken Location header.
416     $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
417    
418     http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
419     } else {
420     $cb->($_[0], $_[1]);
421     }
422     };
423 root 1.3
424 root 1.11 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
425     $finish->(undef, \%hdr);
426 root 1.3 } else {
427 root 1.11 if (exists $hdr{"content-length"}) {
428     $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
429     # could cache persistent connection now
430     if ($hdr{connection} =~ /\bkeep-alive\b/i) {
431     # but we don't, due to misdesigns, this is annoyingly complex
432     };
433    
434     $finish->($_[1], \%hdr);
435     });
436     } else {
437     # too bad, need to read until we get an error or EOF,
438     # no way to detect winged data.
439     $_[0]->on_error (sub {
440     $finish->($_[0]{rbuf}, \%hdr);
441     });
442     $_[0]->on_eof (undef);
443     $_[0]->on_read (sub { });
444     }
445 root 1.3 }
446 root 1.11 });
447 root 1.1 });
448 root 1.11 }, sub {
449     $timeout
450     };
451 root 1.1 };
452    
453     defined wantarray && AnyEvent::Util::guard { %state = () }
454     }
455    
456 elmex 1.15 sub http_get($@) {
457 root 1.1 unshift @_, "GET";
458     &http_request
459     }
460    
461 elmex 1.15 sub http_head($@) {
462 root 1.4 unshift @_, "HEAD";
463     &http_request
464     }
465    
466 elmex 1.15 sub http_post($$@) {
467 root 1.3 unshift @_, "POST", "body";
468     &http_request
469     }
470    
471 root 1.9 =back
472    
473 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
474 root 1.1
475     =over 4
476    
477 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
478    
479     Sets the default proxy server to use. The proxy-url must begin with a
480     string of the form C<http://host:port> (optionally C<https:...>).
481    
482 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
483 root 1.1
484 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
485 root 1.1
486     =item $AnyEvent::HTTP::USERAGENT
487    
488     The default value for the C<User-Agent> header (the default is
489     C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
490    
491     =item $AnyEvent::HTTP::MAX_PERSISTENT
492    
493     The maximum number of persistent connections to keep open (default: 8).
494    
495 root 1.3 Not implemented currently.
496    
497 root 1.1 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
498    
499 root 1.2 The maximum time to cache a persistent connection, in seconds (default: 2).
500 root 1.1
501 root 1.3 Not implemented currently.
502    
503 root 1.14 =item $AnyEvent::HTTP::ACTIVE
504    
505     The number of active connections. This is not the number of currently
506     running requests, but the number of currently open and non-idle TCP
507     connections. This number of can be useful for load-leveling.
508    
509 root 1.1 =back
510    
511     =cut
512    
513 root 1.2 sub set_proxy($) {
514     $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
515     }
516    
517     # initialise proxy from environment
518     set_proxy $ENV{http_proxy};
519    
520 root 1.1 =head1 SEE ALSO
521    
522     L<AnyEvent>.
523    
524     =head1 AUTHOR
525    
526 root 1.18 Marc Lehmann <schmorp@schmorp.de>
527     http://home.schmorp.de/
528 root 1.1
529     =cut
530    
531     1
532