ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.16
Committed: Fri Jun 6 12:57:48 2008 UTC (15 years, 11 months ago) by root
Branch: MAIN
CVS Tags: rel-1_01
Changes since 1.15: +1 -1 lines
Log Message:
1.01

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9     =head1 DESCRIPTION
10    
11     This module is an L<AnyEvent> user, you need to make sure that you use and
12     run a supported event loop.
13    
14 root 1.11 This module implements a simple, stateless and non-blocking HTTP
15     client. It supports GET, POST and other request methods, cookies and more,
16     all on a very low level. It can follow redirects supports proxies and
17     automatically limits the number of connections to the values specified in
18     the RFC.
19    
20     It should generally be a "good client" that is enough for most HTTP
21     tasks. Simple tasks should be simple, but complex tasks should still be
22     possible as the user retains control over request and response headers.
23    
24     The caller is responsible for authentication management, cookies (if
25     the simplistic implementation in this module doesn't suffice), referer
26     and other high-level protocol details for which this module offers only
27     limited support.
28    
29 root 1.1 =head2 METHODS
30    
31     =over 4
32    
33     =cut
34    
35     package AnyEvent::HTTP;
36    
37     use strict;
38     no warnings;
39    
40     use Carp;
41    
42     use AnyEvent ();
43     use AnyEvent::Util ();
44     use AnyEvent::Socket ();
45     use AnyEvent::Handle ();
46    
47     use base Exporter::;
48    
49 root 1.16 our $VERSION = '1.01';
50 root 1.1
51     our @EXPORT = qw(http_get http_request);
52    
53     our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
54 root 1.3 our $MAX_RECURSE = 10;
55 root 1.2 our $MAX_PERSISTENT = 8;
56     our $PERSISTENT_TIMEOUT = 2;
57     our $TIMEOUT = 300;
58 root 1.1
59     # changing these is evil
60     our $MAX_PERSISTENT_PER_HOST = 2;
61 root 1.11 our $MAX_PER_HOST = 4;
62 root 1.1
63 root 1.2 our $PROXY;
64 root 1.14 our $ACTIVE = 0;
65 root 1.2
66 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
67 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
68 root 1.1
69     =item http_get $url, key => value..., $cb->($data, $headers)
70    
71     Executes an HTTP-GET request. See the http_request function for details on
72     additional parameters.
73    
74 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
75    
76     Executes an HTTP-HEAD request. See the http_request function for details on
77     additional parameters.
78    
79     =item http_post $url, $body, key => value..., $cb->($data, $headers)
80 root 1.3
81 root 1.7 Executes an HTTP-POST request with a request body of C<$bod>. See the
82 root 1.3 http_request function for details on additional parameters.
83    
84 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
85    
86     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
87     must be an absolute http or https URL.
88    
89 root 1.2 The callback will be called with the response data as first argument
90     (or C<undef> if it wasn't available due to errors), and a hash-ref with
91     response headers as second argument.
92    
93 root 1.7 All the headers in that hash are lowercased. In addition to the response
94 root 1.2 headers, the three "pseudo-headers" C<HTTPVersion>, C<Status> and
95     C<Reason> contain the three parts of the HTTP Status-Line of the same
96 root 1.10 name. If the server sends a header multiple lines, then their contents
97     will be joined together with C<\x00>.
98 root 1.2
99     If an internal error occurs, such as not being able to resolve a hostname,
100     then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
101     and the C<Reason> pseudo-header will contain an error message.
102    
103 root 1.6 A typical callback might look like this:
104    
105     sub {
106     my ($body, $hdr) = @_;
107    
108     if ($hdr->{Status} =~ /^2/) {
109     ... everything should be ok
110     } else {
111     print "error, $hdr->{Status} $hdr->{Reason}\n";
112     }
113     }
114    
115 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
116     include:
117    
118     =over 4
119    
120 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
121 root 1.1
122     Whether to recurse requests or not, e.g. on redirects, authentication
123 root 1.3 retries and so on, and how often to do so.
124 root 1.1
125     =item headers => hashref
126    
127 root 1.12 The request headers to use. Currently, C<http_request> may provide its
128     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
129     and will provide defaults for C<User-Agent:> and C<Referer:>.
130 root 1.1
131     =item timeout => $seconds
132    
133     The time-out to use for various stages - each connect attempt will reset
134 root 1.2 the timeout, as will read or write activity. Default timeout is 5 minutes.
135    
136     =item proxy => [$host, $port[, $scheme]] or undef
137    
138     Use the given http proxy for all requests. If not specified, then the
139     default proxy (as specified by C<$ENV{http_proxy}>) is used.
140    
141     C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
142     HTTPS.
143 root 1.1
144 root 1.3 =item body => $string
145    
146     The request body, usually empty. Will be-sent as-is (future versions of
147     this module might offer more options).
148    
149 root 1.10 =item cookie_jar => $hash_ref
150    
151     Passing this parameter enables (simplified) cookie-processing, loosely
152     based on the original netscape specification.
153    
154     The C<$hash_ref> must be an (initially empty) hash reference which will
155     get updated automatically. It is possible to save the cookie_jar to
156     persistent storage with something like JSON or Storable, but this is not
157     recommended, as expire times are currently being ignored.
158    
159     Note that this cookie implementation is not of very high quality, nor
160     meant to be complete. If you want complete cookie management you have to
161     do that on your own. C<cookie_jar> is meant as a quick fix to get some
162     cookie-using sites working. Cookies are a privacy disaster, do not use
163     them unless required to.
164    
165 root 1.1 =back
166    
167 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
168    
169     http_request GET => "http://www.nethype.de/", sub {
170     my ($body, $hdr) = @_;
171     print "$body\n";
172     };
173    
174     Example: make a HTTP HEAD request on https://www.google.com/, use a
175     timeout of 30 seconds.
176    
177     http_request
178     GET => "https://www.google.com",
179     timeout => 30,
180     sub {
181     my ($body, $hdr) = @_;
182     use Data::Dumper;
183     print Dumper $hdr;
184     }
185     ;
186 root 1.1
187     =cut
188    
189 root 1.12 sub _slot_schedule;
190 root 1.11 sub _slot_schedule($) {
191     my $host = shift;
192    
193     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
194     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
195 root 1.12 # somebody wants that slot
196 root 1.11 ++$CO_SLOT{$host}[0];
197 root 1.14 ++$ACTIVE;
198 root 1.11
199     $cb->(AnyEvent::Util::guard {
200 root 1.14 --$ACTIVE;
201 root 1.11 --$CO_SLOT{$host}[0];
202     _slot_schedule $host;
203     });
204     } else {
205     # nobody wants the slot, maybe we can forget about it
206     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
207     last;
208     }
209     }
210     }
211    
212     # wait for a free slot on host, call callback
213     sub _get_slot($$) {
214     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
215    
216     _slot_schedule $_[0];
217     }
218    
219 elmex 1.15 sub http_request($$@) {
220 root 1.1 my $cb = pop;
221     my ($method, $url, %arg) = @_;
222    
223     my %hdr;
224    
225 root 1.3 $method = uc $method;
226    
227 root 1.8 if (my $hdr = $arg{headers}) {
228 root 1.1 while (my ($k, $v) = each %$hdr) {
229     $hdr{lc $k} = $v;
230     }
231     }
232    
233 root 1.8 my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE;
234    
235     return $cb->(undef, { Status => 599, Reason => "recursion limit reached" })
236     if $recurse < 0;
237    
238 root 1.2 my $proxy = $arg{proxy} || $PROXY;
239 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
240    
241     $hdr{"user-agent"} ||= $USERAGENT;
242    
243 root 1.10 my ($scheme, $authority, $upath, $query, $fragment) =
244     $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
245 root 1.2
246 root 1.10 $scheme = lc $scheme;
247 root 1.1
248 root 1.10 my $uport = $scheme eq "http" ? 80
249     : $scheme eq "https" ? 443
250     : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" });
251    
252 root 1.13 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
253    
254 root 1.10 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
255     or return $cb->(undef, { Status => 599, Reason => "unparsable URL" });
256    
257     my $uhost = $1;
258     $uport = $2 if defined $2;
259    
260     $uhost =~ s/^\[(.*)\]$/$1/;
261     $upath .= "?$query" if length $query;
262    
263     $upath =~ s%^/?%/%;
264    
265     # cookie processing
266     if (my $jar = $arg{cookie_jar}) {
267     %$jar = () if $jar->{version} < 1;
268    
269     my @cookie;
270    
271     while (my ($chost, $v) = each %$jar) {
272     next unless $chost eq substr $uhost, -length $chost;
273     next unless $chost =~ /^\./;
274    
275     while (my ($cpath, $v) = each %$v) {
276     next unless $cpath eq substr $upath, 0, length $cpath;
277    
278     while (my ($k, $v) = each %$v) {
279     next if $scheme ne "https" && exists $v->{secure};
280     push @cookie, "$k=$v->{value}";
281     }
282     }
283     }
284    
285     $hdr{cookie} = join "; ", @cookie
286     if @cookie;
287     }
288 root 1.1
289 root 1.10 my ($rhost, $rport, $rpath); # request host, port, path
290 root 1.2
291 root 1.10 if ($proxy) {
292     ($rhost, $rport, $scheme) = @$proxy;
293     $rpath = $url;
294     } else {
295     ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
296     $hdr{host} = $uhost;
297 root 1.2 }
298    
299 root 1.10 $hdr{"content-length"} = length $arg{body};
300 root 1.1
301 root 1.11 my %state = (connect_guard => 1);
302    
303     _get_slot $uhost, sub {
304     $state{slot_guard} = shift;
305 root 1.1
306 root 1.11 return unless $state{connect_guard};
307 root 1.1
308 root 1.11 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
309     $state{fh} = shift
310     or return $cb->(undef, { Status => 599, Reason => "$!" });
311    
312     delete $state{connect_guard}; # reduce memory usage, save a tree
313    
314     # get handle
315     $state{handle} = new AnyEvent::Handle
316     fh => $state{fh},
317     ($scheme eq "https" ? (tls => "connect") : ());
318    
319     # limit the number of persistent connections
320     if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
321     ++$KA_COUNT{$_[1]};
322     $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
323     $hdr{connection} = "keep-alive";
324     delete $hdr{connection}; # keep-alive not yet supported
325     } else {
326     delete $hdr{connection};
327     }
328 root 1.1
329 root 1.11 # (re-)configure handle
330     $state{handle}->timeout ($timeout);
331     $state{handle}->on_error (sub {
332 root 1.14 my $errno = "$!";
333 root 1.11 %state = ();
334 root 1.14 $cb->(undef, { Status => 599, Reason => $errno });
335 root 1.11 });
336     $state{handle}->on_eof (sub {
337     %state = ();
338     $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" });
339     });
340 root 1.1
341 root 1.11 # send request
342     $state{handle}->push_write (
343     "$method $rpath HTTP/1.0\015\012"
344     . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
345     . "\015\012"
346     . (delete $arg{body})
347     );
348 root 1.1
349 root 1.11 %hdr = (); # reduce memory usage, save a kitten
350 root 1.1
351 root 1.11 # status line
352     $state{handle}->push_read (line => qr/\015?\012/, sub {
353     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
354     or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" }));
355    
356     my %hdr = ( # response headers
357     HTTPVersion => "\x00$1",
358     Status => "\x00$2",
359     Reason => "\x00$3",
360     );
361    
362     # headers, could be optimized a bit
363     $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
364     for ("$_[1]\012") {
365     # we support spaces in field names, as lotus domino
366     # creates them.
367     $hdr{lc $1} .= "\x00$2"
368     while /\G
369     ([^:\000-\037]+):
370     [\011\040]*
371     ((?: [^\015\012]+ | \015?\012[\011\040] )*)
372     \015?\012
373     /gxc;
374 root 1.10
375 root 1.11 /\G$/
376     or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" }));
377 root 1.10 }
378    
379 root 1.11 substr $_, 0, 1, ""
380     for values %hdr;
381    
382     my $finish = sub {
383     %state = ();
384 root 1.10
385 root 1.11 # set-cookie processing
386     if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
387     for (split /\x00/, $hdr{"set-cookie"}) {
388     my ($cookie, @arg) = split /;\s*/;
389     my ($name, $value) = split /=/, $cookie, 2;
390     my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
391    
392     my $cdom = (delete $kv{domain}) || $uhost;
393     my $cpath = (delete $kv{path}) || "/";
394    
395     $cdom =~ s/^.?/./; # make sure it starts with a "."
396    
397     next if $cdom =~ /\.$/;
398    
399     # this is not rfc-like and not netscape-like. go figure.
400     my $ndots = $cdom =~ y/.//;
401     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
402    
403     # store it
404     $arg{cookie_jar}{version} = 1;
405     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
406     }
407     }
408 root 1.8
409 root 1.13 if ($_[1]{Status} =~ /^30[12]$/ && $recurse) {
410 root 1.11 # microsoft and other assholes don't give a shit for following standards,
411     # try to support a common form of broken Location header.
412     $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
413    
414     http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
415     } else {
416     $cb->($_[0], $_[1]);
417     }
418     };
419 root 1.3
420 root 1.11 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
421     $finish->(undef, \%hdr);
422 root 1.3 } else {
423 root 1.11 if (exists $hdr{"content-length"}) {
424     $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
425     # could cache persistent connection now
426     if ($hdr{connection} =~ /\bkeep-alive\b/i) {
427     # but we don't, due to misdesigns, this is annoyingly complex
428     };
429    
430     $finish->($_[1], \%hdr);
431     });
432     } else {
433     # too bad, need to read until we get an error or EOF,
434     # no way to detect winged data.
435     $_[0]->on_error (sub {
436     $finish->($_[0]{rbuf}, \%hdr);
437     });
438     $_[0]->on_eof (undef);
439     $_[0]->on_read (sub { });
440     }
441 root 1.3 }
442 root 1.11 });
443 root 1.1 });
444 root 1.11 }, sub {
445     $timeout
446     };
447 root 1.1 };
448    
449     defined wantarray && AnyEvent::Util::guard { %state = () }
450     }
451    
452 elmex 1.15 sub http_get($@) {
453 root 1.1 unshift @_, "GET";
454     &http_request
455     }
456    
457 elmex 1.15 sub http_head($@) {
458 root 1.4 unshift @_, "HEAD";
459     &http_request
460     }
461    
462 elmex 1.15 sub http_post($$@) {
463 root 1.3 unshift @_, "POST", "body";
464     &http_request
465     }
466    
467 root 1.9 =back
468    
469 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
470 root 1.1
471     =over 4
472    
473 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
474    
475     Sets the default proxy server to use. The proxy-url must begin with a
476     string of the form C<http://host:port> (optionally C<https:...>).
477    
478 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
479 root 1.1
480 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
481 root 1.1
482     =item $AnyEvent::HTTP::USERAGENT
483    
484     The default value for the C<User-Agent> header (the default is
485     C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
486    
487     =item $AnyEvent::HTTP::MAX_PERSISTENT
488    
489     The maximum number of persistent connections to keep open (default: 8).
490    
491 root 1.3 Not implemented currently.
492    
493 root 1.1 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
494    
495 root 1.2 The maximum time to cache a persistent connection, in seconds (default: 2).
496 root 1.1
497 root 1.3 Not implemented currently.
498    
499 root 1.14 =item $AnyEvent::HTTP::ACTIVE
500    
501     The number of active connections. This is not the number of currently
502     running requests, but the number of currently open and non-idle TCP
503     connections. This number of can be useful for load-leveling.
504    
505 root 1.1 =back
506    
507     =cut
508    
509 root 1.2 sub set_proxy($) {
510     $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
511     }
512    
513     # initialise proxy from environment
514     set_proxy $ENV{http_proxy};
515    
516 root 1.1 =head1 SEE ALSO
517    
518     L<AnyEvent>.
519    
520     =head1 AUTHOR
521    
522     Marc Lehmann <schmorp@schmorp.de>
523     http://home.schmorp.de/
524    
525     =cut
526    
527     1
528