ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.23
Committed: Wed Jul 2 01:23:41 2008 UTC (15 years, 10 months ago) by root
Branch: MAIN
Changes since 1.22: +9 -2 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44     use Carp;
45    
46     use AnyEvent ();
47     use AnyEvent::Util ();
48     use AnyEvent::Socket ();
49     use AnyEvent::Handle ();
50    
51     use base Exporter::;
52    
53 root 1.22 our $VERSION = '1.03';
54 root 1.1
55 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
56 root 1.1
57     our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
58 root 1.3 our $MAX_RECURSE = 10;
59 root 1.2 our $MAX_PERSISTENT = 8;
60     our $PERSISTENT_TIMEOUT = 2;
61     our $TIMEOUT = 300;
62 root 1.1
63     # changing these is evil
64     our $MAX_PERSISTENT_PER_HOST = 2;
65 root 1.11 our $MAX_PER_HOST = 4;
66 root 1.1
67 root 1.2 our $PROXY;
68 root 1.14 our $ACTIVE = 0;
69 root 1.2
70 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
71 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
72 root 1.1
73     =item http_get $url, key => value..., $cb->($data, $headers)
74    
75     Executes an HTTP-GET request. See the http_request function for details on
76     additional parameters.
77    
78 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
79    
80     Executes an HTTP-HEAD request. See the http_request function for details on
81     additional parameters.
82    
83     =item http_post $url, $body, key => value..., $cb->($data, $headers)
84 root 1.3
85 root 1.7 Executes an HTTP-POST request with a request body of C<$bod>. See the
86 root 1.3 http_request function for details on additional parameters.
87    
88 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89    
90     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91     must be an absolute http or https URL.
92    
93 root 1.2 The callback will be called with the response data as first argument
94     (or C<undef> if it wasn't available due to errors), and a hash-ref with
95     response headers as second argument.
96    
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.20 headers, the "pseudo-headers" C<HTTPVersion>, C<Status> and C<Reason>
99     contain the three parts of the HTTP Status-Line of the same name. The
100     pseudo-header C<URL> contains the original URL (which can differ from the
101     requested URL when following redirects).
102    
103     If the server sends a header multiple lines, then their contents will be
104     joined together with C<\x00>.
105 root 1.2
106     If an internal error occurs, such as not being able to resolve a hostname,
107     then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
108     and the C<Reason> pseudo-header will contain an error message.
109    
110 root 1.6 A typical callback might look like this:
111    
112     sub {
113     my ($body, $hdr) = @_;
114    
115     if ($hdr->{Status} =~ /^2/) {
116     ... everything should be ok
117     } else {
118     print "error, $hdr->{Status} $hdr->{Reason}\n";
119     }
120     }
121    
122 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
123     include:
124    
125     =over 4
126    
127 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
128 root 1.1
129     Whether to recurse requests or not, e.g. on redirects, authentication
130 root 1.3 retries and so on, and how often to do so.
131 root 1.1
132     =item headers => hashref
133    
134 root 1.12 The request headers to use. Currently, C<http_request> may provide its
135     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
136     and will provide defaults for C<User-Agent:> and C<Referer:>.
137 root 1.1
138     =item timeout => $seconds
139    
140     The time-out to use for various stages - each connect attempt will reset
141 root 1.2 the timeout, as will read or write activity. Default timeout is 5 minutes.
142    
143     =item proxy => [$host, $port[, $scheme]] or undef
144    
145     Use the given http proxy for all requests. If not specified, then the
146     default proxy (as specified by C<$ENV{http_proxy}>) is used.
147    
148     C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
149     HTTPS.
150 root 1.1
151 root 1.3 =item body => $string
152    
153     The request body, usually empty. Will be-sent as-is (future versions of
154     this module might offer more options).
155    
156 root 1.10 =item cookie_jar => $hash_ref
157    
158     Passing this parameter enables (simplified) cookie-processing, loosely
159     based on the original netscape specification.
160    
161     The C<$hash_ref> must be an (initially empty) hash reference which will
162     get updated automatically. It is possible to save the cookie_jar to
163     persistent storage with something like JSON or Storable, but this is not
164     recommended, as expire times are currently being ignored.
165    
166     Note that this cookie implementation is not of very high quality, nor
167     meant to be complete. If you want complete cookie management you have to
168     do that on your own. C<cookie_jar> is meant as a quick fix to get some
169     cookie-using sites working. Cookies are a privacy disaster, do not use
170     them unless required to.
171    
172 root 1.1 =back
173    
174 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
175    
176     http_request GET => "http://www.nethype.de/", sub {
177     my ($body, $hdr) = @_;
178     print "$body\n";
179     };
180    
181     Example: make a HTTP HEAD request on https://www.google.com/, use a
182     timeout of 30 seconds.
183    
184     http_request
185     GET => "https://www.google.com",
186     timeout => 30,
187     sub {
188     my ($body, $hdr) = @_;
189     use Data::Dumper;
190     print Dumper $hdr;
191     }
192     ;
193 root 1.1
194     =cut
195    
196 root 1.12 sub _slot_schedule;
197 root 1.11 sub _slot_schedule($) {
198     my $host = shift;
199    
200     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
201     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
202 root 1.12 # somebody wants that slot
203 root 1.11 ++$CO_SLOT{$host}[0];
204 root 1.14 ++$ACTIVE;
205 root 1.11
206     $cb->(AnyEvent::Util::guard {
207 root 1.14 --$ACTIVE;
208 root 1.11 --$CO_SLOT{$host}[0];
209     _slot_schedule $host;
210     });
211     } else {
212     # nobody wants the slot, maybe we can forget about it
213     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
214     last;
215     }
216     }
217     }
218    
219     # wait for a free slot on host, call callback
220     sub _get_slot($$) {
221     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
222    
223     _slot_schedule $_[0];
224     }
225    
226 elmex 1.15 sub http_request($$@) {
227 root 1.1 my $cb = pop;
228     my ($method, $url, %arg) = @_;
229    
230     my %hdr;
231    
232 root 1.3 $method = uc $method;
233    
234 root 1.8 if (my $hdr = $arg{headers}) {
235 root 1.1 while (my ($k, $v) = each %$hdr) {
236     $hdr{lc $k} = $v;
237     }
238     }
239    
240 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
241 root 1.8
242 elmex 1.19 return $cb->(undef, { Status => 599, Reason => "recursion limit reached", URL => $url })
243 root 1.8 if $recurse < 0;
244    
245 root 1.2 my $proxy = $arg{proxy} || $PROXY;
246 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
247    
248     $hdr{"user-agent"} ||= $USERAGENT;
249    
250 root 1.10 my ($scheme, $authority, $upath, $query, $fragment) =
251     $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
252 root 1.2
253 root 1.10 $scheme = lc $scheme;
254 root 1.1
255 root 1.10 my $uport = $scheme eq "http" ? 80
256     : $scheme eq "https" ? 443
257 elmex 1.19 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported", URL => $url });
258 root 1.10
259 root 1.13 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
260    
261 root 1.10 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
262 elmex 1.19 or return $cb->(undef, { Status => 599, Reason => "unparsable URL", URL => $url });
263 root 1.10
264     my $uhost = $1;
265     $uport = $2 if defined $2;
266    
267     $uhost =~ s/^\[(.*)\]$/$1/;
268     $upath .= "?$query" if length $query;
269    
270     $upath =~ s%^/?%/%;
271    
272     # cookie processing
273     if (my $jar = $arg{cookie_jar}) {
274     %$jar = () if $jar->{version} < 1;
275    
276     my @cookie;
277    
278     while (my ($chost, $v) = each %$jar) {
279     next unless $chost eq substr $uhost, -length $chost;
280     next unless $chost =~ /^\./;
281    
282     while (my ($cpath, $v) = each %$v) {
283     next unless $cpath eq substr $upath, 0, length $cpath;
284    
285     while (my ($k, $v) = each %$v) {
286     next if $scheme ne "https" && exists $v->{secure};
287     push @cookie, "$k=$v->{value}";
288     }
289     }
290     }
291    
292     $hdr{cookie} = join "; ", @cookie
293     if @cookie;
294     }
295 root 1.1
296 root 1.10 my ($rhost, $rport, $rpath); # request host, port, path
297 root 1.2
298 root 1.10 if ($proxy) {
299     ($rhost, $rport, $scheme) = @$proxy;
300     $rpath = $url;
301     } else {
302     ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
303     $hdr{host} = $uhost;
304 root 1.2 }
305    
306 root 1.10 $hdr{"content-length"} = length $arg{body};
307 root 1.1
308 root 1.11 my %state = (connect_guard => 1);
309    
310     _get_slot $uhost, sub {
311     $state{slot_guard} = shift;
312 root 1.1
313 root 1.11 return unless $state{connect_guard};
314 root 1.1
315 root 1.11 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
316     $state{fh} = shift
317 elmex 1.19 or return $cb->(undef, { Status => 599, Reason => "$!", URL => $url });
318 root 1.11
319     delete $state{connect_guard}; # reduce memory usage, save a tree
320    
321     # get handle
322     $state{handle} = new AnyEvent::Handle
323     fh => $state{fh},
324     ($scheme eq "https" ? (tls => "connect") : ());
325    
326     # limit the number of persistent connections
327     if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
328     ++$KA_COUNT{$_[1]};
329     $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
330     $hdr{connection} = "keep-alive";
331     delete $hdr{connection}; # keep-alive not yet supported
332     } else {
333     delete $hdr{connection};
334     }
335 root 1.1
336 root 1.11 # (re-)configure handle
337     $state{handle}->timeout ($timeout);
338     $state{handle}->on_error (sub {
339 root 1.14 my $errno = "$!";
340 root 1.11 %state = ();
341 elmex 1.19 $cb->(undef, { Status => 599, Reason => $errno, URL => $url });
342 root 1.11 });
343     $state{handle}->on_eof (sub {
344     %state = ();
345 elmex 1.19 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file", URL => $url });
346 root 1.11 });
347 root 1.1
348 root 1.11 # send request
349     $state{handle}->push_write (
350     "$method $rpath HTTP/1.0\015\012"
351     . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
352     . "\015\012"
353     . (delete $arg{body})
354     );
355 root 1.1
356 root 1.11 %hdr = (); # reduce memory usage, save a kitten
357 root 1.1
358 root 1.11 # status line
359     $state{handle}->push_read (line => qr/\015?\012/, sub {
360     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
361 elmex 1.19 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])", URL => $url }));
362 root 1.11
363     my %hdr = ( # response headers
364     HTTPVersion => "\x00$1",
365     Status => "\x00$2",
366     Reason => "\x00$3",
367 elmex 1.19 URL => "\x00$url"
368 root 1.11 );
369    
370     # headers, could be optimized a bit
371     $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
372     for ("$_[1]\012") {
373     # we support spaces in field names, as lotus domino
374     # creates them.
375     $hdr{lc $1} .= "\x00$2"
376     while /\G
377     ([^:\000-\037]+):
378     [\011\040]*
379     ((?: [^\015\012]+ | \015?\012[\011\040] )*)
380     \015?\012
381     /gxc;
382 root 1.10
383 root 1.11 /\G$/
384 elmex 1.19 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers", URL => $url }));
385 root 1.10 }
386    
387 root 1.11 substr $_, 0, 1, ""
388     for values %hdr;
389    
390     my $finish = sub {
391     %state = ();
392 root 1.10
393 root 1.11 # set-cookie processing
394     if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
395     for (split /\x00/, $hdr{"set-cookie"}) {
396     my ($cookie, @arg) = split /;\s*/;
397     my ($name, $value) = split /=/, $cookie, 2;
398     my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
399    
400     my $cdom = (delete $kv{domain}) || $uhost;
401     my $cpath = (delete $kv{path}) || "/";
402    
403     $cdom =~ s/^.?/./; # make sure it starts with a "."
404    
405     next if $cdom =~ /\.$/;
406    
407     # this is not rfc-like and not netscape-like. go figure.
408     my $ndots = $cdom =~ y/.//;
409     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
410    
411     # store it
412     $arg{cookie_jar}{version} = 1;
413     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
414     }
415     }
416 root 1.8
417 root 1.23 if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") {
418 root 1.11 # microsoft and other assholes don't give a shit for following standards,
419     # try to support a common form of broken Location header.
420     $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
421    
422 root 1.23 # apparently, mozilla et al. just change POST to GET here
423     # more research is needed before we do the same
424    
425 root 1.11 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
426 root 1.23 } elsif ($_[1]{Status} == 303 && $recurse) {
427     $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
428    
429     http_request (GET => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
430 root 1.11 } else {
431     $cb->($_[0], $_[1]);
432     }
433     };
434 root 1.3
435 root 1.11 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
436     $finish->(undef, \%hdr);
437 root 1.3 } else {
438 root 1.11 if (exists $hdr{"content-length"}) {
439     $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
440     # could cache persistent connection now
441     if ($hdr{connection} =~ /\bkeep-alive\b/i) {
442     # but we don't, due to misdesigns, this is annoyingly complex
443     };
444    
445     $finish->($_[1], \%hdr);
446     });
447     } else {
448     # too bad, need to read until we get an error or EOF,
449     # no way to detect winged data.
450     $_[0]->on_error (sub {
451     $finish->($_[0]{rbuf}, \%hdr);
452     });
453     $_[0]->on_eof (undef);
454     $_[0]->on_read (sub { });
455     }
456 root 1.3 }
457 root 1.11 });
458 root 1.1 });
459 root 1.11 }, sub {
460     $timeout
461     };
462 root 1.1 };
463    
464     defined wantarray && AnyEvent::Util::guard { %state = () }
465     }
466    
467 elmex 1.15 sub http_get($@) {
468 root 1.1 unshift @_, "GET";
469     &http_request
470     }
471    
472 elmex 1.15 sub http_head($@) {
473 root 1.4 unshift @_, "HEAD";
474     &http_request
475     }
476    
477 elmex 1.15 sub http_post($$@) {
478 root 1.22 my $url = shift;
479     unshift @_, "POST", $url, "body";
480 root 1.3 &http_request
481     }
482    
483 root 1.9 =back
484    
485 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
486 root 1.1
487     =over 4
488    
489 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
490    
491     Sets the default proxy server to use. The proxy-url must begin with a
492     string of the form C<http://host:port> (optionally C<https:...>).
493    
494 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
495 root 1.1
496 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
497 root 1.1
498     =item $AnyEvent::HTTP::USERAGENT
499    
500     The default value for the C<User-Agent> header (the default is
501     C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
502    
503     =item $AnyEvent::HTTP::MAX_PERSISTENT
504    
505     The maximum number of persistent connections to keep open (default: 8).
506    
507 root 1.3 Not implemented currently.
508    
509 root 1.1 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
510    
511 root 1.2 The maximum time to cache a persistent connection, in seconds (default: 2).
512 root 1.1
513 root 1.3 Not implemented currently.
514    
515 root 1.14 =item $AnyEvent::HTTP::ACTIVE
516    
517     The number of active connections. This is not the number of currently
518     running requests, but the number of currently open and non-idle TCP
519     connections. This number of can be useful for load-leveling.
520    
521 root 1.1 =back
522    
523     =cut
524    
525 root 1.2 sub set_proxy($) {
526     $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
527     }
528    
529     # initialise proxy from environment
530     set_proxy $ENV{http_proxy};
531    
532 root 1.1 =head1 SEE ALSO
533    
534     L<AnyEvent>.
535    
536     =head1 AUTHOR
537    
538 root 1.18 Marc Lehmann <schmorp@schmorp.de>
539     http://home.schmorp.de/
540 root 1.1
541     =cut
542    
543     1
544