ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.11
Committed: Thu Jun 5 15:34:00 2008 UTC (15 years, 11 months ago) by root
Branch: MAIN
Changes since 1.10: +175 -122 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9     =head1 DESCRIPTION
10    
11     This module is an L<AnyEvent> user, you need to make sure that you use and
12     run a supported event loop.
13    
14 root 1.11 This module implements a simple, stateless and non-blocking HTTP
15     client. It supports GET, POST and other request methods, cookies and more,
16     all on a very low level. It can follow redirects supports proxies and
17     automatically limits the number of connections to the values specified in
18     the RFC.
19    
20     It should generally be a "good client" that is enough for most HTTP
21     tasks. Simple tasks should be simple, but complex tasks should still be
22     possible as the user retains control over request and response headers.
23    
24     The caller is responsible for authentication management, cookies (if
25     the simplistic implementation in this module doesn't suffice), referer
26     and other high-level protocol details for which this module offers only
27     limited support.
28    
29 root 1.1 =head2 METHODS
30    
31     =over 4
32    
33     =cut
34    
35     package AnyEvent::HTTP;
36    
37     use strict;
38     no warnings;
39    
40     use Carp;
41    
42     use AnyEvent ();
43     use AnyEvent::Util ();
44     use AnyEvent::Socket ();
45     use AnyEvent::Handle ();
46    
47     use base Exporter::;
48    
49     our $VERSION = '1.0';
50    
51     our @EXPORT = qw(http_get http_request);
52    
53     our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
54 root 1.3 our $MAX_RECURSE = 10;
55 root 1.2 our $MAX_PERSISTENT = 8;
56     our $PERSISTENT_TIMEOUT = 2;
57     our $TIMEOUT = 300;
58 root 1.1
59     # changing these is evil
60     our $MAX_PERSISTENT_PER_HOST = 2;
61 root 1.11 our $MAX_PER_HOST = 4;
62 root 1.1
63 root 1.2 our $PROXY;
64    
65 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
66 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
67 root 1.1
68     =item http_get $url, key => value..., $cb->($data, $headers)
69    
70     Executes an HTTP-GET request. See the http_request function for details on
71     additional parameters.
72    
73 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
74    
75     Executes an HTTP-HEAD request. See the http_request function for details on
76     additional parameters.
77    
78     =item http_post $url, $body, key => value..., $cb->($data, $headers)
79 root 1.3
80 root 1.7 Executes an HTTP-POST request with a request body of C<$bod>. See the
81 root 1.3 http_request function for details on additional parameters.
82    
83 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84    
85     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86     must be an absolute http or https URL.
87    
88 root 1.2 The callback will be called with the response data as first argument
89     (or C<undef> if it wasn't available due to errors), and a hash-ref with
90     response headers as second argument.
91    
92 root 1.7 All the headers in that hash are lowercased. In addition to the response
93 root 1.2 headers, the three "pseudo-headers" C<HTTPVersion>, C<Status> and
94     C<Reason> contain the three parts of the HTTP Status-Line of the same
95 root 1.10 name. If the server sends a header multiple lines, then their contents
96     will be joined together with C<\x00>.
97 root 1.2
98     If an internal error occurs, such as not being able to resolve a hostname,
99     then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
100     and the C<Reason> pseudo-header will contain an error message.
101    
102 root 1.6 A typical callback might look like this:
103    
104     sub {
105     my ($body, $hdr) = @_;
106    
107     if ($hdr->{Status} =~ /^2/) {
108     ... everything should be ok
109     } else {
110     print "error, $hdr->{Status} $hdr->{Reason}\n";
111     }
112     }
113    
114 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
115     include:
116    
117     =over 4
118    
119 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
120 root 1.1
121     Whether to recurse requests or not, e.g. on redirects, authentication
122 root 1.3 retries and so on, and how often to do so.
123 root 1.1
124     =item headers => hashref
125    
126     The request headers to use.
127    
128     =item timeout => $seconds
129    
130     The time-out to use for various stages - each connect attempt will reset
131 root 1.2 the timeout, as will read or write activity. Default timeout is 5 minutes.
132    
133     =item proxy => [$host, $port[, $scheme]] or undef
134    
135     Use the given http proxy for all requests. If not specified, then the
136     default proxy (as specified by C<$ENV{http_proxy}>) is used.
137    
138     C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
139     HTTPS.
140 root 1.1
141 root 1.3 =item body => $string
142    
143     The request body, usually empty. Will be-sent as-is (future versions of
144     this module might offer more options).
145    
146 root 1.10 =item cookie_jar => $hash_ref
147    
148     Passing this parameter enables (simplified) cookie-processing, loosely
149     based on the original netscape specification.
150    
151     The C<$hash_ref> must be an (initially empty) hash reference which will
152     get updated automatically. It is possible to save the cookie_jar to
153     persistent storage with something like JSON or Storable, but this is not
154     recommended, as expire times are currently being ignored.
155    
156     Note that this cookie implementation is not of very high quality, nor
157     meant to be complete. If you want complete cookie management you have to
158     do that on your own. C<cookie_jar> is meant as a quick fix to get some
159     cookie-using sites working. Cookies are a privacy disaster, do not use
160     them unless required to.
161    
162 root 1.1 =back
163    
164 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
165    
166     http_request GET => "http://www.nethype.de/", sub {
167     my ($body, $hdr) = @_;
168     print "$body\n";
169     };
170    
171     Example: make a HTTP HEAD request on https://www.google.com/, use a
172     timeout of 30 seconds.
173    
174     http_request
175     GET => "https://www.google.com",
176     timeout => 30,
177     sub {
178     my ($body, $hdr) = @_;
179     use Data::Dumper;
180     print Dumper $hdr;
181     }
182     ;
183 root 1.1
184     =cut
185    
186 root 1.11 sub _slot_schedule($) {
187     my $host = shift;
188    
189     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
190     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
191     # somebody wnats that slot
192     ++$CO_SLOT{$host}[0];
193    
194     $cb->(AnyEvent::Util::guard {
195     --$CO_SLOT{$host}[0];
196     _slot_schedule $host;
197     });
198     } else {
199     # nobody wants the slot, maybe we can forget about it
200     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
201     warn "$host deleted" unless $CO_SLOT{$host}[0];#d#
202     last;
203     }
204     }
205     }
206    
207     # wait for a free slot on host, call callback
208     sub _get_slot($$) {
209     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
210    
211     _slot_schedule $_[0];
212     }
213    
214 root 1.1 sub http_request($$$;@) {
215     my $cb = pop;
216     my ($method, $url, %arg) = @_;
217    
218     my %hdr;
219    
220 root 1.3 $method = uc $method;
221    
222 root 1.8 if (my $hdr = $arg{headers}) {
223 root 1.1 while (my ($k, $v) = each %$hdr) {
224     $hdr{lc $k} = $v;
225     }
226     }
227    
228 root 1.8 my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE;
229    
230     return $cb->(undef, { Status => 599, Reason => "recursion limit reached" })
231     if $recurse < 0;
232    
233 root 1.2 my $proxy = $arg{proxy} || $PROXY;
234 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
235    
236     $hdr{"user-agent"} ||= $USERAGENT;
237    
238 root 1.10 my ($scheme, $authority, $upath, $query, $fragment) =
239     $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
240 root 1.2
241 root 1.10 $scheme = lc $scheme;
242 root 1.1
243 root 1.10 my $uport = $scheme eq "http" ? 80
244     : $scheme eq "https" ? 443
245     : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" });
246    
247     $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
248     or return $cb->(undef, { Status => 599, Reason => "unparsable URL" });
249    
250     my $uhost = $1;
251     $uport = $2 if defined $2;
252    
253     $uhost =~ s/^\[(.*)\]$/$1/;
254     $upath .= "?$query" if length $query;
255    
256     $upath =~ s%^/?%/%;
257    
258     # cookie processing
259     if (my $jar = $arg{cookie_jar}) {
260     %$jar = () if $jar->{version} < 1;
261    
262     my @cookie;
263    
264     while (my ($chost, $v) = each %$jar) {
265     next unless $chost eq substr $uhost, -length $chost;
266     next unless $chost =~ /^\./;
267    
268     while (my ($cpath, $v) = each %$v) {
269     next unless $cpath eq substr $upath, 0, length $cpath;
270    
271     while (my ($k, $v) = each %$v) {
272     next if $scheme ne "https" && exists $v->{secure};
273     push @cookie, "$k=$v->{value}";
274     }
275     }
276     }
277    
278     $hdr{cookie} = join "; ", @cookie
279     if @cookie;
280     }
281 root 1.1
282 root 1.10 my ($rhost, $rport, $rpath); # request host, port, path
283 root 1.2
284 root 1.10 if ($proxy) {
285     ($rhost, $rport, $scheme) = @$proxy;
286     $rpath = $url;
287     } else {
288     ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
289     $hdr{host} = $uhost;
290 root 1.2 }
291    
292 root 1.10 $hdr{"content-length"} = length $arg{body};
293 root 1.1
294 root 1.11 my %state = (connect_guard => 1);
295    
296     _get_slot $uhost, sub {
297     $state{slot_guard} = shift;
298 root 1.1
299 root 1.11 return unless $state{connect_guard};
300 root 1.1
301 root 1.11 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
302     $state{fh} = shift
303     or return $cb->(undef, { Status => 599, Reason => "$!" });
304    
305     delete $state{connect_guard}; # reduce memory usage, save a tree
306    
307     # get handle
308     $state{handle} = new AnyEvent::Handle
309     fh => $state{fh},
310     ($scheme eq "https" ? (tls => "connect") : ());
311    
312     # limit the number of persistent connections
313     if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
314     ++$KA_COUNT{$_[1]};
315     $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
316     $hdr{connection} = "keep-alive";
317     delete $hdr{connection}; # keep-alive not yet supported
318     } else {
319     delete $hdr{connection};
320     }
321 root 1.1
322 root 1.11 # (re-)configure handle
323     $state{handle}->timeout ($timeout);
324     $state{handle}->on_error (sub {
325     %state = ();
326     $cb->(undef, { Status => 599, Reason => "$!" });
327     });
328     $state{handle}->on_eof (sub {
329     %state = ();
330     $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" });
331     });
332 root 1.1
333 root 1.11 # send request
334     $state{handle}->push_write (
335     "$method $rpath HTTP/1.0\015\012"
336     . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
337     . "\015\012"
338     . (delete $arg{body})
339     );
340 root 1.1
341 root 1.11 %hdr = (); # reduce memory usage, save a kitten
342 root 1.1
343 root 1.11 # status line
344     $state{handle}->push_read (line => qr/\015?\012/, sub {
345     $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
346     or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" }));
347    
348     my %hdr = ( # response headers
349     HTTPVersion => "\x00$1",
350     Status => "\x00$2",
351     Reason => "\x00$3",
352     );
353    
354     # headers, could be optimized a bit
355     $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
356     for ("$_[1]\012") {
357     # we support spaces in field names, as lotus domino
358     # creates them.
359     $hdr{lc $1} .= "\x00$2"
360     while /\G
361     ([^:\000-\037]+):
362     [\011\040]*
363     ((?: [^\015\012]+ | \015?\012[\011\040] )*)
364     \015?\012
365     /gxc;
366 root 1.10
367 root 1.11 /\G$/
368     or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" }));
369 root 1.10 }
370    
371 root 1.11 substr $_, 0, 1, ""
372     for values %hdr;
373    
374     my $finish = sub {
375     %state = ();
376 root 1.10
377 root 1.11 # set-cookie processing
378     if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
379     for (split /\x00/, $hdr{"set-cookie"}) {
380     my ($cookie, @arg) = split /;\s*/;
381     my ($name, $value) = split /=/, $cookie, 2;
382     my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
383    
384     my $cdom = (delete $kv{domain}) || $uhost;
385     my $cpath = (delete $kv{path}) || "/";
386    
387     $cdom =~ s/^.?/./; # make sure it starts with a "."
388    
389     next if $cdom =~ /\.$/;
390    
391     # this is not rfc-like and not netscape-like. go figure.
392     my $ndots = $cdom =~ y/.//;
393     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
394    
395     # store it
396     $arg{cookie_jar}{version} = 1;
397     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
398     }
399     }
400 root 1.8
401 root 1.11 if ($_[1]{Status} =~ /^x30[12]$/ && $recurse) {
402     # microsoft and other assholes don't give a shit for following standards,
403     # try to support a common form of broken Location header.
404     $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
405    
406     http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
407     } else {
408     $cb->($_[0], $_[1]);
409     }
410     };
411 root 1.3
412 root 1.11 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
413     $finish->(undef, \%hdr);
414 root 1.3 } else {
415 root 1.11 if (exists $hdr{"content-length"}) {
416     $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
417     # could cache persistent connection now
418     if ($hdr{connection} =~ /\bkeep-alive\b/i) {
419     # but we don't, due to misdesigns, this is annoyingly complex
420     };
421    
422     $finish->($_[1], \%hdr);
423     });
424     } else {
425     # too bad, need to read until we get an error or EOF,
426     # no way to detect winged data.
427     $_[0]->on_error (sub {
428     $finish->($_[0]{rbuf}, \%hdr);
429     });
430     $_[0]->on_eof (undef);
431     $_[0]->on_read (sub { });
432     }
433 root 1.3 }
434 root 1.11 });
435 root 1.1 });
436 root 1.11 }, sub {
437     $timeout
438     };
439 root 1.1 };
440    
441     defined wantarray && AnyEvent::Util::guard { %state = () }
442     }
443    
444     sub http_get($$;@) {
445     unshift @_, "GET";
446     &http_request
447     }
448    
449 root 1.4 sub http_head($$;@) {
450     unshift @_, "HEAD";
451     &http_request
452     }
453    
454 root 1.3 sub http_post($$$;@) {
455     unshift @_, "POST", "body";
456     &http_request
457     }
458    
459 root 1.9 =back
460    
461 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
462 root 1.1
463     =over 4
464    
465 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
466    
467     Sets the default proxy server to use. The proxy-url must begin with a
468     string of the form C<http://host:port> (optionally C<https:...>).
469    
470 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
471 root 1.1
472 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
473 root 1.1
474     =item $AnyEvent::HTTP::USERAGENT
475    
476     The default value for the C<User-Agent> header (the default is
477     C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
478    
479     =item $AnyEvent::HTTP::MAX_PERSISTENT
480    
481     The maximum number of persistent connections to keep open (default: 8).
482    
483 root 1.3 Not implemented currently.
484    
485 root 1.1 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
486    
487 root 1.2 The maximum time to cache a persistent connection, in seconds (default: 2).
488 root 1.1
489 root 1.3 Not implemented currently.
490    
491 root 1.1 =back
492    
493     =cut
494    
495 root 1.2 sub set_proxy($) {
496     $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
497     }
498    
499     # initialise proxy from environment
500     set_proxy $ENV{http_proxy};
501    
502 root 1.1 =head1 SEE ALSO
503    
504     L<AnyEvent>.
505    
506     =head1 AUTHOR
507    
508     Marc Lehmann <schmorp@schmorp.de>
509     http://home.schmorp.de/
510    
511     =cut
512    
513     1
514