ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.25
Committed: Mon Jul 21 05:42:07 2008 UTC (15 years, 10 months ago) by root
Branch: MAIN
Changes since 1.24: +15 -5 lines
Log Message:
*** empty log message ***

File Contents

# User Rev Content
1 root 1.1 =head1 NAME
2    
3     AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4    
5     =head1 SYNOPSIS
6    
7     use AnyEvent::HTTP;
8    
9 root 1.17 http_get "http://www.nethype.de/", sub { print $_[1] };
10    
11     # ... do something else here
12    
13 root 1.1 =head1 DESCRIPTION
14    
15     This module is an L<AnyEvent> user, you need to make sure that you use and
16     run a supported event loop.
17    
18 root 1.11 This module implements a simple, stateless and non-blocking HTTP
19     client. It supports GET, POST and other request methods, cookies and more,
20     all on a very low level. It can follow redirects supports proxies and
21     automatically limits the number of connections to the values specified in
22     the RFC.
23    
24     It should generally be a "good client" that is enough for most HTTP
25     tasks. Simple tasks should be simple, but complex tasks should still be
26     possible as the user retains control over request and response headers.
27    
28     The caller is responsible for authentication management, cookies (if
29     the simplistic implementation in this module doesn't suffice), referer
30     and other high-level protocol details for which this module offers only
31     limited support.
32    
33 root 1.1 =head2 METHODS
34    
35     =over 4
36    
37     =cut
38    
39     package AnyEvent::HTTP;
40    
41     use strict;
42     no warnings;
43    
44     use Carp;
45    
46     use AnyEvent ();
47     use AnyEvent::Util ();
48     use AnyEvent::Socket ();
49     use AnyEvent::Handle ();
50    
51     use base Exporter::;
52    
53 root 1.22 our $VERSION = '1.03';
54 root 1.1
55 root 1.17 our @EXPORT = qw(http_get http_post http_head http_request);
56 root 1.1
57     our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
58 root 1.3 our $MAX_RECURSE = 10;
59 root 1.2 our $MAX_PERSISTENT = 8;
60     our $PERSISTENT_TIMEOUT = 2;
61     our $TIMEOUT = 300;
62 root 1.1
63     # changing these is evil
64     our $MAX_PERSISTENT_PER_HOST = 2;
65 root 1.11 our $MAX_PER_HOST = 4;
66 root 1.1
67 root 1.2 our $PROXY;
68 root 1.14 our $ACTIVE = 0;
69 root 1.2
70 root 1.1 my %KA_COUNT; # number of open keep-alive connections per host
71 root 1.11 my %CO_SLOT; # number of open connections, and wait queue, per host
72 root 1.1
73     =item http_get $url, key => value..., $cb->($data, $headers)
74    
75     Executes an HTTP-GET request. See the http_request function for details on
76     additional parameters.
77    
78 root 1.5 =item http_head $url, key => value..., $cb->($data, $headers)
79    
80     Executes an HTTP-HEAD request. See the http_request function for details on
81     additional parameters.
82    
83     =item http_post $url, $body, key => value..., $cb->($data, $headers)
84 root 1.3
85 root 1.7 Executes an HTTP-POST request with a request body of C<$bod>. See the
86 root 1.3 http_request function for details on additional parameters.
87    
88 root 1.1 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89    
90     Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91     must be an absolute http or https URL.
92    
93 root 1.2 The callback will be called with the response data as first argument
94     (or C<undef> if it wasn't available due to errors), and a hash-ref with
95     response headers as second argument.
96    
97 root 1.7 All the headers in that hash are lowercased. In addition to the response
98 root 1.20 headers, the "pseudo-headers" C<HTTPVersion>, C<Status> and C<Reason>
99     contain the three parts of the HTTP Status-Line of the same name. The
100     pseudo-header C<URL> contains the original URL (which can differ from the
101     requested URL when following redirects).
102    
103     If the server sends a header multiple lines, then their contents will be
104     joined together with C<\x00>.
105 root 1.2
106     If an internal error occurs, such as not being able to resolve a hostname,
107     then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
108     and the C<Reason> pseudo-header will contain an error message.
109    
110 root 1.6 A typical callback might look like this:
111    
112     sub {
113     my ($body, $hdr) = @_;
114    
115     if ($hdr->{Status} =~ /^2/) {
116     ... everything should be ok
117     } else {
118     print "error, $hdr->{Status} $hdr->{Reason}\n";
119     }
120     }
121    
122 root 1.1 Additional parameters are key-value pairs, and are fully optional. They
123     include:
124    
125     =over 4
126    
127 root 1.3 =item recurse => $count (default: $MAX_RECURSE)
128 root 1.1
129     Whether to recurse requests or not, e.g. on redirects, authentication
130 root 1.3 retries and so on, and how often to do so.
131 root 1.1
132     =item headers => hashref
133    
134 root 1.12 The request headers to use. Currently, C<http_request> may provide its
135     own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
136     and will provide defaults for C<User-Agent:> and C<Referer:>.
137 root 1.1
138     =item timeout => $seconds
139    
140     The time-out to use for various stages - each connect attempt will reset
141 root 1.2 the timeout, as will read or write activity. Default timeout is 5 minutes.
142    
143     =item proxy => [$host, $port[, $scheme]] or undef
144    
145     Use the given http proxy for all requests. If not specified, then the
146     default proxy (as specified by C<$ENV{http_proxy}>) is used.
147    
148     C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
149     HTTPS.
150 root 1.1
151 root 1.3 =item body => $string
152    
153     The request body, usually empty. Will be-sent as-is (future versions of
154     this module might offer more options).
155    
156 root 1.10 =item cookie_jar => $hash_ref
157    
158     Passing this parameter enables (simplified) cookie-processing, loosely
159     based on the original netscape specification.
160    
161     The C<$hash_ref> must be an (initially empty) hash reference which will
162     get updated automatically. It is possible to save the cookie_jar to
163     persistent storage with something like JSON or Storable, but this is not
164     recommended, as expire times are currently being ignored.
165    
166     Note that this cookie implementation is not of very high quality, nor
167     meant to be complete. If you want complete cookie management you have to
168     do that on your own. C<cookie_jar> is meant as a quick fix to get some
169     cookie-using sites working. Cookies are a privacy disaster, do not use
170     them unless required to.
171    
172 root 1.1 =back
173    
174 root 1.9 Example: make a simple HTTP GET request for http://www.nethype.de/
175    
176     http_request GET => "http://www.nethype.de/", sub {
177     my ($body, $hdr) = @_;
178     print "$body\n";
179     };
180    
181     Example: make a HTTP HEAD request on https://www.google.com/, use a
182     timeout of 30 seconds.
183    
184     http_request
185     GET => "https://www.google.com",
186     timeout => 30,
187     sub {
188     my ($body, $hdr) = @_;
189     use Data::Dumper;
190     print Dumper $hdr;
191     }
192     ;
193 root 1.1
194     =cut
195    
196 root 1.12 sub _slot_schedule;
197 root 1.11 sub _slot_schedule($) {
198     my $host = shift;
199    
200     while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
201     if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
202 root 1.12 # somebody wants that slot
203 root 1.11 ++$CO_SLOT{$host}[0];
204 root 1.14 ++$ACTIVE;
205 root 1.11
206     $cb->(AnyEvent::Util::guard {
207 root 1.14 --$ACTIVE;
208 root 1.11 --$CO_SLOT{$host}[0];
209     _slot_schedule $host;
210     });
211     } else {
212     # nobody wants the slot, maybe we can forget about it
213     delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
214     last;
215     }
216     }
217     }
218    
219     # wait for a free slot on host, call callback
220     sub _get_slot($$) {
221     push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
222    
223     _slot_schedule $_[0];
224     }
225    
226 elmex 1.15 sub http_request($$@) {
227 root 1.1 my $cb = pop;
228     my ($method, $url, %arg) = @_;
229    
230     my %hdr;
231    
232 root 1.3 $method = uc $method;
233    
234 root 1.8 if (my $hdr = $arg{headers}) {
235 root 1.1 while (my ($k, $v) = each %$hdr) {
236     $hdr{lc $k} = $v;
237     }
238     }
239    
240 root 1.23 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
241 root 1.8
242 elmex 1.19 return $cb->(undef, { Status => 599, Reason => "recursion limit reached", URL => $url })
243 root 1.8 if $recurse < 0;
244    
245 root 1.2 my $proxy = $arg{proxy} || $PROXY;
246 root 1.1 my $timeout = $arg{timeout} || $TIMEOUT;
247    
248     $hdr{"user-agent"} ||= $USERAGENT;
249    
250 root 1.10 my ($scheme, $authority, $upath, $query, $fragment) =
251     $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
252 root 1.2
253 root 1.10 $scheme = lc $scheme;
254 root 1.1
255 root 1.10 my $uport = $scheme eq "http" ? 80
256     : $scheme eq "https" ? 443
257 elmex 1.19 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported", URL => $url });
258 root 1.10
259 root 1.13 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
260    
261 root 1.10 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
262 elmex 1.19 or return $cb->(undef, { Status => 599, Reason => "unparsable URL", URL => $url });
263 root 1.10
264     my $uhost = $1;
265     $uport = $2 if defined $2;
266    
267     $uhost =~ s/^\[(.*)\]$/$1/;
268     $upath .= "?$query" if length $query;
269    
270     $upath =~ s%^/?%/%;
271    
272     # cookie processing
273     if (my $jar = $arg{cookie_jar}) {
274     %$jar = () if $jar->{version} < 1;
275    
276     my @cookie;
277    
278     while (my ($chost, $v) = each %$jar) {
279     next unless $chost eq substr $uhost, -length $chost;
280     next unless $chost =~ /^\./;
281    
282     while (my ($cpath, $v) = each %$v) {
283     next unless $cpath eq substr $upath, 0, length $cpath;
284    
285     while (my ($k, $v) = each %$v) {
286     next if $scheme ne "https" && exists $v->{secure};
287     push @cookie, "$k=$v->{value}";
288     }
289     }
290     }
291    
292     $hdr{cookie} = join "; ", @cookie
293     if @cookie;
294     }
295 root 1.1
296 root 1.10 my ($rhost, $rport, $rpath); # request host, port, path
297 root 1.2
298 root 1.10 if ($proxy) {
299     ($rhost, $rport, $scheme) = @$proxy;
300     $rpath = $url;
301     } else {
302     ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
303     $hdr{host} = $uhost;
304 root 1.2 }
305    
306 root 1.10 $hdr{"content-length"} = length $arg{body};
307 root 1.1
308 root 1.11 my %state = (connect_guard => 1);
309    
310     _get_slot $uhost, sub {
311     $state{slot_guard} = shift;
312 root 1.1
313 root 1.11 return unless $state{connect_guard};
314 root 1.1
315 root 1.11 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
316     $state{fh} = shift
317 elmex 1.19 or return $cb->(undef, { Status => 599, Reason => "$!", URL => $url });
318 root 1.11
319     delete $state{connect_guard}; # reduce memory usage, save a tree
320    
321     # get handle
322     $state{handle} = new AnyEvent::Handle
323     fh => $state{fh},
324     ($scheme eq "https" ? (tls => "connect") : ());
325    
326     # limit the number of persistent connections
327     if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
328     ++$KA_COUNT{$_[1]};
329     $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
330     $hdr{connection} = "keep-alive";
331     delete $hdr{connection}; # keep-alive not yet supported
332     } else {
333     delete $hdr{connection};
334     }
335 root 1.1
336 root 1.11 # (re-)configure handle
337     $state{handle}->timeout ($timeout);
338     $state{handle}->on_error (sub {
339 root 1.14 my $errno = "$!";
340 root 1.11 %state = ();
341 elmex 1.19 $cb->(undef, { Status => 599, Reason => $errno, URL => $url });
342 root 1.11 });
343     $state{handle}->on_eof (sub {
344     %state = ();
345 elmex 1.19 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file", URL => $url });
346 root 1.11 });
347 root 1.1
348 root 1.11 # send request
349     $state{handle}->push_write (
350     "$method $rpath HTTP/1.0\015\012"
351     . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
352     . "\015\012"
353     . (delete $arg{body})
354     );
355 root 1.1
356 root 1.11 %hdr = (); # reduce memory usage, save a kitten
357 root 1.1
358 root 1.11 # status line
359     $state{handle}->push_read (line => qr/\015?\012/, sub {
360 root 1.25 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
361 elmex 1.19 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])", URL => $url }));
362 root 1.11
363     my %hdr = ( # response headers
364     HTTPVersion => "\x00$1",
365     Status => "\x00$2",
366     Reason => "\x00$3",
367 elmex 1.19 URL => "\x00$url"
368 root 1.11 );
369    
370     # headers, could be optimized a bit
371     $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
372     for ("$_[1]\012") {
373     # we support spaces in field names, as lotus domino
374     # creates them.
375     $hdr{lc $1} .= "\x00$2"
376     while /\G
377     ([^:\000-\037]+):
378     [\011\040]*
379     ((?: [^\015\012]+ | \015?\012[\011\040] )*)
380     \015?\012
381     /gxc;
382 root 1.10
383 root 1.11 /\G$/
384 elmex 1.19 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers", URL => $url }));
385 root 1.10 }
386    
387 root 1.11 substr $_, 0, 1, ""
388     for values %hdr;
389    
390     my $finish = sub {
391     %state = ();
392 root 1.10
393 root 1.11 # set-cookie processing
394     if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
395     for (split /\x00/, $hdr{"set-cookie"}) {
396     my ($cookie, @arg) = split /;\s*/;
397     my ($name, $value) = split /=/, $cookie, 2;
398     my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
399    
400     my $cdom = (delete $kv{domain}) || $uhost;
401     my $cpath = (delete $kv{path}) || "/";
402    
403     $cdom =~ s/^.?/./; # make sure it starts with a "."
404    
405     next if $cdom =~ /\.$/;
406    
407     # this is not rfc-like and not netscape-like. go figure.
408     my $ndots = $cdom =~ y/.//;
409     next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
410    
411     # store it
412     $arg{cookie_jar}{version} = 1;
413     $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
414     }
415     }
416 root 1.8
417 root 1.25 # microsoft and other shitheads don't give a shit for following standards,
418     # try to support some common forms of broken Location headers.
419     if ($_[1]{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
420     $_[1]{location} =~ s/^\.\/+//;
421    
422     my $url = "$scheme://$uhost:$uport";
423    
424     unless ($_[1]{location} =~ s/^\///) {
425     $url .= $upath;
426     $url =~ s/\/[^\/]*$//;
427     }
428    
429     $_[1]{location} = "$url/$_[1]{location}";
430     }
431 root 1.24
432 root 1.23 if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") {
433     # apparently, mozilla et al. just change POST to GET here
434     # more research is needed before we do the same
435 root 1.11 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
436 root 1.23 } elsif ($_[1]{Status} == 303 && $recurse) {
437 root 1.24 # even http/1.1 is unlear on how to mutate the method
438     $method = "GET" unless $method eq "HEAD";
439     http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
440     } elsif ($_[1]{Status} == 307 && $recurse && $method =~ /^(?:GET|HEAD)$/) {
441     http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
442 root 1.11 } else {
443     $cb->($_[0], $_[1]);
444     }
445     };
446 root 1.3
447 root 1.11 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
448     $finish->(undef, \%hdr);
449 root 1.3 } else {
450 root 1.11 if (exists $hdr{"content-length"}) {
451     $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
452     # could cache persistent connection now
453     if ($hdr{connection} =~ /\bkeep-alive\b/i) {
454     # but we don't, due to misdesigns, this is annoyingly complex
455     };
456    
457     $finish->($_[1], \%hdr);
458     });
459     } else {
460     # too bad, need to read until we get an error or EOF,
461     # no way to detect winged data.
462     $_[0]->on_error (sub {
463     $finish->($_[0]{rbuf}, \%hdr);
464     });
465     $_[0]->on_eof (undef);
466     $_[0]->on_read (sub { });
467     }
468 root 1.3 }
469 root 1.11 });
470 root 1.1 });
471 root 1.11 }, sub {
472     $timeout
473     };
474 root 1.1 };
475    
476     defined wantarray && AnyEvent::Util::guard { %state = () }
477     }
478    
479 elmex 1.15 sub http_get($@) {
480 root 1.1 unshift @_, "GET";
481     &http_request
482     }
483    
484 elmex 1.15 sub http_head($@) {
485 root 1.4 unshift @_, "HEAD";
486     &http_request
487     }
488    
489 elmex 1.15 sub http_post($$@) {
490 root 1.22 my $url = shift;
491     unshift @_, "POST", $url, "body";
492 root 1.3 &http_request
493     }
494    
495 root 1.9 =back
496    
497 root 1.2 =head2 GLOBAL FUNCTIONS AND VARIABLES
498 root 1.1
499     =over 4
500    
501 root 1.2 =item AnyEvent::HTTP::set_proxy "proxy-url"
502    
503     Sets the default proxy server to use. The proxy-url must begin with a
504     string of the form C<http://host:port> (optionally C<https:...>).
505    
506 root 1.3 =item $AnyEvent::HTTP::MAX_RECURSE
507 root 1.1
508 root 1.3 The default value for the C<recurse> request parameter (default: C<10>).
509 root 1.1
510     =item $AnyEvent::HTTP::USERAGENT
511    
512     The default value for the C<User-Agent> header (the default is
513     C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
514    
515     =item $AnyEvent::HTTP::MAX_PERSISTENT
516    
517     The maximum number of persistent connections to keep open (default: 8).
518    
519 root 1.3 Not implemented currently.
520    
521 root 1.1 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
522    
523 root 1.2 The maximum time to cache a persistent connection, in seconds (default: 2).
524 root 1.1
525 root 1.3 Not implemented currently.
526    
527 root 1.14 =item $AnyEvent::HTTP::ACTIVE
528    
529     The number of active connections. This is not the number of currently
530     running requests, but the number of currently open and non-idle TCP
531     connections. This number of can be useful for load-leveling.
532    
533 root 1.1 =back
534    
535     =cut
536    
537 root 1.2 sub set_proxy($) {
538     $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
539     }
540    
541     # initialise proxy from environment
542     set_proxy $ENV{http_proxy};
543    
544 root 1.1 =head1 SEE ALSO
545    
546     L<AnyEvent>.
547    
548     =head1 AUTHOR
549    
550 root 1.18 Marc Lehmann <schmorp@schmorp.de>
551     http://home.schmorp.de/
552 root 1.1
553     =cut
554    
555     1
556