ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.17
Committed: Fri Jun 6 13:02:38 2008 UTC (15 years, 11 months ago) by root
Branch: MAIN
Changes since 1.16: +5 -1 lines
Log Message:
*** empty log message ***

File Contents

# Content
1 =head1 NAME
2
3 AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4
5 =head1 SYNOPSIS
6
7 use AnyEvent::HTTP;
8
9 http_get "http://www.nethype.de/", sub { print $_[1] };
10
11 # ... do something else here
12
13 =head1 DESCRIPTION
14
15 This module is an L<AnyEvent> user, you need to make sure that you use and
16 run a supported event loop.
17
18 This module implements a simple, stateless and non-blocking HTTP
19 client. It supports GET, POST and other request methods, cookies and more,
20 all on a very low level. It can follow redirects supports proxies and
21 automatically limits the number of connections to the values specified in
22 the RFC.
23
24 It should generally be a "good client" that is enough for most HTTP
25 tasks. Simple tasks should be simple, but complex tasks should still be
26 possible as the user retains control over request and response headers.
27
28 The caller is responsible for authentication management, cookies (if
29 the simplistic implementation in this module doesn't suffice), referer
30 and other high-level protocol details for which this module offers only
31 limited support.
32
33 =head2 METHODS
34
35 =over 4
36
37 =cut
38
39 package AnyEvent::HTTP;
40
41 use strict;
42 no warnings;
43
44 use Carp;
45
46 use AnyEvent ();
47 use AnyEvent::Util ();
48 use AnyEvent::Socket ();
49 use AnyEvent::Handle ();
50
51 use base Exporter::;
52
53 our $VERSION = '1.01';
54
55 our @EXPORT = qw(http_get http_post http_head http_request);
56
57 our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
58 our $MAX_RECURSE = 10;
59 our $MAX_PERSISTENT = 8;
60 our $PERSISTENT_TIMEOUT = 2;
61 our $TIMEOUT = 300;
62
63 # changing these is evil
64 our $MAX_PERSISTENT_PER_HOST = 2;
65 our $MAX_PER_HOST = 4;
66
67 our $PROXY;
68 our $ACTIVE = 0;
69
70 my %KA_COUNT; # number of open keep-alive connections per host
71 my %CO_SLOT; # number of open connections, and wait queue, per host
72
73 =item http_get $url, key => value..., $cb->($data, $headers)
74
75 Executes an HTTP-GET request. See the http_request function for details on
76 additional parameters.
77
78 =item http_head $url, key => value..., $cb->($data, $headers)
79
80 Executes an HTTP-HEAD request. See the http_request function for details on
81 additional parameters.
82
83 =item http_post $url, $body, key => value..., $cb->($data, $headers)
84
85 Executes an HTTP-POST request with a request body of C<$bod>. See the
86 http_request function for details on additional parameters.
87
88 =item http_request $method => $url, key => value..., $cb->($data, $headers)
89
90 Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
91 must be an absolute http or https URL.
92
93 The callback will be called with the response data as first argument
94 (or C<undef> if it wasn't available due to errors), and a hash-ref with
95 response headers as second argument.
96
97 All the headers in that hash are lowercased. In addition to the response
98 headers, the three "pseudo-headers" C<HTTPVersion>, C<Status> and
99 C<Reason> contain the three parts of the HTTP Status-Line of the same
100 name. If the server sends a header multiple lines, then their contents
101 will be joined together with C<\x00>.
102
103 If an internal error occurs, such as not being able to resolve a hostname,
104 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
105 and the C<Reason> pseudo-header will contain an error message.
106
107 A typical callback might look like this:
108
109 sub {
110 my ($body, $hdr) = @_;
111
112 if ($hdr->{Status} =~ /^2/) {
113 ... everything should be ok
114 } else {
115 print "error, $hdr->{Status} $hdr->{Reason}\n";
116 }
117 }
118
119 Additional parameters are key-value pairs, and are fully optional. They
120 include:
121
122 =over 4
123
124 =item recurse => $count (default: $MAX_RECURSE)
125
126 Whether to recurse requests or not, e.g. on redirects, authentication
127 retries and so on, and how often to do so.
128
129 =item headers => hashref
130
131 The request headers to use. Currently, C<http_request> may provide its
132 own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
133 and will provide defaults for C<User-Agent:> and C<Referer:>.
134
135 =item timeout => $seconds
136
137 The time-out to use for various stages - each connect attempt will reset
138 the timeout, as will read or write activity. Default timeout is 5 minutes.
139
140 =item proxy => [$host, $port[, $scheme]] or undef
141
142 Use the given http proxy for all requests. If not specified, then the
143 default proxy (as specified by C<$ENV{http_proxy}>) is used.
144
145 C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
146 HTTPS.
147
148 =item body => $string
149
150 The request body, usually empty. Will be-sent as-is (future versions of
151 this module might offer more options).
152
153 =item cookie_jar => $hash_ref
154
155 Passing this parameter enables (simplified) cookie-processing, loosely
156 based on the original netscape specification.
157
158 The C<$hash_ref> must be an (initially empty) hash reference which will
159 get updated automatically. It is possible to save the cookie_jar to
160 persistent storage with something like JSON or Storable, but this is not
161 recommended, as expire times are currently being ignored.
162
163 Note that this cookie implementation is not of very high quality, nor
164 meant to be complete. If you want complete cookie management you have to
165 do that on your own. C<cookie_jar> is meant as a quick fix to get some
166 cookie-using sites working. Cookies are a privacy disaster, do not use
167 them unless required to.
168
169 =back
170
171 Example: make a simple HTTP GET request for http://www.nethype.de/
172
173 http_request GET => "http://www.nethype.de/", sub {
174 my ($body, $hdr) = @_;
175 print "$body\n";
176 };
177
178 Example: make a HTTP HEAD request on https://www.google.com/, use a
179 timeout of 30 seconds.
180
181 http_request
182 GET => "https://www.google.com",
183 timeout => 30,
184 sub {
185 my ($body, $hdr) = @_;
186 use Data::Dumper;
187 print Dumper $hdr;
188 }
189 ;
190
191 =cut
192
193 sub _slot_schedule;
194 sub _slot_schedule($) {
195 my $host = shift;
196
197 while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
198 if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
199 # somebody wants that slot
200 ++$CO_SLOT{$host}[0];
201 ++$ACTIVE;
202
203 $cb->(AnyEvent::Util::guard {
204 --$ACTIVE;
205 --$CO_SLOT{$host}[0];
206 _slot_schedule $host;
207 });
208 } else {
209 # nobody wants the slot, maybe we can forget about it
210 delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
211 last;
212 }
213 }
214 }
215
216 # wait for a free slot on host, call callback
217 sub _get_slot($$) {
218 push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
219
220 _slot_schedule $_[0];
221 }
222
223 sub http_request($$@) {
224 my $cb = pop;
225 my ($method, $url, %arg) = @_;
226
227 my %hdr;
228
229 $method = uc $method;
230
231 if (my $hdr = $arg{headers}) {
232 while (my ($k, $v) = each %$hdr) {
233 $hdr{lc $k} = $v;
234 }
235 }
236
237 my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE;
238
239 return $cb->(undef, { Status => 599, Reason => "recursion limit reached" })
240 if $recurse < 0;
241
242 my $proxy = $arg{proxy} || $PROXY;
243 my $timeout = $arg{timeout} || $TIMEOUT;
244
245 $hdr{"user-agent"} ||= $USERAGENT;
246
247 my ($scheme, $authority, $upath, $query, $fragment) =
248 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
249
250 $scheme = lc $scheme;
251
252 my $uport = $scheme eq "http" ? 80
253 : $scheme eq "https" ? 443
254 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" });
255
256 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
257
258 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
259 or return $cb->(undef, { Status => 599, Reason => "unparsable URL" });
260
261 my $uhost = $1;
262 $uport = $2 if defined $2;
263
264 $uhost =~ s/^\[(.*)\]$/$1/;
265 $upath .= "?$query" if length $query;
266
267 $upath =~ s%^/?%/%;
268
269 # cookie processing
270 if (my $jar = $arg{cookie_jar}) {
271 %$jar = () if $jar->{version} < 1;
272
273 my @cookie;
274
275 while (my ($chost, $v) = each %$jar) {
276 next unless $chost eq substr $uhost, -length $chost;
277 next unless $chost =~ /^\./;
278
279 while (my ($cpath, $v) = each %$v) {
280 next unless $cpath eq substr $upath, 0, length $cpath;
281
282 while (my ($k, $v) = each %$v) {
283 next if $scheme ne "https" && exists $v->{secure};
284 push @cookie, "$k=$v->{value}";
285 }
286 }
287 }
288
289 $hdr{cookie} = join "; ", @cookie
290 if @cookie;
291 }
292
293 my ($rhost, $rport, $rpath); # request host, port, path
294
295 if ($proxy) {
296 ($rhost, $rport, $scheme) = @$proxy;
297 $rpath = $url;
298 } else {
299 ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
300 $hdr{host} = $uhost;
301 }
302
303 $hdr{"content-length"} = length $arg{body};
304
305 my %state = (connect_guard => 1);
306
307 _get_slot $uhost, sub {
308 $state{slot_guard} = shift;
309
310 return unless $state{connect_guard};
311
312 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
313 $state{fh} = shift
314 or return $cb->(undef, { Status => 599, Reason => "$!" });
315
316 delete $state{connect_guard}; # reduce memory usage, save a tree
317
318 # get handle
319 $state{handle} = new AnyEvent::Handle
320 fh => $state{fh},
321 ($scheme eq "https" ? (tls => "connect") : ());
322
323 # limit the number of persistent connections
324 if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
325 ++$KA_COUNT{$_[1]};
326 $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
327 $hdr{connection} = "keep-alive";
328 delete $hdr{connection}; # keep-alive not yet supported
329 } else {
330 delete $hdr{connection};
331 }
332
333 # (re-)configure handle
334 $state{handle}->timeout ($timeout);
335 $state{handle}->on_error (sub {
336 my $errno = "$!";
337 %state = ();
338 $cb->(undef, { Status => 599, Reason => $errno });
339 });
340 $state{handle}->on_eof (sub {
341 %state = ();
342 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" });
343 });
344
345 # send request
346 $state{handle}->push_write (
347 "$method $rpath HTTP/1.0\015\012"
348 . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
349 . "\015\012"
350 . (delete $arg{body})
351 );
352
353 %hdr = (); # reduce memory usage, save a kitten
354
355 # status line
356 $state{handle}->push_read (line => qr/\015?\012/, sub {
357 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
358 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" }));
359
360 my %hdr = ( # response headers
361 HTTPVersion => "\x00$1",
362 Status => "\x00$2",
363 Reason => "\x00$3",
364 );
365
366 # headers, could be optimized a bit
367 $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
368 for ("$_[1]\012") {
369 # we support spaces in field names, as lotus domino
370 # creates them.
371 $hdr{lc $1} .= "\x00$2"
372 while /\G
373 ([^:\000-\037]+):
374 [\011\040]*
375 ((?: [^\015\012]+ | \015?\012[\011\040] )*)
376 \015?\012
377 /gxc;
378
379 /\G$/
380 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" }));
381 }
382
383 substr $_, 0, 1, ""
384 for values %hdr;
385
386 my $finish = sub {
387 %state = ();
388
389 # set-cookie processing
390 if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
391 for (split /\x00/, $hdr{"set-cookie"}) {
392 my ($cookie, @arg) = split /;\s*/;
393 my ($name, $value) = split /=/, $cookie, 2;
394 my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
395
396 my $cdom = (delete $kv{domain}) || $uhost;
397 my $cpath = (delete $kv{path}) || "/";
398
399 $cdom =~ s/^.?/./; # make sure it starts with a "."
400
401 next if $cdom =~ /\.$/;
402
403 # this is not rfc-like and not netscape-like. go figure.
404 my $ndots = $cdom =~ y/.//;
405 next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
406
407 # store it
408 $arg{cookie_jar}{version} = 1;
409 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
410 }
411 }
412
413 if ($_[1]{Status} =~ /^30[12]$/ && $recurse) {
414 # microsoft and other assholes don't give a shit for following standards,
415 # try to support a common form of broken Location header.
416 $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
417
418 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
419 } else {
420 $cb->($_[0], $_[1]);
421 }
422 };
423
424 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
425 $finish->(undef, \%hdr);
426 } else {
427 if (exists $hdr{"content-length"}) {
428 $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
429 # could cache persistent connection now
430 if ($hdr{connection} =~ /\bkeep-alive\b/i) {
431 # but we don't, due to misdesigns, this is annoyingly complex
432 };
433
434 $finish->($_[1], \%hdr);
435 });
436 } else {
437 # too bad, need to read until we get an error or EOF,
438 # no way to detect winged data.
439 $_[0]->on_error (sub {
440 $finish->($_[0]{rbuf}, \%hdr);
441 });
442 $_[0]->on_eof (undef);
443 $_[0]->on_read (sub { });
444 }
445 }
446 });
447 });
448 }, sub {
449 $timeout
450 };
451 };
452
453 defined wantarray && AnyEvent::Util::guard { %state = () }
454 }
455
456 sub http_get($@) {
457 unshift @_, "GET";
458 &http_request
459 }
460
461 sub http_head($@) {
462 unshift @_, "HEAD";
463 &http_request
464 }
465
466 sub http_post($$@) {
467 unshift @_, "POST", "body";
468 &http_request
469 }
470
471 =back
472
473 =head2 GLOBAL FUNCTIONS AND VARIABLES
474
475 =over 4
476
477 =item AnyEvent::HTTP::set_proxy "proxy-url"
478
479 Sets the default proxy server to use. The proxy-url must begin with a
480 string of the form C<http://host:port> (optionally C<https:...>).
481
482 =item $AnyEvent::HTTP::MAX_RECURSE
483
484 The default value for the C<recurse> request parameter (default: C<10>).
485
486 =item $AnyEvent::HTTP::USERAGENT
487
488 The default value for the C<User-Agent> header (the default is
489 C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
490
491 =item $AnyEvent::HTTP::MAX_PERSISTENT
492
493 The maximum number of persistent connections to keep open (default: 8).
494
495 Not implemented currently.
496
497 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
498
499 The maximum time to cache a persistent connection, in seconds (default: 2).
500
501 Not implemented currently.
502
503 =item $AnyEvent::HTTP::ACTIVE
504
505 The number of active connections. This is not the number of currently
506 running requests, but the number of currently open and non-idle TCP
507 connections. This number of can be useful for load-leveling.
508
509 =back
510
511 =cut
512
513 sub set_proxy($) {
514 $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
515 }
516
517 # initialise proxy from environment
518 set_proxy $ENV{http_proxy};
519
520 =head1 SEE ALSO
521
522 L<AnyEvent>.
523
524 =head1 AUTHOR
525
526 Marc Lehmann <schmorp@schmorp.de>
527 http://home.schmorp.de/
528
529 =cut
530
531 1
532