ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.15
Committed: Fri Jun 6 10:12:44 2008 UTC (15 years, 11 months ago) by elmex
Branch: MAIN
Changes since 1.14: +5 -5 lines
Log Message:
fixed prototypes

File Contents

# Content
1 =head1 NAME
2
3 AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4
5 =head1 SYNOPSIS
6
7 use AnyEvent::HTTP;
8
9 =head1 DESCRIPTION
10
11 This module is an L<AnyEvent> user, you need to make sure that you use and
12 run a supported event loop.
13
14 This module implements a simple, stateless and non-blocking HTTP
15 client. It supports GET, POST and other request methods, cookies and more,
16 all on a very low level. It can follow redirects supports proxies and
17 automatically limits the number of connections to the values specified in
18 the RFC.
19
20 It should generally be a "good client" that is enough for most HTTP
21 tasks. Simple tasks should be simple, but complex tasks should still be
22 possible as the user retains control over request and response headers.
23
24 The caller is responsible for authentication management, cookies (if
25 the simplistic implementation in this module doesn't suffice), referer
26 and other high-level protocol details for which this module offers only
27 limited support.
28
29 =head2 METHODS
30
31 =over 4
32
33 =cut
34
35 package AnyEvent::HTTP;
36
37 use strict;
38 no warnings;
39
40 use Carp;
41
42 use AnyEvent ();
43 use AnyEvent::Util ();
44 use AnyEvent::Socket ();
45 use AnyEvent::Handle ();
46
47 use base Exporter::;
48
49 our $VERSION = '1.1';
50
51 our @EXPORT = qw(http_get http_request);
52
53 our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
54 our $MAX_RECURSE = 10;
55 our $MAX_PERSISTENT = 8;
56 our $PERSISTENT_TIMEOUT = 2;
57 our $TIMEOUT = 300;
58
59 # changing these is evil
60 our $MAX_PERSISTENT_PER_HOST = 2;
61 our $MAX_PER_HOST = 4;
62
63 our $PROXY;
64 our $ACTIVE = 0;
65
66 my %KA_COUNT; # number of open keep-alive connections per host
67 my %CO_SLOT; # number of open connections, and wait queue, per host
68
69 =item http_get $url, key => value..., $cb->($data, $headers)
70
71 Executes an HTTP-GET request. See the http_request function for details on
72 additional parameters.
73
74 =item http_head $url, key => value..., $cb->($data, $headers)
75
76 Executes an HTTP-HEAD request. See the http_request function for details on
77 additional parameters.
78
79 =item http_post $url, $body, key => value..., $cb->($data, $headers)
80
81 Executes an HTTP-POST request with a request body of C<$bod>. See the
82 http_request function for details on additional parameters.
83
84 =item http_request $method => $url, key => value..., $cb->($data, $headers)
85
86 Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
87 must be an absolute http or https URL.
88
89 The callback will be called with the response data as first argument
90 (or C<undef> if it wasn't available due to errors), and a hash-ref with
91 response headers as second argument.
92
93 All the headers in that hash are lowercased. In addition to the response
94 headers, the three "pseudo-headers" C<HTTPVersion>, C<Status> and
95 C<Reason> contain the three parts of the HTTP Status-Line of the same
96 name. If the server sends a header multiple lines, then their contents
97 will be joined together with C<\x00>.
98
99 If an internal error occurs, such as not being able to resolve a hostname,
100 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
101 and the C<Reason> pseudo-header will contain an error message.
102
103 A typical callback might look like this:
104
105 sub {
106 my ($body, $hdr) = @_;
107
108 if ($hdr->{Status} =~ /^2/) {
109 ... everything should be ok
110 } else {
111 print "error, $hdr->{Status} $hdr->{Reason}\n";
112 }
113 }
114
115 Additional parameters are key-value pairs, and are fully optional. They
116 include:
117
118 =over 4
119
120 =item recurse => $count (default: $MAX_RECURSE)
121
122 Whether to recurse requests or not, e.g. on redirects, authentication
123 retries and so on, and how often to do so.
124
125 =item headers => hashref
126
127 The request headers to use. Currently, C<http_request> may provide its
128 own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
129 and will provide defaults for C<User-Agent:> and C<Referer:>.
130
131 =item timeout => $seconds
132
133 The time-out to use for various stages - each connect attempt will reset
134 the timeout, as will read or write activity. Default timeout is 5 minutes.
135
136 =item proxy => [$host, $port[, $scheme]] or undef
137
138 Use the given http proxy for all requests. If not specified, then the
139 default proxy (as specified by C<$ENV{http_proxy}>) is used.
140
141 C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
142 HTTPS.
143
144 =item body => $string
145
146 The request body, usually empty. Will be-sent as-is (future versions of
147 this module might offer more options).
148
149 =item cookie_jar => $hash_ref
150
151 Passing this parameter enables (simplified) cookie-processing, loosely
152 based on the original netscape specification.
153
154 The C<$hash_ref> must be an (initially empty) hash reference which will
155 get updated automatically. It is possible to save the cookie_jar to
156 persistent storage with something like JSON or Storable, but this is not
157 recommended, as expire times are currently being ignored.
158
159 Note that this cookie implementation is not of very high quality, nor
160 meant to be complete. If you want complete cookie management you have to
161 do that on your own. C<cookie_jar> is meant as a quick fix to get some
162 cookie-using sites working. Cookies are a privacy disaster, do not use
163 them unless required to.
164
165 =back
166
167 Example: make a simple HTTP GET request for http://www.nethype.de/
168
169 http_request GET => "http://www.nethype.de/", sub {
170 my ($body, $hdr) = @_;
171 print "$body\n";
172 };
173
174 Example: make a HTTP HEAD request on https://www.google.com/, use a
175 timeout of 30 seconds.
176
177 http_request
178 GET => "https://www.google.com",
179 timeout => 30,
180 sub {
181 my ($body, $hdr) = @_;
182 use Data::Dumper;
183 print Dumper $hdr;
184 }
185 ;
186
187 =cut
188
189 sub _slot_schedule;
190 sub _slot_schedule($) {
191 my $host = shift;
192
193 while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
194 if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
195 # somebody wants that slot
196 ++$CO_SLOT{$host}[0];
197 ++$ACTIVE;
198
199 $cb->(AnyEvent::Util::guard {
200 --$ACTIVE;
201 --$CO_SLOT{$host}[0];
202 _slot_schedule $host;
203 });
204 } else {
205 # nobody wants the slot, maybe we can forget about it
206 delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
207 last;
208 }
209 }
210 }
211
212 # wait for a free slot on host, call callback
213 sub _get_slot($$) {
214 push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
215
216 _slot_schedule $_[0];
217 }
218
219 sub http_request($$@) {
220 my $cb = pop;
221 my ($method, $url, %arg) = @_;
222
223 my %hdr;
224
225 $method = uc $method;
226
227 if (my $hdr = $arg{headers}) {
228 while (my ($k, $v) = each %$hdr) {
229 $hdr{lc $k} = $v;
230 }
231 }
232
233 my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE;
234
235 return $cb->(undef, { Status => 599, Reason => "recursion limit reached" })
236 if $recurse < 0;
237
238 my $proxy = $arg{proxy} || $PROXY;
239 my $timeout = $arg{timeout} || $TIMEOUT;
240
241 $hdr{"user-agent"} ||= $USERAGENT;
242
243 my ($scheme, $authority, $upath, $query, $fragment) =
244 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
245
246 $scheme = lc $scheme;
247
248 my $uport = $scheme eq "http" ? 80
249 : $scheme eq "https" ? 443
250 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" });
251
252 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
253
254 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
255 or return $cb->(undef, { Status => 599, Reason => "unparsable URL" });
256
257 my $uhost = $1;
258 $uport = $2 if defined $2;
259
260 $uhost =~ s/^\[(.*)\]$/$1/;
261 $upath .= "?$query" if length $query;
262
263 $upath =~ s%^/?%/%;
264
265 # cookie processing
266 if (my $jar = $arg{cookie_jar}) {
267 %$jar = () if $jar->{version} < 1;
268
269 my @cookie;
270
271 while (my ($chost, $v) = each %$jar) {
272 next unless $chost eq substr $uhost, -length $chost;
273 next unless $chost =~ /^\./;
274
275 while (my ($cpath, $v) = each %$v) {
276 next unless $cpath eq substr $upath, 0, length $cpath;
277
278 while (my ($k, $v) = each %$v) {
279 next if $scheme ne "https" && exists $v->{secure};
280 push @cookie, "$k=$v->{value}";
281 }
282 }
283 }
284
285 $hdr{cookie} = join "; ", @cookie
286 if @cookie;
287 }
288
289 my ($rhost, $rport, $rpath); # request host, port, path
290
291 if ($proxy) {
292 ($rhost, $rport, $scheme) = @$proxy;
293 $rpath = $url;
294 } else {
295 ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
296 $hdr{host} = $uhost;
297 }
298
299 $hdr{"content-length"} = length $arg{body};
300
301 my %state = (connect_guard => 1);
302
303 _get_slot $uhost, sub {
304 $state{slot_guard} = shift;
305
306 return unless $state{connect_guard};
307
308 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
309 $state{fh} = shift
310 or return $cb->(undef, { Status => 599, Reason => "$!" });
311
312 delete $state{connect_guard}; # reduce memory usage, save a tree
313
314 # get handle
315 $state{handle} = new AnyEvent::Handle
316 fh => $state{fh},
317 ($scheme eq "https" ? (tls => "connect") : ());
318
319 # limit the number of persistent connections
320 if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
321 ++$KA_COUNT{$_[1]};
322 $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
323 $hdr{connection} = "keep-alive";
324 delete $hdr{connection}; # keep-alive not yet supported
325 } else {
326 delete $hdr{connection};
327 }
328
329 # (re-)configure handle
330 $state{handle}->timeout ($timeout);
331 $state{handle}->on_error (sub {
332 my $errno = "$!";
333 %state = ();
334 $cb->(undef, { Status => 599, Reason => $errno });
335 });
336 $state{handle}->on_eof (sub {
337 %state = ();
338 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" });
339 });
340
341 # send request
342 $state{handle}->push_write (
343 "$method $rpath HTTP/1.0\015\012"
344 . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
345 . "\015\012"
346 . (delete $arg{body})
347 );
348
349 %hdr = (); # reduce memory usage, save a kitten
350
351 # status line
352 $state{handle}->push_read (line => qr/\015?\012/, sub {
353 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
354 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" }));
355
356 my %hdr = ( # response headers
357 HTTPVersion => "\x00$1",
358 Status => "\x00$2",
359 Reason => "\x00$3",
360 );
361
362 # headers, could be optimized a bit
363 $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
364 for ("$_[1]\012") {
365 # we support spaces in field names, as lotus domino
366 # creates them.
367 $hdr{lc $1} .= "\x00$2"
368 while /\G
369 ([^:\000-\037]+):
370 [\011\040]*
371 ((?: [^\015\012]+ | \015?\012[\011\040] )*)
372 \015?\012
373 /gxc;
374
375 /\G$/
376 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" }));
377 }
378
379 substr $_, 0, 1, ""
380 for values %hdr;
381
382 my $finish = sub {
383 %state = ();
384
385 # set-cookie processing
386 if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
387 for (split /\x00/, $hdr{"set-cookie"}) {
388 my ($cookie, @arg) = split /;\s*/;
389 my ($name, $value) = split /=/, $cookie, 2;
390 my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
391
392 my $cdom = (delete $kv{domain}) || $uhost;
393 my $cpath = (delete $kv{path}) || "/";
394
395 $cdom =~ s/^.?/./; # make sure it starts with a "."
396
397 next if $cdom =~ /\.$/;
398
399 # this is not rfc-like and not netscape-like. go figure.
400 my $ndots = $cdom =~ y/.//;
401 next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
402
403 # store it
404 $arg{cookie_jar}{version} = 1;
405 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
406 }
407 }
408
409 if ($_[1]{Status} =~ /^30[12]$/ && $recurse) {
410 # microsoft and other assholes don't give a shit for following standards,
411 # try to support a common form of broken Location header.
412 $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
413
414 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
415 } else {
416 $cb->($_[0], $_[1]);
417 }
418 };
419
420 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
421 $finish->(undef, \%hdr);
422 } else {
423 if (exists $hdr{"content-length"}) {
424 $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
425 # could cache persistent connection now
426 if ($hdr{connection} =~ /\bkeep-alive\b/i) {
427 # but we don't, due to misdesigns, this is annoyingly complex
428 };
429
430 $finish->($_[1], \%hdr);
431 });
432 } else {
433 # too bad, need to read until we get an error or EOF,
434 # no way to detect winged data.
435 $_[0]->on_error (sub {
436 $finish->($_[0]{rbuf}, \%hdr);
437 });
438 $_[0]->on_eof (undef);
439 $_[0]->on_read (sub { });
440 }
441 }
442 });
443 });
444 }, sub {
445 $timeout
446 };
447 };
448
449 defined wantarray && AnyEvent::Util::guard { %state = () }
450 }
451
452 sub http_get($@) {
453 unshift @_, "GET";
454 &http_request
455 }
456
457 sub http_head($@) {
458 unshift @_, "HEAD";
459 &http_request
460 }
461
462 sub http_post($$@) {
463 unshift @_, "POST", "body";
464 &http_request
465 }
466
467 =back
468
469 =head2 GLOBAL FUNCTIONS AND VARIABLES
470
471 =over 4
472
473 =item AnyEvent::HTTP::set_proxy "proxy-url"
474
475 Sets the default proxy server to use. The proxy-url must begin with a
476 string of the form C<http://host:port> (optionally C<https:...>).
477
478 =item $AnyEvent::HTTP::MAX_RECURSE
479
480 The default value for the C<recurse> request parameter (default: C<10>).
481
482 =item $AnyEvent::HTTP::USERAGENT
483
484 The default value for the C<User-Agent> header (the default is
485 C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
486
487 =item $AnyEvent::HTTP::MAX_PERSISTENT
488
489 The maximum number of persistent connections to keep open (default: 8).
490
491 Not implemented currently.
492
493 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
494
495 The maximum time to cache a persistent connection, in seconds (default: 2).
496
497 Not implemented currently.
498
499 =item $AnyEvent::HTTP::ACTIVE
500
501 The number of active connections. This is not the number of currently
502 running requests, but the number of currently open and non-idle TCP
503 connections. This number of can be useful for load-leveling.
504
505 =back
506
507 =cut
508
509 sub set_proxy($) {
510 $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
511 }
512
513 # initialise proxy from environment
514 set_proxy $ENV{http_proxy};
515
516 =head1 SEE ALSO
517
518 L<AnyEvent>.
519
520 =head1 AUTHOR
521
522 Marc Lehmann <schmorp@schmorp.de>
523 http://home.schmorp.de/
524
525 =cut
526
527 1
528