ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.13
Committed: Thu Jun 5 16:43:45 2008 UTC (15 years, 11 months ago) by root
Branch: MAIN
Changes since 1.12: +3 -3 lines
Log Message:
*** empty log message ***

File Contents

# Content
1 =head1 NAME
2
3 AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4
5 =head1 SYNOPSIS
6
7 use AnyEvent::HTTP;
8
9 =head1 DESCRIPTION
10
11 This module is an L<AnyEvent> user, you need to make sure that you use and
12 run a supported event loop.
13
14 This module implements a simple, stateless and non-blocking HTTP
15 client. It supports GET, POST and other request methods, cookies and more,
16 all on a very low level. It can follow redirects supports proxies and
17 automatically limits the number of connections to the values specified in
18 the RFC.
19
20 It should generally be a "good client" that is enough for most HTTP
21 tasks. Simple tasks should be simple, but complex tasks should still be
22 possible as the user retains control over request and response headers.
23
24 The caller is responsible for authentication management, cookies (if
25 the simplistic implementation in this module doesn't suffice), referer
26 and other high-level protocol details for which this module offers only
27 limited support.
28
29 =head2 METHODS
30
31 =over 4
32
33 =cut
34
35 package AnyEvent::HTTP;
36
37 use strict;
38 no warnings;
39
40 use Carp;
41
42 use AnyEvent ();
43 use AnyEvent::Util ();
44 use AnyEvent::Socket ();
45 use AnyEvent::Handle ();
46
47 use base Exporter::;
48
49 our $VERSION = '1.0';
50
51 our @EXPORT = qw(http_get http_request);
52
53 our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
54 our $MAX_RECURSE = 10;
55 our $MAX_PERSISTENT = 8;
56 our $PERSISTENT_TIMEOUT = 2;
57 our $TIMEOUT = 300;
58
59 # changing these is evil
60 our $MAX_PERSISTENT_PER_HOST = 2;
61 our $MAX_PER_HOST = 4;
62
63 our $PROXY;
64
65 my %KA_COUNT; # number of open keep-alive connections per host
66 my %CO_SLOT; # number of open connections, and wait queue, per host
67
68 =item http_get $url, key => value..., $cb->($data, $headers)
69
70 Executes an HTTP-GET request. See the http_request function for details on
71 additional parameters.
72
73 =item http_head $url, key => value..., $cb->($data, $headers)
74
75 Executes an HTTP-HEAD request. See the http_request function for details on
76 additional parameters.
77
78 =item http_post $url, $body, key => value..., $cb->($data, $headers)
79
80 Executes an HTTP-POST request with a request body of C<$bod>. See the
81 http_request function for details on additional parameters.
82
83 =item http_request $method => $url, key => value..., $cb->($data, $headers)
84
85 Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
86 must be an absolute http or https URL.
87
88 The callback will be called with the response data as first argument
89 (or C<undef> if it wasn't available due to errors), and a hash-ref with
90 response headers as second argument.
91
92 All the headers in that hash are lowercased. In addition to the response
93 headers, the three "pseudo-headers" C<HTTPVersion>, C<Status> and
94 C<Reason> contain the three parts of the HTTP Status-Line of the same
95 name. If the server sends a header multiple lines, then their contents
96 will be joined together with C<\x00>.
97
98 If an internal error occurs, such as not being able to resolve a hostname,
99 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
100 and the C<Reason> pseudo-header will contain an error message.
101
102 A typical callback might look like this:
103
104 sub {
105 my ($body, $hdr) = @_;
106
107 if ($hdr->{Status} =~ /^2/) {
108 ... everything should be ok
109 } else {
110 print "error, $hdr->{Status} $hdr->{Reason}\n";
111 }
112 }
113
114 Additional parameters are key-value pairs, and are fully optional. They
115 include:
116
117 =over 4
118
119 =item recurse => $count (default: $MAX_RECURSE)
120
121 Whether to recurse requests or not, e.g. on redirects, authentication
122 retries and so on, and how often to do so.
123
124 =item headers => hashref
125
126 The request headers to use. Currently, C<http_request> may provide its
127 own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
128 and will provide defaults for C<User-Agent:> and C<Referer:>.
129
130 =item timeout => $seconds
131
132 The time-out to use for various stages - each connect attempt will reset
133 the timeout, as will read or write activity. Default timeout is 5 minutes.
134
135 =item proxy => [$host, $port[, $scheme]] or undef
136
137 Use the given http proxy for all requests. If not specified, then the
138 default proxy (as specified by C<$ENV{http_proxy}>) is used.
139
140 C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
141 HTTPS.
142
143 =item body => $string
144
145 The request body, usually empty. Will be-sent as-is (future versions of
146 this module might offer more options).
147
148 =item cookie_jar => $hash_ref
149
150 Passing this parameter enables (simplified) cookie-processing, loosely
151 based on the original netscape specification.
152
153 The C<$hash_ref> must be an (initially empty) hash reference which will
154 get updated automatically. It is possible to save the cookie_jar to
155 persistent storage with something like JSON or Storable, but this is not
156 recommended, as expire times are currently being ignored.
157
158 Note that this cookie implementation is not of very high quality, nor
159 meant to be complete. If you want complete cookie management you have to
160 do that on your own. C<cookie_jar> is meant as a quick fix to get some
161 cookie-using sites working. Cookies are a privacy disaster, do not use
162 them unless required to.
163
164 =back
165
166 Example: make a simple HTTP GET request for http://www.nethype.de/
167
168 http_request GET => "http://www.nethype.de/", sub {
169 my ($body, $hdr) = @_;
170 print "$body\n";
171 };
172
173 Example: make a HTTP HEAD request on https://www.google.com/, use a
174 timeout of 30 seconds.
175
176 http_request
177 GET => "https://www.google.com",
178 timeout => 30,
179 sub {
180 my ($body, $hdr) = @_;
181 use Data::Dumper;
182 print Dumper $hdr;
183 }
184 ;
185
186 =cut
187
188 sub _slot_schedule;
189 sub _slot_schedule($) {
190 my $host = shift;
191
192 while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
193 if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
194 # somebody wants that slot
195 ++$CO_SLOT{$host}[0];
196
197 $cb->(AnyEvent::Util::guard {
198 --$CO_SLOT{$host}[0];
199 _slot_schedule $host;
200 });
201 } else {
202 # nobody wants the slot, maybe we can forget about it
203 delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
204 last;
205 }
206 }
207 }
208
209 # wait for a free slot on host, call callback
210 sub _get_slot($$) {
211 push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
212
213 _slot_schedule $_[0];
214 }
215
216 sub http_request($$$;@) {
217 my $cb = pop;
218 my ($method, $url, %arg) = @_;
219
220 my %hdr;
221
222 $method = uc $method;
223
224 if (my $hdr = $arg{headers}) {
225 while (my ($k, $v) = each %$hdr) {
226 $hdr{lc $k} = $v;
227 }
228 }
229
230 my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE;
231
232 return $cb->(undef, { Status => 599, Reason => "recursion limit reached" })
233 if $recurse < 0;
234
235 my $proxy = $arg{proxy} || $PROXY;
236 my $timeout = $arg{timeout} || $TIMEOUT;
237
238 $hdr{"user-agent"} ||= $USERAGENT;
239
240 my ($scheme, $authority, $upath, $query, $fragment) =
241 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
242
243 $scheme = lc $scheme;
244
245 my $uport = $scheme eq "http" ? 80
246 : $scheme eq "https" ? 443
247 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" });
248
249 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
250
251 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
252 or return $cb->(undef, { Status => 599, Reason => "unparsable URL" });
253
254 my $uhost = $1;
255 $uport = $2 if defined $2;
256
257 $uhost =~ s/^\[(.*)\]$/$1/;
258 $upath .= "?$query" if length $query;
259
260 $upath =~ s%^/?%/%;
261
262 # cookie processing
263 if (my $jar = $arg{cookie_jar}) {
264 %$jar = () if $jar->{version} < 1;
265
266 my @cookie;
267
268 while (my ($chost, $v) = each %$jar) {
269 next unless $chost eq substr $uhost, -length $chost;
270 next unless $chost =~ /^\./;
271
272 while (my ($cpath, $v) = each %$v) {
273 next unless $cpath eq substr $upath, 0, length $cpath;
274
275 while (my ($k, $v) = each %$v) {
276 next if $scheme ne "https" && exists $v->{secure};
277 push @cookie, "$k=$v->{value}";
278 }
279 }
280 }
281
282 $hdr{cookie} = join "; ", @cookie
283 if @cookie;
284 }
285
286 my ($rhost, $rport, $rpath); # request host, port, path
287
288 if ($proxy) {
289 ($rhost, $rport, $scheme) = @$proxy;
290 $rpath = $url;
291 } else {
292 ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
293 $hdr{host} = $uhost;
294 }
295
296 $hdr{"content-length"} = length $arg{body};
297
298 my %state = (connect_guard => 1);
299
300 _get_slot $uhost, sub {
301 $state{slot_guard} = shift;
302
303 return unless $state{connect_guard};
304
305 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
306 $state{fh} = shift
307 or return $cb->(undef, { Status => 599, Reason => "$!" });
308
309 delete $state{connect_guard}; # reduce memory usage, save a tree
310
311 # get handle
312 $state{handle} = new AnyEvent::Handle
313 fh => $state{fh},
314 ($scheme eq "https" ? (tls => "connect") : ());
315
316 # limit the number of persistent connections
317 if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
318 ++$KA_COUNT{$_[1]};
319 $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
320 $hdr{connection} = "keep-alive";
321 delete $hdr{connection}; # keep-alive not yet supported
322 } else {
323 delete $hdr{connection};
324 }
325
326 # (re-)configure handle
327 $state{handle}->timeout ($timeout);
328 $state{handle}->on_error (sub {
329 %state = ();
330 $cb->(undef, { Status => 599, Reason => "$!" });
331 });
332 $state{handle}->on_eof (sub {
333 %state = ();
334 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" });
335 });
336
337 # send request
338 $state{handle}->push_write (
339 "$method $rpath HTTP/1.0\015\012"
340 . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
341 . "\015\012"
342 . (delete $arg{body})
343 );
344
345 %hdr = (); # reduce memory usage, save a kitten
346
347 # status line
348 $state{handle}->push_read (line => qr/\015?\012/, sub {
349 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
350 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" }));
351
352 my %hdr = ( # response headers
353 HTTPVersion => "\x00$1",
354 Status => "\x00$2",
355 Reason => "\x00$3",
356 );
357
358 # headers, could be optimized a bit
359 $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
360 for ("$_[1]\012") {
361 # we support spaces in field names, as lotus domino
362 # creates them.
363 $hdr{lc $1} .= "\x00$2"
364 while /\G
365 ([^:\000-\037]+):
366 [\011\040]*
367 ((?: [^\015\012]+ | \015?\012[\011\040] )*)
368 \015?\012
369 /gxc;
370
371 /\G$/
372 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" }));
373 }
374
375 substr $_, 0, 1, ""
376 for values %hdr;
377
378 my $finish = sub {
379 %state = ();
380
381 # set-cookie processing
382 if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
383 for (split /\x00/, $hdr{"set-cookie"}) {
384 my ($cookie, @arg) = split /;\s*/;
385 my ($name, $value) = split /=/, $cookie, 2;
386 my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
387
388 my $cdom = (delete $kv{domain}) || $uhost;
389 my $cpath = (delete $kv{path}) || "/";
390
391 $cdom =~ s/^.?/./; # make sure it starts with a "."
392
393 next if $cdom =~ /\.$/;
394
395 # this is not rfc-like and not netscape-like. go figure.
396 my $ndots = $cdom =~ y/.//;
397 next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
398
399 # store it
400 $arg{cookie_jar}{version} = 1;
401 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
402 }
403 }
404
405 if ($_[1]{Status} =~ /^30[12]$/ && $recurse) {
406 # microsoft and other assholes don't give a shit for following standards,
407 # try to support a common form of broken Location header.
408 $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
409
410 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
411 } else {
412 $cb->($_[0], $_[1]);
413 }
414 };
415
416 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
417 $finish->(undef, \%hdr);
418 } else {
419 if (exists $hdr{"content-length"}) {
420 $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
421 # could cache persistent connection now
422 if ($hdr{connection} =~ /\bkeep-alive\b/i) {
423 # but we don't, due to misdesigns, this is annoyingly complex
424 };
425
426 $finish->($_[1], \%hdr);
427 });
428 } else {
429 # too bad, need to read until we get an error or EOF,
430 # no way to detect winged data.
431 $_[0]->on_error (sub {
432 $finish->($_[0]{rbuf}, \%hdr);
433 });
434 $_[0]->on_eof (undef);
435 $_[0]->on_read (sub { });
436 }
437 }
438 });
439 });
440 }, sub {
441 $timeout
442 };
443 };
444
445 defined wantarray && AnyEvent::Util::guard { %state = () }
446 }
447
448 sub http_get($$;@) {
449 unshift @_, "GET";
450 &http_request
451 }
452
453 sub http_head($$;@) {
454 unshift @_, "HEAD";
455 &http_request
456 }
457
458 sub http_post($$$;@) {
459 unshift @_, "POST", "body";
460 &http_request
461 }
462
463 =back
464
465 =head2 GLOBAL FUNCTIONS AND VARIABLES
466
467 =over 4
468
469 =item AnyEvent::HTTP::set_proxy "proxy-url"
470
471 Sets the default proxy server to use. The proxy-url must begin with a
472 string of the form C<http://host:port> (optionally C<https:...>).
473
474 =item $AnyEvent::HTTP::MAX_RECURSE
475
476 The default value for the C<recurse> request parameter (default: C<10>).
477
478 =item $AnyEvent::HTTP::USERAGENT
479
480 The default value for the C<User-Agent> header (the default is
481 C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
482
483 =item $AnyEvent::HTTP::MAX_PERSISTENT
484
485 The maximum number of persistent connections to keep open (default: 8).
486
487 Not implemented currently.
488
489 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
490
491 The maximum time to cache a persistent connection, in seconds (default: 2).
492
493 Not implemented currently.
494
495 =back
496
497 =cut
498
499 sub set_proxy($) {
500 $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
501 }
502
503 # initialise proxy from environment
504 set_proxy $ENV{http_proxy};
505
506 =head1 SEE ALSO
507
508 L<AnyEvent>.
509
510 =head1 AUTHOR
511
512 Marc Lehmann <schmorp@schmorp.de>
513 http://home.schmorp.de/
514
515 =cut
516
517 1
518