ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
(Generate patch)

Comparing AnyEvent-HTTP/HTTP.pm (file contents):
Revision 1.10 by root, Thu Jun 5 13:06:43 2008 UTC vs.
Revision 1.16 by root, Fri Jun 6 12:57:48 2008 UTC

8 8
9=head1 DESCRIPTION 9=head1 DESCRIPTION
10 10
11This module is an L<AnyEvent> user, you need to make sure that you use and 11This module is an L<AnyEvent> user, you need to make sure that you use and
12run a supported event loop. 12run a supported event loop.
13
14This module implements a simple, stateless and non-blocking HTTP
15client. It supports GET, POST and other request methods, cookies and more,
16all on a very low level. It can follow redirects supports proxies and
17automatically limits the number of connections to the values specified in
18the RFC.
19
20It should generally be a "good client" that is enough for most HTTP
21tasks. Simple tasks should be simple, but complex tasks should still be
22possible as the user retains control over request and response headers.
23
24The caller is responsible for authentication management, cookies (if
25the simplistic implementation in this module doesn't suffice), referer
26and other high-level protocol details for which this module offers only
27limited support.
13 28
14=head2 METHODS 29=head2 METHODS
15 30
16=over 4 31=over 4
17 32
29use AnyEvent::Socket (); 44use AnyEvent::Socket ();
30use AnyEvent::Handle (); 45use AnyEvent::Handle ();
31 46
32use base Exporter::; 47use base Exporter::;
33 48
34our $VERSION = '1.0'; 49our $VERSION = '1.01';
35 50
36our @EXPORT = qw(http_get http_request); 51our @EXPORT = qw(http_get http_request);
37 52
38our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)"; 53our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
39our $MAX_RECURSE = 10; 54our $MAX_RECURSE = 10;
41our $PERSISTENT_TIMEOUT = 2; 56our $PERSISTENT_TIMEOUT = 2;
42our $TIMEOUT = 300; 57our $TIMEOUT = 300;
43 58
44# changing these is evil 59# changing these is evil
45our $MAX_PERSISTENT_PER_HOST = 2; 60our $MAX_PERSISTENT_PER_HOST = 2;
46our $MAX_PER_HOST = 4; # not respected yet :( 61our $MAX_PER_HOST = 4;
47 62
48our $PROXY; 63our $PROXY;
64our $ACTIVE = 0;
49 65
50my %KA_COUNT; # number of open keep-alive connections per host 66my %KA_COUNT; # number of open keep-alive connections per host
67my %CO_SLOT; # number of open connections, and wait queue, per host
51 68
52=item http_get $url, key => value..., $cb->($data, $headers) 69=item http_get $url, key => value..., $cb->($data, $headers)
53 70
54Executes an HTTP-GET request. See the http_request function for details on 71Executes an HTTP-GET request. See the http_request function for details on
55additional parameters. 72additional parameters.
105Whether to recurse requests or not, e.g. on redirects, authentication 122Whether to recurse requests or not, e.g. on redirects, authentication
106retries and so on, and how often to do so. 123retries and so on, and how often to do so.
107 124
108=item headers => hashref 125=item headers => hashref
109 126
110The request headers to use. 127The request headers to use. Currently, C<http_request> may provide its
128own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
129and will provide defaults for C<User-Agent:> and C<Referer:>.
111 130
112=item timeout => $seconds 131=item timeout => $seconds
113 132
114The time-out to use for various stages - each connect attempt will reset 133The time-out to use for various stages - each connect attempt will reset
115the timeout, as will read or write activity. Default timeout is 5 minutes. 134the timeout, as will read or write activity. Default timeout is 5 minutes.
165 } 184 }
166 ; 185 ;
167 186
168=cut 187=cut
169 188
189sub _slot_schedule;
190sub _slot_schedule($) {
191 my $host = shift;
192
193 while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
194 if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
195 # somebody wants that slot
196 ++$CO_SLOT{$host}[0];
197 ++$ACTIVE;
198
199 $cb->(AnyEvent::Util::guard {
200 --$ACTIVE;
201 --$CO_SLOT{$host}[0];
202 _slot_schedule $host;
203 });
204 } else {
205 # nobody wants the slot, maybe we can forget about it
206 delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
207 last;
208 }
209 }
210}
211
212# wait for a free slot on host, call callback
213sub _get_slot($$) {
214 push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
215
216 _slot_schedule $_[0];
217}
218
170sub http_request($$$;@) { 219sub http_request($$@) {
171 my $cb = pop; 220 my $cb = pop;
172 my ($method, $url, %arg) = @_; 221 my ($method, $url, %arg) = @_;
173 222
174 my %hdr; 223 my %hdr;
175 224
197 $scheme = lc $scheme; 246 $scheme = lc $scheme;
198 247
199 my $uport = $scheme eq "http" ? 80 248 my $uport = $scheme eq "http" ? 80
200 : $scheme eq "https" ? 443 249 : $scheme eq "https" ? 443
201 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" }); 250 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" });
251
252 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
202 253
203 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x 254 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
204 or return $cb->(undef, { Status => 599, Reason => "unparsable URL" }); 255 or return $cb->(undef, { Status => 599, Reason => "unparsable URL" });
205 256
206 my $uhost = $1; 257 my $uhost = $1;
245 $hdr{host} = $uhost; 296 $hdr{host} = $uhost;
246 } 297 }
247 298
248 $hdr{"content-length"} = length $arg{body}; 299 $hdr{"content-length"} = length $arg{body};
249 300
250 my %state; 301 my %state = (connect_guard => 1);
251 302
303 _get_slot $uhost, sub {
304 $state{slot_guard} = shift;
305
306 return unless $state{connect_guard};
307
252 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub { 308 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
253 $state{fh} = shift 309 $state{fh} = shift
254 or return $cb->(undef, { Status => 599, Reason => "$!" }); 310 or return $cb->(undef, { Status => 599, Reason => "$!" });
255 311
256 delete $state{connect_guard}; # reduce memory usage, save a tree 312 delete $state{connect_guard}; # reduce memory usage, save a tree
257 313
258 # get handle 314 # get handle
259 $state{handle} = new AnyEvent::Handle 315 $state{handle} = new AnyEvent::Handle
260 fh => $state{fh}, 316 fh => $state{fh},
261 ($scheme eq "https" ? (tls => "connect") : ()); 317 ($scheme eq "https" ? (tls => "connect") : ());
262 318
263 # limit the number of persistent connections 319 # limit the number of persistent connections
264 if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) { 320 if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
265 ++$KA_COUNT{$_[1]}; 321 ++$KA_COUNT{$_[1]};
266 $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} }; 322 $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
267 $hdr{connection} = "keep-alive"; 323 $hdr{connection} = "keep-alive";
268 delete $hdr{connection}; # keep-alive not yet supported 324 delete $hdr{connection}; # keep-alive not yet supported
269 } else { 325 } else {
270 delete $hdr{connection}; 326 delete $hdr{connection};
271 } 327 }
272 328
273 # (re-)configure handle 329 # (re-)configure handle
274 $state{handle}->timeout ($timeout); 330 $state{handle}->timeout ($timeout);
275 $state{handle}->on_error (sub { 331 $state{handle}->on_error (sub {
332 my $errno = "$!";
276 %state = (); 333 %state = ();
277 $cb->(undef, { Status => 599, Reason => "$!" }); 334 $cb->(undef, { Status => 599, Reason => $errno });
278 }); 335 });
279 $state{handle}->on_eof (sub { 336 $state{handle}->on_eof (sub {
280 %state = (); 337 %state = ();
281 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" }); 338 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" });
282 }); 339 });
283 340
284 # send request 341 # send request
285 $state{handle}->push_write ( 342 $state{handle}->push_write (
286 "$method $rpath HTTP/1.0\015\012" 343 "$method $rpath HTTP/1.0\015\012"
287 . (join "", map "$_: $hdr{$_}\015\012", keys %hdr) 344 . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
288 . "\015\012" 345 . "\015\012"
289 . (delete $arg{body}) 346 . (delete $arg{body})
290 );
291
292 %hdr = (); # reduce memory usage, save a kitten
293
294 # status line
295 $state{handle}->push_read (line => qr/\015?\012/, sub {
296 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
297 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" }));
298
299 my %hdr = ( # response headers
300 HTTPVersion => "\x00$1",
301 Status => "\x00$2",
302 Reason => "\x00$3",
303 ); 347 );
304 348
349 %hdr = (); # reduce memory usage, save a kitten
350
351 # status line
352 $state{handle}->push_read (line => qr/\015?\012/, sub {
353 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
354 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" }));
355
356 my %hdr = ( # response headers
357 HTTPVersion => "\x00$1",
358 Status => "\x00$2",
359 Reason => "\x00$3",
360 );
361
305 # headers, could be optimized a bit 362 # headers, could be optimized a bit
306 $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub { 363 $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
307 for ("$_[1]\012") { 364 for ("$_[1]\012") {
308 # we support spaces in field names, as lotus domino 365 # we support spaces in field names, as lotus domino
309 # creates them. 366 # creates them.
310 $hdr{lc $1} .= "\x00$2" 367 $hdr{lc $1} .= "\x00$2"
311 while /\G 368 while /\G
312 ([^:\000-\037]+): 369 ([^:\000-\037]+):
313 [\011\040]* 370 [\011\040]*
314 ((?: [^\015\012]+ | \015?\012[\011\040] )*) 371 ((?: [^\015\012]+ | \015?\012[\011\040] )*)
315 \015?\012 372 \015?\012
316 /gxc; 373 /gxc;
317 374
318 /\G$/ 375 /\G$/
319 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" })); 376 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" }));
320 } 377 }
321 378
322 substr $_, 0, 1, "" 379 substr $_, 0, 1, ""
323 for values %hdr; 380 for values %hdr;
324 381
325 my $finish = sub { 382 my $finish = sub {
326 %state = (); 383 %state = ();
327 384
328 # set-cookie processing 385 # set-cookie processing
329 if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) { 386 if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
330 for (split /\x00/, $hdr{"set-cookie"}) { 387 for (split /\x00/, $hdr{"set-cookie"}) {
331 my ($cookie, @arg) = split /;\s*/; 388 my ($cookie, @arg) = split /;\s*/;
332 my ($name, $value) = split /=/, $cookie, 2; 389 my ($name, $value) = split /=/, $cookie, 2;
333 my %kv = (value => $value, map { split /=/, $_, 2 } @arg); 390 my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
334 391
335 my $cdom = (delete $kv{domain}) || $uhost; 392 my $cdom = (delete $kv{domain}) || $uhost;
336 my $cpath = (delete $kv{path}) || "/"; 393 my $cpath = (delete $kv{path}) || "/";
337 394
338 $cdom =~ s/^.?/./; # make sure it starts with a "." 395 $cdom =~ s/^.?/./; # make sure it starts with a "."
339 396
397 next if $cdom =~ /\.$/;
398
399 # this is not rfc-like and not netscape-like. go figure.
340 my $ndots = $cdom =~ y/.//; 400 my $ndots = $cdom =~ y/.//;
341 next if $ndots < ($cdom =~ /[^.]{3}$/ ? 2 : 3); 401 next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
342 402
343 # store it 403 # store it
344 $arg{cookie_jar}{version} = 1; 404 $arg{cookie_jar}{version} = 1;
345 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv; 405 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
406 }
407 }
408
409 if ($_[1]{Status} =~ /^30[12]$/ && $recurse) {
410 # microsoft and other assholes don't give a shit for following standards,
411 # try to support a common form of broken Location header.
412 $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
413
414 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
415 } else {
416 $cb->($_[0], $_[1]);
417 }
418 };
419
420 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
421 $finish->(undef, \%hdr);
422 } else {
423 if (exists $hdr{"content-length"}) {
424 $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
425 # could cache persistent connection now
426 if ($hdr{connection} =~ /\bkeep-alive\b/i) {
427 # but we don't, due to misdesigns, this is annoyingly complex
428 };
429
430 $finish->($_[1], \%hdr);
431 });
432 } else {
433 # too bad, need to read until we get an error or EOF,
434 # no way to detect winged data.
435 $_[0]->on_error (sub {
436 $finish->($_[0]{rbuf}, \%hdr);
437 });
438 $_[0]->on_eof (undef);
439 $_[0]->on_read (sub { });
346 } 440 }
347 } 441 }
348
349 if ($_[1]{Status} =~ /^x30[12]$/ && $recurse) {
350 # microsoft and other assholes don't give a shit for following standards,
351 # try to support a common form of broken Location header.
352 $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
353
354 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
355 } else {
356 $cb->($_[0], $_[1]);
357 }
358 }; 442 });
359
360 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
361 $finish->(undef, \%hdr);
362 } else {
363 if (exists $hdr{"content-length"}) {
364 $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
365 # could cache persistent connection now
366 if ($hdr{connection} =~ /\bkeep-alive\b/i) {
367 # but we don't, due to misdesigns, this is annoyingly complex
368 };
369
370 $finish->($_[1], \%hdr);
371 });
372 } else {
373 # too bad, need to read until we get an error or EOF,
374 # no way to detect winged data.
375 $_[0]->on_error (sub {
376 $finish->($_[0]{rbuf}, \%hdr);
377 });
378 $_[0]->on_eof (undef);
379 $_[0]->on_read (sub { });
380 }
381 }
382 }); 443 });
444 }, sub {
445 $timeout
383 }); 446 };
384 }, sub {
385 $timeout
386 }; 447 };
387 448
388 defined wantarray && AnyEvent::Util::guard { %state = () } 449 defined wantarray && AnyEvent::Util::guard { %state = () }
389} 450}
390 451
391sub http_get($$;@) { 452sub http_get($@) {
392 unshift @_, "GET"; 453 unshift @_, "GET";
393 &http_request 454 &http_request
394} 455}
395 456
396sub http_head($$;@) { 457sub http_head($@) {
397 unshift @_, "HEAD"; 458 unshift @_, "HEAD";
398 &http_request 459 &http_request
399} 460}
400 461
401sub http_post($$$;@) { 462sub http_post($$@) {
402 unshift @_, "POST", "body"; 463 unshift @_, "POST", "body";
403 &http_request 464 &http_request
404} 465}
405 466
406=back 467=back
433 494
434The maximum time to cache a persistent connection, in seconds (default: 2). 495The maximum time to cache a persistent connection, in seconds (default: 2).
435 496
436Not implemented currently. 497Not implemented currently.
437 498
499=item $AnyEvent::HTTP::ACTIVE
500
501The number of active connections. This is not the number of currently
502running requests, but the number of currently open and non-idle TCP
503connections. This number of can be useful for load-leveling.
504
438=back 505=back
439 506
440=cut 507=cut
441 508
442sub set_proxy($) { 509sub set_proxy($) {

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines