ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
(Generate patch)

Comparing AnyEvent-HTTP/HTTP.pm (file contents):
Revision 1.11 by root, Thu Jun 5 15:34:00 2008 UTC vs.
Revision 1.25 by root, Mon Jul 21 05:42:07 2008 UTC

3AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client 3AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4 4
5=head1 SYNOPSIS 5=head1 SYNOPSIS
6 6
7 use AnyEvent::HTTP; 7 use AnyEvent::HTTP;
8
9 http_get "http://www.nethype.de/", sub { print $_[1] };
10
11 # ... do something else here
8 12
9=head1 DESCRIPTION 13=head1 DESCRIPTION
10 14
11This module is an L<AnyEvent> user, you need to make sure that you use and 15This module is an L<AnyEvent> user, you need to make sure that you use and
12run a supported event loop. 16run a supported event loop.
44use AnyEvent::Socket (); 48use AnyEvent::Socket ();
45use AnyEvent::Handle (); 49use AnyEvent::Handle ();
46 50
47use base Exporter::; 51use base Exporter::;
48 52
49our $VERSION = '1.0'; 53our $VERSION = '1.03';
50 54
51our @EXPORT = qw(http_get http_request); 55our @EXPORT = qw(http_get http_post http_head http_request);
52 56
53our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)"; 57our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
54our $MAX_RECURSE = 10; 58our $MAX_RECURSE = 10;
55our $MAX_PERSISTENT = 8; 59our $MAX_PERSISTENT = 8;
56our $PERSISTENT_TIMEOUT = 2; 60our $PERSISTENT_TIMEOUT = 2;
59# changing these is evil 63# changing these is evil
60our $MAX_PERSISTENT_PER_HOST = 2; 64our $MAX_PERSISTENT_PER_HOST = 2;
61our $MAX_PER_HOST = 4; 65our $MAX_PER_HOST = 4;
62 66
63our $PROXY; 67our $PROXY;
68our $ACTIVE = 0;
64 69
65my %KA_COUNT; # number of open keep-alive connections per host 70my %KA_COUNT; # number of open keep-alive connections per host
66my %CO_SLOT; # number of open connections, and wait queue, per host 71my %CO_SLOT; # number of open connections, and wait queue, per host
67 72
68=item http_get $url, key => value..., $cb->($data, $headers) 73=item http_get $url, key => value..., $cb->($data, $headers)
88The callback will be called with the response data as first argument 93The callback will be called with the response data as first argument
89(or C<undef> if it wasn't available due to errors), and a hash-ref with 94(or C<undef> if it wasn't available due to errors), and a hash-ref with
90response headers as second argument. 95response headers as second argument.
91 96
92All the headers in that hash are lowercased. In addition to the response 97All the headers in that hash are lowercased. In addition to the response
93headers, the three "pseudo-headers" C<HTTPVersion>, C<Status> and 98headers, the "pseudo-headers" C<HTTPVersion>, C<Status> and C<Reason>
94C<Reason> contain the three parts of the HTTP Status-Line of the same 99contain the three parts of the HTTP Status-Line of the same name. The
100pseudo-header C<URL> contains the original URL (which can differ from the
101requested URL when following redirects).
102
95name. If the server sends a header multiple lines, then their contents 103If the server sends a header multiple lines, then their contents will be
96will be joined together with C<\x00>. 104joined together with C<\x00>.
97 105
98If an internal error occurs, such as not being able to resolve a hostname, 106If an internal error occurs, such as not being able to resolve a hostname,
99then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599> 107then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
100and the C<Reason> pseudo-header will contain an error message. 108and the C<Reason> pseudo-header will contain an error message.
101 109
121Whether to recurse requests or not, e.g. on redirects, authentication 129Whether to recurse requests or not, e.g. on redirects, authentication
122retries and so on, and how often to do so. 130retries and so on, and how often to do so.
123 131
124=item headers => hashref 132=item headers => hashref
125 133
126The request headers to use. 134The request headers to use. Currently, C<http_request> may provide its
135own C<Host:>, C<Content-Length:>, C<Connection:> and C<Cookie:> headers
136and will provide defaults for C<User-Agent:> and C<Referer:>.
127 137
128=item timeout => $seconds 138=item timeout => $seconds
129 139
130The time-out to use for various stages - each connect attempt will reset 140The time-out to use for various stages - each connect attempt will reset
131the timeout, as will read or write activity. Default timeout is 5 minutes. 141the timeout, as will read or write activity. Default timeout is 5 minutes.
181 } 191 }
182 ; 192 ;
183 193
184=cut 194=cut
185 195
196sub _slot_schedule;
186sub _slot_schedule($) { 197sub _slot_schedule($) {
187 my $host = shift; 198 my $host = shift;
188 199
189 while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) { 200 while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) {
190 if (my $cb = shift @{ $CO_SLOT{$host}[1] }) { 201 if (my $cb = shift @{ $CO_SLOT{$host}[1] }) {
191 # somebody wnats that slot 202 # somebody wants that slot
192 ++$CO_SLOT{$host}[0]; 203 ++$CO_SLOT{$host}[0];
204 ++$ACTIVE;
193 205
194 $cb->(AnyEvent::Util::guard { 206 $cb->(AnyEvent::Util::guard {
207 --$ACTIVE;
195 --$CO_SLOT{$host}[0]; 208 --$CO_SLOT{$host}[0];
196 _slot_schedule $host; 209 _slot_schedule $host;
197 }); 210 });
198 } else { 211 } else {
199 # nobody wants the slot, maybe we can forget about it 212 # nobody wants the slot, maybe we can forget about it
200 delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0]; 213 delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0];
201 warn "$host deleted" unless $CO_SLOT{$host}[0];#d#
202 last; 214 last;
203 } 215 }
204 } 216 }
205} 217}
206 218
209 push @{ $CO_SLOT{$_[0]}[1] }, $_[1]; 221 push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
210 222
211 _slot_schedule $_[0]; 223 _slot_schedule $_[0];
212} 224}
213 225
214sub http_request($$$;@) { 226sub http_request($$@) {
215 my $cb = pop; 227 my $cb = pop;
216 my ($method, $url, %arg) = @_; 228 my ($method, $url, %arg) = @_;
217 229
218 my %hdr; 230 my %hdr;
219 231
223 while (my ($k, $v) = each %$hdr) { 235 while (my ($k, $v) = each %$hdr) {
224 $hdr{lc $k} = $v; 236 $hdr{lc $k} = $v;
225 } 237 }
226 } 238 }
227 239
228 my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE; 240 my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE;
229 241
230 return $cb->(undef, { Status => 599, Reason => "recursion limit reached" }) 242 return $cb->(undef, { Status => 599, Reason => "recursion limit reached", URL => $url })
231 if $recurse < 0; 243 if $recurse < 0;
232 244
233 my $proxy = $arg{proxy} || $PROXY; 245 my $proxy = $arg{proxy} || $PROXY;
234 my $timeout = $arg{timeout} || $TIMEOUT; 246 my $timeout = $arg{timeout} || $TIMEOUT;
235 247
240 252
241 $scheme = lc $scheme; 253 $scheme = lc $scheme;
242 254
243 my $uport = $scheme eq "http" ? 80 255 my $uport = $scheme eq "http" ? 80
244 : $scheme eq "https" ? 443 256 : $scheme eq "https" ? 443
245 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" }); 257 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported", URL => $url });
258
259 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic
246 260
247 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x 261 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
248 or return $cb->(undef, { Status => 599, Reason => "unparsable URL" }); 262 or return $cb->(undef, { Status => 599, Reason => "unparsable URL", URL => $url });
249 263
250 my $uhost = $1; 264 my $uhost = $1;
251 $uport = $2 if defined $2; 265 $uport = $2 if defined $2;
252 266
253 $uhost =~ s/^\[(.*)\]$/$1/; 267 $uhost =~ s/^\[(.*)\]$/$1/;
298 312
299 return unless $state{connect_guard}; 313 return unless $state{connect_guard};
300 314
301 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub { 315 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
302 $state{fh} = shift 316 $state{fh} = shift
303 or return $cb->(undef, { Status => 599, Reason => "$!" }); 317 or return $cb->(undef, { Status => 599, Reason => "$!", URL => $url });
304 318
305 delete $state{connect_guard}; # reduce memory usage, save a tree 319 delete $state{connect_guard}; # reduce memory usage, save a tree
306 320
307 # get handle 321 # get handle
308 $state{handle} = new AnyEvent::Handle 322 $state{handle} = new AnyEvent::Handle
320 } 334 }
321 335
322 # (re-)configure handle 336 # (re-)configure handle
323 $state{handle}->timeout ($timeout); 337 $state{handle}->timeout ($timeout);
324 $state{handle}->on_error (sub { 338 $state{handle}->on_error (sub {
339 my $errno = "$!";
325 %state = (); 340 %state = ();
326 $cb->(undef, { Status => 599, Reason => "$!" }); 341 $cb->(undef, { Status => 599, Reason => $errno, URL => $url });
327 }); 342 });
328 $state{handle}->on_eof (sub { 343 $state{handle}->on_eof (sub {
329 %state = (); 344 %state = ();
330 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" }); 345 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file", URL => $url });
331 }); 346 });
332 347
333 # send request 348 # send request
334 $state{handle}->push_write ( 349 $state{handle}->push_write (
335 "$method $rpath HTTP/1.0\015\012" 350 "$method $rpath HTTP/1.0\015\012"
340 355
341 %hdr = (); # reduce memory usage, save a kitten 356 %hdr = (); # reduce memory usage, save a kitten
342 357
343 # status line 358 # status line
344 $state{handle}->push_read (line => qr/\015?\012/, sub { 359 $state{handle}->push_read (line => qr/\015?\012/, sub {
345 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix 360 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
346 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" })); 361 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])", URL => $url }));
347 362
348 my %hdr = ( # response headers 363 my %hdr = ( # response headers
349 HTTPVersion => "\x00$1", 364 HTTPVersion => "\x00$1",
350 Status => "\x00$2", 365 Status => "\x00$2",
351 Reason => "\x00$3", 366 Reason => "\x00$3",
367 URL => "\x00$url"
352 ); 368 );
353 369
354 # headers, could be optimized a bit 370 # headers, could be optimized a bit
355 $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub { 371 $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
356 for ("$_[1]\012") { 372 for ("$_[1]\012") {
363 ((?: [^\015\012]+ | \015?\012[\011\040] )*) 379 ((?: [^\015\012]+ | \015?\012[\011\040] )*)
364 \015?\012 380 \015?\012
365 /gxc; 381 /gxc;
366 382
367 /\G$/ 383 /\G$/
368 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" })); 384 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers", URL => $url }));
369 } 385 }
370 386
371 substr $_, 0, 1, "" 387 substr $_, 0, 1, ""
372 for values %hdr; 388 for values %hdr;
373 389
396 $arg{cookie_jar}{version} = 1; 412 $arg{cookie_jar}{version} = 1;
397 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv; 413 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
398 } 414 }
399 } 415 }
400 416
401 if ($_[1]{Status} =~ /^x30[12]$/ && $recurse) {
402 # microsoft and other assholes don't give a shit for following standards, 417 # microsoft and other shitheads don't give a shit for following standards,
403 # try to support a common form of broken Location header. 418 # try to support some common forms of broken Location headers.
419 if ($_[1]{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
420 $_[1]{location} =~ s/^\.\/+//;
421
404 $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%; 422 my $url = "$scheme://$uhost:$uport";
405 423
424 unless ($_[1]{location} =~ s/^\///) {
425 $url .= $upath;
426 $url =~ s/\/[^\/]*$//;
427 }
428
429 $_[1]{location} = "$url/$_[1]{location}";
430 }
431
432 if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") {
433 # apparently, mozilla et al. just change POST to GET here
434 # more research is needed before we do the same
406 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb); 435 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
436 } elsif ($_[1]{Status} == 303 && $recurse) {
437 # even http/1.1 is unlear on how to mutate the method
438 $method = "GET" unless $method eq "HEAD";
439 http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
440 } elsif ($_[1]{Status} == 307 && $recurse && $method =~ /^(?:GET|HEAD)$/) {
441 http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
407 } else { 442 } else {
408 $cb->($_[0], $_[1]); 443 $cb->($_[0], $_[1]);
409 } 444 }
410 }; 445 };
411 446
439 }; 474 };
440 475
441 defined wantarray && AnyEvent::Util::guard { %state = () } 476 defined wantarray && AnyEvent::Util::guard { %state = () }
442} 477}
443 478
444sub http_get($$;@) { 479sub http_get($@) {
445 unshift @_, "GET"; 480 unshift @_, "GET";
446 &http_request 481 &http_request
447} 482}
448 483
449sub http_head($$;@) { 484sub http_head($@) {
450 unshift @_, "HEAD"; 485 unshift @_, "HEAD";
451 &http_request 486 &http_request
452} 487}
453 488
454sub http_post($$$;@) { 489sub http_post($$@) {
490 my $url = shift;
455 unshift @_, "POST", "body"; 491 unshift @_, "POST", $url, "body";
456 &http_request 492 &http_request
457} 493}
458 494
459=back 495=back
460 496
486 522
487The maximum time to cache a persistent connection, in seconds (default: 2). 523The maximum time to cache a persistent connection, in seconds (default: 2).
488 524
489Not implemented currently. 525Not implemented currently.
490 526
527=item $AnyEvent::HTTP::ACTIVE
528
529The number of active connections. This is not the number of currently
530running requests, but the number of currently open and non-idle TCP
531connections. This number of can be useful for load-leveling.
532
491=back 533=back
492 534
493=cut 535=cut
494 536
495sub set_proxy($) { 537sub set_proxy($) {
503 545
504L<AnyEvent>. 546L<AnyEvent>.
505 547
506=head1 AUTHOR 548=head1 AUTHOR
507 549
508 Marc Lehmann <schmorp@schmorp.de> 550 Marc Lehmann <schmorp@schmorp.de>
509 http://home.schmorp.de/ 551 http://home.schmorp.de/
510 552
511=cut 553=cut
512 554
5131 5551
514 556

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines