--- AnyEvent-HTTP/HTTP.pm 2008/06/05 15:34:00 1.11 +++ AnyEvent-HTTP/HTTP.pm 2008/09/29 13:50:39 1.28 @@ -6,6 +6,10 @@ use AnyEvent::HTTP; + http_get "http://www.nethype.de/", sub { print $_[1] }; + + # ... do something else here + =head1 DESCRIPTION This module is an L user, you need to make sure that you use and @@ -46,9 +50,9 @@ use base Exporter::; -our $VERSION = '1.0'; +our $VERSION = '1.05'; -our @EXPORT = qw(http_get http_request); +our @EXPORT = qw(http_get http_post http_head http_request); our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)"; our $MAX_RECURSE = 10; @@ -61,6 +65,7 @@ our $MAX_PER_HOST = 4; our $PROXY; +our $ACTIVE = 0; my %KA_COUNT; # number of open keep-alive connections per host my %CO_SLOT; # number of open connections, and wait queue, per host @@ -77,7 +82,7 @@ =item http_post $url, $body, key => value..., $cb->($data, $headers) -Executes an HTTP-POST request with a request body of C<$bod>. See the +Executes an HTTP-POST request with a request body of C<$body>. See the http_request function for details on additional parameters. =item http_request $method => $url, key => value..., $cb->($data, $headers) @@ -90,10 +95,13 @@ response headers as second argument. All the headers in that hash are lowercased. In addition to the response -headers, the three "pseudo-headers" C, C and -C contain the three parts of the HTTP Status-Line of the same -name. If the server sends a header multiple lines, then their contents -will be joined together with C<\x00>. +headers, the "pseudo-headers" C, C and C +contain the three parts of the HTTP Status-Line of the same name. The +pseudo-header C contains the original URL (which can differ from the +requested URL when following redirects). + +If the server sends a header multiple lines, then their contents will be +joined together with C<\x00>. If an internal error occurs, such as not being able to resolve a hostname, then C<$data> will be C, C<< $headers->{Status} >> will be C<599> @@ -123,7 +131,9 @@ =item headers => hashref -The request headers to use. +The request headers to use. Currently, C may provide its +own C, C, C and C headers +and will provide defaults for C and C. =item timeout => $seconds @@ -183,22 +193,24 @@ =cut +sub _slot_schedule; sub _slot_schedule($) { my $host = shift; while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) { if (my $cb = shift @{ $CO_SLOT{$host}[1] }) { - # somebody wnats that slot + # somebody wants that slot ++$CO_SLOT{$host}[0]; + ++$ACTIVE; $cb->(AnyEvent::Util::guard { + --$ACTIVE; --$CO_SLOT{$host}[0]; _slot_schedule $host; }); } else { # nobody wants the slot, maybe we can forget about it delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0]; - warn "$host deleted" unless $CO_SLOT{$host}[0];#d# last; } } @@ -211,7 +223,7 @@ _slot_schedule $_[0]; } -sub http_request($$$;@) { +sub http_request($$@) { my $cb = pop; my ($method, $url, %arg) = @_; @@ -225,9 +237,9 @@ } } - my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE; + my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE; - return $cb->(undef, { Status => 599, Reason => "recursion limit reached" }) + return $cb->(undef, { Status => 599, Reason => "recursion limit reached", URL => $url }) if $recurse < 0; my $proxy = $arg{proxy} || $PROXY; @@ -242,10 +254,12 @@ my $uport = $scheme eq "http" ? 80 : $scheme eq "https" ? 443 - : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" }); + : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported", URL => $url }); + + $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x - or return $cb->(undef, { Status => 599, Reason => "unparsable URL" }); + or return $cb->(undef, { Status => 599, Reason => "unparsable URL", URL => $url }); my $uhost = $1; $uport = $2 if defined $2; @@ -300,7 +314,7 @@ $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub { $state{fh} = shift - or return $cb->(undef, { Status => 599, Reason => "$!" }); + or return $cb->(undef, { Status => 599, Reason => "$!", URL => $url }); delete $state{connect_guard}; # reduce memory usage, save a tree @@ -322,12 +336,13 @@ # (re-)configure handle $state{handle}->timeout ($timeout); $state{handle}->on_error (sub { + my $errno = "$!"; %state = (); - $cb->(undef, { Status => 599, Reason => "$!" }); + $cb->(undef, { Status => 599, Reason => $errno, URL => $url }); }); $state{handle}->on_eof (sub { %state = (); - $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" }); + $cb->(undef, { Status => 599, Reason => "unexpected end-of-file", URL => $url }); }); # send request @@ -342,13 +357,14 @@ # status line $state{handle}->push_read (line => qr/\015?\012/, sub { - $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix - or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" })); + $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix + or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])", URL => $url })); my %hdr = ( # response headers HTTPVersion => "\x00$1", Status => "\x00$2", Reason => "\x00$3", + URL => "\x00$url" ); # headers, could be optimized a bit @@ -365,7 +381,7 @@ /gxc; /\G$/ - or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" })); + or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers", URL => $url })); } substr $_, 0, 1, "" @@ -384,7 +400,7 @@ my $cdom = (delete $kv{domain}) || $uhost; my $cpath = (delete $kv{path}) || "/"; - $cdom =~ s/^.?/./; # make sure it starts with a "." + $cdom =~ s/^\.?/./; # make sure it starts with a "." next if $cdom =~ /\.$/; @@ -398,12 +414,31 @@ } } - if ($_[1]{Status} =~ /^x30[12]$/ && $recurse) { - # microsoft and other assholes don't give a shit for following standards, - # try to support a common form of broken Location header. - $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%; + # microsoft and other shitheads don't give a shit for following standards, + # try to support some common forms of broken Location headers. + if ($_[1]{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) { + $_[1]{location} =~ s/^\.\/+//; + + my $url = "$scheme://$uhost:$uport"; + + unless ($_[1]{location} =~ s/^\///) { + $url .= $upath; + $url =~ s/\/[^\/]*$//; + } + $_[1]{location} = "$url/$_[1]{location}"; + } + + if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") { + # apparently, mozilla et al. just change POST to GET here + # more research is needed before we do the same http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb); + } elsif ($_[1]{Status} == 303 && $recurse) { + # even http/1.1 is unlear on how to mutate the method + $method = "GET" unless $method eq "HEAD"; + http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb); + } elsif ($_[1]{Status} == 307 && $recurse && $method =~ /^(?:GET|HEAD)$/) { + http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb); } else { $cb->($_[0], $_[1]); } @@ -441,18 +476,19 @@ defined wantarray && AnyEvent::Util::guard { %state = () } } -sub http_get($$;@) { +sub http_get($@) { unshift @_, "GET"; &http_request } -sub http_head($$;@) { +sub http_head($@) { unshift @_, "HEAD"; &http_request } -sub http_post($$$;@) { - unshift @_, "POST", "body"; +sub http_post($$@) { + my $url = shift; + unshift @_, "POST", $url, "body"; &http_request } @@ -488,6 +524,12 @@ Not implemented currently. +=item $AnyEvent::HTTP::ACTIVE + +The number of active connections. This is not the number of currently +running requests, but the number of currently open and non-idle TCP +connections. This number of can be useful for load-leveling. + =back =cut @@ -505,8 +547,8 @@ =head1 AUTHOR - Marc Lehmann - http://home.schmorp.de/ + Marc Lehmann + http://home.schmorp.de/ =cut