--- AnyEvent-HTTP/HTTP.pm 2009/08/05 16:23:48 1.49 +++ AnyEvent-HTTP/HTTP.pm 2010/06/16 19:17:30 1.55 @@ -43,14 +43,14 @@ use Errno (); -use AnyEvent 4.8 (); +use AnyEvent 5.0 (); use AnyEvent::Util (); use AnyEvent::Socket (); use AnyEvent::Handle (); use base Exporter::; -our $VERSION = '1.42'; +our $VERSION = '1.45'; our @EXPORT = qw(http_get http_post http_head http_request); @@ -101,10 +101,22 @@ second argument. All the headers in that hash are lowercased. In addition to the response -headers, the "pseudo-headers" C, C and C -contain the three parts of the HTTP Status-Line of the same name. The -pseudo-header C contains the original URL (which can differ from the -requested URL when following redirects). +headers, the "pseudo-headers" (uppercase to avoid clashing with possible +response headers) C, C and C contain the +three parts of the HTTP Status-Line of the same name. + +The pseudo-header C contains the actual URL (which can differ from +the requested URL when following redirects - for example, you might get +an error that your URL scheme is not supported even though your URL is a +valid http URL because it redirected to an ftp URL, in which case you can +look at the URL pseudo header). + +The pseudo-header C only exists when the request was a result +of an internal redirect. In that case it is an array reference with +the C<($data, $headers)> from the redirect response. Note that this +response could in turn be the result of a redirect itself, and C<< +$headers->{Redirect}[1]{Redirect} >> will then contain the original +response, and so on. If the server sends a header multiple times, then their contents will be joined together with a comma (C<,>), as per the HTTP spec. @@ -147,7 +159,10 @@ =item timeout => $seconds The time-out to use for various stages - each connect attempt will reset -the timeout, as will read or write activity. Default timeout is 5 minutes. +the timeout, as will read or write activity, i.e. this is not an overall +timeout. + +Default timeout is 5 minutes. =item proxy => [$host, $port[, $scheme]] or undef @@ -190,6 +205,15 @@ The default for this option is C, which could be interpreted as "give me the page, no matter what". +=item on_prepare => $callback->($fh) + +In rare cases you need to "tune" the socket before it is used to +connect (for exmaple, to bind it on a given IP address). This parameter +overrides the prepare callback passed to C +and behaves exactly the same way (e.g. it has to provide a +timeout). See the description for the C<$prepare_cb> argument of +C for details. + =item on_header => $callback->($headers) When specified, this callback will be called with the header hash as soon @@ -339,9 +363,13 @@ } } + # pseudo headers for all subsequent responses + my @pseudo = (URL => $url); + push @pseudo, Redirect => delete $arg{Redirect} if exists $arg{Redirect}; + my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE; - return $cb->(undef, { Status => 599, Reason => "Too many redirections", URL => $url }) + return $cb->(undef, { Status => 599, Reason => "Too many redirections", @pseudo }) if $recurse < 0; my $proxy = $arg{proxy} || $PROXY; @@ -354,15 +382,16 @@ my $uport = $uscheme eq "http" ? 80 : $uscheme eq "https" ? 443 - : return $cb->(undef, { Status => 599, Reason => "Only http and https URL schemes supported", URL => $url }); + : return $cb->(undef, { Status => 599, Reason => "Only http and https URL schemes supported", @pseudo }); $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x - or return $cb->(undef, { Status => 599, Reason => "Unparsable URL", URL => $url }); + or return $cb->(undef, { Status => 599, Reason => "Unparsable URL", @pseudo }); my $uhost = $1; $uport = $2 if defined $2; - $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost"; + $hdr{host} = defined $2 ? "$uhost:$2" : "$uhost" + unless exists $hdr{host}; $uhost =~ s/^\[(.*)\]$/$1/; $upath .= "?$query" if length $query; @@ -418,7 +447,8 @@ $hdr{referer} ||= "$uscheme://$uauthority$upath" unless exists $hdr{referer}; $hdr{"user-agent"} ||= $USERAGENT unless exists $hdr{"user-agent"}; - $hdr{"content-length"} = length $arg{body}; + $hdr{"content-length"} = length $arg{body} + if length $arg{body} || $method ne "GET"; my %state = (connect_guard => 1); @@ -432,7 +462,7 @@ or do { my $err = "$!"; %state = (); - return $cb->(undef, { Status => 599, Reason => $err, URL => $url }); + return $cb->(undef, { Status => 599, Reason => $err, @pseudo }); }; pop; # free memory, save a tree @@ -461,11 +491,11 @@ # (re-)configure handle $state{handle}->on_error (sub { %state = (); - $cb->(undef, { Status => 599, Reason => $_[2], URL => $url }); + $cb->(undef, { Status => 599, Reason => $_[2], @pseudo }); }); $state{handle}->on_eof (sub { %state = (); - $cb->(undef, { Status => 599, Reason => "Unexpected end-of-file", URL => $url }); + $cb->(undef, { Status => 599, Reason => "Unexpected end-of-file", @pseudo }); }); $state{handle}->starttls ("connect") if $rscheme eq "https"; @@ -482,19 +512,21 @@ . (delete $arg{body}) ); - %hdr = (); # reduce memory usage, save a kitten + # return if error occured during push_write() + return unless %state; + + %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use # status line $state{handle}->push_read (line => $qr_nl, sub { $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix - or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid server response ($_[1])", URL => $url })); + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid server response ($_[1])", @pseudo })); - my %hdr = ( # response headers - HTTPVersion => ",$1", - Status => ",$2", - Reason => ",$3", - URL => ",$url" - ); + push @pseudo, + HTTPVersion => $1, + Status => $2, + Reason => $3, + ; # headers, could be optimized a bit $state{handle}->unshift_read (line => $qr_nlnl, sub { @@ -513,12 +545,16 @@ /gxc; /\G$/ - or return (%state = (), $cb->(undef, { Status => 599, Reason => "Garbled response headers", URL => $url })); + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Garbled response headers", @pseudo })); } + # remove the "," prefix we added to all headers above substr $_, 0, 1, "" for values %hdr; + # patch in all pseudo headers + %hdr = (%hdr, @pseudo); + # redirect handling # microsoft and other shitheads don't give a shit for following standards, # try to support some common forms of broken Location headers. @@ -538,15 +574,17 @@ my $redirect; if ($recurse) { - if ($hdr{Status} =~ /^30[12]$/ && $method ne "POST") { + my $status = $hdr{Status}; + + if (($status == 301 || $status == 302) && $method ne "POST") { # apparently, mozilla et al. just change POST to GET here # more research is needed before we do the same $redirect = 1; - } elsif ($hdr{Status} == 303) { + } elsif ($status == 303) { # even http/1.1 is unclear on how to mutate the method $method = "GET" unless $method eq "HEAD"; $redirect = 1; - } elsif ($hdr{Status} == 307 && $method =~ /^(?:GET|HEAD)$/) { + } elsif ($status == 307 && $method =~ /^(?:GET|HEAD)$/) { $redirect = 1; } } @@ -609,7 +647,12 @@ # we ignore any errors, as it is very common to receive # Content-Length != 0 but no actual body # we also access %hdr, as $_[1] might be an erro - http_request ($method => $hdr{location}, %arg, recurse => $recurse - 1, $cb); + http_request ( + $method => $hdr{location}, + %arg, + recurse => $recurse - 1, + Redirect => \@_, + $cb); } else { $cb->($_[0], $_[1]); } @@ -618,7 +661,7 @@ my $len = $hdr{"content-length"}; if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) { - $finish->(undef, { Status => 598, Reason => "Request cancelled by on_header", URL => $url }); + $finish->(undef, { Status => 598, Reason => "Request cancelled by on_header", @pseudo }); } elsif ( $hdr{Status} =~ /^(?:1..|[23]04)$/ or $method eq "HEAD" @@ -638,14 +681,14 @@ $finish->(delete $state{handle}, \%hdr); } elsif ($arg{on_body}) { - $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], URL => $url }) }); + $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) }); if ($len) { $_[0]->on_eof (undef); $_[0]->on_read (sub { $len -= length $_[0]{rbuf}; $arg{on_body}(delete $_[0]{rbuf}, \%hdr) - or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", URL => $url }); + or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo }); $len > 0 or $finish->("", \%hdr); @@ -656,14 +699,14 @@ }); $_[0]->on_read (sub { $arg{on_body}(delete $_[0]{rbuf}, \%hdr) - or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", URL => $url }); + or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo }); }); } } else { $_[0]->on_eof (undef); if ($len) { - $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], URL => $url }) }); + $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) }); $_[0]->on_read (sub { $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), \%hdr) if $len <= length $_[0]{rbuf}; @@ -672,7 +715,7 @@ $_[0]->on_error (sub { $! == Errno::EPIPE || !$! ? $finish->(delete $_[0]{rbuf}, \%hdr) - : $finish->(undef, { Status => 599, Reason => $_[2], URL => $url }); + : $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }); }); $_[0]->on_read (sub { }); } @@ -690,23 +733,21 @@ $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012"); $state{handle}->push_read (line => $qr_nlnl, sub { $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix - or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid proxy connect response ($_[1])", URL => $url })); + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid proxy connect response ($_[1])", @pseudo })); if ($2 == 200) { $rpath = $upath; &$handle_actual_request; } else { %state = (); - $cb->(undef, { Status => $2, Reason => $3, URL => $url }); + $cb->(undef, { Status => $2, Reason => $3, @pseudo }); } }); } else { &$handle_actual_request; } - }, sub { - $timeout - }; + }, $arg{on_prepare} || sub { $timeout }; }; defined wantarray && AnyEvent::Util::guard { %state = () } @@ -730,6 +771,15 @@ =back +=head2 DNS CACHING + +AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for +the actual connection, which in turn uses AnyEvent::DNS to resolve +hostnames. The latter is a simple stub resolver and does no caching +on its own. If you want DNS caching, you currently have to provide +your own default resolver (by storing a suitable resolver object in +C<$AnyEvent::DNS::RESOLVER>). + =head2 GLOBAL FUNCTIONS AND VARIABLES =over 4 @@ -737,7 +787,10 @@ =item AnyEvent::HTTP::set_proxy "proxy-url" Sets the default proxy server to use. The proxy-url must begin with a -string of the form C (optionally C). +string of the form C (optionally C), croaks +otherwise. + +To clear an already-set proxy, use C. =item $AnyEvent::HTTP::MAX_RECURSE @@ -768,11 +821,19 @@ =cut sub set_proxy($) { - $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix; + if (length $_[0]) { + $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix + or Carp::croak "$_[0]: invalid proxy URL"; + $PROXY = [$2, $3 || 3128, $1] + } else { + undef $PROXY; + } } # initialise proxy from environment -set_proxy $ENV{http_proxy}; +eval { + set_proxy $ENV{http_proxy}; +}; =head1 SEE ALSO