--- AnyEvent-HTTP/HTTP.pm 2010/09/06 06:31:32 1.57 +++ AnyEvent-HTTP/HTTP.pm 2010/12/30 04:31:55 1.63 @@ -45,7 +45,6 @@ use AnyEvent 5.0 (); use AnyEvent::Util (); -use AnyEvent::Socket (); use AnyEvent::Handle (); use base Exporter::; @@ -94,7 +93,7 @@ When called in void context, nothing is returned. In other contexts, C returns a "cancellation guard" - you have to keep the object at least alive until the callback get called. If the object gets -destroyed before the callbakc is called, the request will be cancelled. +destroyed before the callback is called, the request will be cancelled. The callback will be called with the response body data as first argument (or C if an error occured), and a hash-ref with response headers as @@ -214,6 +213,18 @@ timeout). See the description for the C<$prepare_cb> argument of C for details. +=item tcp_connect => $callback->($host, $service, $connect_cb, $prepare_cb) + +In even rarer cases you want total control over how AnyEvent::HTTP +establishes connections. Normally it uses L +to do this, but you can provide your own C function - +obviously, it has to follow the same calling conventions, except that it +may always return a connection guard object. + +There are probably lots of weird uses for this function, starting from +tracing the hosts C actually tries to connect, to (inexact +but fast) host => IP address caching or even socks protocol support. + =item on_header => $callback->($headers) When specified, this callback will be called with the header hash as soon @@ -456,37 +467,43 @@ return unless $state{connect_guard}; - $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub { - $state{fh} = shift - or do { - my $err = "$!"; - %state = (); - return $cb->(undef, { Status => 599, Reason => $err, @pseudo }); - }; + my $tcp_connect = $arg{tcp_connect} + || do { require AnyEvent::Socket; \&AnyEvent::Socket::tcp_connect }; + + $state{connect_guard} = $tcp_connect->( + $rhost, + $rport, + sub { + $state{fh} = shift + or do { + my $err = "$!"; + %state = (); + return $cb->(undef, { Status => 599, Reason => $err, @pseudo }); + }; - pop; # free memory, save a tree + pop; # free memory, save a tree - return unless delete $state{connect_guard}; + return unless delete $state{connect_guard}; - # get handle - $state{handle} = new AnyEvent::Handle - fh => $state{fh}, - peername => $rhost, - tls_ctx => $arg{tls_ctx}, - # these need to be reconfigured on keepalive handles - timeout => $timeout, - on_error => sub { - %state = (); - $cb->(undef, { Status => 599, Reason => $_[2], @pseudo }); - }, - on_eof => sub { - %state = (); - $cb->(undef, { Status => 599, Reason => "Unexpected end-of-file", @pseudo }); - }, - ; + # get handle + $state{handle} = new AnyEvent::Handle + fh => $state{fh}, + peername => $rhost, + tls_ctx => $arg{tls_ctx}, + # these need to be reconfigured on keepalive handles + timeout => $timeout, + on_error => sub { + %state = (); + $cb->(undef, { Status => 599, Reason => $_[2], @pseudo }); + }, + on_eof => sub { + %state = (); + $cb->(undef, { Status => 599, Reason => "Unexpected end-of-file", @pseudo }); + }, + ; - # limit the number of persistent connections - # keepalive not yet supported + # limit the number of persistent connections + # keepalive not yet supported # if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) { # ++$KA_COUNT{$_[1]}; # $state{handle}{ka_count_guard} = AnyEvent::Util::guard { @@ -494,257 +511,259 @@ # }; # $hdr{connection} = "keep-alive"; # } else { - delete $hdr{connection}; + delete $hdr{connection}; # } - $state{handle}->starttls ("connect") if $rscheme eq "https"; + $state{handle}->starttls ("connect") if $rscheme eq "https"; - # handle actual, non-tunneled, request - my $handle_actual_request = sub { - $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls}; - - # send request - $state{handle}->push_write ( - "$method $rpath HTTP/1.0\015\012" - . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr) - . "\015\012" - . (delete $arg{body}) - ); - - # return if error occured during push_write() - return unless %state; - - %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use - - # status line and headers - $state{handle}->push_read (line => $qr_nlnl, sub { - for ("$_[1]") { - y/\015//d; # weed out any \015, as they show up in the weirdest of places. - - /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )? \015?\012/igxc - or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid server response", @pseudo })); - - push @pseudo, - HTTPVersion => $1, - Status => $2, - Reason => $3, - ; - - # things seen, not parsed: - # p3pP="NON CUR OTPi OUR NOR UNI" - - $hdr{lc $1} .= ",$2" - while /\G - ([^:\000-\037]*): - [\011\040]* - ((?: [^\012]+ | \012[\011\040] )*) - \012 - /gxc; - - /\G$/ - or return (%state = (), $cb->(undef, { Status => 599, Reason => "Garbled response headers", @pseudo })); - } - - # remove the "," prefix we added to all headers above - substr $_, 0, 1, "" - for values %hdr; - - # patch in all pseudo headers - %hdr = (%hdr, @pseudo); - - # redirect handling - # microsoft and other shitheads don't give a shit for following standards, - # try to support some common forms of broken Location headers. - if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) { - $hdr{location} =~ s/^\.\/+//; - - my $url = "$rscheme://$uhost:$uport"; - - unless ($hdr{location} =~ s/^\///) { - $url .= $upath; - $url =~ s/\/[^\/]*$//; + # handle actual, non-tunneled, request + my $handle_actual_request = sub { + $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls}; + + # send request + $state{handle}->push_write ( + "$method $rpath HTTP/1.0\015\012" + . (join "", map "\u$_: $hdr{$_}\015\012", grep defined $hdr{$_}, keys %hdr) + . "\015\012" + . (delete $arg{body}) + ); + + # return if error occured during push_write() + return unless %state; + + %hdr = (); # reduce memory usage, save a kitten, also make it possible to re-use + + # status line and headers + $state{handle}->push_read (line => $qr_nlnl, sub { + for ("$_[1]") { + y/\015//d; # weed out any \015, as they show up in the weirdest of places. + + /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )? \015?\012/igxc + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid server response", @pseudo })); + + push @pseudo, + HTTPVersion => $1, + Status => $2, + Reason => $3, + ; + + # things seen, not parsed: + # p3pP="NON CUR OTPi OUR NOR UNI" + + $hdr{lc $1} .= ",$2" + while /\G + ([^:\000-\037]*): + [\011\040]* + ((?: [^\012]+ | \012[\011\040] )*) + \012 + /gxc; + + /\G$/ + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Garbled response headers", @pseudo })); } - $hdr{location} = "$url/$hdr{location}"; - } + # remove the "," prefix we added to all headers above + substr $_, 0, 1, "" + for values %hdr; + + # patch in all pseudo headers + %hdr = (%hdr, @pseudo); + + # redirect handling + # microsoft and other shitheads don't give a shit for following standards, + # try to support some common forms of broken Location headers. + if ($hdr{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) { + $hdr{location} =~ s/^\.\/+//; + + my $url = "$rscheme://$uhost:$uport"; + + unless ($hdr{location} =~ s/^\///) { + $url .= $upath; + $url =~ s/\/[^\/]*$//; + } - my $redirect; + $hdr{location} = "$url/$hdr{location}"; + } + + my $redirect; - if ($recurse) { - my $status = $hdr{Status}; + if ($recurse) { + my $status = $hdr{Status}; - # industry standard is to redirect POST as GET for - # 301, 302 and 303, in contrast to http/1.0 and 1.1. - # also, the UA should ask the user for 301 and 307 and POST, - # industry standard seems to be to simply follow. - # we go with the industry standard. - if ($status == 301 or $status == 302 or $status == 303) { - # HTTP/1.1 is unclear on how to mutate the method - $method = "GET" unless $method eq "HEAD"; - $redirect = 1; - } elsif ($status == 307) { - $redirect = 1; + # industry standard is to redirect POST as GET for + # 301, 302 and 303, in contrast to http/1.0 and 1.1. + # also, the UA should ask the user for 301 and 307 and POST, + # industry standard seems to be to simply follow. + # we go with the industry standard. + if ($status == 301 or $status == 302 or $status == 303) { + # HTTP/1.1 is unclear on how to mutate the method + $method = "GET" unless $method eq "HEAD"; + $redirect = 1; + } elsif ($status == 307) { + $redirect = 1; + } } - } - my $finish = sub { - $state{handle}->destroy if $state{handle}; - %state = (); + my $finish = sub { + $state{handle}->destroy if $state{handle}; + %state = (); + + # set-cookie processing + if ($arg{cookie_jar}) { + for ($_[1]{"set-cookie"}) { + # parse NAME=VALUE + my @kv; + + while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) { + my $name = $1; + my $value = $3; + + unless ($value) { + $value = $2; + $value =~ s/\\(.)/$1/gs; + } - # set-cookie processing - if ($arg{cookie_jar}) { - for ($_[1]{"set-cookie"}) { - # parse NAME=VALUE - my @kv; - - while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) { - my $name = $1; - my $value = $3; - - unless ($value) { - $value = $2; - $value =~ s/\\(.)/$1/gs; - } + push @kv, $name => $value; - push @kv, $name => $value; - - last unless /\G\s*;/gc; - } + last unless /\G\s*;/gc; + } - last unless @kv; + last unless @kv; - my $name = shift @kv; - my %kv = (value => shift @kv, @kv); + my $name = shift @kv; + my %kv = (value => shift @kv, @kv); - my $cdom; - my $cpath = (delete $kv{path}) || "/"; + my $cdom; + my $cpath = (delete $kv{path}) || "/"; - if (exists $kv{domain}) { - $cdom = delete $kv{domain}; - - $cdom =~ s/^\.?/./; # make sure it starts with a "." + if (exists $kv{domain}) { + $cdom = delete $kv{domain}; + + $cdom =~ s/^\.?/./; # make sure it starts with a "." - next if $cdom =~ /\.$/; + next if $cdom =~ /\.$/; + + # this is not rfc-like and not netscape-like. go figure. + my $ndots = $cdom =~ y/.//; + next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2); + } else { + $cdom = $uhost; + } - # this is not rfc-like and not netscape-like. go figure. - my $ndots = $cdom =~ y/.//; - next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2); - } else { - $cdom = $uhost; + # store it + $arg{cookie_jar}{version} = 1; + $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv; + + redo if /\G\s*,/gc; } - - # store it - $arg{cookie_jar}{version} = 1; - $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv; + } - redo if /\G\s*,/gc; + if ($redirect && exists $hdr{location}) { + # we ignore any errors, as it is very common to receive + # Content-Length != 0 but no actual body + # we also access %hdr, as $_[1] might be an erro + http_request ( + $method => $hdr{location}, + %arg, + recurse => $recurse - 1, + Redirect => \@_, + $cb); + } else { + $cb->($_[0], $_[1]); } - } + }; + + my $len = $hdr{"content-length"}; - if ($redirect && exists $hdr{location}) { - # we ignore any errors, as it is very common to receive - # Content-Length != 0 but no actual body - # we also access %hdr, as $_[1] might be an erro - http_request ( - $method => $hdr{location}, - %arg, - recurse => $recurse - 1, - Redirect => \@_, - $cb); + if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) { + $finish->(undef, { Status => 598, Reason => "Request cancelled by on_header", @pseudo }); + } elsif ( + $hdr{Status} =~ /^(?:1..|[23]04)$/ + or $method eq "HEAD" + or (defined $len && !$len) + ) { + # no body + $finish->("", \%hdr); } else { - $cb->($_[0], $_[1]); - } - }; + # body handling, four different code paths + # for want_body_handle, on_body (2x), normal (2x) + # we might read too much here, but it does not matter yet (no pers. connections) + if (!$redirect && $arg{want_body_handle}) { + $_[0]->on_eof (undef); + $_[0]->on_error (undef); + $_[0]->on_read (undef); - my $len = $hdr{"content-length"}; + $finish->(delete $state{handle}, \%hdr); - if (!$redirect && $arg{on_header} && !$arg{on_header}(\%hdr)) { - $finish->(undef, { Status => 598, Reason => "Request cancelled by on_header", @pseudo }); - } elsif ( - $hdr{Status} =~ /^(?:1..|[23]04)$/ - or $method eq "HEAD" - or (defined $len && !$len) - ) { - # no body - $finish->("", \%hdr); - } else { - # body handling, four different code paths - # for want_body_handle, on_body (2x), normal (2x) - # we might read too much here, but it does not matter yet (no pers. connections) - if (!$redirect && $arg{want_body_handle}) { - $_[0]->on_eof (undef); - $_[0]->on_error (undef); - $_[0]->on_read (undef); - - $finish->(delete $state{handle}, \%hdr); - - } elsif ($arg{on_body}) { - $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) }); - if ($len) { + } elsif ($arg{on_body}) { + $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) }); + if ($len) { + $_[0]->on_eof (undef); + $_[0]->on_read (sub { + $len -= length $_[0]{rbuf}; + + $arg{on_body}(delete $_[0]{rbuf}, \%hdr) + or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo }); + + $len > 0 + or $finish->("", \%hdr); + }); + } else { + $_[0]->on_eof (sub { + $finish->("", \%hdr); + }); + $_[0]->on_read (sub { + $arg{on_body}(delete $_[0]{rbuf}, \%hdr) + or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo }); + }); + } + } else { $_[0]->on_eof (undef); - $_[0]->on_read (sub { - $len -= length $_[0]{rbuf}; - $arg{on_body}(delete $_[0]{rbuf}, \%hdr) - or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo }); - - $len > 0 - or $finish->("", \%hdr); - }); - } else { - $_[0]->on_eof (sub { - $finish->("", \%hdr); - }); - $_[0]->on_read (sub { - $arg{on_body}(delete $_[0]{rbuf}, \%hdr) - or $finish->(undef, { Status => 598, Reason => "Request cancelled by on_body", @pseudo }); - }); + if ($len) { + $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) }); + $_[0]->on_read (sub { + $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), \%hdr) + if $len <= length $_[0]{rbuf}; + }); + } else { + $_[0]->on_error (sub { + ($! == Errno::EPIPE || !$!) + ? $finish->(delete $_[0]{rbuf}, \%hdr) + : $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }); + }); + $_[0]->on_read (sub { }); + } } - } else { - $_[0]->on_eof (undef); + } + }); + }; - if ($len) { - $_[0]->on_error (sub { $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }) }); - $_[0]->on_read (sub { - $finish->((substr delete $_[0]{rbuf}, 0, $len, ""), \%hdr) - if $len <= length $_[0]{rbuf}; - }); - } else { - $_[0]->on_error (sub { - ($! == Errno::EPIPE || !$!) - ? $finish->(delete $_[0]{rbuf}, \%hdr) - : $finish->(undef, { Status => 599, Reason => $_[2], @pseudo }); - }); - $_[0]->on_read (sub { }); - } + # now handle proxy-CONNECT method + if ($proxy && $uscheme eq "https") { + # oh dear, we have to wrap it into a connect request + + # maybe re-use $uauthority with patched port? + $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012"); + $state{handle}->push_read (line => $qr_nlnl, sub { + $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid proxy connect response ($_[1])", @pseudo })); + + if ($2 == 200) { + $rpath = $upath; + &$handle_actual_request; + } else { + %state = (); + $cb->(undef, { Status => $2, Reason => $3, @pseudo }); } - } - }); - }; - - # now handle proxy-CONNECT method - if ($proxy && $uscheme eq "https") { - # oh dear, we have to wrap it into a connect request - - # maybe re-use $uauthority with patched port? - $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012"); - $state{handle}->push_read (line => $qr_nlnl, sub { - $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix - or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid proxy connect response ($_[1])", @pseudo })); - - if ($2 == 200) { - $rpath = $upath; - &$handle_actual_request; - } else { - %state = (); - $cb->(undef, { Status => $2, Reason => $3, @pseudo }); - } - }); - } else { - &$handle_actual_request; - } + }); + } else { + &$handle_actual_request; + } - }, $arg{on_prepare} || sub { $timeout }; + }, + $arg{on_prepare} || sub { $timeout } + ); }; defined wantarray && AnyEvent::Util::guard { %state = () } @@ -789,6 +808,16 @@ To clear an already-set proxy, use C. +=item $date = AnyEvent::HTTP::format_date $timestamp + +Takes a POSIX timestamp (seconds since the epoch) and formats it as a HTTP +Date (RFC 2616). + +=item $timestamp = AnyEvent::HTTP::parse_date $date + +Takes a HTTP Date (RFC 2616) and returns the corresponding POSIX +timestamp, or C if the date cannot be parsed. + =item $AnyEvent::HTTP::MAX_RECURSE The default value for the C request parameter (default: C<10>). @@ -817,6 +846,49 @@ =cut +our @month = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); +our @weekday = qw(Sun Mon Tue Wed Thu Fri Sat); + +sub format_date($) { + my ($time) = @_; + + # RFC 822/1123 format + my ($S, $M, $H, $mday, $mon, $year, $wday, $yday, undef) = gmtime $time; + + sprintf "%s, %02d %s %04d %02d:%02d:%02d GMT", + $weekday[$wday], $mday, $month[$mon], $year + 1900, + $H, $M, $S; +} + +sub parse_date($) { + my ($date) = @_; + + my ($d, $m, $y, $H, $M, $S); + + if ($date =~ /^[A-Z][a-z][a-z], ([0-9][0-9]) ([A-Z][a-z][a-z]) ([0-9][0-9][0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) { + # RFC 822/1123, required by RFC 2616 + ($d, $m, $y, $H, $M, $S) = ($1, $2, $3, $4, $5, $6); + + } elsif ($date =~ /^[A-Z][a-z]+, ([0-9][0-9])-([A-Z][a-z][a-z])-([0-9][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) GMT$/) { + # RFC 850 + ($d, $m, $y, $H, $M, $S) = ($1, $2, $3 < 69 ? $3 + 2000 : $3 + 1900, $4, $5, $6); + + } elsif ($date =~ /^[A-Z][a-z][a-z] ([A-Z][a-z][a-z]) ([0-9 ][0-9]) ([0-9][0-9]):([0-9][0-9]):([0-9][0-9]) ([0-9][0-9][0-9][0-9])$/) { + # ISO C's asctime + ($d, $m, $y, $H, $M, $S) = ($2, $1, $6, $3, $4, $5); + } + # other formats fail in the loop below + + for (0..11) { + if ($m eq $month[$_]) { + require Time::Local; + return Time::Local::timegm ($S, $M, $H, $d, $_, $y); + } + } + + undef +} + sub set_proxy($) { if (length $_[0]) { $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix @@ -832,6 +904,62 @@ set_proxy $ENV{http_proxy}; }; +=head2 SOCKS PROXIES + +Socks proxies are not directly supported by AnyEvent::HTTP. You can +compile your perl to support socks, or use an external program such as +F (dante) or F to make your program use a socks proxy +transparently. + +Alternatively, for AnyEvent::HTTP only, you can use your own +C function that does the proxy handshake - here is an example +that works with socks4a proxies: + + use Errno; + use AnyEvent::Util; + use AnyEvent::Socket; + use AnyEvent::Handle; + + # host, port and username of/for your socks4a proxy + my $socks_host = "10.0.0.23"; + my $socks_port = 9050; + my $socks_user = ""; + + sub socks4a_connect { + my ($host, $port, $connect_cb, $prepare_cb) = @_; + + my $hdl = new AnyEvent::Handle + connect => [$socks_host, $socks_port], + on_prepare => sub { $prepare_cb->($_[0]{fh}) }, + on_error => sub { $connect_cb->() }, + ; + + $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host); + + $hdl->push_read (chunk => 8, sub { + my ($hdl, $chunk) = @_; + my ($status, $port, $ipn) = unpack "xCna4", $chunk; + + if ($status == 0x5a) { + $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port"); + } else { + $! = Errno::ENXIO; $connect_cb->(); + } + }); + + $hdl + } + +Use C instead of C when doing Cs, +possibly after switching off other proxy types: + + AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies + + http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub { + my ($data, $headers) = @_; + ... + }; + =head1 SEE ALSO L.