--- AnyEvent-HTTP/HTTP.pm 2008/06/09 13:02:13 1.19 +++ AnyEvent-HTTP/HTTP.pm 2009/07/05 01:45:01 1.40 @@ -43,18 +43,18 @@ use Carp; -use AnyEvent (); +use AnyEvent 4.452 (); use AnyEvent::Util (); use AnyEvent::Socket (); use AnyEvent::Handle (); use base Exporter::; -our $VERSION = '1.01'; +our $VERSION = '1.12'; our @EXPORT = qw(http_get http_post http_head http_request); -our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)"; +our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)"; our $MAX_RECURSE = 10; our $MAX_PERSISTENT = 8; our $PERSISTENT_TIMEOUT = 2; @@ -73,32 +73,41 @@ =item http_get $url, key => value..., $cb->($data, $headers) Executes an HTTP-GET request. See the http_request function for details on -additional parameters. +additional parameters and the return value. =item http_head $url, key => value..., $cb->($data, $headers) -Executes an HTTP-HEAD request. See the http_request function for details on -additional parameters. +Executes an HTTP-HEAD request. See the http_request function for details +on additional parameters and the return value. =item http_post $url, $body, key => value..., $cb->($data, $headers) -Executes an HTTP-POST request with a request body of C<$bod>. See the -http_request function for details on additional parameters. +Executes an HTTP-POST request with a request body of C<$body>. See the +http_request function for details on additional parameters and the return +value. =item http_request $method => $url, key => value..., $cb->($data, $headers) Executes a HTTP request of type C<$method> (e.g. C, C). The URL must be an absolute http or https URL. +When called in void context, nothing is returned. In other contexts, +C returns a "cancellation guard" - you have to keep the +object at least alive until the callback get called. If the object gets +destroyed before the callbakc is called, the request will be cancelled. + The callback will be called with the response data as first argument (or C if it wasn't available due to errors), and a hash-ref with response headers as second argument. All the headers in that hash are lowercased. In addition to the response -headers, the three "pseudo-headers" C, C and -C contain the three parts of the HTTP Status-Line of the same -name. If the server sends a header multiple lines, then their contents -will be joined together with C<\x00>. +headers, the "pseudo-headers" C, C and C +contain the three parts of the HTTP Status-Line of the same name. The +pseudo-header C contains the original URL (which can differ from the +requested URL when following redirects). + +If the server sends a header multiple times, then their contents will be +joined together with a comma (C<,>), as per the HTTP spec. If an internal error occurs, such as not being able to resolve a hostname, then C<$data> will be C, C<< $headers->{Status} >> will be C<599> @@ -158,7 +167,7 @@ The C<$hash_ref> must be an (initially empty) hash reference which will get updated automatically. It is possible to save the cookie_jar to persistent storage with something like JSON or Storable, but this is not -recommended, as expire times are currently being ignored. +recommended, as expiry times are currently being ignored. Note that this cookie implementation is not of very high quality, nor meant to be complete. If you want complete cookie management you have to @@ -166,6 +175,18 @@ cookie-using sites working. Cookies are a privacy disaster, do not use them unless required to. +=item tls_ctx => $scheme | $tls_ctx + +Specifies the AnyEvent::TLS context to be used for https connections. This +parameter follows the same rules as the C parameter to +L, but additionally, the two strings C or +C can be specified, which give you a predefined low-security (no +verification, highest compatibility) and high-security (CA and common-name +verification) TLS context. + +The default for this option is C, which could be interpreted as "give +me the page, no matter what". + =back Example: make a simple HTTP GET request for http://www.nethype.de/ @@ -188,6 +209,16 @@ } ; +Example: make another simple HTTP GET request, but immediately try to +cancel it. + + my $request = http_request GET => "http://www.nethype.de/", sub { + my ($body, $hdr) = @_; + print "$body\n"; + }; + + undef $request; + =cut sub _slot_schedule; @@ -220,12 +251,21 @@ _slot_schedule $_[0]; } +our $qr_nl = qr<\015?\012>; +our $qr_nlnl = qr<\015?\012\015?\012>; + +our $TLS_CTX_LOW = { cache => 1, dh => undef, sslv2 => 1 }; +our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_cn => "https", dh => "skip4096" }; + sub http_request($$@) { my $cb = pop; my ($method, $url, %arg) = @_; my %hdr; + $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx}; + $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high"; + $method = uc $method; if (my $hdr = $arg{headers}) { @@ -234,9 +274,9 @@ } } - my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE; + my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE; - return $cb->(undef, { Status => 599, Reason => "recursion limit reached", URL => $url }) + return $cb->(undef, { Status => 599, Reason => "Too many redirections", URL => $url }) if $recurse < 0; my $proxy = $arg{proxy} || $PROXY; @@ -244,19 +284,19 @@ $hdr{"user-agent"} ||= $USERAGENT; - my ($scheme, $authority, $upath, $query, $fragment) = + my ($uscheme, $uauthority, $upath, $query, $fragment) = $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|; - $scheme = lc $scheme; + $uscheme = lc $uscheme; - my $uport = $scheme eq "http" ? 80 - : $scheme eq "https" ? 443 - : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported", URL => $url }); + my $uport = $uscheme eq "http" ? 80 + : $uscheme eq "https" ? 443 + : return $cb->(undef, { Status => 599, Reason => "Only http and https URL schemes supported (not '$uscheme')", URL => $url }); - $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic + $hdr{referer} ||= "$uscheme://$uauthority$upath"; # leave out fragment and query string, just a heuristic - $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x - or return $cb->(undef, { Status => 599, Reason => "unparsable URL", URL => $url }); + $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x + or return $cb->(undef, { Status => 599, Reason => "Unparsable URL", URL => $url }); my $uhost = $1; $uport = $2 if defined $2; @@ -268,20 +308,27 @@ # cookie processing if (my $jar = $arg{cookie_jar}) { - %$jar = () if $jar->{version} < 1; + %$jar = () if $jar->{version} != 1; my @cookie; while (my ($chost, $v) = each %$jar) { - next unless $chost eq substr $uhost, -length $chost; - next unless $chost =~ /^\./; + if ($chost =~ /^\./) { + next unless $chost eq substr $uhost, -length $chost; + } elsif ($chost =~ /\./) { + next unless $chost eq $uhost; + } else { + next; + } while (my ($cpath, $v) = each %$v) { next unless $cpath eq substr $upath, 0, length $cpath; while (my ($k, $v) = each %$v) { - next if $scheme ne "https" && exists $v->{secure}; - push @cookie, "$k=$v->{value}"; + next if $uscheme ne "https" && exists $v->{secure}; + my $value = $v->{value}; + $value =~ s/([\\"])/\\$1/g; + push @cookie, "$k=\"$value\""; } } } @@ -290,16 +337,19 @@ if @cookie; } - my ($rhost, $rport, $rpath); # request host, port, path + my ($rhost, $rport, $rscheme, $rpath); # request host, port, path if ($proxy) { - ($rhost, $rport, $scheme) = @$proxy; - $rpath = $url; + ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy); + + # don't support https requests over https-proxy transport, + # can't be done with tls as spec'ed, unless you double-encrypt. + $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https"; } else { - ($rhost, $rport, $rpath) = ($uhost, $uport, $upath); - $hdr{host} = $uhost; + ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath); } + $hdr{host} = $uhost; $hdr{"content-length"} = length $arg{body}; my %state = (connect_guard => 1); @@ -311,141 +361,225 @@ $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub { $state{fh} = shift - or return $cb->(undef, { Status => 599, Reason => "$!", URL => $url }); + or return (%state = (), $cb->(undef, { Status => 599, Reason => "$!", URL => $url })); + pop; # free memory, save a tree - delete $state{connect_guard}; # reduce memory usage, save a tree + return unless delete $state{connect_guard}; # get handle $state{handle} = new AnyEvent::Handle - fh => $state{fh}, - ($scheme eq "https" ? (tls => "connect") : ()); + fh => $state{fh}, + timeout => $timeout, + peername => $rhost, + tls_ctx => $arg{tls_ctx}; # limit the number of persistent connections + # keepalive not yet supported if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) { ++$KA_COUNT{$_[1]}; - $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} }; + $state{handle}{ka_count_guard} = AnyEvent::Util::guard { + --$KA_COUNT{$_[1]} + }; $hdr{connection} = "keep-alive"; - delete $hdr{connection}; # keep-alive not yet supported } else { delete $hdr{connection}; } # (re-)configure handle - $state{handle}->timeout ($timeout); $state{handle}->on_error (sub { - my $errno = "$!"; %state = (); - $cb->(undef, { Status => 599, Reason => $errno, URL => $url }); + $cb->(undef, { Status => 599, Reason => $_[2], URL => $url }); }); $state{handle}->on_eof (sub { %state = (); - $cb->(undef, { Status => 599, Reason => "unexpected end-of-file", URL => $url }); + $cb->(undef, { Status => 599, Reason => "Unexpected end-of-file", URL => $url }); }); - # send request - $state{handle}->push_write ( - "$method $rpath HTTP/1.0\015\012" - . (join "", map "$_: $hdr{$_}\015\012", keys %hdr) - . "\015\012" - . (delete $arg{body}) - ); - - %hdr = (); # reduce memory usage, save a kitten - - # status line - $state{handle}->push_read (line => qr/\015?\012/, sub { - $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix - or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])", URL => $url })); - - my %hdr = ( # response headers - HTTPVersion => "\x00$1", - Status => "\x00$2", - Reason => "\x00$3", - URL => "\x00$url" + $state{handle}->starttls ("connect") if $rscheme eq "https"; + + # handle actual, non-tunneled, request + my $handle_actual_request = sub { + $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls}; + + # send request + $state{handle}->push_write ( + "$method $rpath HTTP/1.0\015\012" + . (join "", map "\u$_: $hdr{$_}\015\012", keys %hdr) + . "\015\012" + . (delete $arg{body}) ); - # headers, could be optimized a bit - $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub { - for ("$_[1]\012") { - # we support spaces in field names, as lotus domino - # creates them. - $hdr{lc $1} .= "\x00$2" - while /\G - ([^:\000-\037]+): - [\011\040]* - ((?: [^\015\012]+ | \015?\012[\011\040] )*) - \015?\012 - /gxc; + %hdr = (); # reduce memory usage, save a kitten - /\G$/ - or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers", URL => $url })); - } + # status line + $state{handle}->push_read (line => $qr_nl, sub { + $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid server response ($_[1])", URL => $url })); + + my %hdr = ( # response headers + HTTPVersion => ",$1", + Status => ",$2", + Reason => ",$3", + URL => ",$url" + ); + + # headers, could be optimized a bit + $state{handle}->unshift_read (line => $qr_nlnl, sub { + for ("$_[1]\012") { + y/\015//d; # weed out any \015, as they show up in the weirdest of places. + + # things seen, not parsed: + # p3pP="NON CUR OTPi OUR NOR UNI" + + $hdr{lc $1} .= ",$2" + while /\G + ([^:\000-\037]+): + [\011\040]* + ((?: [^\012]+ | \012[\011\040] )*) + \012 + /gxc; - substr $_, 0, 1, "" - for values %hdr; + /\G$/ + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Garbled response headers", URL => $url })); + } - my $finish = sub { - %state = (); + substr $_, 0, 1, "" + for values %hdr; + + my $finish = sub { + $state{handle}->destroy; + %state = (); + + # set-cookie processing + if ($arg{cookie_jar}) { + for ($hdr{"set-cookie"}) { + # parse NAME=VALUE + my @kv; + + while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) { + my $name = $1; + my $value = $3; + + unless ($value) { + $value = $2; + $value =~ s/\\(.)/$1/gs; + } + + push @kv, $name => $value; + + last unless /\G\s*;/gc; + } + + last unless @kv; + + my $name = shift @kv; + my %kv = (value => shift @kv, @kv); + + my $cdom; + my $cpath = (delete $kv{path}) || "/"; + + if (exists $kv{domain}) { + $cdom = delete $kv{domain}; + + $cdom =~ s/^\.?/./; # make sure it starts with a "." + + next if $cdom =~ /\.$/; + + # this is not rfc-like and not netscape-like. go figure. + my $ndots = $cdom =~ y/.//; + next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2); + } else { + $cdom = $uhost; + } + + # store it + $arg{cookie_jar}{version} = 1; + $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv; - # set-cookie processing - if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) { - for (split /\x00/, $hdr{"set-cookie"}) { - my ($cookie, @arg) = split /;\s*/; - my ($name, $value) = split /=/, $cookie, 2; - my %kv = (value => $value, map { split /=/, $_, 2 } @arg); - - my $cdom = (delete $kv{domain}) || $uhost; - my $cpath = (delete $kv{path}) || "/"; - - $cdom =~ s/^.?/./; # make sure it starts with a "." - - next if $cdom =~ /\.$/; - - # this is not rfc-like and not netscape-like. go figure. - my $ndots = $cdom =~ y/.//; - next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2); - - # store it - $arg{cookie_jar}{version} = 1; - $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv; + redo if /\G\s*,/gc; + } } - } - if ($_[1]{Status} =~ /^30[12]$/ && $recurse) { - # microsoft and other assholes don't give a shit for following standards, - # try to support a common form of broken Location header. - $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%; + # microsoft and other shitheads don't give a shit for following standards, + # try to support some common forms of broken Location headers. + if ($_[1]{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) { + $_[1]{location} =~ s/^\.\/+//; + + my $url = "$rscheme://$uhost:$uport"; + + unless ($_[1]{location} =~ s/^\///) { + $url .= $upath; + $url =~ s/\/[^\/]*$//; + } - http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb); - } else { - $cb->($_[0], $_[1]); - } - }; + $_[1]{location} = "$url/$_[1]{location}"; + } - if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") { - $finish->(undef, \%hdr); - } else { - if (exists $hdr{"content-length"}) { - $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub { - # could cache persistent connection now - if ($hdr{connection} =~ /\bkeep-alive\b/i) { - # but we don't, due to misdesigns, this is annoyingly complex - }; + if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") { + # apparently, mozilla et al. just change POST to GET here + # more research is needed before we do the same + http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb); + } elsif ($_[1]{Status} == 303 && $recurse) { + # even http/1.1 is unclear on how to mutate the method + $method = "GET" unless $method eq "HEAD"; + http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb); + } elsif ($_[1]{Status} == 307 && $recurse && $method =~ /^(?:GET|HEAD)$/) { + http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb); + } else { + $cb->($_[0], $_[1]); + } + }; - $finish->($_[1], \%hdr); - }); + if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") { + $finish->(undef, \%hdr); } else { - # too bad, need to read until we get an error or EOF, - # no way to detect winged data. - $_[0]->on_error (sub { - $finish->($_[0]{rbuf}, \%hdr); - }); - $_[0]->on_eof (undef); - $_[0]->on_read (sub { }); + if (exists $hdr{"content-length"}) { + $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub { + # could cache persistent connection now + if ($hdr{connection} =~ /\bkeep-alive\b/i) { + # but we don't, due to misdesigns, this is annoyingly complex + }; + + $finish->($_[1], \%hdr); + }); + } else { + # too bad, need to read until we get an error or EOF, + # no way to detect winged data. + $_[0]->on_error (sub { + # delete ought to be more efficient, as we would have to make + # a copy otherwise as $_[0] gets destroyed. + $finish->(delete $_[0]{rbuf}, \%hdr); + }); + $_[0]->on_eof (undef); + $_[0]->on_read (sub { }); + } } + }); + }); + }; + + # now handle proxy-CONNECT method + if ($proxy && $uscheme eq "https") { + # oh dear, we have to wrap it into a connect request + + # maybe re-use $uauthority with patched port? + $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012"); + $state{handle}->push_read (line => $qr_nlnl, sub { + $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid proxy connect response ($_[1])", URL => $url })); + + if ($2 == 200) { + $rpath = $upath; + &$handle_actual_request; + } else { + %state = (); + $cb->(undef, { Status => $2, Reason => $3, URL => $url }); } }); - }); + } else { + &$handle_actual_request; + } + }, sub { $timeout }; @@ -465,7 +599,8 @@ } sub http_post($$@) { - unshift @_, "POST", "body"; + my $url = shift; + unshift @_, "POST", $url, "body"; &http_request } @@ -487,7 +622,7 @@ =item $AnyEvent::HTTP::USERAGENT The default value for the C header (the default is -C). +C). =item $AnyEvent::HTTP::MAX_PERSISTENT @@ -527,6 +662,9 @@ Marc Lehmann http://home.schmorp.de/ +With many thanks to Дмитрий Шалашов, who provided countless +testcases and bugreports. + =cut 1