--- AnyEvent-HTTP/HTTP.pm 2008/06/05 13:06:43 1.10 +++ AnyEvent-HTTP/HTTP.pm 2009/07/05 01:45:01 1.40 @@ -6,11 +6,30 @@ use AnyEvent::HTTP; + http_get "http://www.nethype.de/", sub { print $_[1] }; + + # ... do something else here + =head1 DESCRIPTION This module is an L user, you need to make sure that you use and run a supported event loop. +This module implements a simple, stateless and non-blocking HTTP +client. It supports GET, POST and other request methods, cookies and more, +all on a very low level. It can follow redirects supports proxies and +automatically limits the number of connections to the values specified in +the RFC. + +It should generally be a "good client" that is enough for most HTTP +tasks. Simple tasks should be simple, but complex tasks should still be +possible as the user retains control over request and response headers. + +The caller is responsible for authentication management, cookies (if +the simplistic implementation in this module doesn't suffice), referer +and other high-level protocol details for which this module offers only +limited support. + =head2 METHODS =over 4 @@ -24,18 +43,18 @@ use Carp; -use AnyEvent (); +use AnyEvent 4.452 (); use AnyEvent::Util (); use AnyEvent::Socket (); use AnyEvent::Handle (); use base Exporter::; -our $VERSION = '1.0'; +our $VERSION = '1.12'; -our @EXPORT = qw(http_get http_request); +our @EXPORT = qw(http_get http_post http_head http_request); -our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)"; +our $USERAGENT = "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)"; our $MAX_RECURSE = 10; our $MAX_PERSISTENT = 8; our $PERSISTENT_TIMEOUT = 2; @@ -43,41 +62,52 @@ # changing these is evil our $MAX_PERSISTENT_PER_HOST = 2; -our $MAX_PER_HOST = 4; # not respected yet :( +our $MAX_PER_HOST = 4; our $PROXY; +our $ACTIVE = 0; my %KA_COUNT; # number of open keep-alive connections per host +my %CO_SLOT; # number of open connections, and wait queue, per host =item http_get $url, key => value..., $cb->($data, $headers) Executes an HTTP-GET request. See the http_request function for details on -additional parameters. +additional parameters and the return value. =item http_head $url, key => value..., $cb->($data, $headers) -Executes an HTTP-HEAD request. See the http_request function for details on -additional parameters. +Executes an HTTP-HEAD request. See the http_request function for details +on additional parameters and the return value. =item http_post $url, $body, key => value..., $cb->($data, $headers) -Executes an HTTP-POST request with a request body of C<$bod>. See the -http_request function for details on additional parameters. +Executes an HTTP-POST request with a request body of C<$body>. See the +http_request function for details on additional parameters and the return +value. =item http_request $method => $url, key => value..., $cb->($data, $headers) Executes a HTTP request of type C<$method> (e.g. C, C). The URL must be an absolute http or https URL. +When called in void context, nothing is returned. In other contexts, +C returns a "cancellation guard" - you have to keep the +object at least alive until the callback get called. If the object gets +destroyed before the callbakc is called, the request will be cancelled. + The callback will be called with the response data as first argument (or C if it wasn't available due to errors), and a hash-ref with response headers as second argument. All the headers in that hash are lowercased. In addition to the response -headers, the three "pseudo-headers" C, C and -C contain the three parts of the HTTP Status-Line of the same -name. If the server sends a header multiple lines, then their contents -will be joined together with C<\x00>. +headers, the "pseudo-headers" C, C and C +contain the three parts of the HTTP Status-Line of the same name. The +pseudo-header C contains the original URL (which can differ from the +requested URL when following redirects). + +If the server sends a header multiple times, then their contents will be +joined together with a comma (C<,>), as per the HTTP spec. If an internal error occurs, such as not being able to resolve a hostname, then C<$data> will be C, C<< $headers->{Status} >> will be C<599> @@ -107,7 +137,9 @@ =item headers => hashref -The request headers to use. +The request headers to use. Currently, C may provide its +own C, C, C and C headers +and will provide defaults for C and C. =item timeout => $seconds @@ -135,7 +167,7 @@ The C<$hash_ref> must be an (initially empty) hash reference which will get updated automatically. It is possible to save the cookie_jar to persistent storage with something like JSON or Storable, but this is not -recommended, as expire times are currently being ignored. +recommended, as expiry times are currently being ignored. Note that this cookie implementation is not of very high quality, nor meant to be complete. If you want complete cookie management you have to @@ -143,6 +175,18 @@ cookie-using sites working. Cookies are a privacy disaster, do not use them unless required to. +=item tls_ctx => $scheme | $tls_ctx + +Specifies the AnyEvent::TLS context to be used for https connections. This +parameter follows the same rules as the C parameter to +L, but additionally, the two strings C or +C can be specified, which give you a predefined low-security (no +verification, highest compatibility) and high-security (CA and common-name +verification) TLS context. + +The default for this option is C, which could be interpreted as "give +me the page, no matter what". + =back Example: make a simple HTTP GET request for http://www.nethype.de/ @@ -165,14 +209,63 @@ } ; +Example: make another simple HTTP GET request, but immediately try to +cancel it. + + my $request = http_request GET => "http://www.nethype.de/", sub { + my ($body, $hdr) = @_; + print "$body\n"; + }; + + undef $request; + =cut -sub http_request($$$;@) { +sub _slot_schedule; +sub _slot_schedule($) { + my $host = shift; + + while ($CO_SLOT{$host}[0] < $MAX_PER_HOST) { + if (my $cb = shift @{ $CO_SLOT{$host}[1] }) { + # somebody wants that slot + ++$CO_SLOT{$host}[0]; + ++$ACTIVE; + + $cb->(AnyEvent::Util::guard { + --$ACTIVE; + --$CO_SLOT{$host}[0]; + _slot_schedule $host; + }); + } else { + # nobody wants the slot, maybe we can forget about it + delete $CO_SLOT{$host} unless $CO_SLOT{$host}[0]; + last; + } + } +} + +# wait for a free slot on host, call callback +sub _get_slot($$) { + push @{ $CO_SLOT{$_[0]}[1] }, $_[1]; + + _slot_schedule $_[0]; +} + +our $qr_nl = qr<\015?\012>; +our $qr_nlnl = qr<\015?\012\015?\012>; + +our $TLS_CTX_LOW = { cache => 1, dh => undef, sslv2 => 1 }; +our $TLS_CTX_HIGH = { cache => 1, verify => 1, verify_cn => "https", dh => "skip4096" }; + +sub http_request($$@) { my $cb = pop; my ($method, $url, %arg) = @_; my %hdr; + $arg{tls_ctx} = $TLS_CTX_LOW if $arg{tls_ctx} eq "low" || !exists $arg{tls_ctx}; + $arg{tls_ctx} = $TLS_CTX_HIGH if $arg{tls_ctx} eq "high"; + $method = uc $method; if (my $hdr = $arg{headers}) { @@ -181,9 +274,9 @@ } } - my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE; + my $recurse = exists $arg{recurse} ? delete $arg{recurse} : $MAX_RECURSE; - return $cb->(undef, { Status => 599, Reason => "recursion limit reached" }) + return $cb->(undef, { Status => 599, Reason => "Too many redirections", URL => $url }) if $recurse < 0; my $proxy = $arg{proxy} || $PROXY; @@ -191,17 +284,19 @@ $hdr{"user-agent"} ||= $USERAGENT; - my ($scheme, $authority, $upath, $query, $fragment) = + my ($uscheme, $uauthority, $upath, $query, $fragment) = $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|; - $scheme = lc $scheme; + $uscheme = lc $uscheme; + + my $uport = $uscheme eq "http" ? 80 + : $uscheme eq "https" ? 443 + : return $cb->(undef, { Status => 599, Reason => "Only http and https URL schemes supported (not '$uscheme')", URL => $url }); - my $uport = $scheme eq "http" ? 80 - : $scheme eq "https" ? 443 - : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" }); + $hdr{referer} ||= "$uscheme://$uauthority$upath"; # leave out fragment and query string, just a heuristic - $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x - or return $cb->(undef, { Status => 599, Reason => "unparsable URL" }); + $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x + or return $cb->(undef, { Status => 599, Reason => "Unparsable URL", URL => $url }); my $uhost = $1; $uport = $2 if defined $2; @@ -213,20 +308,27 @@ # cookie processing if (my $jar = $arg{cookie_jar}) { - %$jar = () if $jar->{version} < 1; + %$jar = () if $jar->{version} != 1; my @cookie; while (my ($chost, $v) = each %$jar) { - next unless $chost eq substr $uhost, -length $chost; - next unless $chost =~ /^\./; + if ($chost =~ /^\./) { + next unless $chost eq substr $uhost, -length $chost; + } elsif ($chost =~ /\./) { + next unless $chost eq $uhost; + } else { + next; + } while (my ($cpath, $v) = each %$v) { next unless $cpath eq substr $upath, 0, length $cpath; while (my ($k, $v) = each %$v) { - next if $scheme ne "https" && exists $v->{secure}; - push @cookie, "$k=$v->{value}"; + next if $uscheme ne "https" && exists $v->{secure}; + my $value = $v->{value}; + $value =~ s/([\\"])/\\$1/g; + push @cookie, "$k=\"$value\""; } } } @@ -235,171 +337,270 @@ if @cookie; } - my ($rhost, $rport, $rpath); # request host, port, path + my ($rhost, $rport, $rscheme, $rpath); # request host, port, path if ($proxy) { - ($rhost, $rport, $scheme) = @$proxy; - $rpath = $url; + ($rpath, $rhost, $rport, $rscheme) = ($url, @$proxy); + + # don't support https requests over https-proxy transport, + # can't be done with tls as spec'ed, unless you double-encrypt. + $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https"; } else { - ($rhost, $rport, $rpath) = ($uhost, $uport, $upath); - $hdr{host} = $uhost; + ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath); } + $hdr{host} = $uhost; $hdr{"content-length"} = length $arg{body}; - my %state; + my %state = (connect_guard => 1); - $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub { - $state{fh} = shift - or return $cb->(undef, { Status => 599, Reason => "$!" }); - - delete $state{connect_guard}; # reduce memory usage, save a tree - - # get handle - $state{handle} = new AnyEvent::Handle - fh => $state{fh}, - ($scheme eq "https" ? (tls => "connect") : ()); - - # limit the number of persistent connections - if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) { - ++$KA_COUNT{$_[1]}; - $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} }; - $hdr{connection} = "keep-alive"; - delete $hdr{connection}; # keep-alive not yet supported - } else { - delete $hdr{connection}; - } + _get_slot $uhost, sub { + $state{slot_guard} = shift; - # (re-)configure handle - $state{handle}->timeout ($timeout); - $state{handle}->on_error (sub { - %state = (); - $cb->(undef, { Status => 599, Reason => "$!" }); - }); - $state{handle}->on_eof (sub { - %state = (); - $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" }); - }); - - # send request - $state{handle}->push_write ( - "$method $rpath HTTP/1.0\015\012" - . (join "", map "$_: $hdr{$_}\015\012", keys %hdr) - . "\015\012" - . (delete $arg{body}) - ); - - %hdr = (); # reduce memory usage, save a kitten - - # status line - $state{handle}->push_read (line => qr/\015?\012/, sub { - $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix - or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" })); - - my %hdr = ( # response headers - HTTPVersion => "\x00$1", - Status => "\x00$2", - Reason => "\x00$3", - ); - - # headers, could be optimized a bit - $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub { - for ("$_[1]\012") { - # we support spaces in field names, as lotus domino - # creates them. - $hdr{lc $1} .= "\x00$2" - while /\G - ([^:\000-\037]+): - [\011\040]* - ((?: [^\015\012]+ | \015?\012[\011\040] )*) - \015?\012 - /gxc; + return unless $state{connect_guard}; - /\G$/ - or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" })); - } + $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub { + $state{fh} = shift + or return (%state = (), $cb->(undef, { Status => 599, Reason => "$!", URL => $url })); + pop; # free memory, save a tree + + return unless delete $state{connect_guard}; + + # get handle + $state{handle} = new AnyEvent::Handle + fh => $state{fh}, + timeout => $timeout, + peername => $rhost, + tls_ctx => $arg{tls_ctx}; + + # limit the number of persistent connections + # keepalive not yet supported + if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) { + ++$KA_COUNT{$_[1]}; + $state{handle}{ka_count_guard} = AnyEvent::Util::guard { + --$KA_COUNT{$_[1]} + }; + $hdr{connection} = "keep-alive"; + } else { + delete $hdr{connection}; + } - substr $_, 0, 1, "" - for values %hdr; + # (re-)configure handle + $state{handle}->on_error (sub { + %state = (); + $cb->(undef, { Status => 599, Reason => $_[2], URL => $url }); + }); + $state{handle}->on_eof (sub { + %state = (); + $cb->(undef, { Status => 599, Reason => "Unexpected end-of-file", URL => $url }); + }); - my $finish = sub { - %state = (); + $state{handle}->starttls ("connect") if $rscheme eq "https"; - # set-cookie processing - if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) { - for (split /\x00/, $hdr{"set-cookie"}) { - my ($cookie, @arg) = split /;\s*/; - my ($name, $value) = split /=/, $cookie, 2; - my %kv = (value => $value, map { split /=/, $_, 2 } @arg); - - my $cdom = (delete $kv{domain}) || $uhost; - my $cpath = (delete $kv{path}) || "/"; - - $cdom =~ s/^.?/./; # make sure it starts with a "." - - my $ndots = $cdom =~ y/.//; - next if $ndots < ($cdom =~ /[^.]{3}$/ ? 2 : 3); - - # store it - $arg{cookie_jar}{version} = 1; - $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv; + # handle actual, non-tunneled, request + my $handle_actual_request = sub { + $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls}; + + # send request + $state{handle}->push_write ( + "$method $rpath HTTP/1.0\015\012" + . (join "", map "\u$_: $hdr{$_}\015\012", keys %hdr) + . "\015\012" + . (delete $arg{body}) + ); + + %hdr = (); # reduce memory usage, save a kitten + + # status line + $state{handle}->push_read (line => $qr_nl, sub { + $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid server response ($_[1])", URL => $url })); + + my %hdr = ( # response headers + HTTPVersion => ",$1", + Status => ",$2", + Reason => ",$3", + URL => ",$url" + ); + + # headers, could be optimized a bit + $state{handle}->unshift_read (line => $qr_nlnl, sub { + for ("$_[1]\012") { + y/\015//d; # weed out any \015, as they show up in the weirdest of places. + + # things seen, not parsed: + # p3pP="NON CUR OTPi OUR NOR UNI" + + $hdr{lc $1} .= ",$2" + while /\G + ([^:\000-\037]+): + [\011\040]* + ((?: [^\012]+ | \012[\011\040] )*) + \012 + /gxc; + + /\G$/ + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Garbled response headers", URL => $url })); } - } - if ($_[1]{Status} =~ /^x30[12]$/ && $recurse) { - # microsoft and other assholes don't give a shit for following standards, - # try to support a common form of broken Location header. - $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%; + substr $_, 0, 1, "" + for values %hdr; - http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb); + my $finish = sub { + $state{handle}->destroy; + %state = (); + + # set-cookie processing + if ($arg{cookie_jar}) { + for ($hdr{"set-cookie"}) { + # parse NAME=VALUE + my @kv; + + while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) { + my $name = $1; + my $value = $3; + + unless ($value) { + $value = $2; + $value =~ s/\\(.)/$1/gs; + } + + push @kv, $name => $value; + + last unless /\G\s*;/gc; + } + + last unless @kv; + + my $name = shift @kv; + my %kv = (value => shift @kv, @kv); + + my $cdom; + my $cpath = (delete $kv{path}) || "/"; + + if (exists $kv{domain}) { + $cdom = delete $kv{domain}; + + $cdom =~ s/^\.?/./; # make sure it starts with a "." + + next if $cdom =~ /\.$/; + + # this is not rfc-like and not netscape-like. go figure. + my $ndots = $cdom =~ y/.//; + next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2); + } else { + $cdom = $uhost; + } + + # store it + $arg{cookie_jar}{version} = 1; + $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv; + + redo if /\G\s*,/gc; + } + } + + # microsoft and other shitheads don't give a shit for following standards, + # try to support some common forms of broken Location headers. + if ($_[1]{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) { + $_[1]{location} =~ s/^\.\/+//; + + my $url = "$rscheme://$uhost:$uport"; + + unless ($_[1]{location} =~ s/^\///) { + $url .= $upath; + $url =~ s/\/[^\/]*$//; + } + + $_[1]{location} = "$url/$_[1]{location}"; + } + + if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") { + # apparently, mozilla et al. just change POST to GET here + # more research is needed before we do the same + http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb); + } elsif ($_[1]{Status} == 303 && $recurse) { + # even http/1.1 is unclear on how to mutate the method + $method = "GET" unless $method eq "HEAD"; + http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb); + } elsif ($_[1]{Status} == 307 && $recurse && $method =~ /^(?:GET|HEAD)$/) { + http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb); + } else { + $cb->($_[0], $_[1]); + } + }; + + if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") { + $finish->(undef, \%hdr); + } else { + if (exists $hdr{"content-length"}) { + $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub { + # could cache persistent connection now + if ($hdr{connection} =~ /\bkeep-alive\b/i) { + # but we don't, due to misdesigns, this is annoyingly complex + }; + + $finish->($_[1], \%hdr); + }); + } else { + # too bad, need to read until we get an error or EOF, + # no way to detect winged data. + $_[0]->on_error (sub { + # delete ought to be more efficient, as we would have to make + # a copy otherwise as $_[0] gets destroyed. + $finish->(delete $_[0]{rbuf}, \%hdr); + }); + $_[0]->on_eof (undef); + $_[0]->on_read (sub { }); + } + } + }); + }); + }; + + # now handle proxy-CONNECT method + if ($proxy && $uscheme eq "https") { + # oh dear, we have to wrap it into a connect request + + # maybe re-use $uauthority with patched port? + $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012"); + $state{handle}->push_read (line => $qr_nlnl, sub { + $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix + or return (%state = (), $cb->(undef, { Status => 599, Reason => "Invalid proxy connect response ($_[1])", URL => $url })); + + if ($2 == 200) { + $rpath = $upath; + &$handle_actual_request; } else { - $cb->($_[0], $_[1]); + %state = (); + $cb->(undef, { Status => $2, Reason => $3, URL => $url }); } - }; - - if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") { - $finish->(undef, \%hdr); - } else { - if (exists $hdr{"content-length"}) { - $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub { - # could cache persistent connection now - if ($hdr{connection} =~ /\bkeep-alive\b/i) { - # but we don't, due to misdesigns, this is annoyingly complex - }; + }); + } else { + &$handle_actual_request; + } - $finish->($_[1], \%hdr); - }); - } else { - # too bad, need to read until we get an error or EOF, - # no way to detect winged data. - $_[0]->on_error (sub { - $finish->($_[0]{rbuf}, \%hdr); - }); - $_[0]->on_eof (undef); - $_[0]->on_read (sub { }); - } - } - }); - }); - }, sub { - $timeout + }, sub { + $timeout + }; }; defined wantarray && AnyEvent::Util::guard { %state = () } } -sub http_get($$;@) { +sub http_get($@) { unshift @_, "GET"; &http_request } -sub http_head($$;@) { +sub http_head($@) { unshift @_, "HEAD"; &http_request } -sub http_post($$$;@) { - unshift @_, "POST", "body"; +sub http_post($$@) { + my $url = shift; + unshift @_, "POST", $url, "body"; &http_request } @@ -421,7 +622,7 @@ =item $AnyEvent::HTTP::USERAGENT The default value for the C header (the default is -C). +C). =item $AnyEvent::HTTP::MAX_PERSISTENT @@ -435,6 +636,12 @@ Not implemented currently. +=item $AnyEvent::HTTP::ACTIVE + +The number of active connections. This is not the number of currently +running requests, but the number of currently open and non-idle TCP +connections. This number of can be useful for load-leveling. + =back =cut @@ -452,8 +659,11 @@ =head1 AUTHOR - Marc Lehmann - http://home.schmorp.de/ + Marc Lehmann + http://home.schmorp.de/ + +With many thanks to Дмитрий Шалашов, who provided countless +testcases and bugreports. =cut