--- AnyEvent-HTTP/README 2008/05/26 21:41:32 1.1 +++ AnyEvent-HTTP/README 2010/12/31 03:47:32 1.14 @@ -1,30 +1,389 @@ NAME - AnyEvent::AIO - truly asynchronous file and directrory I/O + AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client SYNOPSIS - use AnyEvent::AIO; - use IO::AIO; + use AnyEvent::HTTP; - # can now use any of the aio requests your IO::AIO module supports - # as long as you use an event loop supported by AnyEvent. + http_get "http://www.nethype.de/", sub { print $_[1] }; + + # ... do something else here DESCRIPTION This module is an AnyEvent user, you need to make sure that you use and run a supported event loop. - Loading this module will install the necessary magic to seamlessly - integrate IO::AIO into AnyEvent, i.e. you no longer need to concern - yourself with calling "IO::AIO::poll_cb" or any of that stuff (you still - can, but this module will do it in case you don't). - - The AnyEvent watcher can be disabled by executing "undef - $AnyEvent::AIO::WATCHER". Please notify the author of when and why you - think this was necessary. + This module implements a simple, stateless and non-blocking HTTP client. + It supports GET, POST and other request methods, cookies and more, all + on a very low level. It can follow redirects supports proxies and + automatically limits the number of connections to the values specified + in the RFC. + + It should generally be a "good client" that is enough for most HTTP + tasks. Simple tasks should be simple, but complex tasks should still be + possible as the user retains control over request and response headers. + + The caller is responsible for authentication management, cookies (if the + simplistic implementation in this module doesn't suffice), referer and + other high-level protocol details for which this module offers only + limited support. + + METHODS + http_get $url, key => value..., $cb->($data, $headers) + Executes an HTTP-GET request. See the http_request function for + details on additional parameters and the return value. + + http_head $url, key => value..., $cb->($data, $headers) + Executes an HTTP-HEAD request. See the http_request function for + details on additional parameters and the return value. + + http_post $url, $body, key => value..., $cb->($data, $headers) + Executes an HTTP-POST request with a request body of $body. See the + http_request function for details on additional parameters and the + return value. + + http_request $method => $url, key => value..., $cb->($data, $headers) + Executes a HTTP request of type $method (e.g. "GET", "POST"). The + URL must be an absolute http or https URL. + + When called in void context, nothing is returned. In other contexts, + "http_request" returns a "cancellation guard" - you have to keep the + object at least alive until the callback get called. If the object + gets destroyed before the callback is called, the request will be + cancelled. + + The callback will be called with the response body data as first + argument (or "undef" if an error occured), and a hash-ref with + response headers as second argument. + + All the headers in that hash are lowercased. In addition to the + response headers, the "pseudo-headers" (uppercase to avoid clashing + with possible response headers) "HTTPVersion", "Status" and "Reason" + contain the three parts of the HTTP Status-Line of the same name. If + an error occurs during the body phase of a request, then the + original "Status" and "Reason" values from the header are available + as "OrigStatus" and "OrigReason". + + The pseudo-header "URL" contains the actual URL (which can differ + from the requested URL when following redirects - for example, you + might get an error that your URL scheme is not supported even though + your URL is a valid http URL because it redirected to an ftp URL, in + which case you can look at the URL pseudo header). + + The pseudo-header "Redirect" only exists when the request was a + result of an internal redirect. In that case it is an array + reference with the "($data, $headers)" from the redirect response. + Note that this response could in turn be the result of a redirect + itself, and "$headers->{Redirect}[1]{Redirect}" will then contain + the original response, and so on. + + If the server sends a header multiple times, then their contents + will be joined together with a comma (","), as per the HTTP spec. + + If an internal error occurs, such as not being able to resolve a + hostname, then $data will be "undef", "$headers->{Status}" will be + "59x" (usually 599) and the "Reason" pseudo-header will contain an + error message. + + A typical callback might look like this: + + sub { + my ($body, $hdr) = @_; + + if ($hdr->{Status} =~ /^2/) { + ... everything should be ok + } else { + print "error, $hdr->{Status} $hdr->{Reason}\n"; + } + } + + Additional parameters are key-value pairs, and are fully optional. + They include: + + recurse => $count (default: $MAX_RECURSE) + Whether to recurse requests or not, e.g. on redirects, + authentication retries and so on, and how often to do so. + + headers => hashref + The request headers to use. Currently, "http_request" may + provide its own "Host:", "Content-Length:", "Connection:" and + "Cookie:" headers and will provide defaults for "User-Agent:" + and "Referer:" (this can be suppressed by using "undef" for + these headers in which case they won't be sent at all). + + timeout => $seconds + The time-out to use for various stages - each connect attempt + will reset the timeout, as will read or write activity, i.e. + this is not an overall timeout. + + Default timeout is 5 minutes. + + proxy => [$host, $port[, $scheme]] or undef + Use the given http proxy for all requests. If not specified, + then the default proxy (as specified by $ENV{http_proxy}) is + used. + + $scheme must be either missing, "http" for HTTP or "https" for + HTTPS. + + body => $string + The request body, usually empty. Will be-sent as-is (future + versions of this module might offer more options). + + cookie_jar => $hash_ref + Passing this parameter enables (simplified) cookie-processing, + loosely based on the original netscape specification. + + The $hash_ref must be an (initially empty) hash reference which + will get updated automatically. It is possible to save the + cookie_jar to persistent storage with something like JSON or + Storable, but this is not recommended, as expiry times are + currently being ignored. + + Note that this cookie implementation is not of very high + quality, nor meant to be complete. If you want complete cookie + management you have to do that on your own. "cookie_jar" is + meant as a quick fix to get some cookie-using sites working. + Cookies are a privacy disaster, do not use them unless required + to. + + tls_ctx => $scheme | $tls_ctx + Specifies the AnyEvent::TLS context to be used for https + connections. This parameter follows the same rules as the + "tls_ctx" parameter to AnyEvent::Handle, but additionally, the + two strings "low" or "high" can be specified, which give you a + predefined low-security (no verification, highest compatibility) + and high-security (CA and common-name verification) TLS context. + + The default for this option is "low", which could be interpreted + as "give me the page, no matter what". + + on_prepare => $callback->($fh) + In rare cases you need to "tune" the socket before it is used to + connect (for exmaple, to bind it on a given IP address). This + parameter overrides the prepare callback passed to + "AnyEvent::Socket::tcp_connect" and behaves exactly the same way + (e.g. it has to provide a timeout). See the description for the + $prepare_cb argument of "AnyEvent::Socket::tcp_connect" for + details. + + tcp_connect => $callback->($host, $service, $connect_cb, + $prepare_cb) + In even rarer cases you want total control over how + AnyEvent::HTTP establishes connections. Normally it uses + AnyEvent::Socket::tcp_connect to do this, but you can provide + your own "tcp_connect" function - obviously, it has to follow + the same calling conventions, except that it may always return a + connection guard object. + + There are probably lots of weird uses for this function, + starting from tracing the hosts "http_request" actually tries to + connect, to (inexact but fast) host => IP address caching or + even socks protocol support. + + on_header => $callback->($headers) + When specified, this callback will be called with the header + hash as soon as headers have been successfully received from the + remote server (not on locally-generated errors). + + It has to return either true (in which case AnyEvent::HTTP will + continue), or false, in which case AnyEvent::HTTP will cancel + the download (and call the finish callback with an error code of + 598). + + This callback is useful, among other things, to quickly reject + unwanted content, which, if it is supposed to be rare, can be + faster than first doing a "HEAD" request. + + Example: cancel the request unless the content-type is + "text/html". + + on_header => sub { + $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/ + }, + + on_body => $callback->($partial_body, $headers) + When specified, all body data will be passed to this callback + instead of to the completion callback. The completion callback + will get the empty string instead of the body data. + + It has to return either true (in which case AnyEvent::HTTP will + continue), or false, in which case AnyEvent::HTTP will cancel + the download (and call the completion callback with an error + code of 598). + + This callback is useful when the data is too large to be held in + memory (so the callback writes it to a file) or when only some + information should be extracted, or when the body should be + processed incrementally. + + It is usually preferred over doing your own body handling via + "want_body_handle", but in case of streaming APIs, where HTTP is + only used to create a connection, "want_body_handle" is the + better alternative, as it allows you to install your own event + handler, reducing resource usage. + + want_body_handle => $enable + When enabled (default is disabled), the behaviour of + AnyEvent::HTTP changes considerably: after parsing the headers, + and instead of downloading the body (if any), the completion + callback will be called. Instead of the $body argument + containing the body data, the callback will receive the + AnyEvent::Handle object associated with the connection. In error + cases, "undef" will be passed. When there is no body (e.g. + status 304), the empty string will be passed. + + The handle object might or might not be in TLS mode, might be + connected to a proxy, be a persistent connection etc., and + configured in unspecified ways. The user is responsible for this + handle (it will not be used by this module anymore). + + This is useful with some push-type services, where, after the + initial headers, an interactive protocol is used (typical + example would be the push-style twitter API which starts a + JSON/XML stream). + + If you think you need this, first have a look at "on_body", to + see if that doesn't solve your problem in a better way. + + Example: make a simple HTTP GET request for http://www.nethype.de/ + + http_request GET => "http://www.nethype.de/", sub { + my ($body, $hdr) = @_; + print "$body\n"; + }; + + Example: make a HTTP HEAD request on https://www.google.com/, use a + timeout of 30 seconds. + + http_request + GET => "https://www.google.com", + timeout => 30, + sub { + my ($body, $hdr) = @_; + use Data::Dumper; + print Dumper $hdr; + } + ; + + Example: make another simple HTTP GET request, but immediately try + to cancel it. + + my $request = http_request GET => "http://www.nethype.de/", sub { + my ($body, $hdr) = @_; + print "$body\n"; + }; + + undef $request; + + DNS CACHING + AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for the + actual connection, which in turn uses AnyEvent::DNS to resolve + hostnames. The latter is a simple stub resolver and does no caching on + its own. If you want DNS caching, you currently have to provide your own + default resolver (by storing a suitable resolver object in + $AnyEvent::DNS::RESOLVER). + + GLOBAL FUNCTIONS AND VARIABLES + AnyEvent::HTTP::set_proxy "proxy-url" + Sets the default proxy server to use. The proxy-url must begin with + a string of the form "http://host:port" (optionally "https:..."), + croaks otherwise. + + To clear an already-set proxy, use "undef". + + $date = AnyEvent::HTTP::format_date $timestamp + Takes a POSIX timestamp (seconds since the epoch) and formats it as + a HTTP Date (RFC 2616). + + $timestamp = AnyEvent::HTTP::parse_date $date + Takes a HTTP Date (RFC 2616) and returns the corresponding POSIX + timestamp, or "undef" if the date cannot be parsed. + + $AnyEvent::HTTP::MAX_RECURSE + The default value for the "recurse" request parameter (default: 10). + + $AnyEvent::HTTP::USERAGENT + The default value for the "User-Agent" header (the default is + "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; + +http://software.schmorp.de/pkg/AnyEvent)"). + + $AnyEvent::HTTP::MAX_PER_HOST + The maximum number of concurrent connections to the same host + (identified by the hostname). If the limit is exceeded, then the + additional requests are queued until previous connections are + closed. + + The default value for this is 4, and it is highly advisable to not + increase it. + + $AnyEvent::HTTP::ACTIVE + The number of active connections. This is not the number of + currently running requests, but the number of currently open and + non-idle TCP connections. This number of can be useful for + load-leveling. + + SOCKS PROXIES + Socks proxies are not directly supported by AnyEvent::HTTP. You can + compile your perl to support socks, or use an external program such as + socksify (dante) or tsocks to make your program use a socks proxy + transparently. + + Alternatively, for AnyEvent::HTTP only, you can use your own + "tcp_connect" function that does the proxy handshake - here is an + example that works with socks4a proxies: + + use Errno; + use AnyEvent::Util; + use AnyEvent::Socket; + use AnyEvent::Handle; + + # host, port and username of/for your socks4a proxy + my $socks_host = "10.0.0.23"; + my $socks_port = 9050; + my $socks_user = ""; + + sub socks4a_connect { + my ($host, $port, $connect_cb, $prepare_cb) = @_; + + my $hdl = new AnyEvent::Handle + connect => [$socks_host, $socks_port], + on_prepare => sub { $prepare_cb->($_[0]{fh}) }, + on_error => sub { $connect_cb->() }, + ; + + $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host); + + $hdl->push_read (chunk => 8, sub { + my ($hdl, $chunk) = @_; + my ($status, $port, $ipn) = unpack "xCna4", $chunk; + + if ($status == 0x5a) { + $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port"); + } else { + $! = Errno::ENXIO; $connect_cb->(); + } + }); + + $hdl + } + + Use "socks4a_connect" instead of "tcp_connect" when doing + "http_request"s, possibly after switching off other proxy types: + + AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies + + http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub { + my ($data, $headers) = @_; + ... + }; SEE ALSO - AnyEvent, Coro::AIO (for a more natural syntax). + AnyEvent. AUTHOR - Marc Lehmann - http://home.schmorp.de/ + Marc Lehmann + http://home.schmorp.de/ + + With many thanks to Дмитрий Шалашов, who provided + countless testcases and bugreports.