1 | NAME |
1 | NAME |
2 | AnyEvent::AIO - truly asynchronous file and directrory I/O |
2 | AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client |
3 | |
3 | |
4 | SYNOPSIS |
4 | SYNOPSIS |
5 | use AnyEvent::AIO; |
5 | use AnyEvent::HTTP; |
6 | use IO::AIO; |
|
|
7 | |
6 | |
8 | # can now use any of the aio requests your IO::AIO module supports |
7 | http_get "http://www.nethype.de/", sub { print $_[1] }; |
9 | # as long as you use an event loop supported by AnyEvent. |
8 | |
|
|
9 | # ... do something else here |
10 | |
10 | |
11 | DESCRIPTION |
11 | DESCRIPTION |
12 | This module is an AnyEvent user, you need to make sure that you use and |
12 | This module is an AnyEvent user, you need to make sure that you use and |
13 | run a supported event loop. |
13 | run a supported event loop. |
14 | |
14 | |
15 | Loading this module will install the necessary magic to seamlessly |
15 | This module implements a simple, stateless and non-blocking HTTP client. |
16 | integrate IO::AIO into AnyEvent, i.e. you no longer need to concern |
16 | It supports GET, POST and other request methods, cookies and more, all |
17 | yourself with calling "IO::AIO::poll_cb" or any of that stuff (you still |
17 | on a very low level. It can follow redirects supports proxies and |
18 | can, but this module will do it in case you don't). |
18 | automatically limits the number of connections to the values specified |
|
|
19 | in the RFC. |
19 | |
20 | |
20 | The AnyEvent watcher can be disabled by executing "undef |
21 | It should generally be a "good client" that is enough for most HTTP |
21 | $AnyEvent::AIO::WATCHER". Please notify the author of when and why you |
22 | tasks. Simple tasks should be simple, but complex tasks should still be |
22 | think this was necessary. |
23 | possible as the user retains control over request and response headers. |
|
|
24 | |
|
|
25 | The caller is responsible for authentication management, cookies (if the |
|
|
26 | simplistic implementation in this module doesn't suffice), referer and |
|
|
27 | other high-level protocol details for which this module offers only |
|
|
28 | limited support. |
|
|
29 | |
|
|
30 | METHODS |
|
|
31 | http_get $url, key => value..., $cb->($data, $headers) |
|
|
32 | Executes an HTTP-GET request. See the http_request function for |
|
|
33 | details on additional parameters and the return value. |
|
|
34 | |
|
|
35 | http_head $url, key => value..., $cb->($data, $headers) |
|
|
36 | Executes an HTTP-HEAD request. See the http_request function for |
|
|
37 | details on additional parameters and the return value. |
|
|
38 | |
|
|
39 | http_post $url, $body, key => value..., $cb->($data, $headers) |
|
|
40 | Executes an HTTP-POST request with a request body of $body. See the |
|
|
41 | http_request function for details on additional parameters and the |
|
|
42 | return value. |
|
|
43 | |
|
|
44 | http_request $method => $url, key => value..., $cb->($data, $headers) |
|
|
45 | Executes a HTTP request of type $method (e.g. "GET", "POST"). The |
|
|
46 | URL must be an absolute http or https URL. |
|
|
47 | |
|
|
48 | When called in void context, nothing is returned. In other contexts, |
|
|
49 | "http_request" returns a "cancellation guard" - you have to keep the |
|
|
50 | object at least alive until the callback get called. If the object |
|
|
51 | gets destroyed before the callback is called, the request will be |
|
|
52 | cancelled. |
|
|
53 | |
|
|
54 | The callback will be called with the response body data as first |
|
|
55 | argument (or "undef" if an error occured), and a hash-ref with |
|
|
56 | response headers as second argument. |
|
|
57 | |
|
|
58 | All the headers in that hash are lowercased. In addition to the |
|
|
59 | response headers, the "pseudo-headers" (uppercase to avoid clashing |
|
|
60 | with possible response headers) "HTTPVersion", "Status" and "Reason" |
|
|
61 | contain the three parts of the HTTP Status-Line of the same name. If |
|
|
62 | an error occurs during the body phase of a request, then the |
|
|
63 | original "Status" and "Reason" values from the header are available |
|
|
64 | as "OrigStatus" and "OrigReason". |
|
|
65 | |
|
|
66 | The pseudo-header "URL" contains the actual URL (which can differ |
|
|
67 | from the requested URL when following redirects - for example, you |
|
|
68 | might get an error that your URL scheme is not supported even though |
|
|
69 | your URL is a valid http URL because it redirected to an ftp URL, in |
|
|
70 | which case you can look at the URL pseudo header). |
|
|
71 | |
|
|
72 | The pseudo-header "Redirect" only exists when the request was a |
|
|
73 | result of an internal redirect. In that case it is an array |
|
|
74 | reference with the "($data, $headers)" from the redirect response. |
|
|
75 | Note that this response could in turn be the result of a redirect |
|
|
76 | itself, and "$headers->{Redirect}[1]{Redirect}" will then contain |
|
|
77 | the original response, and so on. |
|
|
78 | |
|
|
79 | If the server sends a header multiple times, then their contents |
|
|
80 | will be joined together with a comma (","), as per the HTTP spec. |
|
|
81 | |
|
|
82 | If an internal error occurs, such as not being able to resolve a |
|
|
83 | hostname, then $data will be "undef", "$headers->{Status}" will be |
|
|
84 | "59x" (usually 599) and the "Reason" pseudo-header will contain an |
|
|
85 | error message. |
|
|
86 | |
|
|
87 | A typical callback might look like this: |
|
|
88 | |
|
|
89 | sub { |
|
|
90 | my ($body, $hdr) = @_; |
|
|
91 | |
|
|
92 | if ($hdr->{Status} =~ /^2/) { |
|
|
93 | ... everything should be ok |
|
|
94 | } else { |
|
|
95 | print "error, $hdr->{Status} $hdr->{Reason}\n"; |
|
|
96 | } |
|
|
97 | } |
|
|
98 | |
|
|
99 | Additional parameters are key-value pairs, and are fully optional. |
|
|
100 | They include: |
|
|
101 | |
|
|
102 | recurse => $count (default: $MAX_RECURSE) |
|
|
103 | Whether to recurse requests or not, e.g. on redirects, |
|
|
104 | authentication retries and so on, and how often to do so. |
|
|
105 | |
|
|
106 | headers => hashref |
|
|
107 | The request headers to use. Currently, "http_request" may |
|
|
108 | provide its own "Host:", "Content-Length:", "Connection:" and |
|
|
109 | "Cookie:" headers and will provide defaults for "User-Agent:" |
|
|
110 | and "Referer:" (this can be suppressed by using "undef" for |
|
|
111 | these headers in which case they won't be sent at all). |
|
|
112 | |
|
|
113 | timeout => $seconds |
|
|
114 | The time-out to use for various stages - each connect attempt |
|
|
115 | will reset the timeout, as will read or write activity, i.e. |
|
|
116 | this is not an overall timeout. |
|
|
117 | |
|
|
118 | Default timeout is 5 minutes. |
|
|
119 | |
|
|
120 | proxy => [$host, $port[, $scheme]] or undef |
|
|
121 | Use the given http proxy for all requests. If not specified, |
|
|
122 | then the default proxy (as specified by $ENV{http_proxy}) is |
|
|
123 | used. |
|
|
124 | |
|
|
125 | $scheme must be either missing, "http" for HTTP or "https" for |
|
|
126 | HTTPS. |
|
|
127 | |
|
|
128 | body => $string |
|
|
129 | The request body, usually empty. Will be-sent as-is (future |
|
|
130 | versions of this module might offer more options). |
|
|
131 | |
|
|
132 | cookie_jar => $hash_ref |
|
|
133 | Passing this parameter enables (simplified) cookie-processing, |
|
|
134 | loosely based on the original netscape specification. |
|
|
135 | |
|
|
136 | The $hash_ref must be an (initially empty) hash reference which |
|
|
137 | will get updated automatically. It is possible to save the |
|
|
138 | cookie_jar to persistent storage with something like JSON or |
|
|
139 | Storable, but this is not recommended, as expiry times are |
|
|
140 | currently being ignored. |
|
|
141 | |
|
|
142 | Note that this cookie implementation is not of very high |
|
|
143 | quality, nor meant to be complete. If you want complete cookie |
|
|
144 | management you have to do that on your own. "cookie_jar" is |
|
|
145 | meant as a quick fix to get some cookie-using sites working. |
|
|
146 | Cookies are a privacy disaster, do not use them unless required |
|
|
147 | to. |
|
|
148 | |
|
|
149 | tls_ctx => $scheme | $tls_ctx |
|
|
150 | Specifies the AnyEvent::TLS context to be used for https |
|
|
151 | connections. This parameter follows the same rules as the |
|
|
152 | "tls_ctx" parameter to AnyEvent::Handle, but additionally, the |
|
|
153 | two strings "low" or "high" can be specified, which give you a |
|
|
154 | predefined low-security (no verification, highest compatibility) |
|
|
155 | and high-security (CA and common-name verification) TLS context. |
|
|
156 | |
|
|
157 | The default for this option is "low", which could be interpreted |
|
|
158 | as "give me the page, no matter what". |
|
|
159 | |
|
|
160 | on_prepare => $callback->($fh) |
|
|
161 | In rare cases you need to "tune" the socket before it is used to |
|
|
162 | connect (for exmaple, to bind it on a given IP address). This |
|
|
163 | parameter overrides the prepare callback passed to |
|
|
164 | "AnyEvent::Socket::tcp_connect" and behaves exactly the same way |
|
|
165 | (e.g. it has to provide a timeout). See the description for the |
|
|
166 | $prepare_cb argument of "AnyEvent::Socket::tcp_connect" for |
|
|
167 | details. |
|
|
168 | |
|
|
169 | tcp_connect => $callback->($host, $service, $connect_cb, |
|
|
170 | $prepare_cb) |
|
|
171 | In even rarer cases you want total control over how |
|
|
172 | AnyEvent::HTTP establishes connections. Normally it uses |
|
|
173 | AnyEvent::Socket::tcp_connect to do this, but you can provide |
|
|
174 | your own "tcp_connect" function - obviously, it has to follow |
|
|
175 | the same calling conventions, except that it may always return a |
|
|
176 | connection guard object. |
|
|
177 | |
|
|
178 | There are probably lots of weird uses for this function, |
|
|
179 | starting from tracing the hosts "http_request" actually tries to |
|
|
180 | connect, to (inexact but fast) host => IP address caching or |
|
|
181 | even socks protocol support. |
|
|
182 | |
|
|
183 | on_header => $callback->($headers) |
|
|
184 | When specified, this callback will be called with the header |
|
|
185 | hash as soon as headers have been successfully received from the |
|
|
186 | remote server (not on locally-generated errors). |
|
|
187 | |
|
|
188 | It has to return either true (in which case AnyEvent::HTTP will |
|
|
189 | continue), or false, in which case AnyEvent::HTTP will cancel |
|
|
190 | the download (and call the finish callback with an error code of |
|
|
191 | 598). |
|
|
192 | |
|
|
193 | This callback is useful, among other things, to quickly reject |
|
|
194 | unwanted content, which, if it is supposed to be rare, can be |
|
|
195 | faster than first doing a "HEAD" request. |
|
|
196 | |
|
|
197 | Example: cancel the request unless the content-type is |
|
|
198 | "text/html". |
|
|
199 | |
|
|
200 | on_header => sub { |
|
|
201 | $_[0]{"content-type"} =~ /^text\/html\s*(?:;|$)/ |
|
|
202 | }, |
|
|
203 | |
|
|
204 | on_body => $callback->($partial_body, $headers) |
|
|
205 | When specified, all body data will be passed to this callback |
|
|
206 | instead of to the completion callback. The completion callback |
|
|
207 | will get the empty string instead of the body data. |
|
|
208 | |
|
|
209 | It has to return either true (in which case AnyEvent::HTTP will |
|
|
210 | continue), or false, in which case AnyEvent::HTTP will cancel |
|
|
211 | the download (and call the completion callback with an error |
|
|
212 | code of 598). |
|
|
213 | |
|
|
214 | This callback is useful when the data is too large to be held in |
|
|
215 | memory (so the callback writes it to a file) or when only some |
|
|
216 | information should be extracted, or when the body should be |
|
|
217 | processed incrementally. |
|
|
218 | |
|
|
219 | It is usually preferred over doing your own body handling via |
|
|
220 | "want_body_handle", but in case of streaming APIs, where HTTP is |
|
|
221 | only used to create a connection, "want_body_handle" is the |
|
|
222 | better alternative, as it allows you to install your own event |
|
|
223 | handler, reducing resource usage. |
|
|
224 | |
|
|
225 | want_body_handle => $enable |
|
|
226 | When enabled (default is disabled), the behaviour of |
|
|
227 | AnyEvent::HTTP changes considerably: after parsing the headers, |
|
|
228 | and instead of downloading the body (if any), the completion |
|
|
229 | callback will be called. Instead of the $body argument |
|
|
230 | containing the body data, the callback will receive the |
|
|
231 | AnyEvent::Handle object associated with the connection. In error |
|
|
232 | cases, "undef" will be passed. When there is no body (e.g. |
|
|
233 | status 304), the empty string will be passed. |
|
|
234 | |
|
|
235 | The handle object might or might not be in TLS mode, might be |
|
|
236 | connected to a proxy, be a persistent connection etc., and |
|
|
237 | configured in unspecified ways. The user is responsible for this |
|
|
238 | handle (it will not be used by this module anymore). |
|
|
239 | |
|
|
240 | This is useful with some push-type services, where, after the |
|
|
241 | initial headers, an interactive protocol is used (typical |
|
|
242 | example would be the push-style twitter API which starts a |
|
|
243 | JSON/XML stream). |
|
|
244 | |
|
|
245 | If you think you need this, first have a look at "on_body", to |
|
|
246 | see if that doesn't solve your problem in a better way. |
|
|
247 | |
|
|
248 | Example: make a simple HTTP GET request for http://www.nethype.de/ |
|
|
249 | |
|
|
250 | http_request GET => "http://www.nethype.de/", sub { |
|
|
251 | my ($body, $hdr) = @_; |
|
|
252 | print "$body\n"; |
|
|
253 | }; |
|
|
254 | |
|
|
255 | Example: make a HTTP HEAD request on https://www.google.com/, use a |
|
|
256 | timeout of 30 seconds. |
|
|
257 | |
|
|
258 | http_request |
|
|
259 | GET => "https://www.google.com", |
|
|
260 | timeout => 30, |
|
|
261 | sub { |
|
|
262 | my ($body, $hdr) = @_; |
|
|
263 | use Data::Dumper; |
|
|
264 | print Dumper $hdr; |
|
|
265 | } |
|
|
266 | ; |
|
|
267 | |
|
|
268 | Example: make another simple HTTP GET request, but immediately try |
|
|
269 | to cancel it. |
|
|
270 | |
|
|
271 | my $request = http_request GET => "http://www.nethype.de/", sub { |
|
|
272 | my ($body, $hdr) = @_; |
|
|
273 | print "$body\n"; |
|
|
274 | }; |
|
|
275 | |
|
|
276 | undef $request; |
|
|
277 | |
|
|
278 | DNS CACHING |
|
|
279 | AnyEvent::HTTP uses the AnyEvent::Socket::tcp_connect function for the |
|
|
280 | actual connection, which in turn uses AnyEvent::DNS to resolve |
|
|
281 | hostnames. The latter is a simple stub resolver and does no caching on |
|
|
282 | its own. If you want DNS caching, you currently have to provide your own |
|
|
283 | default resolver (by storing a suitable resolver object in |
|
|
284 | $AnyEvent::DNS::RESOLVER). |
|
|
285 | |
|
|
286 | GLOBAL FUNCTIONS AND VARIABLES |
|
|
287 | AnyEvent::HTTP::set_proxy "proxy-url" |
|
|
288 | Sets the default proxy server to use. The proxy-url must begin with |
|
|
289 | a string of the form "http://host:port" (optionally "https:..."), |
|
|
290 | croaks otherwise. |
|
|
291 | |
|
|
292 | To clear an already-set proxy, use "undef". |
|
|
293 | |
|
|
294 | $date = AnyEvent::HTTP::format_date $timestamp |
|
|
295 | Takes a POSIX timestamp (seconds since the epoch) and formats it as |
|
|
296 | a HTTP Date (RFC 2616). |
|
|
297 | |
|
|
298 | $timestamp = AnyEvent::HTTP::parse_date $date |
|
|
299 | Takes a HTTP Date (RFC 2616) and returns the corresponding POSIX |
|
|
300 | timestamp, or "undef" if the date cannot be parsed. |
|
|
301 | |
|
|
302 | $AnyEvent::HTTP::MAX_RECURSE |
|
|
303 | The default value for the "recurse" request parameter (default: 10). |
|
|
304 | |
|
|
305 | $AnyEvent::HTTP::USERAGENT |
|
|
306 | The default value for the "User-Agent" header (the default is |
|
|
307 | "Mozilla/5.0 (compatible; U; AnyEvent-HTTP/$VERSION; |
|
|
308 | +http://software.schmorp.de/pkg/AnyEvent)"). |
|
|
309 | |
|
|
310 | $AnyEvent::HTTP::MAX_PER_HOST |
|
|
311 | The maximum number of concurrent connections to the same host |
|
|
312 | (identified by the hostname). If the limit is exceeded, then the |
|
|
313 | additional requests are queued until previous connections are |
|
|
314 | closed. |
|
|
315 | |
|
|
316 | The default value for this is 4, and it is highly advisable to not |
|
|
317 | increase it. |
|
|
318 | |
|
|
319 | $AnyEvent::HTTP::ACTIVE |
|
|
320 | The number of active connections. This is not the number of |
|
|
321 | currently running requests, but the number of currently open and |
|
|
322 | non-idle TCP connections. This number of can be useful for |
|
|
323 | load-leveling. |
|
|
324 | |
|
|
325 | SOCKS PROXIES |
|
|
326 | Socks proxies are not directly supported by AnyEvent::HTTP. You can |
|
|
327 | compile your perl to support socks, or use an external program such as |
|
|
328 | socksify (dante) or tsocks to make your program use a socks proxy |
|
|
329 | transparently. |
|
|
330 | |
|
|
331 | Alternatively, for AnyEvent::HTTP only, you can use your own |
|
|
332 | "tcp_connect" function that does the proxy handshake - here is an |
|
|
333 | example that works with socks4a proxies: |
|
|
334 | |
|
|
335 | use Errno; |
|
|
336 | use AnyEvent::Util; |
|
|
337 | use AnyEvent::Socket; |
|
|
338 | use AnyEvent::Handle; |
|
|
339 | |
|
|
340 | # host, port and username of/for your socks4a proxy |
|
|
341 | my $socks_host = "10.0.0.23"; |
|
|
342 | my $socks_port = 9050; |
|
|
343 | my $socks_user = ""; |
|
|
344 | |
|
|
345 | sub socks4a_connect { |
|
|
346 | my ($host, $port, $connect_cb, $prepare_cb) = @_; |
|
|
347 | |
|
|
348 | my $hdl = new AnyEvent::Handle |
|
|
349 | connect => [$socks_host, $socks_port], |
|
|
350 | on_prepare => sub { $prepare_cb->($_[0]{fh}) }, |
|
|
351 | on_error => sub { $connect_cb->() }, |
|
|
352 | ; |
|
|
353 | |
|
|
354 | $hdl->push_write (pack "CCnNZ*Z*", 4, 1, $port, 1, $socks_user, $host); |
|
|
355 | |
|
|
356 | $hdl->push_read (chunk => 8, sub { |
|
|
357 | my ($hdl, $chunk) = @_; |
|
|
358 | my ($status, $port, $ipn) = unpack "xCna4", $chunk; |
|
|
359 | |
|
|
360 | if ($status == 0x5a) { |
|
|
361 | $connect_cb->($hdl->{fh}, (format_address $ipn) . ":$port"); |
|
|
362 | } else { |
|
|
363 | $! = Errno::ENXIO; $connect_cb->(); |
|
|
364 | } |
|
|
365 | }); |
|
|
366 | |
|
|
367 | $hdl |
|
|
368 | } |
|
|
369 | |
|
|
370 | Use "socks4a_connect" instead of "tcp_connect" when doing |
|
|
371 | "http_request"s, possibly after switching off other proxy types: |
|
|
372 | |
|
|
373 | AnyEvent::HTTP::set_proxy undef; # usually you do not want other proxies |
|
|
374 | |
|
|
375 | http_get 'http://www.google.com', tcp_connect => \&socks4a_connect, sub { |
|
|
376 | my ($data, $headers) = @_; |
|
|
377 | ... |
|
|
378 | }; |
23 | |
379 | |
24 | SEE ALSO |
380 | SEE ALSO |
25 | AnyEvent, Coro::AIO (for a more natural syntax). |
381 | AnyEvent. |
26 | |
382 | |
27 | AUTHOR |
383 | AUTHOR |
28 | Marc Lehmann <schmorp@schmorp.de> |
384 | Marc Lehmann <schmorp@schmorp.de> |
29 | http://home.schmorp.de/ |
385 | http://home.schmorp.de/ |
30 | |
386 | |
|
|
387 | With many thanks to Дмитрий Шалашов, who provided |
|
|
388 | countless testcases and bugreports. |
|
|
389 | |