ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
Revision: 1.10
Committed: Thu Jun 5 13:06:43 2008 UTC (15 years, 11 months ago) by root
Branch: MAIN
Changes since 1.9: +102 -36 lines
Log Message:
*** empty log message ***

File Contents

# Content
1 =head1 NAME
2
3 AnyEvent::HTTP - simple but non-blocking HTTP/HTTPS client
4
5 =head1 SYNOPSIS
6
7 use AnyEvent::HTTP;
8
9 =head1 DESCRIPTION
10
11 This module is an L<AnyEvent> user, you need to make sure that you use and
12 run a supported event loop.
13
14 =head2 METHODS
15
16 =over 4
17
18 =cut
19
20 package AnyEvent::HTTP;
21
22 use strict;
23 no warnings;
24
25 use Carp;
26
27 use AnyEvent ();
28 use AnyEvent::Util ();
29 use AnyEvent::Socket ();
30 use AnyEvent::Handle ();
31
32 use base Exporter::;
33
34 our $VERSION = '1.0';
35
36 our @EXPORT = qw(http_get http_request);
37
38 our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
39 our $MAX_RECURSE = 10;
40 our $MAX_PERSISTENT = 8;
41 our $PERSISTENT_TIMEOUT = 2;
42 our $TIMEOUT = 300;
43
44 # changing these is evil
45 our $MAX_PERSISTENT_PER_HOST = 2;
46 our $MAX_PER_HOST = 4; # not respected yet :(
47
48 our $PROXY;
49
50 my %KA_COUNT; # number of open keep-alive connections per host
51
52 =item http_get $url, key => value..., $cb->($data, $headers)
53
54 Executes an HTTP-GET request. See the http_request function for details on
55 additional parameters.
56
57 =item http_head $url, key => value..., $cb->($data, $headers)
58
59 Executes an HTTP-HEAD request. See the http_request function for details on
60 additional parameters.
61
62 =item http_post $url, $body, key => value..., $cb->($data, $headers)
63
64 Executes an HTTP-POST request with a request body of C<$bod>. See the
65 http_request function for details on additional parameters.
66
67 =item http_request $method => $url, key => value..., $cb->($data, $headers)
68
69 Executes a HTTP request of type C<$method> (e.g. C<GET>, C<POST>). The URL
70 must be an absolute http or https URL.
71
72 The callback will be called with the response data as first argument
73 (or C<undef> if it wasn't available due to errors), and a hash-ref with
74 response headers as second argument.
75
76 All the headers in that hash are lowercased. In addition to the response
77 headers, the three "pseudo-headers" C<HTTPVersion>, C<Status> and
78 C<Reason> contain the three parts of the HTTP Status-Line of the same
79 name. If the server sends a header multiple lines, then their contents
80 will be joined together with C<\x00>.
81
82 If an internal error occurs, such as not being able to resolve a hostname,
83 then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
84 and the C<Reason> pseudo-header will contain an error message.
85
86 A typical callback might look like this:
87
88 sub {
89 my ($body, $hdr) = @_;
90
91 if ($hdr->{Status} =~ /^2/) {
92 ... everything should be ok
93 } else {
94 print "error, $hdr->{Status} $hdr->{Reason}\n";
95 }
96 }
97
98 Additional parameters are key-value pairs, and are fully optional. They
99 include:
100
101 =over 4
102
103 =item recurse => $count (default: $MAX_RECURSE)
104
105 Whether to recurse requests or not, e.g. on redirects, authentication
106 retries and so on, and how often to do so.
107
108 =item headers => hashref
109
110 The request headers to use.
111
112 =item timeout => $seconds
113
114 The time-out to use for various stages - each connect attempt will reset
115 the timeout, as will read or write activity. Default timeout is 5 minutes.
116
117 =item proxy => [$host, $port[, $scheme]] or undef
118
119 Use the given http proxy for all requests. If not specified, then the
120 default proxy (as specified by C<$ENV{http_proxy}>) is used.
121
122 C<$scheme> must be either missing or C<http> for HTTP, or C<https> for
123 HTTPS.
124
125 =item body => $string
126
127 The request body, usually empty. Will be-sent as-is (future versions of
128 this module might offer more options).
129
130 =item cookie_jar => $hash_ref
131
132 Passing this parameter enables (simplified) cookie-processing, loosely
133 based on the original netscape specification.
134
135 The C<$hash_ref> must be an (initially empty) hash reference which will
136 get updated automatically. It is possible to save the cookie_jar to
137 persistent storage with something like JSON or Storable, but this is not
138 recommended, as expire times are currently being ignored.
139
140 Note that this cookie implementation is not of very high quality, nor
141 meant to be complete. If you want complete cookie management you have to
142 do that on your own. C<cookie_jar> is meant as a quick fix to get some
143 cookie-using sites working. Cookies are a privacy disaster, do not use
144 them unless required to.
145
146 =back
147
148 Example: make a simple HTTP GET request for http://www.nethype.de/
149
150 http_request GET => "http://www.nethype.de/", sub {
151 my ($body, $hdr) = @_;
152 print "$body\n";
153 };
154
155 Example: make a HTTP HEAD request on https://www.google.com/, use a
156 timeout of 30 seconds.
157
158 http_request
159 GET => "https://www.google.com",
160 timeout => 30,
161 sub {
162 my ($body, $hdr) = @_;
163 use Data::Dumper;
164 print Dumper $hdr;
165 }
166 ;
167
168 =cut
169
170 sub http_request($$$;@) {
171 my $cb = pop;
172 my ($method, $url, %arg) = @_;
173
174 my %hdr;
175
176 $method = uc $method;
177
178 if (my $hdr = $arg{headers}) {
179 while (my ($k, $v) = each %$hdr) {
180 $hdr{lc $k} = $v;
181 }
182 }
183
184 my $recurse = exists $arg{recurse} ? $arg{recurse} : $MAX_RECURSE;
185
186 return $cb->(undef, { Status => 599, Reason => "recursion limit reached" })
187 if $recurse < 0;
188
189 my $proxy = $arg{proxy} || $PROXY;
190 my $timeout = $arg{timeout} || $TIMEOUT;
191
192 $hdr{"user-agent"} ||= $USERAGENT;
193
194 my ($scheme, $authority, $upath, $query, $fragment) =
195 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
196
197 $scheme = lc $scheme;
198
199 my $uport = $scheme eq "http" ? 80
200 : $scheme eq "https" ? 443
201 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported" });
202
203 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
204 or return $cb->(undef, { Status => 599, Reason => "unparsable URL" });
205
206 my $uhost = $1;
207 $uport = $2 if defined $2;
208
209 $uhost =~ s/^\[(.*)\]$/$1/;
210 $upath .= "?$query" if length $query;
211
212 $upath =~ s%^/?%/%;
213
214 # cookie processing
215 if (my $jar = $arg{cookie_jar}) {
216 %$jar = () if $jar->{version} < 1;
217
218 my @cookie;
219
220 while (my ($chost, $v) = each %$jar) {
221 next unless $chost eq substr $uhost, -length $chost;
222 next unless $chost =~ /^\./;
223
224 while (my ($cpath, $v) = each %$v) {
225 next unless $cpath eq substr $upath, 0, length $cpath;
226
227 while (my ($k, $v) = each %$v) {
228 next if $scheme ne "https" && exists $v->{secure};
229 push @cookie, "$k=$v->{value}";
230 }
231 }
232 }
233
234 $hdr{cookie} = join "; ", @cookie
235 if @cookie;
236 }
237
238 my ($rhost, $rport, $rpath); # request host, port, path
239
240 if ($proxy) {
241 ($rhost, $rport, $scheme) = @$proxy;
242 $rpath = $url;
243 } else {
244 ($rhost, $rport, $rpath) = ($uhost, $uport, $upath);
245 $hdr{host} = $uhost;
246 }
247
248 $hdr{"content-length"} = length $arg{body};
249
250 my %state;
251
252 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
253 $state{fh} = shift
254 or return $cb->(undef, { Status => 599, Reason => "$!" });
255
256 delete $state{connect_guard}; # reduce memory usage, save a tree
257
258 # get handle
259 $state{handle} = new AnyEvent::Handle
260 fh => $state{fh},
261 ($scheme eq "https" ? (tls => "connect") : ());
262
263 # limit the number of persistent connections
264 if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
265 ++$KA_COUNT{$_[1]};
266 $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} };
267 $hdr{connection} = "keep-alive";
268 delete $hdr{connection}; # keep-alive not yet supported
269 } else {
270 delete $hdr{connection};
271 }
272
273 # (re-)configure handle
274 $state{handle}->timeout ($timeout);
275 $state{handle}->on_error (sub {
276 %state = ();
277 $cb->(undef, { Status => 599, Reason => "$!" });
278 });
279 $state{handle}->on_eof (sub {
280 %state = ();
281 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file" });
282 });
283
284 # send request
285 $state{handle}->push_write (
286 "$method $rpath HTTP/1.0\015\012"
287 . (join "", map "$_: $hdr{$_}\015\012", keys %hdr)
288 . "\015\012"
289 . (delete $arg{body})
290 );
291
292 %hdr = (); # reduce memory usage, save a kitten
293
294 # status line
295 $state{handle}->push_read (line => qr/\015?\012/, sub {
296 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) \s+ ([^\015\012]+)/ix
297 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])" }));
298
299 my %hdr = ( # response headers
300 HTTPVersion => "\x00$1",
301 Status => "\x00$2",
302 Reason => "\x00$3",
303 );
304
305 # headers, could be optimized a bit
306 $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub {
307 for ("$_[1]\012") {
308 # we support spaces in field names, as lotus domino
309 # creates them.
310 $hdr{lc $1} .= "\x00$2"
311 while /\G
312 ([^:\000-\037]+):
313 [\011\040]*
314 ((?: [^\015\012]+ | \015?\012[\011\040] )*)
315 \015?\012
316 /gxc;
317
318 /\G$/
319 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers" }));
320 }
321
322 substr $_, 0, 1, ""
323 for values %hdr;
324
325 my $finish = sub {
326 %state = ();
327
328 # set-cookie processing
329 if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) {
330 for (split /\x00/, $hdr{"set-cookie"}) {
331 my ($cookie, @arg) = split /;\s*/;
332 my ($name, $value) = split /=/, $cookie, 2;
333 my %kv = (value => $value, map { split /=/, $_, 2 } @arg);
334
335 my $cdom = (delete $kv{domain}) || $uhost;
336 my $cpath = (delete $kv{path}) || "/";
337
338 $cdom =~ s/^.?/./; # make sure it starts with a "."
339
340 my $ndots = $cdom =~ y/.//;
341 next if $ndots < ($cdom =~ /[^.]{3}$/ ? 2 : 3);
342
343 # store it
344 $arg{cookie_jar}{version} = 1;
345 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
346 }
347 }
348
349 if ($_[1]{Status} =~ /^x30[12]$/ && $recurse) {
350 # microsoft and other assholes don't give a shit for following standards,
351 # try to support a common form of broken Location header.
352 $_[1]{location} =~ s%^/%$scheme://$uhost:$uport/%;
353
354 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
355 } else {
356 $cb->($_[0], $_[1]);
357 }
358 };
359
360 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
361 $finish->(undef, \%hdr);
362 } else {
363 if (exists $hdr{"content-length"}) {
364 $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
365 # could cache persistent connection now
366 if ($hdr{connection} =~ /\bkeep-alive\b/i) {
367 # but we don't, due to misdesigns, this is annoyingly complex
368 };
369
370 $finish->($_[1], \%hdr);
371 });
372 } else {
373 # too bad, need to read until we get an error or EOF,
374 # no way to detect winged data.
375 $_[0]->on_error (sub {
376 $finish->($_[0]{rbuf}, \%hdr);
377 });
378 $_[0]->on_eof (undef);
379 $_[0]->on_read (sub { });
380 }
381 }
382 });
383 });
384 }, sub {
385 $timeout
386 };
387
388 defined wantarray && AnyEvent::Util::guard { %state = () }
389 }
390
391 sub http_get($$;@) {
392 unshift @_, "GET";
393 &http_request
394 }
395
396 sub http_head($$;@) {
397 unshift @_, "HEAD";
398 &http_request
399 }
400
401 sub http_post($$$;@) {
402 unshift @_, "POST", "body";
403 &http_request
404 }
405
406 =back
407
408 =head2 GLOBAL FUNCTIONS AND VARIABLES
409
410 =over 4
411
412 =item AnyEvent::HTTP::set_proxy "proxy-url"
413
414 Sets the default proxy server to use. The proxy-url must begin with a
415 string of the form C<http://host:port> (optionally C<https:...>).
416
417 =item $AnyEvent::HTTP::MAX_RECURSE
418
419 The default value for the C<recurse> request parameter (default: C<10>).
420
421 =item $AnyEvent::HTTP::USERAGENT
422
423 The default value for the C<User-Agent> header (the default is
424 C<Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)>).
425
426 =item $AnyEvent::HTTP::MAX_PERSISTENT
427
428 The maximum number of persistent connections to keep open (default: 8).
429
430 Not implemented currently.
431
432 =item $AnyEvent::HTTP::PERSISTENT_TIMEOUT
433
434 The maximum time to cache a persistent connection, in seconds (default: 2).
435
436 Not implemented currently.
437
438 =back
439
440 =cut
441
442 sub set_proxy($) {
443 $PROXY = [$2, $3 || 3128, $1] if $_[0] =~ m%^(https?):// ([^:/]+) (?: : (\d*) )?%ix;
444 }
445
446 # initialise proxy from environment
447 set_proxy $ENV{http_proxy};
448
449 =head1 SEE ALSO
450
451 L<AnyEvent>.
452
453 =head1 AUTHOR
454
455 Marc Lehmann <schmorp@schmorp.de>
456 http://home.schmorp.de/
457
458 =cut
459
460 1
461