ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
(Generate patch)

Comparing AnyEvent-HTTP/HTTP.pm (file contents):
Revision 1.30 by root, Thu Oct 23 02:46:20 2008 UTC vs.
Revision 1.36 by root, Fri Nov 21 06:56:05 2008 UTC

48use AnyEvent::Socket (); 48use AnyEvent::Socket ();
49use AnyEvent::Handle (); 49use AnyEvent::Handle ();
50 50
51use base Exporter::; 51use base Exporter::;
52 52
53our $VERSION = '1.05'; 53our $VERSION = '1.1';
54 54
55our @EXPORT = qw(http_get http_post http_head http_request); 55our @EXPORT = qw(http_get http_post http_head http_request);
56 56
57our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)"; 57our $USERAGENT = "Mozilla/5.0 (compatible; AnyEvent::HTTP/$VERSION; +http://software.schmorp.de/pkg/AnyEvent)";
58our $MAX_RECURSE = 10; 58our $MAX_RECURSE = 10;
104headers, the "pseudo-headers" C<HTTPVersion>, C<Status> and C<Reason> 104headers, the "pseudo-headers" C<HTTPVersion>, C<Status> and C<Reason>
105contain the three parts of the HTTP Status-Line of the same name. The 105contain the three parts of the HTTP Status-Line of the same name. The
106pseudo-header C<URL> contains the original URL (which can differ from the 106pseudo-header C<URL> contains the original URL (which can differ from the
107requested URL when following redirects). 107requested URL when following redirects).
108 108
109If the server sends a header multiple lines, then their contents will be 109If the server sends a header multiple times, then their contents will be
110joined together with C<\x00>. 110joined together with a comma (C<,>), as per the HTTP spec.
111 111
112If an internal error occurs, such as not being able to resolve a hostname, 112If an internal error occurs, such as not being able to resolve a hostname,
113then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599> 113then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
114and the C<Reason> pseudo-header will contain an error message. 114and the C<Reason> pseudo-header will contain an error message.
115 115
237 push @{ $CO_SLOT{$_[0]}[1] }, $_[1]; 237 push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
238 238
239 _slot_schedule $_[0]; 239 _slot_schedule $_[0];
240} 240}
241 241
242our $qr_nl = qr<\015?\012>;
243our $qr_nlnl = qr<\015?\012\015?\012>;
244
242sub http_request($$@) { 245sub http_request($$@) {
243 my $cb = pop; 246 my $cb = pop;
244 my ($method, $url, %arg) = @_; 247 my ($method, $url, %arg) = @_;
245 248
246 my %hdr; 249 my %hdr;
261 my $proxy = $arg{proxy} || $PROXY; 264 my $proxy = $arg{proxy} || $PROXY;
262 my $timeout = $arg{timeout} || $TIMEOUT; 265 my $timeout = $arg{timeout} || $TIMEOUT;
263 266
264 $hdr{"user-agent"} ||= $USERAGENT; 267 $hdr{"user-agent"} ||= $USERAGENT;
265 268
266 my ($scheme, $authority, $upath, $query, $fragment) = 269 my ($uscheme, $uauthority, $upath, $query, $fragment) =
267 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|; 270 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
268 271
269 $scheme = lc $scheme; 272 $uscheme = lc $uscheme;
270 273
271 my $uport = $scheme eq "http" ? 80 274 my $uport = $uscheme eq "http" ? 80
272 : $scheme eq "https" ? 443 275 : $uscheme eq "https" ? 443
273 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported", URL => $url }); 276 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported", URL => $url });
274 277
275 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic 278 $hdr{referer} ||= "$uscheme://$uauthority$upath"; # leave out fragment and query string, just a heuristic
276 279
277 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x 280 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
278 or return $cb->(undef, { Status => 599, Reason => "unparsable URL", URL => $url }); 281 or return $cb->(undef, { Status => 599, Reason => "unparsable URL", URL => $url });
279 282
280 my $uhost = $1; 283 my $uhost = $1;
281 $uport = $2 if defined $2; 284 $uport = $2 if defined $2;
282 285
285 288
286 $upath =~ s%^/?%/%; 289 $upath =~ s%^/?%/%;
287 290
288 # cookie processing 291 # cookie processing
289 if (my $jar = $arg{cookie_jar}) { 292 if (my $jar = $arg{cookie_jar}) {
290 %$jar = () if $jar->{version} < 1; 293 %$jar = () if $jar->{version} != 1;
291 294
292 my @cookie; 295 my @cookie;
293 296
294 while (my ($chost, $v) = each %$jar) { 297 while (my ($chost, $v) = each %$jar) {
295 if ($chost =~ /^\./) { 298 if ($chost =~ /^\./) {
302 305
303 while (my ($cpath, $v) = each %$v) { 306 while (my ($cpath, $v) = each %$v) {
304 next unless $cpath eq substr $upath, 0, length $cpath; 307 next unless $cpath eq substr $upath, 0, length $cpath;
305 308
306 while (my ($k, $v) = each %$v) { 309 while (my ($k, $v) = each %$v) {
307 next if $scheme ne "https" && exists $v->{secure}; 310 next if $uscheme ne "https" && exists $v->{secure};
311 my $value = $v->{value};
312 $value =~ s/([\\"])/\\$1/g;
308 push @cookie, "$k=$v->{value}"; 313 push @cookie, "$k=\"$value\"";
309 } 314 }
310 } 315 }
311 } 316 }
312 317
313 $hdr{cookie} = join "; ", @cookie 318 $hdr{cookie} = join "; ", @cookie
314 if @cookie; 319 if @cookie;
315 } 320 }
316 321
317 my ($rhost, $rport, $rpath); # request host, port, path 322 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
318 323
319 if ($proxy) { 324 if ($proxy) {
320 ($rhost, $rport, $scheme) = @$proxy; 325 ($rhost, $rport, $rscheme, $rpath) = (@$proxy, $url);
321 $rpath = $url; 326
327 # don't support https requests over https-proxy transport,
328 # can't be done with tls as spec'ed.
329 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
322 } else { 330 } else {
323 ($rhost, $rport, $rpath) = ($uhost, $uport, $upath); 331 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
324 $hdr{host} = $uhost;
325 } 332 }
326 333
334 $hdr{host} = $uhost;
327 $hdr{"content-length"} = length $arg{body}; 335 $hdr{"content-length"} = length $arg{body};
328 336
329 my %state = (connect_guard => 1); 337 my %state = (connect_guard => 1);
330 338
331 _get_slot $uhost, sub { 339 _get_slot $uhost, sub {
334 return unless $state{connect_guard}; 342 return unless $state{connect_guard};
335 343
336 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub { 344 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
337 $state{fh} = shift 345 $state{fh} = shift
338 or return $cb->(undef, { Status => 599, Reason => "$!", URL => $url }); 346 or return $cb->(undef, { Status => 599, Reason => "$!", URL => $url });
347 pop; # free memory, save a tree
339 348
340 delete $state{connect_guard}; # reduce memory usage, save a tree 349 return unless delete $state{connect_guard};
341 350
342 # get handle 351 # get handle
343 $state{handle} = new AnyEvent::Handle 352 $state{handle} = new AnyEvent::Handle
344 fh => $state{fh}, 353 fh => $state{fh},
345 ($scheme eq "https" ? (tls => "connect") : ()); 354 timeout => $timeout;
346 355
347 # limit the number of persistent connections 356 # limit the number of persistent connections
357 # keepalive not yet supported
348 if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) { 358 if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
349 ++$KA_COUNT{$_[1]}; 359 ++$KA_COUNT{$_[1]};
350 $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} }; 360 $state{handle}{ka_count_guard} = AnyEvent::Util::guard {
361 --$KA_COUNT{$_[1]}
362 };
351 $hdr{connection} = "keep-alive"; 363 $hdr{connection} = "keep-alive";
352 delete $hdr{connection}; # keep-alive not yet supported
353 } else { 364 } else {
354 delete $hdr{connection}; 365 delete $hdr{connection};
355 } 366 }
356 367
357 # (re-)configure handle 368 # (re-)configure handle
358 $state{handle}->timeout ($timeout);
359 $state{handle}->on_error (sub { 369 $state{handle}->on_error (sub {
360 my $errno = "$!"; 370 my $errno = "$!";
361 %state = (); 371 %state = ();
362 $cb->(undef, { Status => 599, Reason => $errno, URL => $url }); 372 $cb->(undef, { Status => 599, Reason => $errno, URL => $url });
363 }); 373 });
364 $state{handle}->on_eof (sub { 374 $state{handle}->on_eof (sub {
365 %state = (); 375 %state = ();
366 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file", URL => $url }); 376 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file", URL => $url });
367 }); 377 });
368 378
379 $state{handle}->starttls ("connect") if $rscheme eq "https";
380
381 # handle actual, non-tunneled, request
382 my $handle_actual_request = sub {
383 $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls};
384
369 # send request 385 # send request
370 $state{handle}->push_write ( 386 $state{handle}->push_write (
371 "$method $rpath HTTP/1.0\015\012" 387 "$method $rpath HTTP/1.0\015\012"
372 . (join "", map "$_: $hdr{$_}\015\012", keys %hdr) 388 . (join "", map "\u$_: $hdr{$_}\015\012", keys %hdr)
373 . "\015\012" 389 . "\015\012"
374 . (delete $arg{body}) 390 . (delete $arg{body})
375 );
376
377 %hdr = (); # reduce memory usage, save a kitten
378
379 # status line
380 $state{handle}->push_read (line => qr/\015?\012/, sub {
381 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
382 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])", URL => $url }));
383
384 my %hdr = ( # response headers
385 HTTPVersion => "\x00$1",
386 Status => "\x00$2",
387 Reason => "\x00$3",
388 URL => "\x00$url"
389 ); 391 );
390 392
393 %hdr = (); # reduce memory usage, save a kitten
394
395 # status line
396 $state{handle}->push_read (line => $qr_nl, sub {
397 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
398 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])", URL => $url }));
399
400 my %hdr = ( # response headers
401 HTTPVersion => ",$1",
402 Status => ",$2",
403 Reason => ",$3",
404 URL => ",$url"
405 );
406
391 # headers, could be optimized a bit 407 # headers, could be optimized a bit
392 $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub { 408 $state{handle}->unshift_read (line => $qr_nlnl, sub {
393 for ("$_[1]\012") { 409 for ("$_[1]\012") {
410 y/\015//d; # weed out any \015, as they show up in the weirdest of places.
411
394 # we support spaces in field names, as lotus domino 412 # we support spaces in field names, as lotus domino
395 # creates them (actually spaces around seperators 413 # creates them (actually spaces around seperators
396 # are strictly allowed in http, they are a security issue). 414 # are strictly allowed in http, they are a security issue).
397 $hdr{lc $1} .= "\x00$2" 415 $hdr{lc $1} .= ",$2"
398 while /\G 416 while /\G
399 ([^:\000-\037]+): 417 ([^:\000-\037]+):
400 [\011\040]* 418 [\011\040]*
401 ((?: [^\015\012]+ | \015?\012[\011\040] )*) 419 ((?: [^\012]+ | \012[\011\040] )*)
402 \015?\012 420 \012
403 /gxc; 421 /gxc;
404 422
405 /\G$/ 423 /\G$/
406 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers", URL => $url })); 424 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers", URL => $url }));
407 } 425 }
408 426
409 substr $_, 0, 1, "" 427 substr $_, 0, 1, ""
410 for values %hdr; 428 for values %hdr;
411 429
412 my $finish = sub { 430 my $finish = sub {
413 # TODO: use destroy method, when/if available
414 #$state{handle}->destroy; 431 $state{handle}->destroy;
415 $state{handle}->on_eof (undef);
416 $state{handle}->on_error (undef);
417 %state = (); 432 %state = ();
418 433
419 # set-cookie processing 434 # set-cookie processing
420 if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) { 435 if ($arg{cookie_jar}) {
421 for (split /\x00/, $hdr{"set-cookie"}) { 436 for ($hdr{"set-cookie"}) {
422 my ($cookie, @arg) = split /;\s*/; 437 # parse NAME=VALUE
423 my ($name, $value) = split /=/, $cookie, 2; 438 my @kv;
424 my %kv = (value => $value, map { split /=/, $_, 2 } @arg); 439
425 440 while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) {
441 my $name = $1;
442 my $value = $3;
443
444 unless ($value) {
445 $value = $2;
446 $value =~ s/\\(.)/$1/gs;
447 }
448
449 push @kv, $name => $value;
450
451 last unless /\G\s*;/gc;
452 }
453
454 last unless @kv;
455
456 my $name = shift @kv;
457 my %kv = (value => shift @kv, @kv);
458
426 my $cdom; 459 my $cdom;
427 my $cpath = (delete $kv{path}) || "/"; 460 my $cpath = (delete $kv{path}) || "/";
428 461
429 if (exists $kv{domain}) { 462 if (exists $kv{domain}) {
430 $cdom = delete $kv{domain}; 463 $cdom = delete $kv{domain};
431
432 $cdom =~ s/^\.?/./; # make sure it starts with a "."
433
434 next if $cdom =~ /\.$/;
435 464
465 $cdom =~ s/^\.?/./; # make sure it starts with a "."
466
467 next if $cdom =~ /\.$/;
468
436 # this is not rfc-like and not netscape-like. go figure. 469 # this is not rfc-like and not netscape-like. go figure.
437 my $ndots = $cdom =~ y/.//; 470 my $ndots = $cdom =~ y/.//;
438 next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2); 471 next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
439 } else { 472 } else {
440 $cdom = $uhost; 473 $cdom = $uhost;
474 }
475
476 # store it
477 $arg{cookie_jar}{version} = 1;
478 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
479
480 redo if /\G\s*,/gc;
441 } 481 }
442 482 }
483
484 # microsoft and other shitheads don't give a shit for following standards,
485 # try to support some common forms of broken Location headers.
486 if ($_[1]{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
487 $_[1]{location} =~ s/^\.\/+//;
488
489 my $url = "$rscheme://$uhost:$uport";
490
491 unless ($_[1]{location} =~ s/^\///) {
492 $url .= $upath;
493 $url =~ s/\/[^\/]*$//;
443 # store it 494 }
444 $arg{cookie_jar}{version} = 1; 495
445 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv; 496 $_[1]{location} = "$url/$_[1]{location}";
497 }
498
499 if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") {
500 # apparently, mozilla et al. just change POST to GET here
501 # more research is needed before we do the same
502 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
503 } elsif ($_[1]{Status} == 303 && $recurse) {
504 # even http/1.1 is unclear on how to mutate the method
505 $method = "GET" unless $method eq "HEAD";
506 http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
507 } elsif ($_[1]{Status} == 307 && $recurse && $method =~ /^(?:GET|HEAD)$/) {
508 http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
509 } else {
510 $cb->($_[0], $_[1]);
511 }
512 };
513
514 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
515 $finish->(undef, \%hdr);
516 } else {
517 if (exists $hdr{"content-length"}) {
518 $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
519 # could cache persistent connection now
520 if ($hdr{connection} =~ /\bkeep-alive\b/i) {
521 # but we don't, due to misdesigns, this is annoyingly complex
522 };
523
524 $finish->($_[1], \%hdr);
525 });
526 } else {
527 # too bad, need to read until we get an error or EOF,
528 # no way to detect winged data.
529 $_[0]->on_error (sub {
530 $finish->($_[0]{rbuf}, \%hdr);
531 });
532 $_[0]->on_eof (undef);
533 $_[0]->on_read (sub { });
446 } 534 }
447 } 535 }
448
449 # microsoft and other shitheads don't give a shit for following standards,
450 # try to support some common forms of broken Location headers.
451 if ($_[1]{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
452 $_[1]{location} =~ s/^\.\/+//;
453
454 my $url = "$scheme://$uhost:$uport";
455
456 unless ($_[1]{location} =~ s/^\///) {
457 $url .= $upath;
458 $url =~ s/\/[^\/]*$//;
459 }
460
461 $_[1]{location} = "$url/$_[1]{location}";
462 }
463
464 if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") {
465 # apparently, mozilla et al. just change POST to GET here
466 # more research is needed before we do the same
467 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
468 } elsif ($_[1]{Status} == 303 && $recurse) {
469 # even http/1.1 is unclear on how to mutate the method
470 $method = "GET" unless $method eq "HEAD";
471 http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
472 } elsif ($_[1]{Status} == 307 && $recurse && $method =~ /^(?:GET|HEAD)$/) {
473 http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
474 } else {
475 $cb->($_[0], $_[1]);
476 }
477 }; 536 });
537 });
538 };
478 539
479 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") { 540 # now handle proxy-CONNECT method
480 $finish->(undef, \%hdr); 541 if ($proxy && $uscheme eq "https") {
542 # oh dear, we have to wrap it into a connect request
543
544 # maybe re-use $uauthority with patched port?
545 $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012");
546 $state{handle}->push_read (line => $qr_nlnl, sub {
547 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
548 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid proxy connect response ($_[1])", URL => $url }));
549
550 if ($2 == 200) {
551 $rpath = $upath;
552 &$handle_actual_request;
481 } else { 553 } else {
482 if (exists $hdr{"content-length"}) { 554 %state = ();
483 $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub { 555 $cb->(undef, { Status => $2, Reason => $3, URL => $url });
484 # could cache persistent connection now
485 if ($hdr{connection} =~ /\bkeep-alive\b/i) {
486 # but we don't, due to misdesigns, this is annoyingly complex
487 };
488
489 $finish->($_[1], \%hdr);
490 });
491 } else {
492 # too bad, need to read until we get an error or EOF,
493 # no way to detect winged data.
494 $_[0]->on_error (sub {
495 $finish->($_[0]{rbuf}, \%hdr);
496 });
497 $_[0]->on_eof (undef);
498 $_[0]->on_read (sub { });
499 }
500 } 556 }
501 }); 557 });
558 } else {
559 &$handle_actual_request;
502 }); 560 }
561
503 }, sub { 562 }, sub {
504 $timeout 563 $timeout
505 }; 564 };
506 }; 565 };
507 566
580=head1 AUTHOR 639=head1 AUTHOR
581 640
582 Marc Lehmann <schmorp@schmorp.de> 641 Marc Lehmann <schmorp@schmorp.de>
583 http://home.schmorp.de/ 642 http://home.schmorp.de/
584 643
644With many thanks to Дмитрий Шалашов, who provided countless
645testcases and bugreports.
646
585=cut 647=cut
586 648
5871 6491
588 650

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines