ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/AnyEvent-HTTP/HTTP.pm
(Generate patch)

Comparing AnyEvent-HTTP/HTTP.pm (file contents):
Revision 1.29 by root, Wed Oct 22 23:28:11 2008 UTC vs.
Revision 1.34 by root, Wed Oct 29 14:51:07 2008 UTC

104headers, the "pseudo-headers" C<HTTPVersion>, C<Status> and C<Reason> 104headers, the "pseudo-headers" C<HTTPVersion>, C<Status> and C<Reason>
105contain the three parts of the HTTP Status-Line of the same name. The 105contain the three parts of the HTTP Status-Line of the same name. The
106pseudo-header C<URL> contains the original URL (which can differ from the 106pseudo-header C<URL> contains the original URL (which can differ from the
107requested URL when following redirects). 107requested URL when following redirects).
108 108
109If the server sends a header multiple lines, then their contents will be 109If the server sends a header multiple times, then their contents will be
110joined together with C<\x00>. 110joined together with a comma (C<,>), as per the HTTP spec.
111 111
112If an internal error occurs, such as not being able to resolve a hostname, 112If an internal error occurs, such as not being able to resolve a hostname,
113then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599> 113then C<$data> will be C<undef>, C<< $headers->{Status} >> will be C<599>
114and the C<Reason> pseudo-header will contain an error message. 114and the C<Reason> pseudo-header will contain an error message.
115 115
237 push @{ $CO_SLOT{$_[0]}[1] }, $_[1]; 237 push @{ $CO_SLOT{$_[0]}[1] }, $_[1];
238 238
239 _slot_schedule $_[0]; 239 _slot_schedule $_[0];
240} 240}
241 241
242our $qr_nl = qr<\015?\012>;
243our $qr_nlnl = qr<\015?\012\015?\012>;
244
242sub http_request($$@) { 245sub http_request($$@) {
243 my $cb = pop; 246 my $cb = pop;
244 my ($method, $url, %arg) = @_; 247 my ($method, $url, %arg) = @_;
245 248
246 my %hdr; 249 my %hdr;
261 my $proxy = $arg{proxy} || $PROXY; 264 my $proxy = $arg{proxy} || $PROXY;
262 my $timeout = $arg{timeout} || $TIMEOUT; 265 my $timeout = $arg{timeout} || $TIMEOUT;
263 266
264 $hdr{"user-agent"} ||= $USERAGENT; 267 $hdr{"user-agent"} ||= $USERAGENT;
265 268
266 my ($scheme, $authority, $upath, $query, $fragment) = 269 my ($uscheme, $uauthority, $upath, $query, $fragment) =
267 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|; 270 $url =~ m|(?:([^:/?#]+):)?(?://([^/?#]*))?([^?#]*)(?:\?([^#]*))?(?:#(.*))?|;
268 271
269 $scheme = lc $scheme; 272 $uscheme = lc $uscheme;
270 273
271 my $uport = $scheme eq "http" ? 80 274 my $uport = $uscheme eq "http" ? 80
272 : $scheme eq "https" ? 443 275 : $uscheme eq "https" ? 443
273 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported", URL => $url }); 276 : return $cb->(undef, { Status => 599, Reason => "only http and https URL schemes supported", URL => $url });
274 277
275 $hdr{referer} ||= "$scheme://$authority$upath"; # leave out fragment and query string, just a heuristic 278 $hdr{referer} ||= "$uscheme://$uauthority$upath"; # leave out fragment and query string, just a heuristic
276 279
277 $authority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x 280 $uauthority =~ /^(?: .*\@ )? ([^\@:]+) (?: : (\d+) )?$/x
278 or return $cb->(undef, { Status => 599, Reason => "unparsable URL", URL => $url }); 281 or return $cb->(undef, { Status => 599, Reason => "unparsable URL", URL => $url });
279 282
280 my $uhost = $1; 283 my $uhost = $1;
281 $uport = $2 if defined $2; 284 $uport = $2 if defined $2;
282 285
285 288
286 $upath =~ s%^/?%/%; 289 $upath =~ s%^/?%/%;
287 290
288 # cookie processing 291 # cookie processing
289 if (my $jar = $arg{cookie_jar}) { 292 if (my $jar = $arg{cookie_jar}) {
290 %$jar = () if $jar->{version} < 1; 293 %$jar = () if $jar->{version} != 1;
291 294
292 my @cookie; 295 my @cookie;
293 296
294 while (my ($chost, $v) = each %$jar) { 297 while (my ($chost, $v) = each %$jar) {
298 if ($chost =~ /^\./) {
295 next unless $chost eq substr $uhost, -length $chost; 299 next unless $chost eq substr $uhost, -length $chost;
296 next unless $chost =~ /^\./; 300 } elsif ($chost =~ /\./) {
301 next unless $chost eq $uhost;
302 } else {
303 next;
304 }
297 305
298 while (my ($cpath, $v) = each %$v) { 306 while (my ($cpath, $v) = each %$v) {
299 next unless $cpath eq substr $upath, 0, length $cpath; 307 next unless $cpath eq substr $upath, 0, length $cpath;
300 308
301 while (my ($k, $v) = each %$v) { 309 while (my ($k, $v) = each %$v) {
302 next if $scheme ne "https" && exists $v->{secure}; 310 next if $uscheme ne "https" && exists $v->{secure};
311 my $value = $v->{value};
312 $value =~ s/([\\"])/\\$1/g;
303 push @cookie, "$k=$v->{value}"; 313 push @cookie, "$k=\"$value\"";
304 } 314 }
305 } 315 }
306 } 316 }
307 317
308 $hdr{cookie} = join "; ", @cookie 318 $hdr{cookie} = join "; ", @cookie
309 if @cookie; 319 if @cookie;
310 } 320 }
311 321
312 my ($rhost, $rport, $rpath); # request host, port, path 322 my ($rhost, $rport, $rscheme, $rpath); # request host, port, path
313 323
314 if ($proxy) { 324 if ($proxy) {
315 ($rhost, $rport, $scheme) = @$proxy; 325 ($rhost, $rport, $rscheme, $rpath) = (@$proxy, $url);
316 $rpath = $url; 326
327 # don't support https requests over https-proxy transport,
328 # can't be done with tls as spec'ed.
329 $rscheme = "http" if $uscheme eq "https" && $rscheme eq "https";
317 } else { 330 } else {
318 ($rhost, $rport, $rpath) = ($uhost, $uport, $upath); 331 ($rhost, $rport, $rscheme, $rpath) = ($uhost, $uport, $uscheme, $upath);
319 $hdr{host} = $uhost;
320 } 332 }
321 333
334 $hdr{host} = $uhost;
322 $hdr{"content-length"} = length $arg{body}; 335 $hdr{"content-length"} = length $arg{body};
323 336
324 my %state = (connect_guard => 1); 337 my %state = (connect_guard => 1);
325 338
326 _get_slot $uhost, sub { 339 _get_slot $uhost, sub {
329 return unless $state{connect_guard}; 342 return unless $state{connect_guard};
330 343
331 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub { 344 $state{connect_guard} = AnyEvent::Socket::tcp_connect $rhost, $rport, sub {
332 $state{fh} = shift 345 $state{fh} = shift
333 or return $cb->(undef, { Status => 599, Reason => "$!", URL => $url }); 346 or return $cb->(undef, { Status => 599, Reason => "$!", URL => $url });
347 pop; # free memory, save a tree
334 348
335 delete $state{connect_guard}; # reduce memory usage, save a tree 349 return unless delete $state{connect_guard};
336 350
337 # get handle 351 # get handle
338 $state{handle} = new AnyEvent::Handle 352 $state{handle} = new AnyEvent::Handle
339 fh => $state{fh}, 353 fh => $state{fh},
340 ($scheme eq "https" ? (tls => "connect") : ()); 354 timeout => $timeout;
341 355
342 # limit the number of persistent connections 356 # limit the number of persistent connections
357 # keepalive not yet supported
343 if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) { 358 if ($KA_COUNT{$_[1]} < $MAX_PERSISTENT_PER_HOST) {
344 ++$KA_COUNT{$_[1]}; 359 ++$KA_COUNT{$_[1]};
345 $state{handle}{ka_count_guard} = AnyEvent::Util::guard { --$KA_COUNT{$_[1]} }; 360 $state{handle}{ka_count_guard} = AnyEvent::Util::guard {
361 --$KA_COUNT{$_[1]}
362 };
346 $hdr{connection} = "keep-alive"; 363 $hdr{connection} = "keep-alive";
347 delete $hdr{connection}; # keep-alive not yet supported
348 } else { 364 } else {
349 delete $hdr{connection}; 365 delete $hdr{connection};
350 } 366 }
351 367
352 # (re-)configure handle 368 # (re-)configure handle
353 $state{handle}->timeout ($timeout);
354 $state{handle}->on_error (sub { 369 $state{handle}->on_error (sub {
355 my $errno = "$!"; 370 my $errno = "$!";
356 %state = (); 371 %state = ();
357 $cb->(undef, { Status => 599, Reason => $errno, URL => $url }); 372 $cb->(undef, { Status => 599, Reason => $errno, URL => $url });
358 }); 373 });
359 $state{handle}->on_eof (sub { 374 $state{handle}->on_eof (sub {
360 %state = (); 375 %state = ();
361 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file", URL => $url }); 376 $cb->(undef, { Status => 599, Reason => "unexpected end-of-file", URL => $url });
362 }); 377 });
363 378
379 $state{handle}->starttls ("connect") if $rscheme eq "https";
380
381 # handle actual, non-tunneled, request
382 my $handle_actual_request = sub {
383 $state{handle}->starttls ("connect") if $uscheme eq "https" && !exists $state{handle}{tls};
384
364 # send request 385 # send request
365 $state{handle}->push_write ( 386 $state{handle}->push_write (
366 "$method $rpath HTTP/1.0\015\012" 387 "$method $rpath HTTP/1.0\015\012"
367 . (join "", map "$_: $hdr{$_}\015\012", keys %hdr) 388 . (join "", map "\u$_: $hdr{$_}\015\012", keys %hdr)
368 . "\015\012" 389 . "\015\012"
369 . (delete $arg{body}) 390 . (delete $arg{body})
370 );
371
372 %hdr = (); # reduce memory usage, save a kitten
373
374 # status line
375 $state{handle}->push_read (line => qr/\015?\012/, sub {
376 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
377 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])", URL => $url }));
378
379 my %hdr = ( # response headers
380 HTTPVersion => "\x00$1",
381 Status => "\x00$2",
382 Reason => "\x00$3",
383 URL => "\x00$url"
384 ); 391 );
385 392
393 %hdr = (); # reduce memory usage, save a kitten
394
395 # status line
396 $state{handle}->push_read (line => $qr_nl, sub {
397 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
398 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid server response ($_[1])", URL => $url }));
399
400 my %hdr = ( # response headers
401 HTTPVersion => ",$1",
402 Status => ",$2",
403 Reason => ",$3",
404 URL => ",$url"
405 );
406
386 # headers, could be optimized a bit 407 # headers, could be optimized a bit
387 $state{handle}->unshift_read (line => qr/\015?\012\015?\012/, sub { 408 $state{handle}->unshift_read (line => $qr_nlnl, sub {
388 for ("$_[1]\012") { 409 for ("$_[1]\012") {
410 y/\015//d; # weed out any \015, as they show up in the weirdest of places.
411
389 # we support spaces in field names, as lotus domino 412 # we support spaces in field names, as lotus domino
390 # creates them. 413 # creates them (actually spaces around seperators
414 # are strictly allowed in http, they are a security issue).
391 $hdr{lc $1} .= "\x00$2" 415 $hdr{lc $1} .= ",$2"
392 while /\G 416 while /\G
393 ([^:\000-\037]+): 417 ([^:\000-\037]+):
394 [\011\040]* 418 [\011\040]*
395 ((?: [^\015\012]+ | \015?\012[\011\040] )*) 419 ((?: [^\012]+ | \012[\011\040] )*)
396 \015?\012 420 \012
397 /gxc; 421 /gxc;
398 422
399 /\G$/ 423 /\G$/
400 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers", URL => $url })); 424 or return (%state = (), $cb->(undef, { Status => 599, Reason => "garbled response headers", URL => $url }));
401 } 425 }
402 426
403 substr $_, 0, 1, "" 427 substr $_, 0, 1, ""
404 for values %hdr; 428 for values %hdr;
405 429
406 my $finish = sub { 430 my $finish = sub {
431 # TODO: use destroy method, when/if available
432 #$state{handle}->destroy;
433 $state{handle}->on_eof (undef);
434 $state{handle}->on_error (undef);
407 %state = (); 435 %state = ();
408 436
409 # set-cookie processing 437 # set-cookie processing
410 if ($arg{cookie_jar} && exists $hdr{"set-cookie"}) { 438 if ($arg{cookie_jar}) {
411 for (split /\x00/, $hdr{"set-cookie"}) { 439 for ($hdr{"set-cookie"}) {
412 my ($cookie, @arg) = split /;\s*/; 440 # parse NAME=VALUE
413 my ($name, $value) = split /=/, $cookie, 2; 441 my @kv;
414 my %kv = (value => $value, map { split /=/, $_, 2 } @arg); 442
415 443 while (/\G\s* ([^=;,[:space:]]+) \s*=\s* (?: "((?:[^\\"]+|\\.)*)" | ([^=;,[:space:]]*) )/gcxs) {
416 my $cdom = (delete $kv{domain}) || $uhost; 444 my $name = $1;
445 my $value = $3;
446
447 unless ($value) {
448 $value = $2;
449 $value =~ s/\\(.)/$1/gs;
450 }
451
452 push @kv, $name => $value;
453
454 last unless /\G\s*;/gc;
455 }
456
457 last unless @kv;
458
459 my $name = shift @kv;
460 my %kv = (value => shift @kv, @kv);
461
462 my $cdom;
417 my $cpath = (delete $kv{path}) || "/"; 463 my $cpath = (delete $kv{path}) || "/";
418 464
465 if (exists $kv{domain}) {
466 $cdom = delete $kv{domain};
467
419 $cdom =~ s/^\.?/./; # make sure it starts with a "." 468 $cdom =~ s/^\.?/./; # make sure it starts with a "."
420 469
421 next if $cdom =~ /\.$/; 470 next if $cdom =~ /\.$/;
422 471
423 # this is not rfc-like and not netscape-like. go figure. 472 # this is not rfc-like and not netscape-like. go figure.
424 my $ndots = $cdom =~ y/.//; 473 my $ndots = $cdom =~ y/.//;
425 next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2); 474 next if $ndots < ($cdom =~ /\.[^.][^.]\.[^.][^.]$/ ? 3 : 2);
426 475 } else {
476 $cdom = $uhost;
477 }
478
427 # store it 479 # store it
428 $arg{cookie_jar}{version} = 1; 480 $arg{cookie_jar}{version} = 1;
429 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv; 481 $arg{cookie_jar}{$cdom}{$cpath}{$name} = \%kv;
482
483 redo if /\G\s*,/gc;
484 }
485 }
486
487 # microsoft and other shitheads don't give a shit for following standards,
488 # try to support some common forms of broken Location headers.
489 if ($_[1]{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
490 $_[1]{location} =~ s/^\.\/+//;
491
492 my $url = "$rscheme://$uhost:$uport";
493
494 unless ($_[1]{location} =~ s/^\///) {
495 $url .= $upath;
496 $url =~ s/\/[^\/]*$//;
497 }
498
499 $_[1]{location} = "$url/$_[1]{location}";
500 }
501
502 if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") {
503 # apparently, mozilla et al. just change POST to GET here
504 # more research is needed before we do the same
505 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
506 } elsif ($_[1]{Status} == 303 && $recurse) {
507 # even http/1.1 is unclear on how to mutate the method
508 $method = "GET" unless $method eq "HEAD";
509 http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
510 } elsif ($_[1]{Status} == 307 && $recurse && $method =~ /^(?:GET|HEAD)$/) {
511 http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
512 } else {
513 $cb->($_[0], $_[1]);
514 }
515 };
516
517 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") {
518 $finish->(undef, \%hdr);
519 } else {
520 if (exists $hdr{"content-length"}) {
521 $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub {
522 # could cache persistent connection now
523 if ($hdr{connection} =~ /\bkeep-alive\b/i) {
524 # but we don't, due to misdesigns, this is annoyingly complex
525 };
526
527 $finish->($_[1], \%hdr);
528 });
529 } else {
530 # too bad, need to read until we get an error or EOF,
531 # no way to detect winged data.
532 $_[0]->on_error (sub {
533 $finish->($_[0]{rbuf}, \%hdr);
534 });
535 $_[0]->on_eof (undef);
536 $_[0]->on_read (sub { });
430 } 537 }
431 } 538 }
432
433 # microsoft and other shitheads don't give a shit for following standards,
434 # try to support some common forms of broken Location headers.
435 if ($_[1]{location} !~ /^(?: $ | [^:\/?\#]+ : )/x) {
436 $_[1]{location} =~ s/^\.\/+//;
437
438 my $url = "$scheme://$uhost:$uport";
439
440 unless ($_[1]{location} =~ s/^\///) {
441 $url .= $upath;
442 $url =~ s/\/[^\/]*$//;
443 }
444
445 $_[1]{location} = "$url/$_[1]{location}";
446 }
447
448 if ($_[1]{Status} =~ /^30[12]$/ && $recurse && $method ne "POST") {
449 # apparently, mozilla et al. just change POST to GET here
450 # more research is needed before we do the same
451 http_request ($method, $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
452 } elsif ($_[1]{Status} == 303 && $recurse) {
453 # even http/1.1 is unlear on how to mutate the method
454 $method = "GET" unless $method eq "HEAD";
455 http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
456 } elsif ($_[1]{Status} == 307 && $recurse && $method =~ /^(?:GET|HEAD)$/) {
457 http_request ($method => $_[1]{location}, %arg, recurse => $recurse - 1, $cb);
458 } else {
459 $cb->($_[0], $_[1]);
460 }
461 }; 539 });
540 });
541 };
462 542
463 if ($hdr{Status} =~ /^(?:1..|204|304)$/ or $method eq "HEAD") { 543 # now handle proxy-CONNECT method
464 $finish->(undef, \%hdr); 544 if ($proxy && $uscheme eq "https") {
545 # oh dear, we have to wrap it into a connect request
546
547 # maybe re-use $uauthority with patched port?
548 $state{handle}->push_write ("CONNECT $uhost:$uport HTTP/1.0\015\012Host: $uhost\015\012\015\012");
549 $state{handle}->push_read (line => $qr_nlnl, sub {
550 $_[1] =~ /^HTTP\/([0-9\.]+) \s+ ([0-9]{3}) (?: \s+ ([^\015\012]*) )?/ix
551 or return (%state = (), $cb->(undef, { Status => 599, Reason => "invalid proxy connect response ($_[1])", URL => $url }));
552
553 if ($2 == 200) {
554 $rpath = $upath;
555 &$handle_actual_request;
465 } else { 556 } else {
466 if (exists $hdr{"content-length"}) { 557 %state = ();
467 $_[0]->unshift_read (chunk => $hdr{"content-length"}, sub { 558 $cb->(undef, { Status => $2, Reason => $3, URL => $url });
468 # could cache persistent connection now
469 if ($hdr{connection} =~ /\bkeep-alive\b/i) {
470 # but we don't, due to misdesigns, this is annoyingly complex
471 };
472
473 $finish->($_[1], \%hdr);
474 });
475 } else {
476 # too bad, need to read until we get an error or EOF,
477 # no way to detect winged data.
478 $_[0]->on_error (sub {
479 $finish->($_[0]{rbuf}, \%hdr);
480 });
481 $_[0]->on_eof (undef);
482 $_[0]->on_read (sub { });
483 }
484 } 559 }
485 }); 560 });
561 } else {
562 &$handle_actual_request;
486 }); 563 }
564
487 }, sub { 565 }, sub {
488 $timeout 566 $timeout
489 }; 567 };
490 }; 568 };
491 569

Diff Legend

Removed lines
+ Added lines
< Changed lines
> Changed lines