--- rxvt-unicode/src/perl/matcher 2021/07/14 12:39:57 1.36 +++ rxvt-unicode/src/perl/matcher 2022/12/09 05:06:46 1.40 @@ -21,7 +21,7 @@ mouse button specified in the C resource (default 2, or middle), the program specified in the C resource (default, the C resource, C) will be started -with the matched text as first argument. The default configuration is +with the matched text as first argument. The default configuration is suitable for matching URLs and launching a web browser, like the former "mark-urls" extension. @@ -31,7 +31,7 @@ The launcher can also be overridden on a per-pattern basis. It is possible to activate the most recently seen match or a list of matches -from the keyboard. Simply bind a keysym to "matcher:last" or +from the keyboard. Simply bind a keysym to "matcher:last" or "matcher:list" as seen in the example below. The C action enables a mode in which it is possible to @@ -39,7 +39,7 @@ or copy them to the clipboard. While the mode is active, normal terminal input/output is suspended and the following bindings are recognized: -=over 4 +=over =item C @@ -88,14 +88,44 @@ URxvt.matcher.pattern.2: \\B(/\\S+?):(\\d+)(?=:|$) URxvt.matcher.launcher.2: gvim +$2 $1 +=head2 Regex encoding/wide character matching + +Urxvt stores all text as unicode, in a special encoding that uses +one character/code point per column. For various reasons, the regular +expressions are matched directly against this encoding, which means there are a few things +you need to keep in mind: + +=over + +=item X resources/command line arguments are locale-encoded + +The regexes taken from the command line or resources will be converted +from locale encoding to unicode. This can change the number of code points +per character. + +=item Wide characters are column-padded with C<$urxvt::NOCHAR> + +Wide characters (such as kanji and sometimes tabs) are padded with +a special character value (C<$urxvt::NOCHAR>). That means that +constructs such as C<\w> or C<.> will only match part of a character, as +C<$urxvt::NOCHAR> is not matched by C<\w> and both only match the first +"column" of a wide character. + +That means you have to incorporate C<$urxvt::NOCHAR> into parts of regexes +that may match wide characters. For example, to match C<\w+> you might +want to use C<[\w$urxvt::NOCHAR]+> instead, and to match a single character +(C<.>) you might want to use C<.$urxvt::NOCHAR*> instead. + +=back + =cut my $url = qr{ (?:https?://|ftp://|news://|mailto:|file://|\bwww\.) - [\w\-\@;\/?:&=%\$.+!*\x27,~#]* + [\w\-\@;\/?:&=%\$.+!*\x27,~#$urxvt::NOCHAR]* ( - \([\w\-\@;\/?:&=%\$.+!*\x27,~#]*\)| # Allow a pair of matched parentheses + \([\w\-\@;\/?:&=%\$.+!*\x27,~#$urxvt::NOCHAR]*\)| # Allow a pair of matched parentheses [\w\-\@;\/?:&=%\$+*~] # exclude some trailing characters (heuristic) )+ }x; @@ -195,16 +225,19 @@ my ($self) = shift; my $row = $self->nrow - 1; my @exec; + while ($row >= $self->top_row) { my $line = $self->line ($row); - @exec = $self->command_for($row); - last if(@exec); + @exec = $self->command_for ($row); + last if @exec; $row = $line->beg - 1; } - if(@exec) { + + if (@exec) { return $self->exec_async (@exec); } + () } @@ -253,9 +286,7 @@ my @defaults = ($url); my @matchers; - for (my $idx = 0; defined (my $res = $self->my_resource ("pattern.$idx") || $defaults[$idx]); $idx++) { - $res = $self->locale_decode ($res); - utf8::encode $res; + for (my $idx = 0; defined (my $res = $self->locale_decode ($self->my_resource ("pattern.$idx")) || $defaults[$idx]); $idx++) { my $launcher = $self->my_resource ("launcher.$idx"); $launcher =~ s/\$&|\$\{&\}/\${0}/g if $launcher; my $rend = $self->parse_rend($self->my_resource ("rend.$idx")); @@ -315,15 +346,18 @@ my @end = @+; my @exec; - if (!defined($off) || ($-[0] <= $off && $+[0] >= $off)) { + if (!(defined $off) || ($-[0] <= $off && $+[0] >= $off)) { if ($launcher !~ /\$/) { @exec = ($launcher, $match); } else { # It'd be nice to just access a list like ($&,$1,$2...), # but alas, m//g behaves differently in list context. - @exec = map { s/\$(\d+)|\$\{(\d+)\}/ - substr $text, $begin[$1 || $2], $end[$1 || $2] - $begin[$1 || $2] - /egx; $_ } split /\s+/, $launcher; + @exec = map { + s{\$(\d+)|\$\{(\d+)\}}{ + substr $text, $begin[$1 || $2], $end[$1 || $2] - $begin[$1 || $2] + }egx; + $_ + } split /\s+/, $launcher; } push @matches, [ $line->coord_of ($begin[0]), $line->coord_of ($end[0]), $match, @exec ]; @@ -331,7 +365,7 @@ } } - @matches; + @matches } sub command_for { @@ -348,8 +382,11 @@ sub on_button_press { my ($self, $event) = @_; - if($self->valid_button($event) - && (my @exec = $self->command_for($event->{row},$event->{col}))) { + + if ( + $self->valid_button ($event) + && (my @exec = $self->command_for ($event->{row}, $event->{col})) + ) { $self->{row} = $event->{row}; $self->{col} = $event->{col}; $self->{cmd} = \@exec; @@ -372,12 +409,13 @@ return if !defined $row; - if($row == $event->{row} && abs($col-$event->{col}) < 2 - && join("\x00", @$cmd) eq join("\x00", $self->command_for($row,$col))) { - if($self->valid_button($event)) { - + if ( + $row == $event->{row} + && (abs $col-$event->{col}) < 2 + && (join "\x00", @$cmd) eq (join "\x00", $self->command_for ($row, $col)) + ) { + if ($self->valid_button ($event)) { $self->exec_async (@$cmd); - } }