… | |
… | |
20 | () |
20 | () |
21 | } |
21 | } |
22 | |
22 | |
23 | # "find interetsing things"-patterns |
23 | # "find interetsing things"-patterns |
24 | my @mark_patterns = ( |
24 | my @mark_patterns = ( |
25 | qr{([[:word:]]+)}, |
|
|
26 | |
|
|
27 | # common "parentheses" |
25 | # common types of "parentheses" |
28 | qr{(?:^|\s) ‘ ([^‘’]+?) ’ (?:\s|\)|$)}x, |
26 | qr{ (?<![^[:space:]]) ‘ ([^‘’]+) ’ (?![^[:space]]) }x, |
29 | qr{(?:^|\s) ` ([^`']+?) ' (?:\s|\)|$)}x, |
27 | qr{ (?<![^[:space:]]) ` ([^`']+) ' (?![^[:space]]) }x, |
30 | qr{ \{ ([^{}]+?) \} }x, |
28 | qr{ (?<![^[:space:]]) (" [^[:space:]] [^"]* ") }x, |
31 | qr{ \[ ([^{}]+?) \] }x, |
29 | qr{ (" [^"]* [^[:space:]] ") (?![^[:space]]) }x, |
32 | qr{ \( ([^()]+?) \) }x, |
30 | qr{ \< ([^<>[:space:]]+) \> }x, |
|
|
31 | qr{ \{ ([^{}[:space:]]+) \} }x, |
|
|
32 | qr{ \[ ([^{}[:space:]]+) \] }x, |
|
|
33 | qr{ \( ([^()[:space:]]+) \) }x, |
33 | |
34 | |
34 | # urls, just a heuristic |
35 | # urls, just a heuristic |
35 | qr{( |
36 | qr{( |
36 | (?:https?|ftp|news|mailto|file)://[ab-zA-Z0-9\-\@;\/?:&=%\$_.+!*\x27(),~]+ |
37 | (?:https?|ftp|news|mailto|file)://[ab-zA-Z0-9\-\@;\/?:&=%\$_.+!*\x27(),~]+ |
37 | [ab-zA-Z0-9\-\@;\/?:&=%\$_+!*\x27()~] # exclude some trailing characters (heuristic) |
38 | [ab-zA-Z0-9\-\@;\/?:&=%\$_+!*\x27()~] # exclude some trailing characters (heuristic) |
… | |
… | |
70 | while ($text =~ /$regex/g) { |
71 | while ($text =~ /$regex/g) { |
71 | if ($-[1] <= $markofs and $markofs <= $+[1]) { |
72 | if ($-[1] <= $markofs and $markofs <= $+[1]) { |
72 | my $ofs = $-[1]; |
73 | my $ofs = $-[1]; |
73 | my $match = $1; |
74 | my $match = $1; |
74 | |
75 | |
75 | push @matches, [$ofs, length $match]; |
|
|
76 | |
|
|
77 | for my $regex (@simplify_patterns) { |
76 | for my $regex (@simplify_patterns) { |
78 | if ($match =~ $regex) { |
77 | if ($match =~ $regex) { |
79 | $match = $1; |
78 | $match = $1; |
80 | $ofs += $-[1]; |
79 | $ofs += $-[1]; |
81 | } |
80 | } |