1 |
/* re-cmp.c |
2 |
* Pattern match a string, parsing some of the common RE-metacharacters. |
3 |
* |
4 |
* This code is public domain, but I would appreciate to hear of |
5 |
* improvements or even the fact that you use it in your program. |
6 |
* |
7 |
* Deliberate BUGS: |
8 |
* - These tokens are not supported: | ( ) |
9 |
* - You will get the longest expansion of the _first_ string which |
10 |
* matches the RE, not the longest string which would be the proper |
11 |
* behaviour for a RE-matcher. |
12 |
* |
13 |
* Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993 |
14 |
*/ |
15 |
|
16 |
#include <stdio.h> |
17 |
#include <stdlib.h> |
18 |
#include <memory.h> |
19 |
#include <limits.h> |
20 |
#include <re-cmp.h> |
21 |
#include <ctype.h> |
22 |
|
23 |
/* Get prototype functions to prevent warnings. */ |
24 |
#if defined (__sun__) && defined(StupidSunHeaders) |
25 |
# include <sys/types.h> |
26 |
# include <sys/time.h> |
27 |
# include "sunos.h" /* Prototypes for standard libraries, sunos lack those */ |
28 |
#endif |
29 |
|
30 |
|
31 |
/* P r o t o t y p e s |
32 |
*/ |
33 |
const char *re_cmp (const char *, const char *); |
34 |
static bool re_cmp_step (const char *, const char *, unsigned, int); |
35 |
static void re_init (void); |
36 |
static bool re_match_token (unsigned char, selection *); |
37 |
static const char *re_get_token (selection *, const char *); |
38 |
|
39 |
#ifdef DEBUG2 |
40 |
static void re_dump_sel (selection *); |
41 |
#endif |
42 |
|
43 |
/* G l o b a l v a r i a b l e s |
44 |
*/ |
45 |
static bool re_init_done = false; |
46 |
static selection *re_token[RE_TOKEN_MAX]; |
47 |
static const char *re_substr[RE_TOKEN_MAX]; |
48 |
static unsigned int re_token_depth; |
49 |
|
50 |
/* E x t e r n a l f u n c t i o n |
51 |
*/ |
52 |
|
53 |
/* re-cmp - get regular expression match. |
54 |
* Return values: NULL - no match or error in regexp. |
55 |
* pointer to beginning of matching string |
56 |
*/ |
57 |
const char * |
58 |
re_cmp (const char *str, const char *regexp) |
59 |
{ |
60 |
const char *next_regexp; |
61 |
bool once = false; |
62 |
bool matched; |
63 |
|
64 |
if (re_init_done == false) |
65 |
re_init (); |
66 |
|
67 |
#ifdef SAFE_CHECKS |
68 |
if (regexp == NULL || str == NULL) |
69 |
return NULL; |
70 |
#endif |
71 |
if (*regexp == '^') |
72 |
{ |
73 |
once = true; |
74 |
++regexp; |
75 |
} |
76 |
if (*regexp == 0) |
77 |
{ |
78 |
/* // or /^/ matches any string */ |
79 |
return str; |
80 |
} |
81 |
|
82 |
next_regexp = re_get_token (re_token[0], regexp); |
83 |
re_token_depth = 0; |
84 |
re_substr[0] = next_regexp; |
85 |
|
86 |
matched = false; |
87 |
while (*str != '\0' && !(matched = re_match_token (*str, re_token[0]))) |
88 |
str++; |
89 |
|
90 |
if (matched && *next_regexp == 0) |
91 |
return str; |
92 |
|
93 |
/* Apologies for the nearly duplicated code below, hopefully it |
94 |
* speeds things up. |
95 |
*/ |
96 |
if (once) |
97 |
{ |
98 |
switch (re_token[0]->repeat) |
99 |
{ |
100 |
case rep_once: |
101 |
if (matched == false) |
102 |
return NULL; |
103 |
break; |
104 |
case rep_once_or_more: |
105 |
if (matched == false) |
106 |
return NULL; |
107 |
|
108 |
if (re_cmp_step (str + 1, regexp, 0, 1)) |
109 |
return str; |
110 |
break; |
111 |
case rep_null_or_once: |
112 |
if (matched == false) |
113 |
return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL; |
114 |
break; |
115 |
case rep_null_or_more: |
116 |
if (matched) |
117 |
{ |
118 |
if (re_cmp_step (str + 1, regexp, 0, 1)) |
119 |
return str; |
120 |
} |
121 |
else |
122 |
{ |
123 |
return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL; |
124 |
} |
125 |
break; |
126 |
} |
127 |
return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL; |
128 |
} |
129 |
|
130 |
if (matched) |
131 |
{ |
132 |
switch (re_token[0]->repeat) |
133 |
{ |
134 |
case rep_once: |
135 |
case rep_null_or_once: |
136 |
break; |
137 |
case rep_once_or_more: |
138 |
case rep_null_or_more: |
139 |
if (re_cmp_step (str + 1, regexp, 0, 1)) |
140 |
return str; |
141 |
break; |
142 |
} |
143 |
/* The logic here is that re_match_token only sees |
144 |
* if the one letter matches. Thus, if the |
145 |
* regex is like '@match eureca', and the |
146 |
* the user enters anything with an e, re_match_token |
147 |
* returns true, but they really need to match the |
148 |
* entire regexp, which re_cmp_step will do. |
149 |
* However, what happens is that there can be a case |
150 |
* where the string being match is something like |
151 |
* 'where is eureca'. In this case, the re_match_token |
152 |
* matches that first e, but the re_cmp_step below, |
153 |
* fails because the next character (r) doesn't match |
154 |
* the u. So we call re_cmp with the string |
155 |
* after the first r, so that it should hopefully match |
156 |
* up properly. |
157 |
*/ |
158 |
if (re_cmp_step (str + 1, next_regexp, 1, 0)) |
159 |
return str; |
160 |
else if (*(str + 1) != 0) |
161 |
return re_cmp (str + 1, regexp); |
162 |
} |
163 |
return NULL; |
164 |
} |
165 |
|
166 |
/* A u x i l l i a r y f u n c t i o n s |
167 |
*/ |
168 |
|
169 |
static bool |
170 |
re_cmp_step (const char *str, const char *regexp, unsigned slot, int matches) |
171 |
{ |
172 |
/* str - string to match |
173 |
* regexp - pattern |
174 |
* slot - number of the token which under consideration |
175 |
* matches - how many times the token has matched |
176 |
*/ |
177 |
const char *next_regexp; |
178 |
bool matched; |
179 |
|
180 |
#ifdef DEBUG |
181 |
|
182 |
/* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/ |
183 |
#endif |
184 |
|
185 |
if (*regexp == 0) |
186 |
{ |
187 |
/* When we reach the end of the regexp, the match is a success |
188 |
*/ |
189 |
return true; |
190 |
} |
191 |
|
192 |
/* This chunk of code makes sure that the regexp-tokenising happens |
193 |
* only once. We only tokenise as much as we need. |
194 |
*/ |
195 |
if (slot > re_token_depth) |
196 |
{ |
197 |
re_token_depth = slot; |
198 |
if (re_token[slot] == NULL) |
199 |
re_token[slot] = (selection *) malloc (sizeof (selection)); |
200 |
next_regexp = re_get_token (re_token[slot], regexp); |
201 |
if (next_regexp == NULL) |
202 |
{ |
203 |
/* Syntax error, what else can we do? */ |
204 |
return false; |
205 |
} |
206 |
re_substr[slot] = next_regexp; |
207 |
} |
208 |
else |
209 |
{ |
210 |
next_regexp = re_substr[slot]; |
211 |
} |
212 |
|
213 |
matched = re_match_token (*str, re_token[slot]); |
214 |
if (matched) |
215 |
++matches; |
216 |
|
217 |
if (*str == 0) |
218 |
return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched; |
219 |
|
220 |
switch (re_token[slot]->repeat) |
221 |
{ |
222 |
case rep_once: |
223 |
if (matches == 1) |
224 |
{ /* (matches == 1) => (matched == true) */ |
225 |
return re_cmp_step (str + 1, next_regexp, slot + 1, 0); |
226 |
} |
227 |
return false; |
228 |
case rep_once_or_more: |
229 |
if (matched) |
230 |
{ /* (matched == true) => (matches >= 1) */ |
231 |
/* First check if the current token repeats more */ |
232 |
if (re_cmp_step (str + 1, regexp, slot, matches)) |
233 |
return true; |
234 |
return re_cmp_step (str + 1, next_regexp, slot + 1, 0); |
235 |
} |
236 |
return false; |
237 |
case rep_null_or_once: |
238 |
/* We must go on to the next token, but should we advance str? */ |
239 |
if (matches == 0) |
240 |
{ |
241 |
return re_cmp_step (str, next_regexp, slot + 1, 0); |
242 |
} |
243 |
else if (matches == 1) |
244 |
{ |
245 |
return re_cmp_step (str + 1, next_regexp, slot + 1, 0); |
246 |
} |
247 |
return false; /* Not reached */ |
248 |
case rep_null_or_more: |
249 |
if (matched) |
250 |
{ |
251 |
/* Look for further repeats, advance str */ |
252 |
if (re_cmp_step (str + 1, regexp, slot, matches)) |
253 |
return true; |
254 |
return re_cmp_step (str, next_regexp, slot + 1, 0); |
255 |
} |
256 |
return re_cmp_step (str, next_regexp, slot + 1, 0); |
257 |
} |
258 |
return false; |
259 |
} |
260 |
|
261 |
static void |
262 |
re_init (void) |
263 |
{ |
264 |
int i; |
265 |
|
266 |
re_token[0] = (selection *) malloc (sizeof (selection)); |
267 |
for (i = 1; i < RE_TOKEN_MAX; i++) |
268 |
re_token[i] = NULL; |
269 |
|
270 |
re_init_done = true; |
271 |
} |
272 |
|
273 |
static bool |
274 |
re_match_token (unsigned char c, selection * sel) |
275 |
{ |
276 |
switch (sel->type) |
277 |
{ |
278 |
case sel_any: |
279 |
return true; |
280 |
case sel_end: |
281 |
return (c == 0); |
282 |
case sel_single: |
283 |
return (tolower (c) == tolower (sel->u.single)); |
284 |
case sel_range: |
285 |
return (c >= sel->u.range.low && c <= sel->u.range.high); |
286 |
case sel_array: |
287 |
return (sel->u.array[c]); |
288 |
case sel_not_single: |
289 |
return (tolower (c) != tolower (sel->u.single)); |
290 |
case sel_not_range: |
291 |
return (c < sel->u.range.low && c > sel->u.range.high); |
292 |
} |
293 |
return false; |
294 |
} |
295 |
|
296 |
/* re_get_token - get regular expression token |
297 |
* Returns the first token found in <regexp> in <sel> |
298 |
* Return values: NULL syntax error |
299 |
* pointer to first character past token. |
300 |
*/ |
301 |
static const char * |
302 |
re_get_token (selection * sel, const char *regexp) |
303 |
{ |
304 |
|
305 |
#ifdef SAFE_CHECKS |
306 |
# define exit_if_null if (*regexp == 0) return NULL |
307 |
#else |
308 |
# define exit_if_null |
309 |
#endif |
310 |
|
311 |
bool quoted = false; |
312 |
unsigned char looking_at; |
313 |
|
314 |
#ifdef SAFE_CHECKS |
315 |
if (sel == NULL || regexp == NULL || *regexp == 0) |
316 |
return NULL; |
317 |
#endif |
318 |
|
319 |
do |
320 |
{ |
321 |
looking_at = *regexp++; |
322 |
switch (looking_at) |
323 |
{ |
324 |
case '$': |
325 |
if (quoted) |
326 |
{ |
327 |
quoted = false; |
328 |
sel->type = sel_single; |
329 |
sel->u.single = looking_at; |
330 |
} |
331 |
else |
332 |
{ |
333 |
sel->type = sel_end; |
334 |
} |
335 |
break; |
336 |
case '.': |
337 |
if (quoted) |
338 |
{ |
339 |
quoted = false; |
340 |
sel->type = sel_single; |
341 |
sel->u.single = looking_at; |
342 |
} |
343 |
else |
344 |
{ |
345 |
sel->type = sel_any; |
346 |
} |
347 |
break; |
348 |
case '[': |
349 |
/* The fun stuff... perhaps a little obfuscated since I |
350 |
* don't trust the compiler to analyse liveness. |
351 |
*/ |
352 |
if (quoted) |
353 |
{ |
354 |
quoted = false; |
355 |
sel->type = sel_single; |
356 |
sel->u.single = looking_at; |
357 |
} |
358 |
else |
359 |
{ |
360 |
bool neg = false; |
361 |
unsigned char first, last = 0; |
362 |
|
363 |
exit_if_null; |
364 |
looking_at = *regexp++; |
365 |
|
366 |
if (looking_at == '^') |
367 |
{ |
368 |
neg = true; |
369 |
exit_if_null; |
370 |
looking_at = *regexp++; |
371 |
} |
372 |
first = looking_at; |
373 |
exit_if_null; |
374 |
looking_at = *regexp++; |
375 |
if (looking_at == ']') |
376 |
{ |
377 |
/* On the form [q] or [^q] */ |
378 |
sel->type = neg ? sel_not_single : sel_single; |
379 |
sel->u.single = first; |
380 |
break; |
381 |
} |
382 |
else if (looking_at == '-') |
383 |
{ |
384 |
exit_if_null; |
385 |
last = *regexp++; |
386 |
if (last == ']') |
387 |
{ |
388 |
/* On the form [A-] or [^A-]. Checking for |
389 |
* [,-] and making it a range is probably not |
390 |
* worth it :-) |
391 |
*/ |
392 |
sel->type = sel_array; |
393 |
memset (sel->u.array, neg, sizeof (sel->u.array)); |
394 |
sel->u.array[first] = sel->u.array['-'] = !neg; |
395 |
break; |
396 |
} |
397 |
else |
398 |
{ |
399 |
exit_if_null; |
400 |
looking_at = *regexp++; |
401 |
if (looking_at == ']') |
402 |
{ |
403 |
/* On the form [A-G] or [^A-G]. Note that [G-A] |
404 |
* is a syntax error. Fair enough, I think. |
405 |
*/ |
406 |
#ifdef SAFE_CHECK |
407 |
if (first > last) |
408 |
return NULL; |
409 |
#endif |
410 |
sel->type = neg ? sel_not_range : sel_range; |
411 |
sel->u.range.low = first; |
412 |
sel->u.range.high = last; |
413 |
break; |
414 |
} |
415 |
} |
416 |
} |
417 |
{ |
418 |
/* The datastructure can only represent a RE this |
419 |
* complex with an array. |
420 |
*/ |
421 |
int i; |
422 |
unsigned char previous; |
423 |
|
424 |
sel->type = sel_array; |
425 |
memset (sel->u.array, neg, sizeof (sel->u.array)); |
426 |
if (last) |
427 |
{ |
428 |
/* It starts with a range */ |
429 |
#ifdef SAFE_CHECK |
430 |
if (first > last) |
431 |
return NULL; |
432 |
#endif |
433 |
for (i = first; i <= last; i++) |
434 |
{ |
435 |
sel->u.array[i] = !neg; |
436 |
} |
437 |
} |
438 |
else |
439 |
{ |
440 |
/* It begins with a "random" character */ |
441 |
sel->u.array[first] = !neg; |
442 |
} |
443 |
sel->u.array[looking_at] = !neg; |
444 |
|
445 |
exit_if_null; |
446 |
previous = looking_at; |
447 |
looking_at = *regexp++; |
448 |
|
449 |
/* Add more characters to the array until we reach |
450 |
* ]. Quoting doesn't and shouldn't work in here. |
451 |
* ("]" should be put first, and "-" last if they |
452 |
* are needed inside this construct.) |
453 |
* Look for ranges as we go along. |
454 |
*/ |
455 |
while (looking_at != ']') |
456 |
{ |
457 |
if (looking_at == '-') |
458 |
{ |
459 |
exit_if_null; |
460 |
looking_at = *regexp++; |
461 |
if (looking_at != ']') |
462 |
{ |
463 |
#ifdef SAFE_CHECK |
464 |
if (previous > looking_at) |
465 |
return NULL; |
466 |
#endif |
467 |
for (i = previous + 1; i < looking_at; i++) |
468 |
{ |
469 |
/* previous has already been set and |
470 |
* looking_at is set below. |
471 |
*/ |
472 |
sel->u.array[i] = !neg; |
473 |
} |
474 |
exit_if_null; |
475 |
} |
476 |
else |
477 |
{ |
478 |
sel->u.array['-'] = !neg; |
479 |
break; |
480 |
} |
481 |
} |
482 |
sel->u.array[looking_at] = !neg; |
483 |
previous = looking_at; |
484 |
exit_if_null; |
485 |
looking_at = *regexp++; |
486 |
} |
487 |
} |
488 |
} |
489 |
break; |
490 |
case '\\': |
491 |
if (quoted) |
492 |
{ |
493 |
quoted = false; |
494 |
sel->type = sel_single; |
495 |
sel->u.single = looking_at; |
496 |
} |
497 |
else |
498 |
{ |
499 |
quoted = true; |
500 |
} |
501 |
break; |
502 |
default: |
503 |
quoted = false; |
504 |
sel->type = sel_single; |
505 |
sel->u.single = looking_at; |
506 |
break; |
507 |
} |
508 |
} |
509 |
while (quoted); |
510 |
|
511 |
if (*regexp == '*') |
512 |
{ |
513 |
sel->repeat = rep_null_or_more; |
514 |
++regexp; |
515 |
} |
516 |
else if (*regexp == '?') |
517 |
{ |
518 |
sel->repeat = rep_null_or_once; |
519 |
++regexp; |
520 |
} |
521 |
else if (*regexp == '+') |
522 |
{ |
523 |
sel->repeat = rep_once_or_more; |
524 |
++regexp; |
525 |
} |
526 |
else |
527 |
{ |
528 |
sel->repeat = rep_once; |
529 |
} |
530 |
|
531 |
return regexp; |
532 |
} |
533 |
|
534 |
/* D e b u g c o d e |
535 |
*/ |
536 |
#ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */ |
537 |
static void |
538 |
re_dump_sel (selection * sel) |
539 |
{ |
540 |
switch (sel->type) |
541 |
{ |
542 |
case sel_any: |
543 |
printf ("."); |
544 |
break; |
545 |
case sel_end: |
546 |
printf ("$"); |
547 |
break; |
548 |
case sel_single: |
549 |
printf ("<%c>", sel->u.single); |
550 |
break; |
551 |
case sel_range: |
552 |
printf ("[%c-%c]", sel->u.range.low, sel->u.range.high); |
553 |
break; |
554 |
case sel_array: |
555 |
{ |
556 |
int i; |
557 |
|
558 |
printf ("["); |
559 |
for (i = 0; i < uchar_MAX; i++) |
560 |
{ |
561 |
if (sel->u.array[i]) |
562 |
{ |
563 |
printf ("%c", i); |
564 |
} |
565 |
} |
566 |
printf ("]"); |
567 |
} |
568 |
break; |
569 |
case sel_not_single: |
570 |
printf ("[^%c]", sel->u.single); |
571 |
break; |
572 |
case sel_not_range: |
573 |
printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high); |
574 |
break; |
575 |
default: |
576 |
printf ("<UNKNOWN TOKEN!>"); |
577 |
break; |
578 |
} |
579 |
switch (sel->repeat) |
580 |
{ |
581 |
case rep_once: |
582 |
break; |
583 |
case rep_null_or_once: |
584 |
printf ("?"); |
585 |
break; |
586 |
case rep_null_or_more: |
587 |
printf ("*"); |
588 |
break; |
589 |
case rep_once_or_more: |
590 |
printf ("+"); |
591 |
break; |
592 |
default: |
593 |
printf ("<UNKNOWN REP-TOKEN!>"); |
594 |
break; |
595 |
} |
596 |
} |
597 |
|
598 |
int |
599 |
main (int argc, char *argv[]) |
600 |
{ |
601 |
char *re, *m; |
602 |
selection sel; |
603 |
|
604 |
re = re_get_token (&sel, argv[1]); |
605 |
|
606 |
printf ("'%s' -> '%s'\n", argv[1], re); |
607 |
re_dump_sel (&sel); |
608 |
printf ("\n"); |
609 |
m = re_cmp (argv[2], argv[1]); |
610 |
if (m) |
611 |
printf ("MATCH! -> '%s'\n", m); |
612 |
return 0; |
613 |
} |
614 |
#endif |