1 |
/* |
2 |
* static char *rcsid_player_c = |
3 |
* "$Id: re-cmp.C,v 1.2 2006-08-29 08:01:35 root Exp $"; |
4 |
*/ |
5 |
|
6 |
|
7 |
/* re-cmp.c |
8 |
* Pattern match a string, parsing some of the common RE-metacharacters. |
9 |
* |
10 |
* This code is public domain, but I would appreciate to hear of |
11 |
* improvements or even the fact that you use it in your program. |
12 |
* |
13 |
* Deliberate BUGS: |
14 |
* - These tokens are not supported: | ( ) |
15 |
* - You will get the longest expansion of the _first_ string which |
16 |
* matches the RE, not the longest string which would be the proper |
17 |
* behaviour for a RE-matcher. |
18 |
* |
19 |
* Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993 |
20 |
*/ |
21 |
|
22 |
#include <stdio.h> |
23 |
#include <stdlib.h> |
24 |
#include <memory.h> |
25 |
#include <limits.h> |
26 |
#include <re-cmp.h> |
27 |
#include <ctype.h> |
28 |
|
29 |
/* Get prototype functions to prevent warnings. */ |
30 |
#if defined (__sun__) && defined(StupidSunHeaders) |
31 |
# include <sys/types.h> |
32 |
# include <sys/time.h> |
33 |
# include "sunos.h" /* Prototypes for standard libraries, sunos lack those */ |
34 |
#endif |
35 |
|
36 |
|
37 |
/* P r o t o t y p e s |
38 |
*/ |
39 |
const char *re_cmp(const char *, const char *); |
40 |
static bool re_cmp_step(const char *, const char *, unsigned, int); |
41 |
static void re_init(void); |
42 |
static bool re_match_token(unsigned char, selection *); |
43 |
static const char *re_get_token(selection *, const char *); |
44 |
#ifdef DEBUG2 |
45 |
static void re_dump_sel(selection *); |
46 |
#endif |
47 |
|
48 |
/* G l o b a l v a r i a b l e s |
49 |
*/ |
50 |
static bool re_init_done = false; |
51 |
static selection *re_token[RE_TOKEN_MAX]; |
52 |
static const char *re_substr[RE_TOKEN_MAX]; |
53 |
static unsigned int re_token_depth; |
54 |
|
55 |
/* E x t e r n a l f u n c t i o n |
56 |
*/ |
57 |
|
58 |
/* re-cmp - get regular expression match. |
59 |
* Return values: NULL - no match or error in regexp. |
60 |
* pointer to beginning of matching string |
61 |
*/ |
62 |
const char * |
63 |
re_cmp(const char *str, const char *regexp) { |
64 |
const char *next_regexp; |
65 |
bool once = false; |
66 |
bool matched; |
67 |
|
68 |
if (re_init_done == false) |
69 |
re_init(); |
70 |
|
71 |
#ifdef SAFE_CHECKS |
72 |
if (regexp == NULL || str == NULL) |
73 |
return NULL; |
74 |
#endif |
75 |
if (*regexp == '^') { |
76 |
once = true; |
77 |
++regexp; |
78 |
} |
79 |
if (*regexp == 0) { |
80 |
/* // or /^/ matches any string */ |
81 |
return str; |
82 |
} |
83 |
|
84 |
next_regexp = re_get_token(re_token[0], regexp); |
85 |
re_token_depth = 0; |
86 |
re_substr[0] = next_regexp; |
87 |
|
88 |
matched = false; |
89 |
while (*str != '\0' && !(matched = re_match_token(*str, re_token[0]))) |
90 |
str++; |
91 |
|
92 |
if (matched && *next_regexp == 0) |
93 |
return str; |
94 |
|
95 |
/* Apologies for the nearly duplicated code below, hopefully it |
96 |
* speeds things up. |
97 |
*/ |
98 |
if (once) { |
99 |
switch (re_token[0]->repeat) { |
100 |
case rep_once: |
101 |
if (matched == false) |
102 |
return NULL; |
103 |
break; |
104 |
case rep_once_or_more: |
105 |
if (matched == false) |
106 |
return NULL; |
107 |
|
108 |
if (re_cmp_step(str+1, regexp, 0, 1)) |
109 |
return str; |
110 |
break; |
111 |
case rep_null_or_once: |
112 |
if (matched == false) |
113 |
return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL; |
114 |
break; |
115 |
case rep_null_or_more: |
116 |
if (matched) { |
117 |
if (re_cmp_step(str+1, regexp, 0, 1)) |
118 |
return str; |
119 |
} else { |
120 |
return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL; |
121 |
} |
122 |
break; |
123 |
} |
124 |
return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL; |
125 |
} |
126 |
|
127 |
if (matched) { |
128 |
switch (re_token[0]->repeat) { |
129 |
case rep_once: |
130 |
case rep_null_or_once: |
131 |
break; |
132 |
case rep_once_or_more: |
133 |
case rep_null_or_more: |
134 |
if (re_cmp_step(str+1, regexp, 0, 1)) |
135 |
return str; |
136 |
break; |
137 |
} |
138 |
/* The logic here is that re_match_token only sees |
139 |
* if the one letter matches. Thus, if the |
140 |
* regex is like '@match eureca', and the |
141 |
* the user enters anything with an e, re_match_token |
142 |
* returns true, but they really need to match the |
143 |
* entire regexp, which re_cmp_step will do. |
144 |
* However, what happens is that there can be a case |
145 |
* where the string being match is something like |
146 |
* 'where is eureca'. In this case, the re_match_token |
147 |
* matches that first e, but the re_cmp_step below, |
148 |
* fails because the next character (r) doesn't match |
149 |
* the u. So we call re_cmp with the string |
150 |
* after the first r, so that it should hopefully match |
151 |
* up properly. |
152 |
*/ |
153 |
if (re_cmp_step(str+1, next_regexp, 1, 0)) |
154 |
return str; |
155 |
else if (*(str+1) != 0) |
156 |
return re_cmp(str+1, regexp); |
157 |
} |
158 |
return NULL; |
159 |
} |
160 |
|
161 |
/* A u x i l l i a r y f u n c t i o n s |
162 |
*/ |
163 |
|
164 |
static bool |
165 |
re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) { |
166 |
/* str - string to match |
167 |
* regexp - pattern |
168 |
* slot - number of the token which under consideration |
169 |
* matches - how many times the token has matched |
170 |
*/ |
171 |
const char *next_regexp; |
172 |
bool matched; |
173 |
|
174 |
#ifdef DEBUG |
175 |
/* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/ |
176 |
#endif |
177 |
|
178 |
if (*regexp == 0) { |
179 |
/* When we reach the end of the regexp, the match is a success |
180 |
*/ |
181 |
return true; |
182 |
} |
183 |
|
184 |
/* This chunk of code makes sure that the regexp-tokenising happens |
185 |
* only once. We only tokenise as much as we need. |
186 |
*/ |
187 |
if (slot > re_token_depth) { |
188 |
re_token_depth = slot; |
189 |
if (re_token[slot] == NULL) |
190 |
re_token[slot] = (selection *) malloc(sizeof(selection)); |
191 |
next_regexp = re_get_token(re_token[slot], regexp); |
192 |
if (next_regexp == NULL) { |
193 |
/* Syntax error, what else can we do? */ |
194 |
return false; |
195 |
} |
196 |
re_substr[slot] = next_regexp; |
197 |
} else { |
198 |
next_regexp = re_substr[slot]; |
199 |
} |
200 |
|
201 |
matched = re_match_token(*str, re_token[slot]); |
202 |
if (matched) |
203 |
++matches; |
204 |
|
205 |
if (*str == 0) |
206 |
return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched; |
207 |
|
208 |
switch (re_token[slot]->repeat) { |
209 |
case rep_once: |
210 |
if (matches == 1) { /* (matches == 1) => (matched == true) */ |
211 |
return re_cmp_step(str+1, next_regexp, slot+1, 0); |
212 |
} |
213 |
return false; |
214 |
case rep_once_or_more: |
215 |
if (matched) { /* (matched == true) => (matches >= 1) */ |
216 |
/* First check if the current token repeats more */ |
217 |
if (re_cmp_step(str+1, regexp, slot, matches)) |
218 |
return true; |
219 |
return re_cmp_step(str+1, next_regexp, slot+1, 0); |
220 |
} |
221 |
return false; |
222 |
case rep_null_or_once: |
223 |
/* We must go on to the next token, but should we advance str? */ |
224 |
if (matches == 0) { |
225 |
return re_cmp_step(str, next_regexp, slot+1, 0); |
226 |
} else if (matches == 1) { |
227 |
return re_cmp_step(str+1, next_regexp, slot+1, 0); |
228 |
} |
229 |
return false; /* Not reached */ |
230 |
case rep_null_or_more: |
231 |
if (matched) { |
232 |
/* Look for further repeats, advance str */ |
233 |
if (re_cmp_step(str+1, regexp, slot, matches)) |
234 |
return true; |
235 |
return re_cmp_step(str, next_regexp, slot+1, 0); |
236 |
} |
237 |
return re_cmp_step(str, next_regexp, slot+1, 0); |
238 |
} |
239 |
return false; |
240 |
} |
241 |
|
242 |
static void |
243 |
re_init(void) { |
244 |
int i; |
245 |
|
246 |
re_token[0] = (selection *) malloc(sizeof(selection)); |
247 |
for (i = 1; i < RE_TOKEN_MAX; i++) |
248 |
re_token[i] = NULL; |
249 |
|
250 |
re_init_done = true; |
251 |
} |
252 |
|
253 |
static bool |
254 |
re_match_token(unsigned char c, selection *sel) { |
255 |
switch (sel->type) { |
256 |
case sel_any: |
257 |
return true; |
258 |
case sel_end: |
259 |
return (c == 0); |
260 |
case sel_single: |
261 |
return (tolower(c) == tolower(sel->u.single)); |
262 |
case sel_range: |
263 |
return (c >= sel->u.range.low && c <= sel->u.range.high); |
264 |
case sel_array: |
265 |
return (sel->u.array[c]); |
266 |
case sel_not_single: |
267 |
return (tolower(c) != tolower(sel->u.single)); |
268 |
case sel_not_range: |
269 |
return (c < sel->u.range.low && c > sel->u.range.high); |
270 |
} |
271 |
return false; |
272 |
} |
273 |
|
274 |
/* re_get_token - get regular expression token |
275 |
* Returns the first token found in <regexp> in <sel> |
276 |
* Return values: NULL syntax error |
277 |
* pointer to first character past token. |
278 |
*/ |
279 |
static const char * |
280 |
re_get_token(selection *sel, const char *regexp) { |
281 |
|
282 |
#ifdef SAFE_CHECKS |
283 |
# define exit_if_null if (*regexp == 0) return NULL |
284 |
#else |
285 |
# define exit_if_null |
286 |
#endif |
287 |
|
288 |
bool quoted = false; |
289 |
unsigned char looking_at; |
290 |
|
291 |
#ifdef SAFE_CHECKS |
292 |
if (sel == NULL || regexp == NULL || *regexp == 0) |
293 |
return NULL; |
294 |
#endif |
295 |
|
296 |
do { |
297 |
looking_at = *regexp++; |
298 |
switch (looking_at) { |
299 |
case '$': |
300 |
if (quoted) { |
301 |
quoted = false; |
302 |
sel->type = sel_single; |
303 |
sel->u.single = looking_at; |
304 |
} else { |
305 |
sel->type = sel_end; |
306 |
} |
307 |
break; |
308 |
case '.': |
309 |
if (quoted) { |
310 |
quoted = false; |
311 |
sel->type = sel_single; |
312 |
sel->u.single = looking_at; |
313 |
} else { |
314 |
sel->type = sel_any; |
315 |
} |
316 |
break; |
317 |
case '[': |
318 |
/* The fun stuff... perhaps a little obfuscated since I |
319 |
* don't trust the compiler to analyse liveness. |
320 |
*/ |
321 |
if (quoted) { |
322 |
quoted = false; |
323 |
sel->type = sel_single; |
324 |
sel->u.single = looking_at; |
325 |
} else { |
326 |
bool neg = false; |
327 |
unsigned char first, last = 0; |
328 |
|
329 |
exit_if_null; |
330 |
looking_at = *regexp++; |
331 |
|
332 |
if (looking_at == '^') { |
333 |
neg = true; |
334 |
exit_if_null; |
335 |
looking_at = *regexp++; |
336 |
} |
337 |
first = looking_at; |
338 |
exit_if_null; |
339 |
looking_at = *regexp++; |
340 |
if (looking_at == ']') { |
341 |
/* On the form [q] or [^q] */ |
342 |
sel->type = neg ? sel_not_single : sel_single; |
343 |
sel->u.single = first; |
344 |
break; |
345 |
} else if (looking_at == '-') { |
346 |
exit_if_null; |
347 |
last = *regexp++; |
348 |
if (last == ']') { |
349 |
/* On the form [A-] or [^A-]. Checking for |
350 |
* [,-] and making it a range is probably not |
351 |
* worth it :-) |
352 |
*/ |
353 |
sel->type = sel_array; |
354 |
memset(sel->u.array, neg, sizeof(sel->u.array)); |
355 |
sel->u.array[first] = sel->u.array['-'] = !neg; |
356 |
break; |
357 |
} else { |
358 |
exit_if_null; |
359 |
looking_at = *regexp++; |
360 |
if (looking_at == ']') { |
361 |
/* On the form [A-G] or [^A-G]. Note that [G-A] |
362 |
* is a syntax error. Fair enough, I think. |
363 |
*/ |
364 |
#ifdef SAFE_CHECK |
365 |
if (first > last) |
366 |
return NULL; |
367 |
#endif |
368 |
sel->type = neg ? sel_not_range : sel_range; |
369 |
sel->u.range.low = first; |
370 |
sel->u.range.high = last; |
371 |
break; |
372 |
} |
373 |
} |
374 |
} |
375 |
{ |
376 |
/* The datastructure can only represent a RE this |
377 |
* complex with an array. |
378 |
*/ |
379 |
int i; |
380 |
unsigned char previous; |
381 |
|
382 |
sel->type = sel_array; |
383 |
memset(sel->u.array, neg, sizeof(sel->u.array)); |
384 |
if (last) { |
385 |
/* It starts with a range */ |
386 |
#ifdef SAFE_CHECK |
387 |
if (first > last) |
388 |
return NULL; |
389 |
#endif |
390 |
for (i = first; i <= last; i++) { |
391 |
sel->u.array[i] = !neg; |
392 |
} |
393 |
} else { |
394 |
/* It begins with a "random" character */ |
395 |
sel->u.array[first] = !neg; |
396 |
} |
397 |
sel->u.array[looking_at] = !neg; |
398 |
|
399 |
exit_if_null; |
400 |
previous = looking_at; |
401 |
looking_at = *regexp++; |
402 |
|
403 |
/* Add more characters to the array until we reach |
404 |
* ]. Quoting doesn't and shouldn't work in here. |
405 |
* ("]" should be put first, and "-" last if they |
406 |
* are needed inside this construct.) |
407 |
* Look for ranges as we go along. |
408 |
*/ |
409 |
while (looking_at != ']') { |
410 |
if (looking_at == '-') { |
411 |
exit_if_null; |
412 |
looking_at = *regexp++; |
413 |
if (looking_at != ']') { |
414 |
#ifdef SAFE_CHECK |
415 |
if (previous > looking_at) |
416 |
return NULL; |
417 |
#endif |
418 |
for (i = previous+1; i < looking_at; i++) { |
419 |
/* previous has already been set and |
420 |
* looking_at is set below. |
421 |
*/ |
422 |
sel->u.array[i] = !neg; |
423 |
} |
424 |
exit_if_null; |
425 |
} else { |
426 |
sel->u.array['-'] = !neg; |
427 |
break; |
428 |
} |
429 |
} |
430 |
sel->u.array[looking_at] = !neg; |
431 |
previous = looking_at; |
432 |
exit_if_null; |
433 |
looking_at = *regexp++; |
434 |
} |
435 |
} |
436 |
} |
437 |
break; |
438 |
case '\\': |
439 |
if (quoted) { |
440 |
quoted = false; |
441 |
sel->type = sel_single; |
442 |
sel->u.single = looking_at; |
443 |
} else { |
444 |
quoted = true; |
445 |
} |
446 |
break; |
447 |
default: |
448 |
quoted = false; |
449 |
sel->type = sel_single; |
450 |
sel->u.single = looking_at; |
451 |
break; |
452 |
} |
453 |
} while (quoted); |
454 |
|
455 |
if (*regexp == '*') { |
456 |
sel->repeat = rep_null_or_more; |
457 |
++regexp; |
458 |
} else if (*regexp == '?') { |
459 |
sel->repeat = rep_null_or_once; |
460 |
++regexp; |
461 |
} else if (*regexp == '+') { |
462 |
sel->repeat = rep_once_or_more; |
463 |
++regexp; |
464 |
} else { |
465 |
sel->repeat = rep_once; |
466 |
} |
467 |
|
468 |
return regexp; |
469 |
} |
470 |
|
471 |
/* D e b u g c o d e |
472 |
*/ |
473 |
#ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */ |
474 |
static void |
475 |
re_dump_sel(selection *sel) { |
476 |
switch(sel->type) { |
477 |
case sel_any: |
478 |
printf("."); |
479 |
break; |
480 |
case sel_end: |
481 |
printf("$"); |
482 |
break; |
483 |
case sel_single: |
484 |
printf("<%c>", sel->u.single); |
485 |
break; |
486 |
case sel_range: |
487 |
printf("[%c-%c]", sel->u.range.low, sel->u.range.high); |
488 |
break; |
489 |
case sel_array: |
490 |
{ |
491 |
int i; |
492 |
printf("["); |
493 |
for (i = 0; i < uchar_MAX; i++) { |
494 |
if (sel->u.array[i]) { |
495 |
printf("%c", i); |
496 |
} |
497 |
} |
498 |
printf("]"); |
499 |
} |
500 |
break; |
501 |
case sel_not_single: |
502 |
printf("[^%c]", sel->u.single); |
503 |
break; |
504 |
case sel_not_range: |
505 |
printf("[^%c-%c]", sel->u.range.low, sel->u.range.high); |
506 |
break; |
507 |
default: |
508 |
printf("<UNKNOWN TOKEN!>"); |
509 |
break; |
510 |
} |
511 |
switch(sel->repeat) { |
512 |
case rep_once: |
513 |
break; |
514 |
case rep_null_or_once: |
515 |
printf("?"); |
516 |
break; |
517 |
case rep_null_or_more: |
518 |
printf("*"); |
519 |
break; |
520 |
case rep_once_or_more: |
521 |
printf("+"); |
522 |
break; |
523 |
default: |
524 |
printf("<UNKNOWN REP-TOKEN!>"); |
525 |
break; |
526 |
} |
527 |
} |
528 |
|
529 |
int |
530 |
main(int argc, char *argv[]) { |
531 |
char *re, *m; |
532 |
selection sel; |
533 |
|
534 |
re = re_get_token(&sel, argv[1]); |
535 |
|
536 |
printf("'%s' -> '%s'\n", argv[1], re); |
537 |
re_dump_sel(&sel); |
538 |
printf("\n"); |
539 |
m = re_cmp(argv[2], argv[1]); |
540 |
if (m) |
541 |
printf("MATCH! -> '%s'\n", m); |
542 |
return 0; |
543 |
} |
544 |
#endif |