ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/deliantra/server/common/re-cmp.C
Revision: 1.4
Committed: Sun Sep 10 16:00:23 2006 UTC (17 years, 8 months ago) by root
Content type: text/plain
Branch: MAIN
Changes since 1.3: +445 -368 lines
Log Message:
indent

File Contents

# Content
1
2 /*
3 * static char *rcsid_player_c =
4 * "$Id: re-cmp.C,v 1.3 2006-09-08 04:51:08 pippijn Exp $";
5 */
6
7
8 /* re-cmp.c
9 * Pattern match a string, parsing some of the common RE-metacharacters.
10 *
11 * This code is public domain, but I would appreciate to hear of
12 * improvements or even the fact that you use it in your program.
13 *
14 * Deliberate BUGS:
15 * - These tokens are not supported: | ( )
16 * - You will get the longest expansion of the _first_ string which
17 * matches the RE, not the longest string which would be the proper
18 * behaviour for a RE-matcher.
19 *
20 * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
21 */
22
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <memory.h>
26 #include <limits.h>
27 #include <re-cmp.h>
28 #include <ctype.h>
29
30 /* Get prototype functions to prevent warnings. */
31 #if defined (__sun__) && defined(StupidSunHeaders)
32 # include <sys/types.h>
33 # include <sys/time.h>
34 # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
35 #endif
36
37
38 /* P r o t o t y p e s
39 */
40 const char *re_cmp (const char *, const char *);
41 static bool re_cmp_step (const char *, const char *, unsigned, int);
42 static void re_init (void);
43 static bool re_match_token (unsigned char, selection *);
44 static const char *re_get_token (selection *, const char *);
45
46 #ifdef DEBUG2
47 static void re_dump_sel (selection *);
48 #endif
49
50 /* G l o b a l v a r i a b l e s
51 */
52 static bool re_init_done = false;
53 static selection *re_token[RE_TOKEN_MAX];
54 static const char *re_substr[RE_TOKEN_MAX];
55 static unsigned int re_token_depth;
56
57 /* E x t e r n a l f u n c t i o n
58 */
59
60 /* re-cmp - get regular expression match.
61 * Return values: NULL - no match or error in regexp.
62 * pointer to beginning of matching string
63 */
64 const char *
65 re_cmp (const char *str, const char *regexp)
66 {
67 const char *next_regexp;
68 bool once = false;
69 bool matched;
70
71 if (re_init_done == false)
72 re_init ();
73
74 #ifdef SAFE_CHECKS
75 if (regexp == NULL || str == NULL)
76 return NULL;
77 #endif
78 if (*regexp == '^')
79 {
80 once = true;
81 ++regexp;
82 }
83 if (*regexp == 0)
84 {
85 /* // or /^/ matches any string */
86 return str;
87 }
88
89 next_regexp = re_get_token (re_token[0], regexp);
90 re_token_depth = 0;
91 re_substr[0] = next_regexp;
92
93 matched = false;
94 while (*str != '\0' && !(matched = re_match_token (*str, re_token[0])))
95 str++;
96
97 if (matched && *next_regexp == 0)
98 return str;
99
100 /* Apologies for the nearly duplicated code below, hopefully it
101 * speeds things up.
102 */
103 if (once)
104 {
105 switch (re_token[0]->repeat)
106 {
107 case rep_once:
108 if (matched == false)
109 return NULL;
110 break;
111 case rep_once_or_more:
112 if (matched == false)
113 return NULL;
114
115 if (re_cmp_step (str + 1, regexp, 0, 1))
116 return str;
117 break;
118 case rep_null_or_once:
119 if (matched == false)
120 return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
121 break;
122 case rep_null_or_more:
123 if (matched)
124 {
125 if (re_cmp_step (str + 1, regexp, 0, 1))
126 return str;
127 }
128 else
129 {
130 return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
131 }
132 break;
133 }
134 return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
135 }
136
137 if (matched)
138 {
139 switch (re_token[0]->repeat)
140 {
141 case rep_once:
142 case rep_null_or_once:
143 break;
144 case rep_once_or_more:
145 case rep_null_or_more:
146 if (re_cmp_step (str + 1, regexp, 0, 1))
147 return str;
148 break;
149 }
150 /* The logic here is that re_match_token only sees
151 * if the one letter matches. Thus, if the
152 * regex is like '@match eureca', and the
153 * the user enters anything with an e, re_match_token
154 * returns true, but they really need to match the
155 * entire regexp, which re_cmp_step will do.
156 * However, what happens is that there can be a case
157 * where the string being match is something like
158 * 'where is eureca'. In this case, the re_match_token
159 * matches that first e, but the re_cmp_step below,
160 * fails because the next character (r) doesn't match
161 * the u. So we call re_cmp with the string
162 * after the first r, so that it should hopefully match
163 * up properly.
164 */
165 if (re_cmp_step (str + 1, next_regexp, 1, 0))
166 return str;
167 else if (*(str + 1) != 0)
168 return re_cmp (str + 1, regexp);
169 }
170 return NULL;
171 }
172
173 /* A u x i l l i a r y f u n c t i o n s
174 */
175
176 static bool
177 re_cmp_step (const char *str, const char *regexp, unsigned slot, int matches)
178 {
179 /* str - string to match
180 * regexp - pattern
181 * slot - number of the token which under consideration
182 * matches - how many times the token has matched
183 */
184 const char *next_regexp;
185 bool matched;
186
187 #ifdef DEBUG
188
189 /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
190 #endif
191
192 if (*regexp == 0)
193 {
194 /* When we reach the end of the regexp, the match is a success
195 */
196 return true;
197 }
198
199 /* This chunk of code makes sure that the regexp-tokenising happens
200 * only once. We only tokenise as much as we need.
201 */
202 if (slot > re_token_depth)
203 {
204 re_token_depth = slot;
205 if (re_token[slot] == NULL)
206 re_token[slot] = (selection *) malloc (sizeof (selection));
207 next_regexp = re_get_token (re_token[slot], regexp);
208 if (next_regexp == NULL)
209 {
210 /* Syntax error, what else can we do? */
211 return false;
212 }
213 re_substr[slot] = next_regexp;
214 }
215 else
216 {
217 next_regexp = re_substr[slot];
218 }
219
220 matched = re_match_token (*str, re_token[slot]);
221 if (matched)
222 ++matches;
223
224 if (*str == 0)
225 return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;
226
227 switch (re_token[slot]->repeat)
228 {
229 case rep_once:
230 if (matches == 1)
231 { /* (matches == 1) => (matched == true) */
232 return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
233 }
234 return false;
235 case rep_once_or_more:
236 if (matched)
237 { /* (matched == true) => (matches >= 1) */
238 /* First check if the current token repeats more */
239 if (re_cmp_step (str + 1, regexp, slot, matches))
240 return true;
241 return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
242 }
243 return false;
244 case rep_null_or_once:
245 /* We must go on to the next token, but should we advance str? */
246 if (matches == 0)
247 {
248 return re_cmp_step (str, next_regexp, slot + 1, 0);
249 }
250 else if (matches == 1)
251 {
252 return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
253 }
254 return false; /* Not reached */
255 case rep_null_or_more:
256 if (matched)
257 {
258 /* Look for further repeats, advance str */
259 if (re_cmp_step (str + 1, regexp, slot, matches))
260 return true;
261 return re_cmp_step (str, next_regexp, slot + 1, 0);
262 }
263 return re_cmp_step (str, next_regexp, slot + 1, 0);
264 }
265 return false;
266 }
267
268 static void
269 re_init (void)
270 {
271 int i;
272
273 re_token[0] = (selection *) malloc (sizeof (selection));
274 for (i = 1; i < RE_TOKEN_MAX; i++)
275 re_token[i] = NULL;
276
277 re_init_done = true;
278 }
279
280 static bool
281 re_match_token (unsigned char c, selection * sel)
282 {
283 switch (sel->type)
284 {
285 case sel_any:
286 return true;
287 case sel_end:
288 return (c == 0);
289 case sel_single:
290 return (tolower (c) == tolower (sel->u.single));
291 case sel_range:
292 return (c >= sel->u.range.low && c <= sel->u.range.high);
293 case sel_array:
294 return (sel->u.array[c]);
295 case sel_not_single:
296 return (tolower (c) != tolower (sel->u.single));
297 case sel_not_range:
298 return (c < sel->u.range.low && c > sel->u.range.high);
299 }
300 return false;
301 }
302
303 /* re_get_token - get regular expression token
304 * Returns the first token found in <regexp> in <sel>
305 * Return values: NULL syntax error
306 * pointer to first character past token.
307 */
308 static const char *
309 re_get_token (selection * sel, const char *regexp)
310 {
311
312 #ifdef SAFE_CHECKS
313 # define exit_if_null if (*regexp == 0) return NULL
314 #else
315 # define exit_if_null
316 #endif
317
318 bool quoted = false;
319 unsigned char looking_at;
320
321 #ifdef SAFE_CHECKS
322 if (sel == NULL || regexp == NULL || *regexp == 0)
323 return NULL;
324 #endif
325
326 do
327 {
328 looking_at = *regexp++;
329 switch (looking_at)
330 {
331 case '$':
332 if (quoted)
333 {
334 quoted = false;
335 sel->type = sel_single;
336 sel->u.single = looking_at;
337 }
338 else
339 {
340 sel->type = sel_end;
341 }
342 break;
343 case '.':
344 if (quoted)
345 {
346 quoted = false;
347 sel->type = sel_single;
348 sel->u.single = looking_at;
349 }
350 else
351 {
352 sel->type = sel_any;
353 }
354 break;
355 case '[':
356 /* The fun stuff... perhaps a little obfuscated since I
357 * don't trust the compiler to analyse liveness.
358 */
359 if (quoted)
360 {
361 quoted = false;
362 sel->type = sel_single;
363 sel->u.single = looking_at;
364 }
365 else
366 {
367 bool neg = false;
368 unsigned char first, last = 0;
369
370 exit_if_null;
371 looking_at = *regexp++;
372
373 if (looking_at == '^')
374 {
375 neg = true;
376 exit_if_null;
377 looking_at = *regexp++;
378 }
379 first = looking_at;
380 exit_if_null;
381 looking_at = *regexp++;
382 if (looking_at == ']')
383 {
384 /* On the form [q] or [^q] */
385 sel->type = neg ? sel_not_single : sel_single;
386 sel->u.single = first;
387 break;
388 }
389 else if (looking_at == '-')
390 {
391 exit_if_null;
392 last = *regexp++;
393 if (last == ']')
394 {
395 /* On the form [A-] or [^A-]. Checking for
396 * [,-] and making it a range is probably not
397 * worth it :-)
398 */
399 sel->type = sel_array;
400 memset (sel->u.array, neg, sizeof (sel->u.array));
401 sel->u.array[first] = sel->u.array['-'] = !neg;
402 break;
403 }
404 else
405 {
406 exit_if_null;
407 looking_at = *regexp++;
408 if (looking_at == ']')
409 {
410 /* On the form [A-G] or [^A-G]. Note that [G-A]
411 * is a syntax error. Fair enough, I think.
412 */
413 #ifdef SAFE_CHECK
414 if (first > last)
415 return NULL;
416 #endif
417 sel->type = neg ? sel_not_range : sel_range;
418 sel->u.range.low = first;
419 sel->u.range.high = last;
420 break;
421 }
422 }
423 }
424 {
425 /* The datastructure can only represent a RE this
426 * complex with an array.
427 */
428 int i;
429 unsigned char previous;
430
431 sel->type = sel_array;
432 memset (sel->u.array, neg, sizeof (sel->u.array));
433 if (last)
434 {
435 /* It starts with a range */
436 #ifdef SAFE_CHECK
437 if (first > last)
438 return NULL;
439 #endif
440 for (i = first; i <= last; i++)
441 {
442 sel->u.array[i] = !neg;
443 }
444 }
445 else
446 {
447 /* It begins with a "random" character */
448 sel->u.array[first] = !neg;
449 }
450 sel->u.array[looking_at] = !neg;
451
452 exit_if_null;
453 previous = looking_at;
454 looking_at = *regexp++;
455
456 /* Add more characters to the array until we reach
457 * ]. Quoting doesn't and shouldn't work in here.
458 * ("]" should be put first, and "-" last if they
459 * are needed inside this construct.)
460 * Look for ranges as we go along.
461 */
462 while (looking_at != ']')
463 {
464 if (looking_at == '-')
465 {
466 exit_if_null;
467 looking_at = *regexp++;
468 if (looking_at != ']')
469 {
470 #ifdef SAFE_CHECK
471 if (previous > looking_at)
472 return NULL;
473 #endif
474 for (i = previous + 1; i < looking_at; i++)
475 {
476 /* previous has already been set and
477 * looking_at is set below.
478 */
479 sel->u.array[i] = !neg;
480 }
481 exit_if_null;
482 }
483 else
484 {
485 sel->u.array['-'] = !neg;
486 break;
487 }
488 }
489 sel->u.array[looking_at] = !neg;
490 previous = looking_at;
491 exit_if_null;
492 looking_at = *regexp++;
493 }
494 }
495 }
496 break;
497 case '\\':
498 if (quoted)
499 {
500 quoted = false;
501 sel->type = sel_single;
502 sel->u.single = looking_at;
503 }
504 else
505 {
506 quoted = true;
507 }
508 break;
509 default:
510 quoted = false;
511 sel->type = sel_single;
512 sel->u.single = looking_at;
513 break;
514 }
515 }
516 while (quoted);
517
518 if (*regexp == '*')
519 {
520 sel->repeat = rep_null_or_more;
521 ++regexp;
522 }
523 else if (*regexp == '?')
524 {
525 sel->repeat = rep_null_or_once;
526 ++regexp;
527 }
528 else if (*regexp == '+')
529 {
530 sel->repeat = rep_once_or_more;
531 ++regexp;
532 }
533 else
534 {
535 sel->repeat = rep_once;
536 }
537
538 return regexp;
539 }
540
541 /* D e b u g c o d e
542 */
543 #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
544 static void
545 re_dump_sel (selection * sel)
546 {
547 switch (sel->type)
548 {
549 case sel_any:
550 printf (".");
551 break;
552 case sel_end:
553 printf ("$");
554 break;
555 case sel_single:
556 printf ("<%c>", sel->u.single);
557 break;
558 case sel_range:
559 printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
560 break;
561 case sel_array:
562 {
563 int i;
564
565 printf ("[");
566 for (i = 0; i < uchar_MAX; i++)
567 {
568 if (sel->u.array[i])
569 {
570 printf ("%c", i);
571 }
572 }
573 printf ("]");
574 }
575 break;
576 case sel_not_single:
577 printf ("[^%c]", sel->u.single);
578 break;
579 case sel_not_range:
580 printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
581 break;
582 default:
583 printf ("<UNKNOWN TOKEN!>");
584 break;
585 }
586 switch (sel->repeat)
587 {
588 case rep_once:
589 break;
590 case rep_null_or_once:
591 printf ("?");
592 break;
593 case rep_null_or_more:
594 printf ("*");
595 break;
596 case rep_once_or_more:
597 printf ("+");
598 break;
599 default:
600 printf ("<UNKNOWN REP-TOKEN!>");
601 break;
602 }
603 }
604
605 int
606 main (int argc, char *argv[])
607 {
608 char *re, *m;
609 selection sel;
610
611 re = re_get_token (&sel, argv[1]);
612
613 printf ("'%s' -> '%s'\n", argv[1], re);
614 re_dump_sel (&sel);
615 printf ("\n");
616 m = re_cmp (argv[2], argv[1]);
617 if (m)
618 printf ("MATCH! -> '%s'\n", m);
619 return 0;
620 }
621 #endif