ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/deliantra/server/common/re-cmp.C
Revision: 1.6
Committed: Sat Dec 9 17:28:37 2006 UTC (17 years, 5 months ago) by pippijn
Content type: text/plain
Branch: MAIN
CVS Tags: HEAD
Changes since 1.5: +0 -0 lines
State: FILE REMOVED
Log Message:
removed regex comparison. this is now done with perl

File Contents

# Content
1 /* re-cmp.c
2 * Pattern match a string, parsing some of the common RE-metacharacters.
3 *
4 * This code is public domain, but I would appreciate to hear of
5 * improvements or even the fact that you use it in your program.
6 *
7 * Deliberate BUGS:
8 * - These tokens are not supported: | ( )
9 * - You will get the longest expansion of the _first_ string which
10 * matches the RE, not the longest string which would be the proper
11 * behaviour for a RE-matcher.
12 *
13 * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
14 */
15
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <memory.h>
19 #include <limits.h>
20 #include <re-cmp.h>
21 #include <ctype.h>
22
23 /* Get prototype functions to prevent warnings. */
24 #if defined (__sun__) && defined(StupidSunHeaders)
25 # include <sys/types.h>
26 # include <sys/time.h>
27 # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
28 #endif
29
30
31 /* P r o t o t y p e s
32 */
33 const char *re_cmp (const char *, const char *);
34 static bool re_cmp_step (const char *, const char *, unsigned, int);
35 static void re_init (void);
36 static bool re_match_token (unsigned char, selection *);
37 static const char *re_get_token (selection *, const char *);
38
39 #ifdef DEBUG2
40 static void re_dump_sel (selection *);
41 #endif
42
43 /* G l o b a l v a r i a b l e s
44 */
45 static bool re_init_done = false;
46 static selection *re_token[RE_TOKEN_MAX];
47 static const char *re_substr[RE_TOKEN_MAX];
48 static unsigned int re_token_depth;
49
50 /* E x t e r n a l f u n c t i o n
51 */
52
53 /* re-cmp - get regular expression match.
54 * Return values: NULL - no match or error in regexp.
55 * pointer to beginning of matching string
56 */
57 const char *
58 re_cmp (const char *str, const char *regexp)
59 {
60 const char *next_regexp;
61 bool once = false;
62 bool matched;
63
64 if (re_init_done == false)
65 re_init ();
66
67 #ifdef SAFE_CHECKS
68 if (regexp == NULL || str == NULL)
69 return NULL;
70 #endif
71 if (*regexp == '^')
72 {
73 once = true;
74 ++regexp;
75 }
76 if (*regexp == 0)
77 {
78 /* // or /^/ matches any string */
79 return str;
80 }
81
82 next_regexp = re_get_token (re_token[0], regexp);
83 re_token_depth = 0;
84 re_substr[0] = next_regexp;
85
86 matched = false;
87 while (*str != '\0' && !(matched = re_match_token (*str, re_token[0])))
88 str++;
89
90 if (matched && *next_regexp == 0)
91 return str;
92
93 /* Apologies for the nearly duplicated code below, hopefully it
94 * speeds things up.
95 */
96 if (once)
97 {
98 switch (re_token[0]->repeat)
99 {
100 case rep_once:
101 if (matched == false)
102 return NULL;
103 break;
104 case rep_once_or_more:
105 if (matched == false)
106 return NULL;
107
108 if (re_cmp_step (str + 1, regexp, 0, 1))
109 return str;
110 break;
111 case rep_null_or_once:
112 if (matched == false)
113 return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
114 break;
115 case rep_null_or_more:
116 if (matched)
117 {
118 if (re_cmp_step (str + 1, regexp, 0, 1))
119 return str;
120 }
121 else
122 {
123 return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
124 }
125 break;
126 }
127 return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
128 }
129
130 if (matched)
131 {
132 switch (re_token[0]->repeat)
133 {
134 case rep_once:
135 case rep_null_or_once:
136 break;
137 case rep_once_or_more:
138 case rep_null_or_more:
139 if (re_cmp_step (str + 1, regexp, 0, 1))
140 return str;
141 break;
142 }
143 /* The logic here is that re_match_token only sees
144 * if the one letter matches. Thus, if the
145 * regex is like '@match eureca', and the
146 * the user enters anything with an e, re_match_token
147 * returns true, but they really need to match the
148 * entire regexp, which re_cmp_step will do.
149 * However, what happens is that there can be a case
150 * where the string being match is something like
151 * 'where is eureca'. In this case, the re_match_token
152 * matches that first e, but the re_cmp_step below,
153 * fails because the next character (r) doesn't match
154 * the u. So we call re_cmp with the string
155 * after the first r, so that it should hopefully match
156 * up properly.
157 */
158 if (re_cmp_step (str + 1, next_regexp, 1, 0))
159 return str;
160 else if (*(str + 1) != 0)
161 return re_cmp (str + 1, regexp);
162 }
163 return NULL;
164 }
165
166 /* A u x i l l i a r y f u n c t i o n s
167 */
168
169 static bool
170 re_cmp_step (const char *str, const char *regexp, unsigned slot, int matches)
171 {
172 /* str - string to match
173 * regexp - pattern
174 * slot - number of the token which under consideration
175 * matches - how many times the token has matched
176 */
177 const char *next_regexp;
178 bool matched;
179
180 #ifdef DEBUG
181
182 /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
183 #endif
184
185 if (*regexp == 0)
186 {
187 /* When we reach the end of the regexp, the match is a success
188 */
189 return true;
190 }
191
192 /* This chunk of code makes sure that the regexp-tokenising happens
193 * only once. We only tokenise as much as we need.
194 */
195 if (slot > re_token_depth)
196 {
197 re_token_depth = slot;
198 if (re_token[slot] == NULL)
199 re_token[slot] = (selection *) malloc (sizeof (selection));
200 next_regexp = re_get_token (re_token[slot], regexp);
201 if (next_regexp == NULL)
202 {
203 /* Syntax error, what else can we do? */
204 return false;
205 }
206 re_substr[slot] = next_regexp;
207 }
208 else
209 {
210 next_regexp = re_substr[slot];
211 }
212
213 matched = re_match_token (*str, re_token[slot]);
214 if (matched)
215 ++matches;
216
217 if (*str == 0)
218 return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;
219
220 switch (re_token[slot]->repeat)
221 {
222 case rep_once:
223 if (matches == 1)
224 { /* (matches == 1) => (matched == true) */
225 return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
226 }
227 return false;
228 case rep_once_or_more:
229 if (matched)
230 { /* (matched == true) => (matches >= 1) */
231 /* First check if the current token repeats more */
232 if (re_cmp_step (str + 1, regexp, slot, matches))
233 return true;
234 return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
235 }
236 return false;
237 case rep_null_or_once:
238 /* We must go on to the next token, but should we advance str? */
239 if (matches == 0)
240 {
241 return re_cmp_step (str, next_regexp, slot + 1, 0);
242 }
243 else if (matches == 1)
244 {
245 return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
246 }
247 return false; /* Not reached */
248 case rep_null_or_more:
249 if (matched)
250 {
251 /* Look for further repeats, advance str */
252 if (re_cmp_step (str + 1, regexp, slot, matches))
253 return true;
254 return re_cmp_step (str, next_regexp, slot + 1, 0);
255 }
256 return re_cmp_step (str, next_regexp, slot + 1, 0);
257 }
258 return false;
259 }
260
261 static void
262 re_init (void)
263 {
264 int i;
265
266 re_token[0] = (selection *) malloc (sizeof (selection));
267 for (i = 1; i < RE_TOKEN_MAX; i++)
268 re_token[i] = NULL;
269
270 re_init_done = true;
271 }
272
273 static bool
274 re_match_token (unsigned char c, selection * sel)
275 {
276 switch (sel->type)
277 {
278 case sel_any:
279 return true;
280 case sel_end:
281 return (c == 0);
282 case sel_single:
283 return (tolower (c) == tolower (sel->u.single));
284 case sel_range:
285 return (c >= sel->u.range.low && c <= sel->u.range.high);
286 case sel_array:
287 return (sel->u.array[c]);
288 case sel_not_single:
289 return (tolower (c) != tolower (sel->u.single));
290 case sel_not_range:
291 return (c < sel->u.range.low && c > sel->u.range.high);
292 }
293 return false;
294 }
295
296 /* re_get_token - get regular expression token
297 * Returns the first token found in <regexp> in <sel>
298 * Return values: NULL syntax error
299 * pointer to first character past token.
300 */
301 static const char *
302 re_get_token (selection * sel, const char *regexp)
303 {
304
305 #ifdef SAFE_CHECKS
306 # define exit_if_null if (*regexp == 0) return NULL
307 #else
308 # define exit_if_null
309 #endif
310
311 bool quoted = false;
312 unsigned char looking_at;
313
314 #ifdef SAFE_CHECKS
315 if (sel == NULL || regexp == NULL || *regexp == 0)
316 return NULL;
317 #endif
318
319 do
320 {
321 looking_at = *regexp++;
322 switch (looking_at)
323 {
324 case '$':
325 if (quoted)
326 {
327 quoted = false;
328 sel->type = sel_single;
329 sel->u.single = looking_at;
330 }
331 else
332 {
333 sel->type = sel_end;
334 }
335 break;
336 case '.':
337 if (quoted)
338 {
339 quoted = false;
340 sel->type = sel_single;
341 sel->u.single = looking_at;
342 }
343 else
344 {
345 sel->type = sel_any;
346 }
347 break;
348 case '[':
349 /* The fun stuff... perhaps a little obfuscated since I
350 * don't trust the compiler to analyse liveness.
351 */
352 if (quoted)
353 {
354 quoted = false;
355 sel->type = sel_single;
356 sel->u.single = looking_at;
357 }
358 else
359 {
360 bool neg = false;
361 unsigned char first, last = 0;
362
363 exit_if_null;
364 looking_at = *regexp++;
365
366 if (looking_at == '^')
367 {
368 neg = true;
369 exit_if_null;
370 looking_at = *regexp++;
371 }
372 first = looking_at;
373 exit_if_null;
374 looking_at = *regexp++;
375 if (looking_at == ']')
376 {
377 /* On the form [q] or [^q] */
378 sel->type = neg ? sel_not_single : sel_single;
379 sel->u.single = first;
380 break;
381 }
382 else if (looking_at == '-')
383 {
384 exit_if_null;
385 last = *regexp++;
386 if (last == ']')
387 {
388 /* On the form [A-] or [^A-]. Checking for
389 * [,-] and making it a range is probably not
390 * worth it :-)
391 */
392 sel->type = sel_array;
393 memset (sel->u.array, neg, sizeof (sel->u.array));
394 sel->u.array[first] = sel->u.array['-'] = !neg;
395 break;
396 }
397 else
398 {
399 exit_if_null;
400 looking_at = *regexp++;
401 if (looking_at == ']')
402 {
403 /* On the form [A-G] or [^A-G]. Note that [G-A]
404 * is a syntax error. Fair enough, I think.
405 */
406 #ifdef SAFE_CHECK
407 if (first > last)
408 return NULL;
409 #endif
410 sel->type = neg ? sel_not_range : sel_range;
411 sel->u.range.low = first;
412 sel->u.range.high = last;
413 break;
414 }
415 }
416 }
417 {
418 /* The datastructure can only represent a RE this
419 * complex with an array.
420 */
421 int i;
422 unsigned char previous;
423
424 sel->type = sel_array;
425 memset (sel->u.array, neg, sizeof (sel->u.array));
426 if (last)
427 {
428 /* It starts with a range */
429 #ifdef SAFE_CHECK
430 if (first > last)
431 return NULL;
432 #endif
433 for (i = first; i <= last; i++)
434 {
435 sel->u.array[i] = !neg;
436 }
437 }
438 else
439 {
440 /* It begins with a "random" character */
441 sel->u.array[first] = !neg;
442 }
443 sel->u.array[looking_at] = !neg;
444
445 exit_if_null;
446 previous = looking_at;
447 looking_at = *regexp++;
448
449 /* Add more characters to the array until we reach
450 * ]. Quoting doesn't and shouldn't work in here.
451 * ("]" should be put first, and "-" last if they
452 * are needed inside this construct.)
453 * Look for ranges as we go along.
454 */
455 while (looking_at != ']')
456 {
457 if (looking_at == '-')
458 {
459 exit_if_null;
460 looking_at = *regexp++;
461 if (looking_at != ']')
462 {
463 #ifdef SAFE_CHECK
464 if (previous > looking_at)
465 return NULL;
466 #endif
467 for (i = previous + 1; i < looking_at; i++)
468 {
469 /* previous has already been set and
470 * looking_at is set below.
471 */
472 sel->u.array[i] = !neg;
473 }
474 exit_if_null;
475 }
476 else
477 {
478 sel->u.array['-'] = !neg;
479 break;
480 }
481 }
482 sel->u.array[looking_at] = !neg;
483 previous = looking_at;
484 exit_if_null;
485 looking_at = *regexp++;
486 }
487 }
488 }
489 break;
490 case '\\':
491 if (quoted)
492 {
493 quoted = false;
494 sel->type = sel_single;
495 sel->u.single = looking_at;
496 }
497 else
498 {
499 quoted = true;
500 }
501 break;
502 default:
503 quoted = false;
504 sel->type = sel_single;
505 sel->u.single = looking_at;
506 break;
507 }
508 }
509 while (quoted);
510
511 if (*regexp == '*')
512 {
513 sel->repeat = rep_null_or_more;
514 ++regexp;
515 }
516 else if (*regexp == '?')
517 {
518 sel->repeat = rep_null_or_once;
519 ++regexp;
520 }
521 else if (*regexp == '+')
522 {
523 sel->repeat = rep_once_or_more;
524 ++regexp;
525 }
526 else
527 {
528 sel->repeat = rep_once;
529 }
530
531 return regexp;
532 }
533
534 /* D e b u g c o d e
535 */
536 #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
537 static void
538 re_dump_sel (selection * sel)
539 {
540 switch (sel->type)
541 {
542 case sel_any:
543 printf (".");
544 break;
545 case sel_end:
546 printf ("$");
547 break;
548 case sel_single:
549 printf ("<%c>", sel->u.single);
550 break;
551 case sel_range:
552 printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
553 break;
554 case sel_array:
555 {
556 int i;
557
558 printf ("[");
559 for (i = 0; i < uchar_MAX; i++)
560 {
561 if (sel->u.array[i])
562 {
563 printf ("%c", i);
564 }
565 }
566 printf ("]");
567 }
568 break;
569 case sel_not_single:
570 printf ("[^%c]", sel->u.single);
571 break;
572 case sel_not_range:
573 printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
574 break;
575 default:
576 printf ("<UNKNOWN TOKEN!>");
577 break;
578 }
579 switch (sel->repeat)
580 {
581 case rep_once:
582 break;
583 case rep_null_or_once:
584 printf ("?");
585 break;
586 case rep_null_or_more:
587 printf ("*");
588 break;
589 case rep_once_or_more:
590 printf ("+");
591 break;
592 default:
593 printf ("<UNKNOWN REP-TOKEN!>");
594 break;
595 }
596 }
597
598 int
599 main (int argc, char *argv[])
600 {
601 char *re, *m;
602 selection sel;
603
604 re = re_get_token (&sel, argv[1]);
605
606 printf ("'%s' -> '%s'\n", argv[1], re);
607 re_dump_sel (&sel);
608 printf ("\n");
609 m = re_cmp (argv[2], argv[1]);
610 if (m)
611 printf ("MATCH! -> '%s'\n", m);
612 return 0;
613 }
614 #endif