ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/deliantra/server/common/re-cmp.C
Revision: 1.4
Committed: Sun Sep 10 16:00:23 2006 UTC (17 years, 8 months ago) by root
Content type: text/plain
Branch: MAIN
Changes since 1.3: +445 -368 lines
Log Message:
indent

File Contents

# User Rev Content
1 root 1.4
2 elmex 1.1 /*
3     * static char *rcsid_player_c =
4 root 1.4 * "$Id: re-cmp.C,v 1.3 2006-09-08 04:51:08 pippijn Exp $";
5 elmex 1.1 */
6    
7    
8     /* re-cmp.c
9     * Pattern match a string, parsing some of the common RE-metacharacters.
10     *
11     * This code is public domain, but I would appreciate to hear of
12     * improvements or even the fact that you use it in your program.
13     *
14     * Deliberate BUGS:
15     * - These tokens are not supported: | ( )
16     * - You will get the longest expansion of the _first_ string which
17     * matches the RE, not the longest string which would be the proper
18     * behaviour for a RE-matcher.
19     *
20     * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
21     */
22    
23     #include <stdio.h>
24     #include <stdlib.h>
25     #include <memory.h>
26     #include <limits.h>
27     #include <re-cmp.h>
28     #include <ctype.h>
29    
30     /* Get prototype functions to prevent warnings. */
31     #if defined (__sun__) && defined(StupidSunHeaders)
32 root 1.4 # include <sys/types.h>
33     # include <sys/time.h>
34     # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
35 elmex 1.1 #endif
36    
37    
38     /* P r o t o t y p e s
39     */
40 root 1.4 const char *re_cmp (const char *, const char *);
41     static bool re_cmp_step (const char *, const char *, unsigned, int);
42     static void re_init (void);
43     static bool re_match_token (unsigned char, selection *);
44     static const char *re_get_token (selection *, const char *);
45    
46 elmex 1.1 #ifdef DEBUG2
47 root 1.4 static void re_dump_sel (selection *);
48 elmex 1.1 #endif
49    
50     /* G l o b a l v a r i a b l e s
51     */
52 root 1.4 static bool re_init_done = false;
53     static selection *re_token[RE_TOKEN_MAX];
54     static const char *re_substr[RE_TOKEN_MAX];
55     static unsigned int re_token_depth;
56 elmex 1.1
57     /* E x t e r n a l f u n c t i o n
58     */
59    
60     /* re-cmp - get regular expression match.
61     * Return values: NULL - no match or error in regexp.
62     * pointer to beginning of matching string
63     */
64     const char *
65 root 1.4 re_cmp (const char *str, const char *regexp)
66     {
67     const char *next_regexp;
68     bool once = false;
69     bool matched;
70 elmex 1.1
71 root 1.4 if (re_init_done == false)
72     re_init ();
73 elmex 1.1
74     #ifdef SAFE_CHECKS
75 root 1.4 if (regexp == NULL || str == NULL)
76     return NULL;
77 elmex 1.1 #endif
78 root 1.4 if (*regexp == '^')
79     {
80     once = true;
81     ++regexp;
82     }
83     if (*regexp == 0)
84     {
85     /* // or /^/ matches any string */
86     return str;
87     }
88    
89     next_regexp = re_get_token (re_token[0], regexp);
90     re_token_depth = 0;
91     re_substr[0] = next_regexp;
92    
93     matched = false;
94     while (*str != '\0' && !(matched = re_match_token (*str, re_token[0])))
95     str++;
96    
97     if (matched && *next_regexp == 0)
98     return str;
99    
100     /* Apologies for the nearly duplicated code below, hopefully it
101     * speeds things up.
102     */
103     if (once)
104     {
105     switch (re_token[0]->repeat)
106     {
107 root 1.2 case rep_once:
108 root 1.4 if (matched == false)
109     return NULL;
110     break;
111 root 1.2 case rep_once_or_more:
112 root 1.4 if (matched == false)
113     return NULL;
114 root 1.2
115 root 1.4 if (re_cmp_step (str + 1, regexp, 0, 1))
116     return str;
117     break;
118 root 1.2 case rep_null_or_once:
119 root 1.4 if (matched == false)
120     return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
121     break;
122 root 1.2 case rep_null_or_more:
123 root 1.4 if (matched)
124     {
125     if (re_cmp_step (str + 1, regexp, 0, 1))
126     return str;
127     }
128     else
129     {
130     return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
131 root 1.2 }
132 root 1.4 break;
133 root 1.2 }
134 root 1.4 return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
135 elmex 1.1 }
136    
137 root 1.4 if (matched)
138     {
139     switch (re_token[0]->repeat)
140     {
141 root 1.2 case rep_once:
142     case rep_null_or_once:
143 root 1.4 break;
144 root 1.2 case rep_once_or_more:
145     case rep_null_or_more:
146 root 1.4 if (re_cmp_step (str + 1, regexp, 0, 1))
147     return str;
148     break;
149 root 1.2 }
150 root 1.4 /* The logic here is that re_match_token only sees
151     * if the one letter matches. Thus, if the
152     * regex is like '@match eureca', and the
153     * the user enters anything with an e, re_match_token
154     * returns true, but they really need to match the
155     * entire regexp, which re_cmp_step will do.
156     * However, what happens is that there can be a case
157     * where the string being match is something like
158     * 'where is eureca'. In this case, the re_match_token
159     * matches that first e, but the re_cmp_step below,
160     * fails because the next character (r) doesn't match
161     * the u. So we call re_cmp with the string
162     * after the first r, so that it should hopefully match
163     * up properly.
164     */
165     if (re_cmp_step (str + 1, next_regexp, 1, 0))
166     return str;
167     else if (*(str + 1) != 0)
168     return re_cmp (str + 1, regexp);
169 elmex 1.1 }
170 root 1.4 return NULL;
171 elmex 1.1 }
172    
173     /* A u x i l l i a r y f u n c t i o n s
174     */
175    
176 pippijn 1.3 static bool
177 root 1.4 re_cmp_step (const char *str, const char *regexp, unsigned slot, int matches)
178     {
179     /* str - string to match
180     * regexp - pattern
181     * slot - number of the token which under consideration
182     * matches - how many times the token has matched
183     */
184     const char *next_regexp;
185     bool matched;
186 elmex 1.1
187     #ifdef DEBUG
188 root 1.4
189 elmex 1.1 /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
190     #endif
191    
192 root 1.4 if (*regexp == 0)
193     {
194     /* When we reach the end of the regexp, the match is a success
195     */
196     return true;
197     }
198    
199     /* This chunk of code makes sure that the regexp-tokenising happens
200     * only once. We only tokenise as much as we need.
201     */
202     if (slot > re_token_depth)
203     {
204     re_token_depth = slot;
205     if (re_token[slot] == NULL)
206     re_token[slot] = (selection *) malloc (sizeof (selection));
207     next_regexp = re_get_token (re_token[slot], regexp);
208     if (next_regexp == NULL)
209     {
210     /* Syntax error, what else can we do? */
211     return false;
212 root 1.2 }
213 root 1.4 re_substr[slot] = next_regexp;
214     }
215     else
216     {
217     next_regexp = re_substr[slot];
218 elmex 1.1 }
219    
220 root 1.4 matched = re_match_token (*str, re_token[slot]);
221     if (matched)
222     ++matches;
223 elmex 1.1
224 root 1.4 if (*str == 0)
225     return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;
226 elmex 1.1
227 root 1.4 switch (re_token[slot]->repeat)
228     {
229 root 1.2 case rep_once:
230 root 1.4 if (matches == 1)
231     { /* (matches == 1) => (matched == true) */
232     return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
233 root 1.2 }
234 root 1.4 return false;
235 root 1.2 case rep_once_or_more:
236 root 1.4 if (matched)
237     { /* (matched == true) => (matches >= 1) */
238     /* First check if the current token repeats more */
239     if (re_cmp_step (str + 1, regexp, slot, matches))
240     return true;
241     return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
242 root 1.2 }
243 root 1.4 return false;
244 root 1.2 case rep_null_or_once:
245 root 1.4 /* We must go on to the next token, but should we advance str? */
246     if (matches == 0)
247     {
248     return re_cmp_step (str, next_regexp, slot + 1, 0);
249     }
250     else if (matches == 1)
251     {
252     return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
253 root 1.2 }
254 root 1.4 return false; /* Not reached */
255 root 1.2 case rep_null_or_more:
256 root 1.4 if (matched)
257     {
258     /* Look for further repeats, advance str */
259     if (re_cmp_step (str + 1, regexp, slot, matches))
260     return true;
261     return re_cmp_step (str, next_regexp, slot + 1, 0);
262 root 1.2 }
263 root 1.4 return re_cmp_step (str, next_regexp, slot + 1, 0);
264 elmex 1.1 }
265 root 1.4 return false;
266 elmex 1.1 }
267    
268     static void
269 root 1.4 re_init (void)
270     {
271     int i;
272    
273     re_token[0] = (selection *) malloc (sizeof (selection));
274     for (i = 1; i < RE_TOKEN_MAX; i++)
275     re_token[i] = NULL;
276 elmex 1.1
277 root 1.4 re_init_done = true;
278 elmex 1.1 }
279    
280 pippijn 1.3 static bool
281 root 1.4 re_match_token (unsigned char c, selection * sel)
282     {
283     switch (sel->type)
284     {
285 root 1.2 case sel_any:
286 root 1.4 return true;
287 root 1.2 case sel_end:
288 root 1.4 return (c == 0);
289 root 1.2 case sel_single:
290 root 1.4 return (tolower (c) == tolower (sel->u.single));
291 root 1.2 case sel_range:
292 root 1.4 return (c >= sel->u.range.low && c <= sel->u.range.high);
293 root 1.2 case sel_array:
294 root 1.4 return (sel->u.array[c]);
295 root 1.2 case sel_not_single:
296 root 1.4 return (tolower (c) != tolower (sel->u.single));
297 root 1.2 case sel_not_range:
298 root 1.4 return (c < sel->u.range.low && c > sel->u.range.high);
299 elmex 1.1 }
300 root 1.4 return false;
301 elmex 1.1 }
302    
303     /* re_get_token - get regular expression token
304     * Returns the first token found in <regexp> in <sel>
305     * Return values: NULL syntax error
306     * pointer to first character past token.
307     */
308     static const char *
309 root 1.4 re_get_token (selection * sel, const char *regexp)
310     {
311 elmex 1.1
312     #ifdef SAFE_CHECKS
313 root 1.4 # define exit_if_null if (*regexp == 0) return NULL
314 elmex 1.1 #else
315 root 1.4 # define exit_if_null
316 elmex 1.1 #endif
317    
318 root 1.4 bool quoted = false;
319     unsigned char looking_at;
320 elmex 1.1
321     #ifdef SAFE_CHECKS
322 root 1.4 if (sel == NULL || regexp == NULL || *regexp == 0)
323     return NULL;
324 elmex 1.1 #endif
325    
326 root 1.4 do
327     {
328     looking_at = *regexp++;
329     switch (looking_at)
330     {
331 root 1.2 case '$':
332 root 1.4 if (quoted)
333     {
334     quoted = false;
335     sel->type = sel_single;
336     sel->u.single = looking_at;
337     }
338     else
339     {
340     sel->type = sel_end;
341 root 1.2 }
342 root 1.4 break;
343 root 1.2 case '.':
344 root 1.4 if (quoted)
345     {
346     quoted = false;
347     sel->type = sel_single;
348     sel->u.single = looking_at;
349 root 1.2 }
350 root 1.4 else
351     {
352     sel->type = sel_any;
353     }
354     break;
355 root 1.2 case '[':
356 root 1.4 /* The fun stuff... perhaps a little obfuscated since I
357     * don't trust the compiler to analyse liveness.
358     */
359     if (quoted)
360     {
361     quoted = false;
362     sel->type = sel_single;
363     sel->u.single = looking_at;
364     }
365     else
366     {
367     bool neg = false;
368     unsigned char first, last = 0;
369 root 1.2
370 root 1.4 exit_if_null;
371     looking_at = *regexp++;
372 root 1.2
373 root 1.4 if (looking_at == '^')
374     {
375     neg = true;
376     exit_if_null;
377     looking_at = *regexp++;
378     }
379     first = looking_at;
380     exit_if_null;
381     looking_at = *regexp++;
382     if (looking_at == ']')
383     {
384     /* On the form [q] or [^q] */
385     sel->type = neg ? sel_not_single : sel_single;
386     sel->u.single = first;
387     break;
388 root 1.2 }
389 root 1.4 else if (looking_at == '-')
390     {
391     exit_if_null;
392     last = *regexp++;
393     if (last == ']')
394     {
395     /* On the form [A-] or [^A-]. Checking for
396     * [,-] and making it a range is probably not
397     * worth it :-)
398     */
399     sel->type = sel_array;
400     memset (sel->u.array, neg, sizeof (sel->u.array));
401     sel->u.array[first] = sel->u.array['-'] = !neg;
402     break;
403     }
404     else
405     {
406     exit_if_null;
407     looking_at = *regexp++;
408     if (looking_at == ']')
409     {
410     /* On the form [A-G] or [^A-G]. Note that [G-A]
411     * is a syntax error. Fair enough, I think.
412     */
413 elmex 1.1 #ifdef SAFE_CHECK
414 root 1.4 if (first > last)
415     return NULL;
416 elmex 1.1 #endif
417 root 1.4 sel->type = neg ? sel_not_range : sel_range;
418     sel->u.range.low = first;
419     sel->u.range.high = last;
420     break;
421 root 1.2 }
422     }
423     }
424 root 1.4 {
425     /* The datastructure can only represent a RE this
426     * complex with an array.
427     */
428     int i;
429     unsigned char previous;
430    
431     sel->type = sel_array;
432     memset (sel->u.array, neg, sizeof (sel->u.array));
433     if (last)
434     {
435     /* It starts with a range */
436 elmex 1.1 #ifdef SAFE_CHECK
437 root 1.4 if (first > last)
438     return NULL;
439 elmex 1.1 #endif
440 root 1.4 for (i = first; i <= last; i++)
441     {
442     sel->u.array[i] = !neg;
443     }
444     }
445     else
446     {
447     /* It begins with a "random" character */
448     sel->u.array[first] = !neg;
449     }
450     sel->u.array[looking_at] = !neg;
451 root 1.2
452 root 1.4 exit_if_null;
453     previous = looking_at;
454     looking_at = *regexp++;
455 root 1.2
456 root 1.4 /* Add more characters to the array until we reach
457     * ]. Quoting doesn't and shouldn't work in here.
458     * ("]" should be put first, and "-" last if they
459     * are needed inside this construct.)
460     * Look for ranges as we go along.
461     */
462     while (looking_at != ']')
463     {
464     if (looking_at == '-')
465     {
466     exit_if_null;
467     looking_at = *regexp++;
468     if (looking_at != ']')
469     {
470 elmex 1.1 #ifdef SAFE_CHECK
471 root 1.4 if (previous > looking_at)
472     return NULL;
473 elmex 1.1 #endif
474 root 1.4 for (i = previous + 1; i < looking_at; i++)
475     {
476     /* previous has already been set and
477     * looking_at is set below.
478     */
479     sel->u.array[i] = !neg;
480     }
481     exit_if_null;
482     }
483     else
484     {
485     sel->u.array['-'] = !neg;
486     break;
487     }
488     }
489     sel->u.array[looking_at] = !neg;
490     previous = looking_at;
491     exit_if_null;
492     looking_at = *regexp++;
493     }
494     }
495 root 1.2 }
496 root 1.4 break;
497 root 1.2 case '\\':
498 root 1.4 if (quoted)
499     {
500     quoted = false;
501     sel->type = sel_single;
502     sel->u.single = looking_at;
503     }
504     else
505     {
506     quoted = true;
507 root 1.2 }
508 root 1.4 break;
509 root 1.2 default:
510 root 1.4 quoted = false;
511     sel->type = sel_single;
512     sel->u.single = looking_at;
513     break;
514 root 1.2 }
515 root 1.4 }
516     while (quoted);
517 elmex 1.1
518 root 1.4 if (*regexp == '*')
519     {
520     sel->repeat = rep_null_or_more;
521     ++regexp;
522     }
523     else if (*regexp == '?')
524     {
525     sel->repeat = rep_null_or_once;
526     ++regexp;
527     }
528     else if (*regexp == '+')
529     {
530     sel->repeat = rep_once_or_more;
531     ++regexp;
532     }
533     else
534     {
535     sel->repeat = rep_once;
536 elmex 1.1 }
537    
538 root 1.4 return regexp;
539 elmex 1.1 }
540    
541     /* D e b u g c o d e
542     */
543 root 1.4 #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
544 elmex 1.1 static void
545 root 1.4 re_dump_sel (selection * sel)
546     {
547     switch (sel->type)
548     {
549 root 1.2 case sel_any:
550 root 1.4 printf (".");
551     break;
552 root 1.2 case sel_end:
553 root 1.4 printf ("$");
554     break;
555 root 1.2 case sel_single:
556 root 1.4 printf ("<%c>", sel->u.single);
557     break;
558 root 1.2 case sel_range:
559 root 1.4 printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
560     break;
561 root 1.2 case sel_array:
562 root 1.4 {
563     int i;
564    
565     printf ("[");
566     for (i = 0; i < uchar_MAX; i++)
567     {
568     if (sel->u.array[i])
569     {
570     printf ("%c", i);
571     }
572     }
573     printf ("]");
574     }
575     break;
576 root 1.2 case sel_not_single:
577 root 1.4 printf ("[^%c]", sel->u.single);
578     break;
579 root 1.2 case sel_not_range:
580 root 1.4 printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
581     break;
582 root 1.2 default:
583 root 1.4 printf ("<UNKNOWN TOKEN!>");
584     break;
585 elmex 1.1 }
586 root 1.4 switch (sel->repeat)
587     {
588 root 1.2 case rep_once:
589 root 1.4 break;
590 root 1.2 case rep_null_or_once:
591 root 1.4 printf ("?");
592     break;
593 root 1.2 case rep_null_or_more:
594 root 1.4 printf ("*");
595     break;
596 root 1.2 case rep_once_or_more:
597 root 1.4 printf ("+");
598     break;
599 root 1.2 default:
600 root 1.4 printf ("<UNKNOWN REP-TOKEN!>");
601     break;
602 elmex 1.1 }
603     }
604    
605     int
606 root 1.4 main (int argc, char *argv[])
607     {
608     char *re, *m;
609     selection sel;
610    
611     re = re_get_token (&sel, argv[1]);
612    
613     printf ("'%s' -> '%s'\n", argv[1], re);
614     re_dump_sel (&sel);
615     printf ("\n");
616     m = re_cmp (argv[2], argv[1]);
617     if (m)
618     printf ("MATCH! -> '%s'\n", m);
619     return 0;
620 elmex 1.1 }
621     #endif