ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/deliantra/server/common/re-cmp.C
Revision: 1.6
Committed: Sat Dec 9 17:28:37 2006 UTC (17 years, 5 months ago) by pippijn
Content type: text/plain
Branch: MAIN
CVS Tags: HEAD
Changes since 1.5: +0 -0 lines
State: FILE REMOVED
Log Message:
removed regex comparison. this is now done with perl

File Contents

# User Rev Content
1 elmex 1.1 /* re-cmp.c
2     * Pattern match a string, parsing some of the common RE-metacharacters.
3     *
4     * This code is public domain, but I would appreciate to hear of
5     * improvements or even the fact that you use it in your program.
6     *
7     * Deliberate BUGS:
8     * - These tokens are not supported: | ( )
9     * - You will get the longest expansion of the _first_ string which
10     * matches the RE, not the longest string which would be the proper
11     * behaviour for a RE-matcher.
12     *
13     * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
14     */
15    
16     #include <stdio.h>
17     #include <stdlib.h>
18     #include <memory.h>
19     #include <limits.h>
20     #include <re-cmp.h>
21     #include <ctype.h>
22    
23     /* Get prototype functions to prevent warnings. */
24     #if defined (__sun__) && defined(StupidSunHeaders)
25 root 1.4 # include <sys/types.h>
26     # include <sys/time.h>
27     # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
28 elmex 1.1 #endif
29    
30    
31     /* P r o t o t y p e s
32     */
33 root 1.4 const char *re_cmp (const char *, const char *);
34     static bool re_cmp_step (const char *, const char *, unsigned, int);
35     static void re_init (void);
36     static bool re_match_token (unsigned char, selection *);
37     static const char *re_get_token (selection *, const char *);
38    
39 elmex 1.1 #ifdef DEBUG2
40 root 1.4 static void re_dump_sel (selection *);
41 elmex 1.1 #endif
42    
43     /* G l o b a l v a r i a b l e s
44     */
45 root 1.4 static bool re_init_done = false;
46     static selection *re_token[RE_TOKEN_MAX];
47     static const char *re_substr[RE_TOKEN_MAX];
48     static unsigned int re_token_depth;
49 elmex 1.1
50     /* E x t e r n a l f u n c t i o n
51     */
52    
53     /* re-cmp - get regular expression match.
54     * Return values: NULL - no match or error in regexp.
55     * pointer to beginning of matching string
56     */
57     const char *
58 root 1.4 re_cmp (const char *str, const char *regexp)
59     {
60     const char *next_regexp;
61     bool once = false;
62     bool matched;
63 elmex 1.1
64 root 1.4 if (re_init_done == false)
65     re_init ();
66 elmex 1.1
67     #ifdef SAFE_CHECKS
68 root 1.4 if (regexp == NULL || str == NULL)
69     return NULL;
70 elmex 1.1 #endif
71 root 1.4 if (*regexp == '^')
72     {
73     once = true;
74     ++regexp;
75     }
76     if (*regexp == 0)
77     {
78     /* // or /^/ matches any string */
79     return str;
80     }
81    
82     next_regexp = re_get_token (re_token[0], regexp);
83     re_token_depth = 0;
84     re_substr[0] = next_regexp;
85    
86     matched = false;
87     while (*str != '\0' && !(matched = re_match_token (*str, re_token[0])))
88     str++;
89    
90     if (matched && *next_regexp == 0)
91     return str;
92    
93     /* Apologies for the nearly duplicated code below, hopefully it
94     * speeds things up.
95     */
96     if (once)
97     {
98     switch (re_token[0]->repeat)
99     {
100 root 1.2 case rep_once:
101 root 1.4 if (matched == false)
102     return NULL;
103     break;
104 root 1.2 case rep_once_or_more:
105 root 1.4 if (matched == false)
106     return NULL;
107 root 1.2
108 root 1.4 if (re_cmp_step (str + 1, regexp, 0, 1))
109     return str;
110     break;
111 root 1.2 case rep_null_or_once:
112 root 1.4 if (matched == false)
113     return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
114     break;
115 root 1.2 case rep_null_or_more:
116 root 1.4 if (matched)
117     {
118     if (re_cmp_step (str + 1, regexp, 0, 1))
119     return str;
120     }
121     else
122     {
123     return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
124 root 1.2 }
125 root 1.4 break;
126 root 1.2 }
127 root 1.4 return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
128 elmex 1.1 }
129    
130 root 1.4 if (matched)
131     {
132     switch (re_token[0]->repeat)
133     {
134 root 1.2 case rep_once:
135     case rep_null_or_once:
136 root 1.4 break;
137 root 1.2 case rep_once_or_more:
138     case rep_null_or_more:
139 root 1.4 if (re_cmp_step (str + 1, regexp, 0, 1))
140     return str;
141     break;
142 root 1.2 }
143 root 1.4 /* The logic here is that re_match_token only sees
144     * if the one letter matches. Thus, if the
145     * regex is like '@match eureca', and the
146     * the user enters anything with an e, re_match_token
147     * returns true, but they really need to match the
148     * entire regexp, which re_cmp_step will do.
149     * However, what happens is that there can be a case
150     * where the string being match is something like
151     * 'where is eureca'. In this case, the re_match_token
152     * matches that first e, but the re_cmp_step below,
153     * fails because the next character (r) doesn't match
154     * the u. So we call re_cmp with the string
155     * after the first r, so that it should hopefully match
156     * up properly.
157     */
158     if (re_cmp_step (str + 1, next_regexp, 1, 0))
159     return str;
160     else if (*(str + 1) != 0)
161     return re_cmp (str + 1, regexp);
162 elmex 1.1 }
163 root 1.4 return NULL;
164 elmex 1.1 }
165    
166     /* A u x i l l i a r y f u n c t i o n s
167     */
168    
169 pippijn 1.3 static bool
170 root 1.4 re_cmp_step (const char *str, const char *regexp, unsigned slot, int matches)
171     {
172     /* str - string to match
173     * regexp - pattern
174     * slot - number of the token which under consideration
175     * matches - how many times the token has matched
176     */
177     const char *next_regexp;
178     bool matched;
179 elmex 1.1
180     #ifdef DEBUG
181 root 1.4
182 elmex 1.1 /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
183     #endif
184    
185 root 1.4 if (*regexp == 0)
186     {
187     /* When we reach the end of the regexp, the match is a success
188     */
189     return true;
190     }
191    
192     /* This chunk of code makes sure that the regexp-tokenising happens
193     * only once. We only tokenise as much as we need.
194     */
195     if (slot > re_token_depth)
196     {
197     re_token_depth = slot;
198     if (re_token[slot] == NULL)
199     re_token[slot] = (selection *) malloc (sizeof (selection));
200     next_regexp = re_get_token (re_token[slot], regexp);
201     if (next_regexp == NULL)
202     {
203     /* Syntax error, what else can we do? */
204     return false;
205 root 1.2 }
206 root 1.4 re_substr[slot] = next_regexp;
207     }
208     else
209     {
210     next_regexp = re_substr[slot];
211 elmex 1.1 }
212    
213 root 1.4 matched = re_match_token (*str, re_token[slot]);
214     if (matched)
215     ++matches;
216 elmex 1.1
217 root 1.4 if (*str == 0)
218     return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;
219 elmex 1.1
220 root 1.4 switch (re_token[slot]->repeat)
221     {
222 root 1.2 case rep_once:
223 root 1.4 if (matches == 1)
224     { /* (matches == 1) => (matched == true) */
225     return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
226 root 1.2 }
227 root 1.4 return false;
228 root 1.2 case rep_once_or_more:
229 root 1.4 if (matched)
230     { /* (matched == true) => (matches >= 1) */
231     /* First check if the current token repeats more */
232     if (re_cmp_step (str + 1, regexp, slot, matches))
233     return true;
234     return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
235 root 1.2 }
236 root 1.4 return false;
237 root 1.2 case rep_null_or_once:
238 root 1.4 /* We must go on to the next token, but should we advance str? */
239     if (matches == 0)
240     {
241     return re_cmp_step (str, next_regexp, slot + 1, 0);
242     }
243     else if (matches == 1)
244     {
245     return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
246 root 1.2 }
247 root 1.4 return false; /* Not reached */
248 root 1.2 case rep_null_or_more:
249 root 1.4 if (matched)
250     {
251     /* Look for further repeats, advance str */
252     if (re_cmp_step (str + 1, regexp, slot, matches))
253     return true;
254     return re_cmp_step (str, next_regexp, slot + 1, 0);
255 root 1.2 }
256 root 1.4 return re_cmp_step (str, next_regexp, slot + 1, 0);
257 elmex 1.1 }
258 root 1.4 return false;
259 elmex 1.1 }
260    
261     static void
262 root 1.4 re_init (void)
263     {
264     int i;
265    
266     re_token[0] = (selection *) malloc (sizeof (selection));
267     for (i = 1; i < RE_TOKEN_MAX; i++)
268     re_token[i] = NULL;
269 elmex 1.1
270 root 1.4 re_init_done = true;
271 elmex 1.1 }
272    
273 pippijn 1.3 static bool
274 root 1.4 re_match_token (unsigned char c, selection * sel)
275     {
276     switch (sel->type)
277     {
278 root 1.2 case sel_any:
279 root 1.4 return true;
280 root 1.2 case sel_end:
281 root 1.4 return (c == 0);
282 root 1.2 case sel_single:
283 root 1.4 return (tolower (c) == tolower (sel->u.single));
284 root 1.2 case sel_range:
285 root 1.4 return (c >= sel->u.range.low && c <= sel->u.range.high);
286 root 1.2 case sel_array:
287 root 1.4 return (sel->u.array[c]);
288 root 1.2 case sel_not_single:
289 root 1.4 return (tolower (c) != tolower (sel->u.single));
290 root 1.2 case sel_not_range:
291 root 1.4 return (c < sel->u.range.low && c > sel->u.range.high);
292 elmex 1.1 }
293 root 1.4 return false;
294 elmex 1.1 }
295    
296     /* re_get_token - get regular expression token
297     * Returns the first token found in <regexp> in <sel>
298     * Return values: NULL syntax error
299     * pointer to first character past token.
300     */
301     static const char *
302 root 1.4 re_get_token (selection * sel, const char *regexp)
303     {
304 elmex 1.1
305     #ifdef SAFE_CHECKS
306 root 1.4 # define exit_if_null if (*regexp == 0) return NULL
307 elmex 1.1 #else
308 root 1.4 # define exit_if_null
309 elmex 1.1 #endif
310    
311 root 1.4 bool quoted = false;
312     unsigned char looking_at;
313 elmex 1.1
314     #ifdef SAFE_CHECKS
315 root 1.4 if (sel == NULL || regexp == NULL || *regexp == 0)
316     return NULL;
317 elmex 1.1 #endif
318    
319 root 1.4 do
320     {
321     looking_at = *regexp++;
322     switch (looking_at)
323     {
324 root 1.2 case '$':
325 root 1.4 if (quoted)
326     {
327     quoted = false;
328     sel->type = sel_single;
329     sel->u.single = looking_at;
330     }
331     else
332     {
333     sel->type = sel_end;
334 root 1.2 }
335 root 1.4 break;
336 root 1.2 case '.':
337 root 1.4 if (quoted)
338     {
339     quoted = false;
340     sel->type = sel_single;
341     sel->u.single = looking_at;
342 root 1.2 }
343 root 1.4 else
344     {
345     sel->type = sel_any;
346     }
347     break;
348 root 1.2 case '[':
349 root 1.4 /* The fun stuff... perhaps a little obfuscated since I
350     * don't trust the compiler to analyse liveness.
351     */
352     if (quoted)
353     {
354     quoted = false;
355     sel->type = sel_single;
356     sel->u.single = looking_at;
357     }
358     else
359     {
360     bool neg = false;
361     unsigned char first, last = 0;
362 root 1.2
363 root 1.4 exit_if_null;
364     looking_at = *regexp++;
365 root 1.2
366 root 1.4 if (looking_at == '^')
367     {
368     neg = true;
369     exit_if_null;
370     looking_at = *regexp++;
371     }
372     first = looking_at;
373     exit_if_null;
374     looking_at = *regexp++;
375     if (looking_at == ']')
376     {
377     /* On the form [q] or [^q] */
378     sel->type = neg ? sel_not_single : sel_single;
379     sel->u.single = first;
380     break;
381 root 1.2 }
382 root 1.4 else if (looking_at == '-')
383     {
384     exit_if_null;
385     last = *regexp++;
386     if (last == ']')
387     {
388     /* On the form [A-] or [^A-]. Checking for
389     * [,-] and making it a range is probably not
390     * worth it :-)
391     */
392     sel->type = sel_array;
393     memset (sel->u.array, neg, sizeof (sel->u.array));
394     sel->u.array[first] = sel->u.array['-'] = !neg;
395     break;
396     }
397     else
398     {
399     exit_if_null;
400     looking_at = *regexp++;
401     if (looking_at == ']')
402     {
403     /* On the form [A-G] or [^A-G]. Note that [G-A]
404     * is a syntax error. Fair enough, I think.
405     */
406 elmex 1.1 #ifdef SAFE_CHECK
407 root 1.4 if (first > last)
408     return NULL;
409 elmex 1.1 #endif
410 root 1.4 sel->type = neg ? sel_not_range : sel_range;
411     sel->u.range.low = first;
412     sel->u.range.high = last;
413     break;
414 root 1.2 }
415     }
416     }
417 root 1.4 {
418     /* The datastructure can only represent a RE this
419     * complex with an array.
420     */
421     int i;
422     unsigned char previous;
423    
424     sel->type = sel_array;
425     memset (sel->u.array, neg, sizeof (sel->u.array));
426     if (last)
427     {
428     /* It starts with a range */
429 elmex 1.1 #ifdef SAFE_CHECK
430 root 1.4 if (first > last)
431     return NULL;
432 elmex 1.1 #endif
433 root 1.4 for (i = first; i <= last; i++)
434     {
435     sel->u.array[i] = !neg;
436     }
437     }
438     else
439     {
440     /* It begins with a "random" character */
441     sel->u.array[first] = !neg;
442     }
443     sel->u.array[looking_at] = !neg;
444 root 1.2
445 root 1.4 exit_if_null;
446     previous = looking_at;
447     looking_at = *regexp++;
448 root 1.2
449 root 1.4 /* Add more characters to the array until we reach
450     * ]. Quoting doesn't and shouldn't work in here.
451     * ("]" should be put first, and "-" last if they
452     * are needed inside this construct.)
453     * Look for ranges as we go along.
454     */
455     while (looking_at != ']')
456     {
457     if (looking_at == '-')
458     {
459     exit_if_null;
460     looking_at = *regexp++;
461     if (looking_at != ']')
462     {
463 elmex 1.1 #ifdef SAFE_CHECK
464 root 1.4 if (previous > looking_at)
465     return NULL;
466 elmex 1.1 #endif
467 root 1.4 for (i = previous + 1; i < looking_at; i++)
468     {
469     /* previous has already been set and
470     * looking_at is set below.
471     */
472     sel->u.array[i] = !neg;
473     }
474     exit_if_null;
475     }
476     else
477     {
478     sel->u.array['-'] = !neg;
479     break;
480     }
481     }
482     sel->u.array[looking_at] = !neg;
483     previous = looking_at;
484     exit_if_null;
485     looking_at = *regexp++;
486     }
487     }
488 root 1.2 }
489 root 1.4 break;
490 root 1.2 case '\\':
491 root 1.4 if (quoted)
492     {
493     quoted = false;
494     sel->type = sel_single;
495     sel->u.single = looking_at;
496     }
497     else
498     {
499     quoted = true;
500 root 1.2 }
501 root 1.4 break;
502 root 1.2 default:
503 root 1.4 quoted = false;
504     sel->type = sel_single;
505     sel->u.single = looking_at;
506     break;
507 root 1.2 }
508 root 1.4 }
509     while (quoted);
510 elmex 1.1
511 root 1.4 if (*regexp == '*')
512     {
513     sel->repeat = rep_null_or_more;
514     ++regexp;
515     }
516     else if (*regexp == '?')
517     {
518     sel->repeat = rep_null_or_once;
519     ++regexp;
520     }
521     else if (*regexp == '+')
522     {
523     sel->repeat = rep_once_or_more;
524     ++regexp;
525     }
526     else
527     {
528     sel->repeat = rep_once;
529 elmex 1.1 }
530    
531 root 1.4 return regexp;
532 elmex 1.1 }
533    
534     /* D e b u g c o d e
535     */
536 root 1.4 #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
537 elmex 1.1 static void
538 root 1.4 re_dump_sel (selection * sel)
539     {
540     switch (sel->type)
541     {
542 root 1.2 case sel_any:
543 root 1.4 printf (".");
544     break;
545 root 1.2 case sel_end:
546 root 1.4 printf ("$");
547     break;
548 root 1.2 case sel_single:
549 root 1.4 printf ("<%c>", sel->u.single);
550     break;
551 root 1.2 case sel_range:
552 root 1.4 printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
553     break;
554 root 1.2 case sel_array:
555 root 1.4 {
556     int i;
557    
558     printf ("[");
559     for (i = 0; i < uchar_MAX; i++)
560     {
561     if (sel->u.array[i])
562     {
563     printf ("%c", i);
564     }
565     }
566     printf ("]");
567     }
568     break;
569 root 1.2 case sel_not_single:
570 root 1.4 printf ("[^%c]", sel->u.single);
571     break;
572 root 1.2 case sel_not_range:
573 root 1.4 printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
574     break;
575 root 1.2 default:
576 root 1.4 printf ("<UNKNOWN TOKEN!>");
577     break;
578 elmex 1.1 }
579 root 1.4 switch (sel->repeat)
580     {
581 root 1.2 case rep_once:
582 root 1.4 break;
583 root 1.2 case rep_null_or_once:
584 root 1.4 printf ("?");
585     break;
586 root 1.2 case rep_null_or_more:
587 root 1.4 printf ("*");
588     break;
589 root 1.2 case rep_once_or_more:
590 root 1.4 printf ("+");
591     break;
592 root 1.2 default:
593 root 1.4 printf ("<UNKNOWN REP-TOKEN!>");
594     break;
595 elmex 1.1 }
596     }
597    
598     int
599 root 1.4 main (int argc, char *argv[])
600     {
601     char *re, *m;
602     selection sel;
603    
604     re = re_get_token (&sel, argv[1]);
605    
606     printf ("'%s' -> '%s'\n", argv[1], re);
607     re_dump_sel (&sel);
608     printf ("\n");
609     m = re_cmp (argv[2], argv[1]);
610     if (m)
611     printf ("MATCH! -> '%s'\n", m);
612     return 0;
613 elmex 1.1 }
614     #endif