ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/deliantra/server/common/re-cmp.C
Revision: 1.3
Committed: Fri Sep 8 04:51:08 2006 UTC (17 years, 9 months ago) by pippijn
Content type: text/plain
Branch: MAIN
Changes since 1.2: +42 -42 lines
Log Message:
Converted the custom type Boolean to C++ bool.

File Contents

# User Rev Content
1 elmex 1.1 /*
2     * static char *rcsid_player_c =
3 pippijn 1.3 * "$Id: re-cmp.C,v 1.2 2006-08-29 08:01:35 root Exp $";
4 elmex 1.1 */
5    
6    
7     /* re-cmp.c
8     * Pattern match a string, parsing some of the common RE-metacharacters.
9     *
10     * This code is public domain, but I would appreciate to hear of
11     * improvements or even the fact that you use it in your program.
12     *
13     * Deliberate BUGS:
14     * - These tokens are not supported: | ( )
15     * - You will get the longest expansion of the _first_ string which
16     * matches the RE, not the longest string which would be the proper
17     * behaviour for a RE-matcher.
18     *
19     * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
20     */
21    
22     #include <stdio.h>
23     #include <stdlib.h>
24     #include <memory.h>
25     #include <limits.h>
26     #include <re-cmp.h>
27     #include <ctype.h>
28    
29     /* Get prototype functions to prevent warnings. */
30     #if defined (__sun__) && defined(StupidSunHeaders)
31     # include <sys/types.h>
32     # include <sys/time.h>
33     # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
34     #endif
35    
36    
37     /* P r o t o t y p e s
38     */
39     const char *re_cmp(const char *, const char *);
40 pippijn 1.3 static bool re_cmp_step(const char *, const char *, unsigned, int);
41 elmex 1.1 static void re_init(void);
42 pippijn 1.3 static bool re_match_token(unsigned char, selection *);
43 elmex 1.1 static const char *re_get_token(selection *, const char *);
44     #ifdef DEBUG2
45     static void re_dump_sel(selection *);
46     #endif
47    
48     /* G l o b a l v a r i a b l e s
49     */
50 pippijn 1.3 static bool re_init_done = false;
51 elmex 1.1 static selection *re_token[RE_TOKEN_MAX];
52     static const char *re_substr[RE_TOKEN_MAX];
53     static unsigned int re_token_depth;
54    
55     /* E x t e r n a l f u n c t i o n
56     */
57    
58     /* re-cmp - get regular expression match.
59     * Return values: NULL - no match or error in regexp.
60     * pointer to beginning of matching string
61     */
62     const char *
63     re_cmp(const char *str, const char *regexp) {
64     const char *next_regexp;
65 pippijn 1.3 bool once = false;
66     bool matched;
67 elmex 1.1
68 pippijn 1.3 if (re_init_done == false)
69 root 1.2 re_init();
70 elmex 1.1
71     #ifdef SAFE_CHECKS
72     if (regexp == NULL || str == NULL)
73 root 1.2 return NULL;
74 elmex 1.1 #endif
75     if (*regexp == '^') {
76 pippijn 1.3 once = true;
77 root 1.2 ++regexp;
78 elmex 1.1 }
79     if (*regexp == 0) {
80 root 1.2 /* // or /^/ matches any string */
81     return str;
82 elmex 1.1 }
83    
84     next_regexp = re_get_token(re_token[0], regexp);
85     re_token_depth = 0;
86     re_substr[0] = next_regexp;
87    
88 pippijn 1.3 matched = false;
89 elmex 1.1 while (*str != '\0' && !(matched = re_match_token(*str, re_token[0])))
90 root 1.2 str++;
91 elmex 1.1
92     if (matched && *next_regexp == 0)
93 root 1.2 return str;
94 elmex 1.1
95     /* Apologies for the nearly duplicated code below, hopefully it
96     * speeds things up.
97     */
98     if (once) {
99 root 1.2 switch (re_token[0]->repeat) {
100     case rep_once:
101 pippijn 1.3 if (matched == false)
102 root 1.2 return NULL;
103     break;
104     case rep_once_or_more:
105 pippijn 1.3 if (matched == false)
106 root 1.2 return NULL;
107    
108     if (re_cmp_step(str+1, regexp, 0, 1))
109     return str;
110     break;
111     case rep_null_or_once:
112 pippijn 1.3 if (matched == false)
113 root 1.2 return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
114     break;
115     case rep_null_or_more:
116     if (matched) {
117     if (re_cmp_step(str+1, regexp, 0, 1))
118     return str;
119     } else {
120     return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
121     }
122     break;
123     }
124     return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL;
125 elmex 1.1 }
126    
127     if (matched) {
128 root 1.2 switch (re_token[0]->repeat) {
129     case rep_once:
130     case rep_null_or_once:
131     break;
132     case rep_once_or_more:
133     case rep_null_or_more:
134     if (re_cmp_step(str+1, regexp, 0, 1))
135     return str;
136     break;
137     }
138     /* The logic here is that re_match_token only sees
139     * if the one letter matches. Thus, if the
140     * regex is like '@match eureca', and the
141     * the user enters anything with an e, re_match_token
142     * returns true, but they really need to match the
143     * entire regexp, which re_cmp_step will do.
144     * However, what happens is that there can be a case
145     * where the string being match is something like
146     * 'where is eureca'. In this case, the re_match_token
147     * matches that first e, but the re_cmp_step below,
148     * fails because the next character (r) doesn't match
149     * the u. So we call re_cmp with the string
150     * after the first r, so that it should hopefully match
151     * up properly.
152     */
153     if (re_cmp_step(str+1, next_regexp, 1, 0))
154     return str;
155     else if (*(str+1) != 0)
156     return re_cmp(str+1, regexp);
157 elmex 1.1 }
158     return NULL;
159     }
160    
161     /* A u x i l l i a r y f u n c t i o n s
162     */
163    
164 pippijn 1.3 static bool
165 elmex 1.1 re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) {
166     /* str - string to match
167     * regexp - pattern
168     * slot - number of the token which under consideration
169     * matches - how many times the token has matched
170     */
171     const char *next_regexp;
172 pippijn 1.3 bool matched;
173 elmex 1.1
174     #ifdef DEBUG
175     /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
176     #endif
177    
178     if (*regexp == 0) {
179 root 1.2 /* When we reach the end of the regexp, the match is a success
180     */
181 pippijn 1.3 return true;
182 elmex 1.1 }
183    
184     /* This chunk of code makes sure that the regexp-tokenising happens
185     * only once. We only tokenise as much as we need.
186     */
187     if (slot > re_token_depth) {
188 root 1.2 re_token_depth = slot;
189     if (re_token[slot] == NULL)
190     re_token[slot] = (selection *) malloc(sizeof(selection));
191     next_regexp = re_get_token(re_token[slot], regexp);
192     if (next_regexp == NULL) {
193     /* Syntax error, what else can we do? */
194 pippijn 1.3 return false;
195 root 1.2 }
196     re_substr[slot] = next_regexp;
197 elmex 1.1 } else {
198 root 1.2 next_regexp = re_substr[slot];
199 elmex 1.1 }
200    
201     matched = re_match_token(*str, re_token[slot]);
202     if (matched)
203 root 1.2 ++matches;
204 elmex 1.1
205     if (*str == 0)
206 root 1.2 return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;
207 elmex 1.1
208     switch (re_token[slot]->repeat) {
209 root 1.2 case rep_once:
210 pippijn 1.3 if (matches == 1) { /* (matches == 1) => (matched == true) */
211 root 1.2 return re_cmp_step(str+1, next_regexp, slot+1, 0);
212     }
213 pippijn 1.3 return false;
214 root 1.2 case rep_once_or_more:
215 pippijn 1.3 if (matched) { /* (matched == true) => (matches >= 1) */
216 root 1.2 /* First check if the current token repeats more */
217     if (re_cmp_step(str+1, regexp, slot, matches))
218 pippijn 1.3 return true;
219 root 1.2 return re_cmp_step(str+1, next_regexp, slot+1, 0);
220     }
221 pippijn 1.3 return false;
222 root 1.2 case rep_null_or_once:
223     /* We must go on to the next token, but should we advance str? */
224     if (matches == 0) {
225     return re_cmp_step(str, next_regexp, slot+1, 0);
226     } else if (matches == 1) {
227     return re_cmp_step(str+1, next_regexp, slot+1, 0);
228     }
229 pippijn 1.3 return false; /* Not reached */
230 root 1.2 case rep_null_or_more:
231     if (matched) {
232     /* Look for further repeats, advance str */
233     if (re_cmp_step(str+1, regexp, slot, matches))
234 pippijn 1.3 return true;
235 root 1.2 return re_cmp_step(str, next_regexp, slot+1, 0);
236     }
237     return re_cmp_step(str, next_regexp, slot+1, 0);
238 elmex 1.1 }
239 pippijn 1.3 return false;
240 elmex 1.1 }
241    
242     static void
243     re_init(void) {
244     int i;
245    
246     re_token[0] = (selection *) malloc(sizeof(selection));
247     for (i = 1; i < RE_TOKEN_MAX; i++)
248 root 1.2 re_token[i] = NULL;
249 elmex 1.1
250 pippijn 1.3 re_init_done = true;
251 elmex 1.1 }
252    
253 pippijn 1.3 static bool
254     re_match_token(unsigned char c, selection *sel) {
255 elmex 1.1 switch (sel->type) {
256 root 1.2 case sel_any:
257 pippijn 1.3 return true;
258 root 1.2 case sel_end:
259     return (c == 0);
260     case sel_single:
261     return (tolower(c) == tolower(sel->u.single));
262     case sel_range:
263     return (c >= sel->u.range.low && c <= sel->u.range.high);
264     case sel_array:
265     return (sel->u.array[c]);
266     case sel_not_single:
267     return (tolower(c) != tolower(sel->u.single));
268     case sel_not_range:
269     return (c < sel->u.range.low && c > sel->u.range.high);
270 elmex 1.1 }
271 pippijn 1.3 return false;
272 elmex 1.1 }
273    
274     /* re_get_token - get regular expression token
275     * Returns the first token found in <regexp> in <sel>
276     * Return values: NULL syntax error
277     * pointer to first character past token.
278     */
279     static const char *
280     re_get_token(selection *sel, const char *regexp) {
281    
282     #ifdef SAFE_CHECKS
283     # define exit_if_null if (*regexp == 0) return NULL
284     #else
285     # define exit_if_null
286     #endif
287    
288 pippijn 1.3 bool quoted = false;
289     unsigned char looking_at;
290 elmex 1.1
291     #ifdef SAFE_CHECKS
292     if (sel == NULL || regexp == NULL || *regexp == 0)
293 root 1.2 return NULL;
294 elmex 1.1 #endif
295    
296     do {
297 root 1.2 looking_at = *regexp++;
298     switch (looking_at) {
299     case '$':
300     if (quoted) {
301 pippijn 1.3 quoted = false;
302 root 1.2 sel->type = sel_single;
303     sel->u.single = looking_at;
304     } else {
305     sel->type = sel_end;
306     }
307     break;
308     case '.':
309     if (quoted) {
310 pippijn 1.3 quoted = false;
311 root 1.2 sel->type = sel_single;
312     sel->u.single = looking_at;
313     } else {
314     sel->type = sel_any;
315     }
316     break;
317     case '[':
318     /* The fun stuff... perhaps a little obfuscated since I
319     * don't trust the compiler to analyse liveness.
320     */
321     if (quoted) {
322 pippijn 1.3 quoted = false;
323 root 1.2 sel->type = sel_single;
324     sel->u.single = looking_at;
325     } else {
326 pippijn 1.3 bool neg = false;
327     unsigned char first, last = 0;
328 root 1.2
329     exit_if_null;
330     looking_at = *regexp++;
331    
332     if (looking_at == '^') {
333 pippijn 1.3 neg = true;
334 root 1.2 exit_if_null;
335     looking_at = *regexp++;
336     }
337     first = looking_at;
338     exit_if_null;
339     looking_at = *regexp++;
340     if (looking_at == ']') {
341     /* On the form [q] or [^q] */
342     sel->type = neg ? sel_not_single : sel_single;
343     sel->u.single = first;
344     break;
345     } else if (looking_at == '-') {
346     exit_if_null;
347     last = *regexp++;
348     if (last == ']') {
349     /* On the form [A-] or [^A-]. Checking for
350     * [,-] and making it a range is probably not
351     * worth it :-)
352     */
353     sel->type = sel_array;
354     memset(sel->u.array, neg, sizeof(sel->u.array));
355     sel->u.array[first] = sel->u.array['-'] = !neg;
356     break;
357     } else {
358     exit_if_null;
359     looking_at = *regexp++;
360     if (looking_at == ']') {
361     /* On the form [A-G] or [^A-G]. Note that [G-A]
362     * is a syntax error. Fair enough, I think.
363     */
364 elmex 1.1 #ifdef SAFE_CHECK
365 root 1.2 if (first > last)
366     return NULL;
367 elmex 1.1 #endif
368 root 1.2 sel->type = neg ? sel_not_range : sel_range;
369     sel->u.range.low = first;
370     sel->u.range.high = last;
371     break;
372     }
373     }
374     }
375     {
376     /* The datastructure can only represent a RE this
377     * complex with an array.
378     */
379     int i;
380 pippijn 1.3 unsigned char previous;
381 root 1.2
382     sel->type = sel_array;
383     memset(sel->u.array, neg, sizeof(sel->u.array));
384     if (last) {
385     /* It starts with a range */
386 elmex 1.1 #ifdef SAFE_CHECK
387 root 1.2 if (first > last)
388     return NULL;
389 elmex 1.1 #endif
390 root 1.2 for (i = first; i <= last; i++) {
391     sel->u.array[i] = !neg;
392     }
393     } else {
394     /* It begins with a "random" character */
395     sel->u.array[first] = !neg;
396     }
397     sel->u.array[looking_at] = !neg;
398    
399     exit_if_null;
400     previous = looking_at;
401     looking_at = *regexp++;
402    
403     /* Add more characters to the array until we reach
404     * ]. Quoting doesn't and shouldn't work in here.
405     * ("]" should be put first, and "-" last if they
406     * are needed inside this construct.)
407     * Look for ranges as we go along.
408     */
409     while (looking_at != ']') {
410     if (looking_at == '-') {
411     exit_if_null;
412     looking_at = *regexp++;
413     if (looking_at != ']') {
414 elmex 1.1 #ifdef SAFE_CHECK
415 root 1.2 if (previous > looking_at)
416     return NULL;
417 elmex 1.1 #endif
418 root 1.2 for (i = previous+1; i < looking_at; i++) {
419     /* previous has already been set and
420     * looking_at is set below.
421     */
422     sel->u.array[i] = !neg;
423     }
424     exit_if_null;
425     } else {
426     sel->u.array['-'] = !neg;
427     break;
428     }
429     }
430     sel->u.array[looking_at] = !neg;
431     previous = looking_at;
432     exit_if_null;
433     looking_at = *regexp++;
434     }
435     }
436     }
437     break;
438     case '\\':
439     if (quoted) {
440 pippijn 1.3 quoted = false;
441 root 1.2 sel->type = sel_single;
442     sel->u.single = looking_at;
443     } else {
444 pippijn 1.3 quoted = true;
445 root 1.2 }
446     break;
447     default:
448 pippijn 1.3 quoted = false;
449 root 1.2 sel->type = sel_single;
450     sel->u.single = looking_at;
451     break;
452     }
453 elmex 1.1 } while (quoted);
454    
455     if (*regexp == '*') {
456 root 1.2 sel->repeat = rep_null_or_more;
457     ++regexp;
458 elmex 1.1 } else if (*regexp == '?') {
459 root 1.2 sel->repeat = rep_null_or_once;
460     ++regexp;
461 elmex 1.1 } else if (*regexp == '+') {
462 root 1.2 sel->repeat = rep_once_or_more;
463     ++regexp;
464 elmex 1.1 } else {
465 root 1.2 sel->repeat = rep_once;
466 elmex 1.1 }
467    
468     return regexp;
469     }
470    
471     /* D e b u g c o d e
472     */
473     #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
474     static void
475     re_dump_sel(selection *sel) {
476     switch(sel->type) {
477 root 1.2 case sel_any:
478     printf(".");
479     break;
480     case sel_end:
481     printf("$");
482     break;
483     case sel_single:
484     printf("<%c>", sel->u.single);
485     break;
486     case sel_range:
487     printf("[%c-%c]", sel->u.range.low, sel->u.range.high);
488     break;
489     case sel_array:
490     {
491     int i;
492     printf("[");
493 pippijn 1.3 for (i = 0; i < uchar_MAX; i++) {
494 root 1.2 if (sel->u.array[i]) {
495     printf("%c", i);
496     }
497     }
498     printf("]");
499     }
500     break;
501     case sel_not_single:
502     printf("[^%c]", sel->u.single);
503     break;
504     case sel_not_range:
505     printf("[^%c-%c]", sel->u.range.low, sel->u.range.high);
506     break;
507     default:
508     printf("<UNKNOWN TOKEN!>");
509     break;
510 elmex 1.1 }
511     switch(sel->repeat) {
512 root 1.2 case rep_once:
513     break;
514     case rep_null_or_once:
515     printf("?");
516     break;
517     case rep_null_or_more:
518     printf("*");
519     break;
520     case rep_once_or_more:
521     printf("+");
522     break;
523     default:
524     printf("<UNKNOWN REP-TOKEN!>");
525     break;
526 elmex 1.1 }
527     }
528    
529     int
530     main(int argc, char *argv[]) {
531     char *re, *m;
532     selection sel;
533    
534     re = re_get_token(&sel, argv[1]);
535    
536     printf("'%s' -> '%s'\n", argv[1], re);
537     re_dump_sel(&sel);
538     printf("\n");
539     m = re_cmp(argv[2], argv[1]);
540     if (m)
541 root 1.2 printf("MATCH! -> '%s'\n", m);
542 elmex 1.1 return 0;
543     }
544     #endif