ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/deliantra/server/common/re-cmp.c
Revision: 1.1.1.1 (vendor branch)
Committed: Fri Feb 3 07:11:39 2006 UTC (18 years, 3 months ago) by root
Content type: text/plain
Branch: UPSTREAM
CVS Tags: UPSTREAM_2006_03_15, LAST_C_VERSION, UPSTREAM_2006_02_22, difficulty_fix_merge_060810_2300, UPSTREAM_2006_02_03
Branch point for: difficulty_fix
Changes since 1.1: +0 -0 lines
Log Message:
initial import

File Contents

# User Rev Content
1 root 1.1 /*
2     * static char *rcsid_player_c =
3     * "$Id: re-cmp.c,v 1.13 2005/10/28 19:08:53 akirschbaum Exp $";
4     */
5    
6    
7     /* re-cmp.c
8     * Pattern match a string, parsing some of the common RE-metacharacters.
9     *
10     * This code is public domain, but I would appreciate to hear of
11     * improvements or even the fact that you use it in your program.
12     *
13     * Deliberate BUGS:
14     * - These tokens are not supported: | ( )
15     * - You will get the longest expansion of the _first_ string which
16     * matches the RE, not the longest string which would be the proper
17     * behaviour for a RE-matcher.
18     *
19     * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
20     */
21    
22     #include <stdio.h>
23     #include <stdlib.h>
24     #include <memory.h>
25     #include <limits.h>
26     #include <re-cmp.h>
27     #include <ctype.h>
28    
29     /* Get prototype functions to prevent warnings. */
30     #if defined (__sun__) && defined(StupidSunHeaders)
31     # include <sys/types.h>
32     # include <sys/time.h>
33     # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
34     #endif
35    
36    
37     /* P r o t o t y p e s
38     */
39     const char *re_cmp(const char *, const char *);
40     static Boolean re_cmp_step(const char *, const char *, unsigned, int);
41     static void re_init(void);
42     static Boolean re_match_token(uchar, selection *);
43     static const char *re_get_token(selection *, const char *);
44     #ifdef DEBUG2
45     static void re_dump_sel(selection *);
46     #endif
47    
48     /* G l o b a l v a r i a b l e s
49     */
50     static Boolean re_init_done = False;
51     static selection *re_token[RE_TOKEN_MAX];
52     static const char *re_substr[RE_TOKEN_MAX];
53     static unsigned int re_token_depth;
54    
55     /* E x t e r n a l f u n c t i o n
56     */
57    
58     /* re-cmp - get regular expression match.
59     * Return values: NULL - no match or error in regexp.
60     * pointer to beginning of matching string
61     */
62     const char *
63     re_cmp(const char *str, const char *regexp) {
64     const char *next_regexp;
65     Boolean once = False;
66     Boolean matched;
67    
68     if (re_init_done == False)
69     re_init();
70    
71     #ifdef SAFE_CHECKS
72     if (regexp == NULL || str == NULL)
73     return NULL;
74     #endif
75     if (*regexp == '^') {
76     once = True;
77     ++regexp;
78     }
79     if (*regexp == 0) {
80     /* // or /^/ matches any string */
81     return str;
82     }
83    
84     next_regexp = re_get_token(re_token[0], regexp);
85     re_token_depth = 0;
86     re_substr[0] = next_regexp;
87    
88     matched = False;
89     while (*str != '\0' && !(matched = re_match_token(*str, re_token[0])))
90     str++;
91    
92     if (matched && *next_regexp == 0)
93     return str;
94    
95     /* Apologies for the nearly duplicated code below, hopefully it
96     * speeds things up.
97     */
98     if (once) {
99     switch (re_token[0]->repeat) {
100     case rep_once:
101     if (matched == False)
102     return NULL;
103     break;
104     case rep_once_or_more:
105     if (matched == False)
106     return NULL;
107    
108     if (re_cmp_step(str+1, regexp, 0, 1))
109     return str;
110     break;
111     case rep_null_or_once:
112     if (matched == False)
113     return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
114     break;
115     case rep_null_or_more:
116     if (matched) {
117     if (re_cmp_step(str+1, regexp, 0, 1))
118     return str;
119     } else {
120     return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
121     }
122     break;
123     }
124     return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL;
125     }
126    
127     if (matched) {
128     switch (re_token[0]->repeat) {
129     case rep_once:
130     case rep_null_or_once:
131     break;
132     case rep_once_or_more:
133     case rep_null_or_more:
134     if (re_cmp_step(str+1, regexp, 0, 1))
135     return str;
136     break;
137     }
138     /* The logic here is that re_match_token only sees
139     * if the one letter matches. Thus, if the
140     * regex is like '@match eureca', and the
141     * the user enters anything with an e, re_match_token
142     * returns true, but they really need to match the
143     * entire regexp, which re_cmp_step will do.
144     * However, what happens is that there can be a case
145     * where the string being match is something like
146     * 'where is eureca'. In this case, the re_match_token
147     * matches that first e, but the re_cmp_step below,
148     * fails because the next character (r) doesn't match
149     * the u. So we call re_cmp with the string
150     * after the first r, so that it should hopefully match
151     * up properly.
152     */
153     if (re_cmp_step(str+1, next_regexp, 1, 0))
154     return str;
155     else if (*(str+1) != 0)
156     return re_cmp(str+1, regexp);
157     }
158     return NULL;
159     }
160    
161     /* A u x i l l i a r y f u n c t i o n s
162     */
163    
164     static Boolean
165     re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) {
166     /* str - string to match
167     * regexp - pattern
168     * slot - number of the token which under consideration
169     * matches - how many times the token has matched
170     */
171     const char *next_regexp;
172     Boolean matched;
173    
174     #ifdef DEBUG
175     /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
176     #endif
177    
178     if (*regexp == 0) {
179     /* When we reach the end of the regexp, the match is a success
180     */
181     return True;
182     }
183    
184     /* This chunk of code makes sure that the regexp-tokenising happens
185     * only once. We only tokenise as much as we need.
186     */
187     if (slot > re_token_depth) {
188     re_token_depth = slot;
189     if (re_token[slot] == NULL)
190     re_token[slot] = (selection *) malloc(sizeof(selection));
191     next_regexp = re_get_token(re_token[slot], regexp);
192     if (next_regexp == NULL) {
193     /* Syntax error, what else can we do? */
194     return False;
195     }
196     re_substr[slot] = next_regexp;
197     } else {
198     next_regexp = re_substr[slot];
199     }
200    
201     matched = re_match_token(*str, re_token[slot]);
202     if (matched)
203     ++matches;
204    
205     if (*str == 0)
206     return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;
207    
208     switch (re_token[slot]->repeat) {
209     case rep_once:
210     if (matches == 1) { /* (matches == 1) => (matched == True) */
211     return re_cmp_step(str+1, next_regexp, slot+1, 0);
212     }
213     return False;
214     case rep_once_or_more:
215     if (matched) { /* (matched == True) => (matches >= 1) */
216     /* First check if the current token repeats more */
217     if (re_cmp_step(str+1, regexp, slot, matches))
218     return True;
219     return re_cmp_step(str+1, next_regexp, slot+1, 0);
220     }
221     return False;
222     case rep_null_or_once:
223     /* We must go on to the next token, but should we advance str? */
224     if (matches == 0) {
225     return re_cmp_step(str, next_regexp, slot+1, 0);
226     } else if (matches == 1) {
227     return re_cmp_step(str+1, next_regexp, slot+1, 0);
228     }
229     return False; /* Not reached */
230     case rep_null_or_more:
231     if (matched) {
232     /* Look for further repeats, advance str */
233     if (re_cmp_step(str+1, regexp, slot, matches))
234     return True;
235     return re_cmp_step(str, next_regexp, slot+1, 0);
236     }
237     return re_cmp_step(str, next_regexp, slot+1, 0);
238     }
239     return False;
240     }
241    
242     static void
243     re_init(void) {
244     int i;
245    
246     re_token[0] = (selection *) malloc(sizeof(selection));
247     for (i = 1; i < RE_TOKEN_MAX; i++)
248     re_token[i] = NULL;
249    
250     re_init_done = True;
251     }
252    
253     static Boolean
254     re_match_token(uchar c, selection *sel) {
255     switch (sel->type) {
256     case sel_any:
257     return True;
258     case sel_end:
259     return (c == 0);
260     case sel_single:
261     return (tolower(c) == tolower(sel->u.single));
262     case sel_range:
263     return (c >= sel->u.range.low && c <= sel->u.range.high);
264     case sel_array:
265     return (sel->u.array[c]);
266     case sel_not_single:
267     return (tolower(c) != tolower(sel->u.single));
268     case sel_not_range:
269     return (c < sel->u.range.low && c > sel->u.range.high);
270     }
271     return False;
272     }
273    
274     /* re_get_token - get regular expression token
275     * Returns the first token found in <regexp> in <sel>
276     * Return values: NULL syntax error
277     * pointer to first character past token.
278     */
279     static const char *
280     re_get_token(selection *sel, const char *regexp) {
281    
282     #ifdef SAFE_CHECKS
283     # define exit_if_null if (*regexp == 0) return NULL
284     #else
285     # define exit_if_null
286     #endif
287    
288     Boolean quoted = False;
289     uchar looking_at;
290    
291     #ifdef SAFE_CHECKS
292     if (sel == NULL || regexp == NULL || *regexp == 0)
293     return NULL;
294     #endif
295    
296     do {
297     looking_at = *regexp++;
298     switch (looking_at) {
299     case '$':
300     if (quoted) {
301     quoted = False;
302     sel->type = sel_single;
303     sel->u.single = looking_at;
304     } else {
305     sel->type = sel_end;
306     }
307     break;
308     case '.':
309     if (quoted) {
310     quoted = False;
311     sel->type = sel_single;
312     sel->u.single = looking_at;
313     } else {
314     sel->type = sel_any;
315     }
316     break;
317     case '[':
318     /* The fun stuff... perhaps a little obfuscated since I
319     * don't trust the compiler to analyse liveness.
320     */
321     if (quoted) {
322     quoted = False;
323     sel->type = sel_single;
324     sel->u.single = looking_at;
325     } else {
326     Boolean neg = False;
327     uchar first, last = 0;
328    
329     exit_if_null;
330     looking_at = *regexp++;
331    
332     if (looking_at == '^') {
333     neg = True;
334     exit_if_null;
335     looking_at = *regexp++;
336     }
337     first = looking_at;
338     exit_if_null;
339     looking_at = *regexp++;
340     if (looking_at == ']') {
341     /* On the form [q] or [^q] */
342     sel->type = neg ? sel_not_single : sel_single;
343     sel->u.single = first;
344     break;
345     } else if (looking_at == '-') {
346     exit_if_null;
347     last = *regexp++;
348     if (last == ']') {
349     /* On the form [A-] or [^A-]. Checking for
350     * [,-] and making it a range is probably not
351     * worth it :-)
352     */
353     sel->type = sel_array;
354     memset(sel->u.array, neg, sizeof(sel->u.array));
355     sel->u.array[first] = sel->u.array['-'] = !neg;
356     break;
357     } else {
358     exit_if_null;
359     looking_at = *regexp++;
360     if (looking_at == ']') {
361     /* On the form [A-G] or [^A-G]. Note that [G-A]
362     * is a syntax error. Fair enough, I think.
363     */
364     #ifdef SAFE_CHECK
365     if (first > last)
366     return NULL;
367     #endif
368     sel->type = neg ? sel_not_range : sel_range;
369     sel->u.range.low = first;
370     sel->u.range.high = last;
371     break;
372     }
373     }
374     }
375     {
376     /* The datastructure can only represent a RE this
377     * complex with an array.
378     */
379     int i;
380     uchar previous;
381    
382     sel->type = sel_array;
383     memset(sel->u.array, neg, sizeof(sel->u.array));
384     if (last) {
385     /* It starts with a range */
386     #ifdef SAFE_CHECK
387     if (first > last)
388     return NULL;
389     #endif
390     for (i = first; i <= last; i++) {
391     sel->u.array[i] = !neg;
392     }
393     } else {
394     /* It begins with a "random" character */
395     sel->u.array[first] = !neg;
396     }
397     sel->u.array[looking_at] = !neg;
398    
399     exit_if_null;
400     previous = looking_at;
401     looking_at = *regexp++;
402    
403     /* Add more characters to the array until we reach
404     * ]. Quoting doesn't and shouldn't work in here.
405     * ("]" should be put first, and "-" last if they
406     * are needed inside this construct.)
407     * Look for ranges as we go along.
408     */
409     while (looking_at != ']') {
410     if (looking_at == '-') {
411     exit_if_null;
412     looking_at = *regexp++;
413     if (looking_at != ']') {
414     #ifdef SAFE_CHECK
415     if (previous > looking_at)
416     return NULL;
417     #endif
418     for (i = previous+1; i < looking_at; i++) {
419     /* previous has already been set and
420     * looking_at is set below.
421     */
422     sel->u.array[i] = !neg;
423     }
424     exit_if_null;
425     } else {
426     sel->u.array['-'] = !neg;
427     break;
428     }
429     }
430     sel->u.array[looking_at] = !neg;
431     previous = looking_at;
432     exit_if_null;
433     looking_at = *regexp++;
434     }
435     }
436     }
437     break;
438     case '\\':
439     if (quoted) {
440     quoted = False;
441     sel->type = sel_single;
442     sel->u.single = looking_at;
443     } else {
444     quoted = True;
445     }
446     break;
447     default:
448     quoted = False;
449     sel->type = sel_single;
450     sel->u.single = looking_at;
451     break;
452     }
453     } while (quoted);
454    
455     if (*regexp == '*') {
456     sel->repeat = rep_null_or_more;
457     ++regexp;
458     } else if (*regexp == '?') {
459     sel->repeat = rep_null_or_once;
460     ++regexp;
461     } else if (*regexp == '+') {
462     sel->repeat = rep_once_or_more;
463     ++regexp;
464     } else {
465     sel->repeat = rep_once;
466     }
467    
468     return regexp;
469     }
470    
471     /* D e b u g c o d e
472     */
473     #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
474     static void
475     re_dump_sel(selection *sel) {
476     switch(sel->type) {
477     case sel_any:
478     printf(".");
479     break;
480     case sel_end:
481     printf("$");
482     break;
483     case sel_single:
484     printf("<%c>", sel->u.single);
485     break;
486     case sel_range:
487     printf("[%c-%c]", sel->u.range.low, sel->u.range.high);
488     break;
489     case sel_array:
490     {
491     int i;
492     printf("[");
493     for (i = 0; i < UCHAR_MAX; i++) {
494     if (sel->u.array[i]) {
495     printf("%c", i);
496     }
497     }
498     printf("]");
499     }
500     break;
501     case sel_not_single:
502     printf("[^%c]", sel->u.single);
503     break;
504     case sel_not_range:
505     printf("[^%c-%c]", sel->u.range.low, sel->u.range.high);
506     break;
507     default:
508     printf("<UNKNOWN TOKEN!>");
509     break;
510     }
511     switch(sel->repeat) {
512     case rep_once:
513     break;
514     case rep_null_or_once:
515     printf("?");
516     break;
517     case rep_null_or_more:
518     printf("*");
519     break;
520     case rep_once_or_more:
521     printf("+");
522     break;
523     default:
524     printf("<UNKNOWN REP-TOKEN!>");
525     break;
526     }
527     }
528    
529     int
530     main(int argc, char *argv[]) {
531     char *re, *m;
532     selection sel;
533    
534     re = re_get_token(&sel, argv[1]);
535    
536     printf("'%s' -> '%s'\n", argv[1], re);
537     re_dump_sel(&sel);
538     printf("\n");
539     m = re_cmp(argv[2], argv[1]);
540     if (m)
541     printf("MATCH! -> '%s'\n", m);
542     return 0;
543     }
544     #endif