ViewVC Help
View File | Revision Log | Show Annotations | Download File
/cvs/deliantra/server/common/re-cmp.c
Revision: 1.2
Committed: Sun Aug 13 17:16:00 2006 UTC (17 years, 9 months ago) by elmex
Content type: text/plain
Branch: MAIN
CVS Tags: HEAD
Changes since 1.1: +1 -1 lines
State: FILE REMOVED
Log Message:
Made server compile with C++.
Removed cfanim plugin and crossedit.
C++ here we come.

File Contents

# Content
1 /*
2 * static char *rcsid_player_c =
3 * "$Id$";
4 */
5
6
7 /* re-cmp.c
8 * Pattern match a string, parsing some of the common RE-metacharacters.
9 *
10 * This code is public domain, but I would appreciate to hear of
11 * improvements or even the fact that you use it in your program.
12 *
13 * Deliberate BUGS:
14 * - These tokens are not supported: | ( )
15 * - You will get the longest expansion of the _first_ string which
16 * matches the RE, not the longest string which would be the proper
17 * behaviour for a RE-matcher.
18 *
19 * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
20 */
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <memory.h>
25 #include <limits.h>
26 #include <re-cmp.h>
27 #include <ctype.h>
28
29 /* Get prototype functions to prevent warnings. */
30 #if defined (__sun__) && defined(StupidSunHeaders)
31 # include <sys/types.h>
32 # include <sys/time.h>
33 # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
34 #endif
35
36
37 /* P r o t o t y p e s
38 */
39 const char *re_cmp(const char *, const char *);
40 static Boolean re_cmp_step(const char *, const char *, unsigned, int);
41 static void re_init(void);
42 static Boolean re_match_token(uchar, selection *);
43 static const char *re_get_token(selection *, const char *);
44 #ifdef DEBUG2
45 static void re_dump_sel(selection *);
46 #endif
47
48 /* G l o b a l v a r i a b l e s
49 */
50 static Boolean re_init_done = False;
51 static selection *re_token[RE_TOKEN_MAX];
52 static const char *re_substr[RE_TOKEN_MAX];
53 static unsigned int re_token_depth;
54
55 /* E x t e r n a l f u n c t i o n
56 */
57
58 /* re-cmp - get regular expression match.
59 * Return values: NULL - no match or error in regexp.
60 * pointer to beginning of matching string
61 */
62 const char *
63 re_cmp(const char *str, const char *regexp) {
64 const char *next_regexp;
65 Boolean once = False;
66 Boolean matched;
67
68 if (re_init_done == False)
69 re_init();
70
71 #ifdef SAFE_CHECKS
72 if (regexp == NULL || str == NULL)
73 return NULL;
74 #endif
75 if (*regexp == '^') {
76 once = True;
77 ++regexp;
78 }
79 if (*regexp == 0) {
80 /* // or /^/ matches any string */
81 return str;
82 }
83
84 next_regexp = re_get_token(re_token[0], regexp);
85 re_token_depth = 0;
86 re_substr[0] = next_regexp;
87
88 matched = False;
89 while (*str != '\0' && !(matched = re_match_token(*str, re_token[0])))
90 str++;
91
92 if (matched && *next_regexp == 0)
93 return str;
94
95 /* Apologies for the nearly duplicated code below, hopefully it
96 * speeds things up.
97 */
98 if (once) {
99 switch (re_token[0]->repeat) {
100 case rep_once:
101 if (matched == False)
102 return NULL;
103 break;
104 case rep_once_or_more:
105 if (matched == False)
106 return NULL;
107
108 if (re_cmp_step(str+1, regexp, 0, 1))
109 return str;
110 break;
111 case rep_null_or_once:
112 if (matched == False)
113 return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
114 break;
115 case rep_null_or_more:
116 if (matched) {
117 if (re_cmp_step(str+1, regexp, 0, 1))
118 return str;
119 } else {
120 return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
121 }
122 break;
123 }
124 return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL;
125 }
126
127 if (matched) {
128 switch (re_token[0]->repeat) {
129 case rep_once:
130 case rep_null_or_once:
131 break;
132 case rep_once_or_more:
133 case rep_null_or_more:
134 if (re_cmp_step(str+1, regexp, 0, 1))
135 return str;
136 break;
137 }
138 /* The logic here is that re_match_token only sees
139 * if the one letter matches. Thus, if the
140 * regex is like '@match eureca', and the
141 * the user enters anything with an e, re_match_token
142 * returns true, but they really need to match the
143 * entire regexp, which re_cmp_step will do.
144 * However, what happens is that there can be a case
145 * where the string being match is something like
146 * 'where is eureca'. In this case, the re_match_token
147 * matches that first e, but the re_cmp_step below,
148 * fails because the next character (r) doesn't match
149 * the u. So we call re_cmp with the string
150 * after the first r, so that it should hopefully match
151 * up properly.
152 */
153 if (re_cmp_step(str+1, next_regexp, 1, 0))
154 return str;
155 else if (*(str+1) != 0)
156 return re_cmp(str+1, regexp);
157 }
158 return NULL;
159 }
160
161 /* A u x i l l i a r y f u n c t i o n s
162 */
163
164 static Boolean
165 re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) {
166 /* str - string to match
167 * regexp - pattern
168 * slot - number of the token which under consideration
169 * matches - how many times the token has matched
170 */
171 const char *next_regexp;
172 Boolean matched;
173
174 #ifdef DEBUG
175 /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
176 #endif
177
178 if (*regexp == 0) {
179 /* When we reach the end of the regexp, the match is a success
180 */
181 return True;
182 }
183
184 /* This chunk of code makes sure that the regexp-tokenising happens
185 * only once. We only tokenise as much as we need.
186 */
187 if (slot > re_token_depth) {
188 re_token_depth = slot;
189 if (re_token[slot] == NULL)
190 re_token[slot] = (selection *) malloc(sizeof(selection));
191 next_regexp = re_get_token(re_token[slot], regexp);
192 if (next_regexp == NULL) {
193 /* Syntax error, what else can we do? */
194 return False;
195 }
196 re_substr[slot] = next_regexp;
197 } else {
198 next_regexp = re_substr[slot];
199 }
200
201 matched = re_match_token(*str, re_token[slot]);
202 if (matched)
203 ++matches;
204
205 if (*str == 0)
206 return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;
207
208 switch (re_token[slot]->repeat) {
209 case rep_once:
210 if (matches == 1) { /* (matches == 1) => (matched == True) */
211 return re_cmp_step(str+1, next_regexp, slot+1, 0);
212 }
213 return False;
214 case rep_once_or_more:
215 if (matched) { /* (matched == True) => (matches >= 1) */
216 /* First check if the current token repeats more */
217 if (re_cmp_step(str+1, regexp, slot, matches))
218 return True;
219 return re_cmp_step(str+1, next_regexp, slot+1, 0);
220 }
221 return False;
222 case rep_null_or_once:
223 /* We must go on to the next token, but should we advance str? */
224 if (matches == 0) {
225 return re_cmp_step(str, next_regexp, slot+1, 0);
226 } else if (matches == 1) {
227 return re_cmp_step(str+1, next_regexp, slot+1, 0);
228 }
229 return False; /* Not reached */
230 case rep_null_or_more:
231 if (matched) {
232 /* Look for further repeats, advance str */
233 if (re_cmp_step(str+1, regexp, slot, matches))
234 return True;
235 return re_cmp_step(str, next_regexp, slot+1, 0);
236 }
237 return re_cmp_step(str, next_regexp, slot+1, 0);
238 }
239 return False;
240 }
241
242 static void
243 re_init(void) {
244 int i;
245
246 re_token[0] = (selection *) malloc(sizeof(selection));
247 for (i = 1; i < RE_TOKEN_MAX; i++)
248 re_token[i] = NULL;
249
250 re_init_done = True;
251 }
252
253 static Boolean
254 re_match_token(uchar c, selection *sel) {
255 switch (sel->type) {
256 case sel_any:
257 return True;
258 case sel_end:
259 return (c == 0);
260 case sel_single:
261 return (tolower(c) == tolower(sel->u.single));
262 case sel_range:
263 return (c >= sel->u.range.low && c <= sel->u.range.high);
264 case sel_array:
265 return (sel->u.array[c]);
266 case sel_not_single:
267 return (tolower(c) != tolower(sel->u.single));
268 case sel_not_range:
269 return (c < sel->u.range.low && c > sel->u.range.high);
270 }
271 return False;
272 }
273
274 /* re_get_token - get regular expression token
275 * Returns the first token found in <regexp> in <sel>
276 * Return values: NULL syntax error
277 * pointer to first character past token.
278 */
279 static const char *
280 re_get_token(selection *sel, const char *regexp) {
281
282 #ifdef SAFE_CHECKS
283 # define exit_if_null if (*regexp == 0) return NULL
284 #else
285 # define exit_if_null
286 #endif
287
288 Boolean quoted = False;
289 uchar looking_at;
290
291 #ifdef SAFE_CHECKS
292 if (sel == NULL || regexp == NULL || *regexp == 0)
293 return NULL;
294 #endif
295
296 do {
297 looking_at = *regexp++;
298 switch (looking_at) {
299 case '$':
300 if (quoted) {
301 quoted = False;
302 sel->type = sel_single;
303 sel->u.single = looking_at;
304 } else {
305 sel->type = sel_end;
306 }
307 break;
308 case '.':
309 if (quoted) {
310 quoted = False;
311 sel->type = sel_single;
312 sel->u.single = looking_at;
313 } else {
314 sel->type = sel_any;
315 }
316 break;
317 case '[':
318 /* The fun stuff... perhaps a little obfuscated since I
319 * don't trust the compiler to analyse liveness.
320 */
321 if (quoted) {
322 quoted = False;
323 sel->type = sel_single;
324 sel->u.single = looking_at;
325 } else {
326 Boolean neg = False;
327 uchar first, last = 0;
328
329 exit_if_null;
330 looking_at = *regexp++;
331
332 if (looking_at == '^') {
333 neg = True;
334 exit_if_null;
335 looking_at = *regexp++;
336 }
337 first = looking_at;
338 exit_if_null;
339 looking_at = *regexp++;
340 if (looking_at == ']') {
341 /* On the form [q] or [^q] */
342 sel->type = neg ? sel_not_single : sel_single;
343 sel->u.single = first;
344 break;
345 } else if (looking_at == '-') {
346 exit_if_null;
347 last = *regexp++;
348 if (last == ']') {
349 /* On the form [A-] or [^A-]. Checking for
350 * [,-] and making it a range is probably not
351 * worth it :-)
352 */
353 sel->type = sel_array;
354 memset(sel->u.array, neg, sizeof(sel->u.array));
355 sel->u.array[first] = sel->u.array['-'] = !neg;
356 break;
357 } else {
358 exit_if_null;
359 looking_at = *regexp++;
360 if (looking_at == ']') {
361 /* On the form [A-G] or [^A-G]. Note that [G-A]
362 * is a syntax error. Fair enough, I think.
363 */
364 #ifdef SAFE_CHECK
365 if (first > last)
366 return NULL;
367 #endif
368 sel->type = neg ? sel_not_range : sel_range;
369 sel->u.range.low = first;
370 sel->u.range.high = last;
371 break;
372 }
373 }
374 }
375 {
376 /* The datastructure can only represent a RE this
377 * complex with an array.
378 */
379 int i;
380 uchar previous;
381
382 sel->type = sel_array;
383 memset(sel->u.array, neg, sizeof(sel->u.array));
384 if (last) {
385 /* It starts with a range */
386 #ifdef SAFE_CHECK
387 if (first > last)
388 return NULL;
389 #endif
390 for (i = first; i <= last; i++) {
391 sel->u.array[i] = !neg;
392 }
393 } else {
394 /* It begins with a "random" character */
395 sel->u.array[first] = !neg;
396 }
397 sel->u.array[looking_at] = !neg;
398
399 exit_if_null;
400 previous = looking_at;
401 looking_at = *regexp++;
402
403 /* Add more characters to the array until we reach
404 * ]. Quoting doesn't and shouldn't work in here.
405 * ("]" should be put first, and "-" last if they
406 * are needed inside this construct.)
407 * Look for ranges as we go along.
408 */
409 while (looking_at != ']') {
410 if (looking_at == '-') {
411 exit_if_null;
412 looking_at = *regexp++;
413 if (looking_at != ']') {
414 #ifdef SAFE_CHECK
415 if (previous > looking_at)
416 return NULL;
417 #endif
418 for (i = previous+1; i < looking_at; i++) {
419 /* previous has already been set and
420 * looking_at is set below.
421 */
422 sel->u.array[i] = !neg;
423 }
424 exit_if_null;
425 } else {
426 sel->u.array['-'] = !neg;
427 break;
428 }
429 }
430 sel->u.array[looking_at] = !neg;
431 previous = looking_at;
432 exit_if_null;
433 looking_at = *regexp++;
434 }
435 }
436 }
437 break;
438 case '\\':
439 if (quoted) {
440 quoted = False;
441 sel->type = sel_single;
442 sel->u.single = looking_at;
443 } else {
444 quoted = True;
445 }
446 break;
447 default:
448 quoted = False;
449 sel->type = sel_single;
450 sel->u.single = looking_at;
451 break;
452 }
453 } while (quoted);
454
455 if (*regexp == '*') {
456 sel->repeat = rep_null_or_more;
457 ++regexp;
458 } else if (*regexp == '?') {
459 sel->repeat = rep_null_or_once;
460 ++regexp;
461 } else if (*regexp == '+') {
462 sel->repeat = rep_once_or_more;
463 ++regexp;
464 } else {
465 sel->repeat = rep_once;
466 }
467
468 return regexp;
469 }
470
471 /* D e b u g c o d e
472 */
473 #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
474 static void
475 re_dump_sel(selection *sel) {
476 switch(sel->type) {
477 case sel_any:
478 printf(".");
479 break;
480 case sel_end:
481 printf("$");
482 break;
483 case sel_single:
484 printf("<%c>", sel->u.single);
485 break;
486 case sel_range:
487 printf("[%c-%c]", sel->u.range.low, sel->u.range.high);
488 break;
489 case sel_array:
490 {
491 int i;
492 printf("[");
493 for (i = 0; i < UCHAR_MAX; i++) {
494 if (sel->u.array[i]) {
495 printf("%c", i);
496 }
497 }
498 printf("]");
499 }
500 break;
501 case sel_not_single:
502 printf("[^%c]", sel->u.single);
503 break;
504 case sel_not_range:
505 printf("[^%c-%c]", sel->u.range.low, sel->u.range.high);
506 break;
507 default:
508 printf("<UNKNOWN TOKEN!>");
509 break;
510 }
511 switch(sel->repeat) {
512 case rep_once:
513 break;
514 case rep_null_or_once:
515 printf("?");
516 break;
517 case rep_null_or_more:
518 printf("*");
519 break;
520 case rep_once_or_more:
521 printf("+");
522 break;
523 default:
524 printf("<UNKNOWN REP-TOKEN!>");
525 break;
526 }
527 }
528
529 int
530 main(int argc, char *argv[]) {
531 char *re, *m;
532 selection sel;
533
534 re = re_get_token(&sel, argv[1]);
535
536 printf("'%s' -> '%s'\n", argv[1], re);
537 re_dump_sel(&sel);
538 printf("\n");
539 m = re_cmp(argv[2], argv[1]);
540 if (m)
541 printf("MATCH! -> '%s'\n", m);
542 return 0;
543 }
544 #endif