/* * static char *rcsid_player_c = * "$Id: re-cmp.C,v 1.3 2006/09/08 04:51:08 pippijn Exp $"; */ /* re-cmp.c * Pattern match a string, parsing some of the common RE-metacharacters. * * This code is public domain, but I would appreciate to hear of * improvements or even the fact that you use it in your program. * * Deliberate BUGS: * - These tokens are not supported: | ( ) * - You will get the longest expansion of the _first_ string which * matches the RE, not the longest string which would be the proper * behaviour for a RE-matcher. * * Author: Kjetil T. Homme May 1993 */ #include #include #include #include #include #include /* Get prototype functions to prevent warnings. */ #if defined (__sun__) && defined(StupidSunHeaders) # include # include # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */ #endif /* P r o t o t y p e s */ const char *re_cmp(const char *, const char *); static bool re_cmp_step(const char *, const char *, unsigned, int); static void re_init(void); static bool re_match_token(unsigned char, selection *); static const char *re_get_token(selection *, const char *); #ifdef DEBUG2 static void re_dump_sel(selection *); #endif /* G l o b a l v a r i a b l e s */ static bool re_init_done = false; static selection *re_token[RE_TOKEN_MAX]; static const char *re_substr[RE_TOKEN_MAX]; static unsigned int re_token_depth; /* E x t e r n a l f u n c t i o n */ /* re-cmp - get regular expression match. * Return values: NULL - no match or error in regexp. * pointer to beginning of matching string */ const char * re_cmp(const char *str, const char *regexp) { const char *next_regexp; bool once = false; bool matched; if (re_init_done == false) re_init(); #ifdef SAFE_CHECKS if (regexp == NULL || str == NULL) return NULL; #endif if (*regexp == '^') { once = true; ++regexp; } if (*regexp == 0) { /* // or /^/ matches any string */ return str; } next_regexp = re_get_token(re_token[0], regexp); re_token_depth = 0; re_substr[0] = next_regexp; matched = false; while (*str != '\0' && !(matched = re_match_token(*str, re_token[0]))) str++; if (matched && *next_regexp == 0) return str; /* Apologies for the nearly duplicated code below, hopefully it * speeds things up. */ if (once) { switch (re_token[0]->repeat) { case rep_once: if (matched == false) return NULL; break; case rep_once_or_more: if (matched == false) return NULL; if (re_cmp_step(str+1, regexp, 0, 1)) return str; break; case rep_null_or_once: if (matched == false) return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL; break; case rep_null_or_more: if (matched) { if (re_cmp_step(str+1, regexp, 0, 1)) return str; } else { return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL; } break; } return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL; } if (matched) { switch (re_token[0]->repeat) { case rep_once: case rep_null_or_once: break; case rep_once_or_more: case rep_null_or_more: if (re_cmp_step(str+1, regexp, 0, 1)) return str; break; } /* The logic here is that re_match_token only sees * if the one letter matches. Thus, if the * regex is like '@match eureca', and the * the user enters anything with an e, re_match_token * returns true, but they really need to match the * entire regexp, which re_cmp_step will do. * However, what happens is that there can be a case * where the string being match is something like * 'where is eureca'. In this case, the re_match_token * matches that first e, but the re_cmp_step below, * fails because the next character (r) doesn't match * the u. So we call re_cmp with the string * after the first r, so that it should hopefully match * up properly. */ if (re_cmp_step(str+1, next_regexp, 1, 0)) return str; else if (*(str+1) != 0) return re_cmp(str+1, regexp); } return NULL; } /* A u x i l l i a r y f u n c t i o n s */ static bool re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) { /* str - string to match * regexp - pattern * slot - number of the token which under consideration * matches - how many times the token has matched */ const char *next_regexp; bool matched; #ifdef DEBUG /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/ #endif if (*regexp == 0) { /* When we reach the end of the regexp, the match is a success */ return true; } /* This chunk of code makes sure that the regexp-tokenising happens * only once. We only tokenise as much as we need. */ if (slot > re_token_depth) { re_token_depth = slot; if (re_token[slot] == NULL) re_token[slot] = (selection *) malloc(sizeof(selection)); next_regexp = re_get_token(re_token[slot], regexp); if (next_regexp == NULL) { /* Syntax error, what else can we do? */ return false; } re_substr[slot] = next_regexp; } else { next_regexp = re_substr[slot]; } matched = re_match_token(*str, re_token[slot]); if (matched) ++matches; if (*str == 0) return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched; switch (re_token[slot]->repeat) { case rep_once: if (matches == 1) { /* (matches == 1) => (matched == true) */ return re_cmp_step(str+1, next_regexp, slot+1, 0); } return false; case rep_once_or_more: if (matched) { /* (matched == true) => (matches >= 1) */ /* First check if the current token repeats more */ if (re_cmp_step(str+1, regexp, slot, matches)) return true; return re_cmp_step(str+1, next_regexp, slot+1, 0); } return false; case rep_null_or_once: /* We must go on to the next token, but should we advance str? */ if (matches == 0) { return re_cmp_step(str, next_regexp, slot+1, 0); } else if (matches == 1) { return re_cmp_step(str+1, next_regexp, slot+1, 0); } return false; /* Not reached */ case rep_null_or_more: if (matched) { /* Look for further repeats, advance str */ if (re_cmp_step(str+1, regexp, slot, matches)) return true; return re_cmp_step(str, next_regexp, slot+1, 0); } return re_cmp_step(str, next_regexp, slot+1, 0); } return false; } static void re_init(void) { int i; re_token[0] = (selection *) malloc(sizeof(selection)); for (i = 1; i < RE_TOKEN_MAX; i++) re_token[i] = NULL; re_init_done = true; } static bool re_match_token(unsigned char c, selection *sel) { switch (sel->type) { case sel_any: return true; case sel_end: return (c == 0); case sel_single: return (tolower(c) == tolower(sel->u.single)); case sel_range: return (c >= sel->u.range.low && c <= sel->u.range.high); case sel_array: return (sel->u.array[c]); case sel_not_single: return (tolower(c) != tolower(sel->u.single)); case sel_not_range: return (c < sel->u.range.low && c > sel->u.range.high); } return false; } /* re_get_token - get regular expression token * Returns the first token found in in * Return values: NULL syntax error * pointer to first character past token. */ static const char * re_get_token(selection *sel, const char *regexp) { #ifdef SAFE_CHECKS # define exit_if_null if (*regexp == 0) return NULL #else # define exit_if_null #endif bool quoted = false; unsigned char looking_at; #ifdef SAFE_CHECKS if (sel == NULL || regexp == NULL || *regexp == 0) return NULL; #endif do { looking_at = *regexp++; switch (looking_at) { case '$': if (quoted) { quoted = false; sel->type = sel_single; sel->u.single = looking_at; } else { sel->type = sel_end; } break; case '.': if (quoted) { quoted = false; sel->type = sel_single; sel->u.single = looking_at; } else { sel->type = sel_any; } break; case '[': /* The fun stuff... perhaps a little obfuscated since I * don't trust the compiler to analyse liveness. */ if (quoted) { quoted = false; sel->type = sel_single; sel->u.single = looking_at; } else { bool neg = false; unsigned char first, last = 0; exit_if_null; looking_at = *regexp++; if (looking_at == '^') { neg = true; exit_if_null; looking_at = *regexp++; } first = looking_at; exit_if_null; looking_at = *regexp++; if (looking_at == ']') { /* On the form [q] or [^q] */ sel->type = neg ? sel_not_single : sel_single; sel->u.single = first; break; } else if (looking_at == '-') { exit_if_null; last = *regexp++; if (last == ']') { /* On the form [A-] or [^A-]. Checking for * [,-] and making it a range is probably not * worth it :-) */ sel->type = sel_array; memset(sel->u.array, neg, sizeof(sel->u.array)); sel->u.array[first] = sel->u.array['-'] = !neg; break; } else { exit_if_null; looking_at = *regexp++; if (looking_at == ']') { /* On the form [A-G] or [^A-G]. Note that [G-A] * is a syntax error. Fair enough, I think. */ #ifdef SAFE_CHECK if (first > last) return NULL; #endif sel->type = neg ? sel_not_range : sel_range; sel->u.range.low = first; sel->u.range.high = last; break; } } } { /* The datastructure can only represent a RE this * complex with an array. */ int i; unsigned char previous; sel->type = sel_array; memset(sel->u.array, neg, sizeof(sel->u.array)); if (last) { /* It starts with a range */ #ifdef SAFE_CHECK if (first > last) return NULL; #endif for (i = first; i <= last; i++) { sel->u.array[i] = !neg; } } else { /* It begins with a "random" character */ sel->u.array[first] = !neg; } sel->u.array[looking_at] = !neg; exit_if_null; previous = looking_at; looking_at = *regexp++; /* Add more characters to the array until we reach * ]. Quoting doesn't and shouldn't work in here. * ("]" should be put first, and "-" last if they * are needed inside this construct.) * Look for ranges as we go along. */ while (looking_at != ']') { if (looking_at == '-') { exit_if_null; looking_at = *regexp++; if (looking_at != ']') { #ifdef SAFE_CHECK if (previous > looking_at) return NULL; #endif for (i = previous+1; i < looking_at; i++) { /* previous has already been set and * looking_at is set below. */ sel->u.array[i] = !neg; } exit_if_null; } else { sel->u.array['-'] = !neg; break; } } sel->u.array[looking_at] = !neg; previous = looking_at; exit_if_null; looking_at = *regexp++; } } } break; case '\\': if (quoted) { quoted = false; sel->type = sel_single; sel->u.single = looking_at; } else { quoted = true; } break; default: quoted = false; sel->type = sel_single; sel->u.single = looking_at; break; } } while (quoted); if (*regexp == '*') { sel->repeat = rep_null_or_more; ++regexp; } else if (*regexp == '?') { sel->repeat = rep_null_or_once; ++regexp; } else if (*regexp == '+') { sel->repeat = rep_once_or_more; ++regexp; } else { sel->repeat = rep_once; } return regexp; } /* D e b u g c o d e */ #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */ static void re_dump_sel(selection *sel) { switch(sel->type) { case sel_any: printf("."); break; case sel_end: printf("$"); break; case sel_single: printf("<%c>", sel->u.single); break; case sel_range: printf("[%c-%c]", sel->u.range.low, sel->u.range.high); break; case sel_array: { int i; printf("["); for (i = 0; i < uchar_MAX; i++) { if (sel->u.array[i]) { printf("%c", i); } } printf("]"); } break; case sel_not_single: printf("[^%c]", sel->u.single); break; case sel_not_range: printf("[^%c-%c]", sel->u.range.low, sel->u.range.high); break; default: printf(""); break; } switch(sel->repeat) { case rep_once: break; case rep_null_or_once: printf("?"); break; case rep_null_or_more: printf("*"); break; case rep_once_or_more: printf("+"); break; default: printf(""); break; } } int main(int argc, char *argv[]) { char *re, *m; selection sel; re = re_get_token(&sel, argv[1]); printf("'%s' -> '%s'\n", argv[1], re); re_dump_sel(&sel); printf("\n"); m = re_cmp(argv[2], argv[1]); if (m) printf("MATCH! -> '%s'\n", m); return 0; } #endif