server/common/re-cmp.C

/*
 * static char *rcsid_player_c =
 *   "$Id: re-cmp.C,v 1.2 2006-08-29 08:01:35 root Exp $";
 */


/* re-cmp.c
 * Pattern match a string, parsing some of the common RE-metacharacters.
 *
 * This code is public domain, but I would appreciate to hear of
 * improvements or even the fact that you use it in your program.
 *
 * Deliberate BUGS:
 *    - These tokens are not supported: | ( )
 *    - You will get the longest expansion of the _first_ string which
 *      matches the RE, not the longest string which would be the proper
 *      behaviour for a RE-matcher.
 *
 * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
 */

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <limits.h>
#include <re-cmp.h>
#include <ctype.h>

/* Get prototype functions to prevent warnings. */
#if defined (__sun__) && defined(StupidSunHeaders)
#  include <sys/types.h>
#  include <sys/time.h>
#  include "sunos.h"   /* Prototypes for standard libraries, sunos lack those */
#endif


/*   P r o t o t y p e s
 */
const char *re_cmp(const char *, const char *);
static bool re_cmp_step(const char *, const char *, unsigned, int);
static void re_init(void);
static bool re_match_token(unsigned char, selection *);
static const char *re_get_token(selection *, const char *);
#ifdef DEBUG2
static void re_dump_sel(selection *);
#endif

/*   G l o b a l   v a r i a b l e s
 */
static bool             re_init_done = false;
static selection        *re_token[RE_TOKEN_MAX];
static const char       *re_substr[RE_TOKEN_MAX];
static unsigned int     re_token_depth;

/*   E x t e r n a l   f u n c t i o n
 */

/* re-cmp - get regular expression match.
 * Return values: NULL - no match or error in regexp.
 *                pointer to beginning of matching string
 */
const char *
re_cmp(const char *str, const char *regexp) {
    const char *next_regexp;
    bool once = false;
    bool matched;

    if (re_init_done == false)
        re_init();

#ifdef SAFE_CHECKS
    if (regexp == NULL || str == NULL)
        return NULL;
#endif
    if (*regexp == '^') {
        once = true;
        ++regexp;
    }
    if (*regexp == 0) {
        /* // or /^/ matches any string */
        return str;
    }

    next_regexp = re_get_token(re_token[0], regexp);
    re_token_depth = 0;
    re_substr[0] = next_regexp;

    matched = false;
    while (*str != '\0' && !(matched = re_match_token(*str, re_token[0])))
        str++;
    
    if (matched && *next_regexp == 0)
        return str;

    /* Apologies for the nearly duplicated code below, hopefully it
     * speeds things up.
     */
    if (once) {
        switch (re_token[0]->repeat) {
            case rep_once:
                if (matched == false)
                    return NULL;
                break;
            case rep_once_or_more:
                if (matched == false)
                    return NULL;

                if (re_cmp_step(str+1, regexp, 0, 1))
                    return str;
                break;
            case rep_null_or_once:
                if (matched == false)
                    return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
                break;
            case rep_null_or_more:
                if (matched) {
                    if (re_cmp_step(str+1, regexp, 0, 1))
                        return str;
                } else {
                    return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
                }
                break;
        }
        return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL;
    }

    if (matched) {
        switch (re_token[0]->repeat) {
            case rep_once:
            case rep_null_or_once:
                break;
            case rep_once_or_more:
            case rep_null_or_more:
                if (re_cmp_step(str+1, regexp, 0, 1))
                    return str;
                break;
        }
        /* The logic here is that re_match_token only sees
         * if the one letter matches.  Thus, if the
         * regex is like '@match eureca', and the
         * the user enters anything with an e, re_match_token
         * returns true, but they really need to match the
         * entire regexp, which re_cmp_step will do.
         * However, what happens is that there can be a case
         * where the string being match is something like
         * 'where is eureca'.  In this case, the re_match_token
         * matches that first e, but the re_cmp_step below,
         * fails because the next character (r) doesn't match
         * the u.  So we call re_cmp with the string
         * after the first r, so that it should hopefully match
         * up properly.
         */
        if (re_cmp_step(str+1, next_regexp, 1, 0))
            return str;
        else if (*(str+1) != 0)
            return re_cmp(str+1, regexp);
    }
    return NULL;
}

/*   A u x i l l i a r y   f u n c t i o n s
 */

static bool
re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) {
    /* str      - string to match
     * regexp   - pattern
     * slot     - number of the token which under consideration
     * matches  - how many times the token has matched
     */
    const char *next_regexp;
    bool matched;

#ifdef DEBUG
/*    fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
#endif

    if (*regexp == 0) {
        /* When we reach the end of the regexp, the match is a success
         */
        return true;
    }

    /* This chunk of code makes sure that the regexp-tokenising happens
     * only once. We only tokenise as much as we need.
     */
    if (slot > re_token_depth) {
        re_token_depth = slot;
        if (re_token[slot] == NULL)
            re_token[slot] = (selection *) malloc(sizeof(selection));
        next_regexp = re_get_token(re_token[slot], regexp);
        if (next_regexp == NULL) {
            /* Syntax error, what else can we do? */
            return false;
        }
        re_substr[slot] = next_regexp;
    } else {
        next_regexp = re_substr[slot];
    }

    matched = re_match_token(*str, re_token[slot]);
    if (matched)
        ++matches;

    if (*str == 0)
        return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;

    switch (re_token[slot]->repeat) {
        case rep_once:
            if (matches == 1) { /* (matches == 1) => (matched == true) */
                return re_cmp_step(str+1, next_regexp, slot+1, 0);
            }
            return false;
        case rep_once_or_more:
            if (matched) {      /* (matched == true) => (matches >= 1) */
                /* First check if the current token repeats more */
                if (re_cmp_step(str+1, regexp, slot, matches))
                    return true;
                return re_cmp_step(str+1, next_regexp, slot+1, 0);
            }
            return false;
        case rep_null_or_once:
            /* We must go on to the next token, but should we advance str? */
            if (matches == 0) {
                return re_cmp_step(str, next_regexp, slot+1, 0);
            } else if (matches == 1) {
                return re_cmp_step(str+1, next_regexp, slot+1, 0);
            }
            return false;       /* Not reached */
        case rep_null_or_more:
            if (matched) {
                /* Look for further repeats, advance str */
                if (re_cmp_step(str+1, regexp, slot, matches))
                    return true;
                return re_cmp_step(str, next_regexp, slot+1, 0);
            }
            return re_cmp_step(str, next_regexp, slot+1, 0);
    }
    return false;
}

static void
re_init(void) {
    int i;

    re_token[0] = (selection *) malloc(sizeof(selection));
    for (i = 1; i < RE_TOKEN_MAX; i++)
        re_token[i] = NULL;

    re_init_done = true;
}

static bool
re_match_token(unsigned char c, selection *sel) {
    switch (sel->type) {
        case sel_any:
            return true;
        case sel_end:
            return (c == 0);
        case sel_single:
            return (tolower(c) == tolower(sel->u.single));
        case sel_range:
            return (c >= sel->u.range.low && c <= sel->u.range.high);
        case sel_array:
            return (sel->u.array[c]);
        case sel_not_single:
            return (tolower(c) != tolower(sel->u.single));
        case sel_not_range:
            return (c < sel->u.range.low && c > sel->u.range.high);
    }
    return false;
}

/* re_get_token - get regular expression token
 * Returns the first token found in <regexp> in <sel>
 * Return values: NULL  syntax error
 *                pointer to first character past token.
 */
static const char *
re_get_token(selection *sel, const char *regexp) {

#ifdef SAFE_CHECKS
#   define exit_if_null if (*regexp == 0) return NULL
#else
#   define exit_if_null
#endif

    bool quoted = false;
    unsigned char looking_at;

#ifdef SAFE_CHECKS
    if (sel == NULL || regexp == NULL || *regexp == 0)
        return NULL;
#endif

    do {
        looking_at = *regexp++;
        switch (looking_at) {
            case '$':
                if (quoted) {
                    quoted = false;
                    sel->type = sel_single;
                    sel->u.single = looking_at;
                } else {
                    sel->type = sel_end;
                }
                break;
            case '.':
                if (quoted) {
                    quoted = false;
                    sel->type = sel_single;
                    sel->u.single = looking_at;
                } else {
                    sel->type = sel_any;
                }
                break;
            case '[':
                /* The fun stuff... perhaps a little obfuscated since I
                 * don't trust the compiler to analyse liveness.
                 */
                if (quoted) {
                    quoted = false;
                    sel->type = sel_single;
                    sel->u.single = looking_at;
                } else {
                    bool neg = false;
                    unsigned char first, last = 0;

                    exit_if_null;
                    looking_at = *regexp++;

                    if (looking_at == '^') {
                        neg = true;
                        exit_if_null;
                        looking_at = *regexp++;
                    }
                    first = looking_at;
                    exit_if_null;
                    looking_at = *regexp++;
                    if (looking_at == ']') {
                        /* On the form [q] or [^q] */
                        sel->type = neg ? sel_not_single : sel_single;
                        sel->u.single = first;
                        break;
                    } else if (looking_at == '-') {
                        exit_if_null;
                        last = *regexp++;
                        if (last == ']') {
                            /* On the form [A-] or [^A-]. Checking for
                             * [,-] and making it a range is probably not
                             * worth it :-)
                             */
                            sel->type = sel_array;
                            memset(sel->u.array, neg, sizeof(sel->u.array));
                            sel->u.array[first] = sel->u.array['-'] = !neg;
                            break;
                        } else {
                            exit_if_null;
                            looking_at = *regexp++;
                            if (looking_at == ']') {
                                /* On the form [A-G] or [^A-G]. Note that [G-A]
                                 * is a syntax error. Fair enough, I think.
                                 */
#ifdef SAFE_CHECK
                                if (first > last)
                                    return NULL;
#endif
                                sel->type = neg ? sel_not_range : sel_range;
                                sel->u.range.low = first;
                                sel->u.range.high = last;
                                break;
                            }
                        }
                    }
                    {
                        /* The datastructure can only represent a RE this
                         * complex with an array.
                         */
                        int i;
                        unsigned char previous;

                        sel->type = sel_array;
                        memset(sel->u.array, neg, sizeof(sel->u.array));
                        if (last) {
                            /* It starts with a range */
#ifdef SAFE_CHECK
                            if (first > last)
                                return NULL;
#endif
                            for (i = first; i <= last; i++) {
                                sel->u.array[i] = !neg;
                            }
                        } else {
                            /* It begins with a "random" character */
                            sel->u.array[first] = !neg;
                        }
                        sel->u.array[looking_at] = !neg;

                        exit_if_null;
                        previous = looking_at;
                        looking_at = *regexp++;

                        /* Add more characters to the array until we reach
                         * ]. Quoting doesn't and shouldn't work in here.
                         * ("]" should be put first, and "-" last if they
                         * are needed inside this construct.)
                         * Look for ranges as we go along.
                         */
                        while (looking_at != ']') {
                            if (looking_at == '-') {
                                exit_if_null;
                                looking_at = *regexp++;
                                if (looking_at != ']') {
#ifdef SAFE_CHECK
                                    if (previous > looking_at)
                                        return NULL;
#endif
                                    for (i = previous+1; i < looking_at; i++) {
                                        /* previous has already been set and
                                         * looking_at is set below.
                                         */
                                        sel->u.array[i] = !neg;
                                    }
                                    exit_if_null;
                                } else {
                                    sel->u.array['-'] = !neg;
                                    break;
                                }
                            }
                            sel->u.array[looking_at] = !neg;
                            previous = looking_at;
                            exit_if_null;
                            looking_at = *regexp++;
                        }
                    }
                }
                break;
            case '\\':
                if (quoted) {
                    quoted = false;
                    sel->type = sel_single;
                    sel->u.single = looking_at;
                } else {
                    quoted = true;
                }
                break;
            default:
                quoted = false;
                sel->type = sel_single;
                sel->u.single = looking_at;
                break;
        }
    } while (quoted);

    if (*regexp == '*') {
        sel->repeat = rep_null_or_more;
        ++regexp;
    } else if (*regexp == '?') {
        sel->repeat = rep_null_or_once;
        ++regexp;
    } else if (*regexp == '+') {
        sel->repeat = rep_once_or_more;
        ++regexp;
    } else {
        sel->repeat = rep_once;
    }

    return regexp;
}

/*   D e b u g   c o d e
 */
#ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
static void
re_dump_sel(selection *sel) {
    switch(sel->type) {
        case sel_any:
            printf(".");
            break;
        case sel_end:
            printf("$");
            break;
        case sel_single:
            printf("<%c>", sel->u.single);
            break;
        case sel_range:
            printf("[%c-%c]", sel->u.range.low, sel->u.range.high);
            break;
        case sel_array:
            {
                int i;
                printf("[");
                for (i = 0; i < uchar_MAX; i++) {
                    if (sel->u.array[i]) {
                        printf("%c", i);
                    }
                }
                printf("]");
            }
            break;
        case sel_not_single:
            printf("[^%c]", sel->u.single);
            break;
        case sel_not_range:
            printf("[^%c-%c]", sel->u.range.low, sel->u.range.high);
            break;
        default:
            printf("<UNKNOWN TOKEN!>");
            break;
    }
    switch(sel->repeat) {
        case rep_once:
            break;
        case rep_null_or_once:
            printf("?");
            break;
        case rep_null_or_more:
            printf("*");
            break;
        case rep_once_or_more:
            printf("+");
            break;
        default:
            printf("<UNKNOWN REP-TOKEN!>");
            break;
    }
}

int
main(int argc, char *argv[]) {
    char *re, *m;
    selection sel;

    re = re_get_token(&sel, argv[1]);

    printf("'%s' -> '%s'\n", argv[1], re);
    re_dump_sel(&sel);
    printf("\n");
    m = re_cmp(argv[2], argv[1]);
    if (m)
        printf("MATCH! -> '%s'\n", m);
    return 0;
}
#endif
Revision:	1.3
Committed:	Fri Sep 8 04:51:08 2006 UTC (17 years, 9 months ago) by pippijn
Content type:	text/plain
Branch:	MAIN
Changes since 1.2:	+42 -42 lines
Log Message:	Converted the custom type Boolean to C++ bool.
#	User	Rev	Content
1	elmex	1.1	/*
2			* static char *rcsid_player_c =
3	pippijn	1.3	* "$Id: re-cmp.C,v 1.2 2006-08-29 08:01:35 root Exp $";
4	elmex	1.1	*/
5
6
7			/* re-cmp.c
8			* Pattern match a string, parsing some of the common RE-metacharacters.
9			*
10			* This code is public domain, but I would appreciate to hear of
11			* improvements or even the fact that you use it in your program.
12			*
13			* Deliberate BUGS:
14			* - These tokens are not supported: \| ( )
15			* - You will get the longest expansion of the _first_ string which
16			* matches the RE, not the longest string which would be the proper
17			* behaviour for a RE-matcher.
18			*
19			* Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
20			*/
21
22			#include <stdio.h>
23			#include <stdlib.h>
24			#include <memory.h>
25			#include <limits.h>
26			#include <re-cmp.h>
27			#include <ctype.h>
28
29			/* Get prototype functions to prevent warnings. */
30			#if defined (__sun__) && defined(StupidSunHeaders)
31			# include <sys/types.h>
32			# include <sys/time.h>
33			# include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
34			#endif
35
36
37			/* P r o t o t y p e s
38			*/
39			const char re_cmp(const char , const char *);
40	pippijn	1.3	static bool re_cmp_step(const char , const char , unsigned, int);
41	elmex	1.1	static void re_init(void);
42	pippijn	1.3	static bool re_match_token(unsigned char, selection *);
43	elmex	1.1	static const char re_get_token(selection , const char *);
44			#ifdef DEBUG2
45			static void re_dump_sel(selection *);
46			#endif
47
48			/* G l o b a l v a r i a b l e s
49			*/
50	pippijn	1.3	static bool re_init_done = false;
51	elmex	1.1	static selection *re_token[RE_TOKEN_MAX];
52			static const char *re_substr[RE_TOKEN_MAX];
53			static unsigned int re_token_depth;
54
55			/* E x t e r n a l f u n c t i o n
56			*/
57
58			/* re-cmp - get regular expression match.
59			* Return values: NULL - no match or error in regexp.
60			* pointer to beginning of matching string
61			*/
62			const char *
63			re_cmp(const char str, const char regexp) {
64			const char *next_regexp;
65	pippijn	1.3	bool once = false;
66			bool matched;
67	elmex	1.1
68	pippijn	1.3	if (re_init_done == false)
69	root	1.2	re_init();
70	elmex	1.1
71			#ifdef SAFE_CHECKS
72			if (regexp == NULL \|\| str == NULL)
73	root	1.2	return NULL;
74	elmex	1.1	#endif
75			if (*regexp == '^') {
76	pippijn	1.3	once = true;
77	root	1.2	++regexp;
78	elmex	1.1	}
79			if (*regexp == 0) {
80	root	1.2	/* // or /^/ matches any string */
81			return str;
82	elmex	1.1	}
83
84			next_regexp = re_get_token(re_token[0], regexp);
85			re_token_depth = 0;
86			re_substr[0] = next_regexp;
87
88	pippijn	1.3	matched = false;
89	elmex	1.1	while (str != '\0' && !(matched = re_match_token(str, re_token[0])))
90	root	1.2	str++;
91	elmex	1.1
92			if (matched && *next_regexp == 0)
93	root	1.2	return str;
94	elmex	1.1
95			/* Apologies for the nearly duplicated code below, hopefully it
96			* speeds things up.
97			*/
98			if (once) {
99	root	1.2	switch (re_token[0]->repeat) {
100			case rep_once:
101	pippijn	1.3	if (matched == false)
102	root	1.2	return NULL;
103			break;
104			case rep_once_or_more:
105	pippijn	1.3	if (matched == false)
106	root	1.2	return NULL;
107
108			if (re_cmp_step(str+1, regexp, 0, 1))
109			return str;
110			break;
111			case rep_null_or_once:
112	pippijn	1.3	if (matched == false)
113	root	1.2	return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
114			break;
115			case rep_null_or_more:
116			if (matched) {
117			if (re_cmp_step(str+1, regexp, 0, 1))
118			return str;
119			} else {
120			return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL;
121			}
122			break;
123			}
124			return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL;
125	elmex	1.1	}
126
127			if (matched) {
128	root	1.2	switch (re_token[0]->repeat) {
129			case rep_once:
130			case rep_null_or_once:
131			break;
132			case rep_once_or_more:
133			case rep_null_or_more:
134			if (re_cmp_step(str+1, regexp, 0, 1))
135			return str;
136			break;
137			}
138			/* The logic here is that re_match_token only sees
139			* if the one letter matches. Thus, if the
140			* regex is like '@match eureca', and the
141			* the user enters anything with an e, re_match_token
142			* returns true, but they really need to match the
143			* entire regexp, which re_cmp_step will do.
144			* However, what happens is that there can be a case
145			* where the string being match is something like
146			* 'where is eureca'. In this case, the re_match_token
147			* matches that first e, but the re_cmp_step below,
148			* fails because the next character (r) doesn't match
149			* the u. So we call re_cmp with the string
150			* after the first r, so that it should hopefully match
151			* up properly.
152			*/
153			if (re_cmp_step(str+1, next_regexp, 1, 0))
154			return str;
155			else if (*(str+1) != 0)
156			return re_cmp(str+1, regexp);
157	elmex	1.1	}
158			return NULL;
159			}
160
161			/* A u x i l l i a r y f u n c t i o n s
162			*/
163
164	pippijn	1.3	static bool
165	elmex	1.1	re_cmp_step(const char str, const char regexp, unsigned slot, int matches) {
166			/* str - string to match
167			* regexp - pattern
168			* slot - number of the token which under consideration
169			* matches - how many times the token has matched
170			*/
171			const char *next_regexp;
172	pippijn	1.3	bool matched;
173	elmex	1.1
174			#ifdef DEBUG
175			/* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
176			#endif
177
178			if (*regexp == 0) {
179	root	1.2	/* When we reach the end of the regexp, the match is a success
180			*/
181	pippijn	1.3	return true;
182	elmex	1.1	}
183
184			/* This chunk of code makes sure that the regexp-tokenising happens
185			* only once. We only tokenise as much as we need.
186			*/
187			if (slot > re_token_depth) {
188	root	1.2	re_token_depth = slot;
189			if (re_token[slot] == NULL)
190			re_token[slot] = (selection *) malloc(sizeof(selection));
191			next_regexp = re_get_token(re_token[slot], regexp);
192			if (next_regexp == NULL) {
193			/* Syntax error, what else can we do? */
194	pippijn	1.3	return false;
195	root	1.2	}
196			re_substr[slot] = next_regexp;
197	elmex	1.1	} else {
198	root	1.2	next_regexp = re_substr[slot];
199	elmex	1.1	}
200
201			matched = re_match_token(*str, re_token[slot]);
202			if (matched)
203	root	1.2	++matches;
204	elmex	1.1
205			if (*str == 0)
206	root	1.2	return (*next_regexp == 0 \|\| re_token[slot]->type == sel_end) && matched;
207	elmex	1.1
208			switch (re_token[slot]->repeat) {
209	root	1.2	case rep_once:
210	pippijn	1.3	if (matches == 1) { /* (matches == 1) => (matched == true) */
211	root	1.2	return re_cmp_step(str+1, next_regexp, slot+1, 0);
212			}
213	pippijn	1.3	return false;
214	root	1.2	case rep_once_or_more:
215	pippijn	1.3	if (matched) { /* (matched == true) => (matches >= 1) */
216	root	1.2	/* First check if the current token repeats more */
217			if (re_cmp_step(str+1, regexp, slot, matches))
218	pippijn	1.3	return true;
219	root	1.2	return re_cmp_step(str+1, next_regexp, slot+1, 0);
220			}
221	pippijn	1.3	return false;
222	root	1.2	case rep_null_or_once:
223			/* We must go on to the next token, but should we advance str? */
224			if (matches == 0) {
225			return re_cmp_step(str, next_regexp, slot+1, 0);
226			} else if (matches == 1) {
227			return re_cmp_step(str+1, next_regexp, slot+1, 0);
228			}
229	pippijn	1.3	return false; /* Not reached */
230	root	1.2	case rep_null_or_more:
231			if (matched) {
232			/* Look for further repeats, advance str */
233			if (re_cmp_step(str+1, regexp, slot, matches))
234	pippijn	1.3	return true;
235	root	1.2	return re_cmp_step(str, next_regexp, slot+1, 0);
236			}
237			return re_cmp_step(str, next_regexp, slot+1, 0);
238	elmex	1.1	}
239	pippijn	1.3	return false;
240	elmex	1.1	}
241
242			static void
243			re_init(void) {
244			int i;
245
246			re_token[0] = (selection *) malloc(sizeof(selection));
247			for (i = 1; i < RE_TOKEN_MAX; i++)
248	root	1.2	re_token[i] = NULL;
249	elmex	1.1
250	pippijn	1.3	re_init_done = true;
251	elmex	1.1	}
252
253	pippijn	1.3	static bool
254			re_match_token(unsigned char c, selection *sel) {
255	elmex	1.1	switch (sel->type) {
256	root	1.2	case sel_any:
257	pippijn	1.3	return true;
258	root	1.2	case sel_end:
259			return (c == 0);
260			case sel_single:
261			return (tolower(c) == tolower(sel->u.single));
262			case sel_range:
263			return (c >= sel->u.range.low && c <= sel->u.range.high);
264			case sel_array:
265			return (sel->u.array[c]);
266			case sel_not_single:
267			return (tolower(c) != tolower(sel->u.single));
268			case sel_not_range:
269			return (c < sel->u.range.low && c > sel->u.range.high);
270	elmex	1.1	}
271	pippijn	1.3	return false;
272	elmex	1.1	}
273
274			/* re_get_token - get regular expression token
275			* Returns the first token found in <regexp> in <sel>
276			* Return values: NULL syntax error
277			* pointer to first character past token.
278			*/
279			static const char *
280			re_get_token(selection sel, const char regexp) {
281
282			#ifdef SAFE_CHECKS
283			# define exit_if_null if (*regexp == 0) return NULL
284			#else
285			# define exit_if_null
286			#endif
287
288	pippijn	1.3	bool quoted = false;
289			unsigned char looking_at;
290	elmex	1.1
291			#ifdef SAFE_CHECKS
292			if (sel == NULL \|\| regexp == NULL \|\| *regexp == 0)
293	root	1.2	return NULL;
294	elmex	1.1	#endif
295
296			do {
297	root	1.2	looking_at = *regexp++;
298			switch (looking_at) {
299			case '$':
300			if (quoted) {
301	pippijn	1.3	quoted = false;
302	root	1.2	sel->type = sel_single;
303			sel->u.single = looking_at;
304			} else {
305			sel->type = sel_end;
306			}
307			break;
308			case '.':
309			if (quoted) {
310	pippijn	1.3	quoted = false;
311	root	1.2	sel->type = sel_single;
312			sel->u.single = looking_at;
313			} else {
314			sel->type = sel_any;
315			}
316			break;
317			case '[':
318			/* The fun stuff... perhaps a little obfuscated since I
319			* don't trust the compiler to analyse liveness.
320			*/
321			if (quoted) {
322	pippijn	1.3	quoted = false;
323	root	1.2	sel->type = sel_single;
324			sel->u.single = looking_at;
325			} else {
326	pippijn	1.3	bool neg = false;
327			unsigned char first, last = 0;
328	root	1.2
329			exit_if_null;
330			looking_at = *regexp++;
331
332			if (looking_at == '^') {
333	pippijn	1.3	neg = true;
334	root	1.2	exit_if_null;
335			looking_at = *regexp++;
336			}
337			first = looking_at;
338			exit_if_null;
339			looking_at = *regexp++;
340			if (looking_at == ']') {
341			/* On the form [q] or [^q] */
342			sel->type = neg ? sel_not_single : sel_single;
343			sel->u.single = first;
344			break;
345			} else if (looking_at == '-') {
346			exit_if_null;
347			last = *regexp++;
348			if (last == ']') {
349			/* On the form [A-] or [^A-]. Checking for
350			* [,-] and making it a range is probably not
351			* worth it :-)
352			*/
353			sel->type = sel_array;
354			memset(sel->u.array, neg, sizeof(sel->u.array));
355			sel->u.array[first] = sel->u.array['-'] = !neg;
356			break;
357			} else {
358			exit_if_null;
359			looking_at = *regexp++;
360			if (looking_at == ']') {
361			/* On the form [A-G] or [^A-G]. Note that [G-A]
362			* is a syntax error. Fair enough, I think.
363			*/
364	elmex	1.1	#ifdef SAFE_CHECK
365	root	1.2	if (first > last)
366			return NULL;
367	elmex	1.1	#endif
368	root	1.2	sel->type = neg ? sel_not_range : sel_range;
369			sel->u.range.low = first;
370			sel->u.range.high = last;
371			break;
372			}
373			}
374			}
375			{
376			/* The datastructure can only represent a RE this
377			* complex with an array.
378			*/
379			int i;
380	pippijn	1.3	unsigned char previous;
381	root	1.2
382			sel->type = sel_array;
383			memset(sel->u.array, neg, sizeof(sel->u.array));
384			if (last) {
385			/* It starts with a range */
386	elmex	1.1	#ifdef SAFE_CHECK
387	root	1.2	if (first > last)
388			return NULL;
389	elmex	1.1	#endif
390	root	1.2	for (i = first; i <= last; i++) {
391			sel->u.array[i] = !neg;
392			}
393			} else {
394			/* It begins with a "random" character */
395			sel->u.array[first] = !neg;
396			}
397			sel->u.array[looking_at] = !neg;
398
399			exit_if_null;
400			previous = looking_at;
401			looking_at = *regexp++;
402
403			/* Add more characters to the array until we reach
404			* ]. Quoting doesn't and shouldn't work in here.
405			* ("]" should be put first, and "-" last if they
406			* are needed inside this construct.)
407			* Look for ranges as we go along.
408			*/
409			while (looking_at != ']') {
410			if (looking_at == '-') {
411			exit_if_null;
412			looking_at = *regexp++;
413			if (looking_at != ']') {
414	elmex	1.1	#ifdef SAFE_CHECK
415	root	1.2	if (previous > looking_at)
416			return NULL;
417	elmex	1.1	#endif
418	root	1.2	for (i = previous+1; i < looking_at; i++) {
419			/* previous has already been set and
420			* looking_at is set below.
421			*/
422			sel->u.array[i] = !neg;
423			}
424			exit_if_null;
425			} else {
426			sel->u.array['-'] = !neg;
427			break;
428			}
429			}
430			sel->u.array[looking_at] = !neg;
431			previous = looking_at;
432			exit_if_null;
433			looking_at = *regexp++;
434			}
435			}
436			}
437			break;
438			case '\\':
439			if (quoted) {
440	pippijn	1.3	quoted = false;
441	root	1.2	sel->type = sel_single;
442			sel->u.single = looking_at;
443			} else {
444	pippijn	1.3	quoted = true;
445	root	1.2	}
446			break;
447			default:
448	pippijn	1.3	quoted = false;
449	root	1.2	sel->type = sel_single;
450			sel->u.single = looking_at;
451			break;
452			}
453	elmex	1.1	} while (quoted);
454
455			if (regexp == '') {
456	root	1.2	sel->repeat = rep_null_or_more;
457			++regexp;
458	elmex	1.1	} else if (*regexp == '?') {
459	root	1.2	sel->repeat = rep_null_or_once;
460			++regexp;
461	elmex	1.1	} else if (*regexp == '+') {
462	root	1.2	sel->repeat = rep_once_or_more;
463			++regexp;
464	elmex	1.1	} else {
465	root	1.2	sel->repeat = rep_once;
466	elmex	1.1	}
467
468			return regexp;
469			}
470
471			/* D e b u g c o d e
472			*/
473			#ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
474			static void
475			re_dump_sel(selection *sel) {
476			switch(sel->type) {
477	root	1.2	case sel_any:
478			printf(".");
479			break;
480			case sel_end:
481			printf("$");
482			break;
483			case sel_single:
484			printf("<%c>", sel->u.single);
485			break;
486			case sel_range:
487			printf("[%c-%c]", sel->u.range.low, sel->u.range.high);
488			break;
489			case sel_array:
490			{
491			int i;
492			printf("[");
493	pippijn	1.3	for (i = 0; i < uchar_MAX; i++) {
494	root	1.2	if (sel->u.array[i]) {
495			printf("%c", i);
496			}
497			}
498			printf("]");
499			}
500			break;
501			case sel_not_single:
502			printf("[^%c]", sel->u.single);
503			break;
504			case sel_not_range:
505			printf("[^%c-%c]", sel->u.range.low, sel->u.range.high);
506			break;
507			default:
508			printf("<UNKNOWN TOKEN!>");
509			break;
510	elmex	1.1	}
511			switch(sel->repeat) {
512	root	1.2	case rep_once:
513			break;
514			case rep_null_or_once:
515			printf("?");
516			break;
517			case rep_null_or_more:
518			printf("*");
519			break;
520			case rep_once_or_more:
521			printf("+");
522			break;
523			default:
524			printf("<UNKNOWN REP-TOKEN!>");
525			break;
526	elmex	1.1	}
527			}
528
529			int
530			main(int argc, char *argv[]) {
531			char re, m;
532			selection sel;
533
534			re = re_get_token(&sel, argv[1]);
535
536			printf("'%s' -> '%s'\n", argv[1], re);
537			re_dump_sel(&sel);
538			printf("\n");
539			m = re_cmp(argv[2], argv[1]);
540			if (m)
541	root	1.2	printf("MATCH! -> '%s'\n", m);
542	elmex	1.1	return 0;
543			}
544			#endif