server/common/re-cmp.C

/* re-cmp.c
 * Pattern match a string, parsing some of the common RE-metacharacters.
 *
 * This code is public domain, but I would appreciate to hear of
 * improvements or even the fact that you use it in your program.
 *
 * Deliberate BUGS:
 *    - These tokens are not supported: | ( )
 *    - You will get the longest expansion of the _first_ string which
 *      matches the RE, not the longest string which would be the proper
 *      behaviour for a RE-matcher.
 *
 * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
 */

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <limits.h>
#include <re-cmp.h>
#include <ctype.h>

/* Get prototype functions to prevent warnings. */
#if defined (__sun__) && defined(StupidSunHeaders)
# include <sys/types.h>
# include <sys/time.h>
# include "sunos.h"             /* Prototypes for standard libraries, sunos lack those */
#endif


/*   P r o t o t y p e s
 */
const char *re_cmp (const char *, const char *);
static bool re_cmp_step (const char *, const char *, unsigned, int);
static void re_init (void);
static bool re_match_token (unsigned char, selection *);
static const char *re_get_token (selection *, const char *);

#ifdef DEBUG2
static void re_dump_sel (selection *);
#endif

/*   G l o b a l   v a r i a b l e s
 */
static bool re_init_done = false;
static selection *re_token[RE_TOKEN_MAX];
static const char *re_substr[RE_TOKEN_MAX];
static unsigned int re_token_depth;

/*   E x t e r n a l   f u n c t i o n
 */

/* re-cmp - get regular expression match.
 * Return values: NULL - no match or error in regexp.
 *                pointer to beginning of matching string
 */
const char *
re_cmp (const char *str, const char *regexp)
{
  const char *next_regexp;
  bool once = false;
  bool matched;

  if (re_init_done == false)
    re_init ();

#ifdef SAFE_CHECKS
  if (regexp == NULL || str == NULL)
    return NULL;
#endif
  if (*regexp == '^')
    {
      once = true;
      ++regexp;
    }
  if (*regexp == 0)
    {
      /* // or /^/ matches any string */
      return str;
    }

  next_regexp = re_get_token (re_token[0], regexp);
  re_token_depth = 0;
  re_substr[0] = next_regexp;

  matched = false;
  while (*str != '\0' && !(matched = re_match_token (*str, re_token[0])))
    str++;

  if (matched && *next_regexp == 0)
    return str;

  /* Apologies for the nearly duplicated code below, hopefully it
   * speeds things up.
   */
  if (once)
    {
      switch (re_token[0]->repeat)
        {
            case rep_once:
              if (matched == false)
                return NULL;
              break;
            case rep_once_or_more:
              if (matched == false)
                return NULL;

              if (re_cmp_step (str + 1, regexp, 0, 1))
                return str;
              break;
            case rep_null_or_once:
              if (matched == false)
                return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
              break;
            case rep_null_or_more:
              if (matched)
                {
                  if (re_cmp_step (str + 1, regexp, 0, 1))
                    return str;
                }
              else
                {
                  return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
                }
              break;
        }
      return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
    }

  if (matched)
    {
      switch (re_token[0]->repeat)
        {
            case rep_once:
            case rep_null_or_once:
              break;
            case rep_once_or_more:
            case rep_null_or_more:
              if (re_cmp_step (str + 1, regexp, 0, 1))
                return str;
              break;
        }
      /* The logic here is that re_match_token only sees
       * if the one letter matches.  Thus, if the
       * regex is like '@match eureca', and the
       * the user enters anything with an e, re_match_token
       * returns true, but they really need to match the
       * entire regexp, which re_cmp_step will do.
       * However, what happens is that there can be a case
       * where the string being match is something like
       * 'where is eureca'.  In this case, the re_match_token
       * matches that first e, but the re_cmp_step below,
       * fails because the next character (r) doesn't match
       * the u.  So we call re_cmp with the string
       * after the first r, so that it should hopefully match
       * up properly.
       */
      if (re_cmp_step (str + 1, next_regexp, 1, 0))
        return str;
      else if (*(str + 1) != 0)
        return re_cmp (str + 1, regexp);
    }
  return NULL;
}

/*   A u x i l l i a r y   f u n c t i o n s
 */

static bool
re_cmp_step (const char *str, const char *regexp, unsigned slot, int matches)
{
  /* str      - string to match
   * regexp   - pattern
   * slot     - number of the token which under consideration
   * matches  - how many times the token has matched
   */
  const char *next_regexp;
  bool matched;

#ifdef DEBUG

/*    fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
#endif

  if (*regexp == 0)
    {
      /* When we reach the end of the regexp, the match is a success
       */
      return true;
    }

  /* This chunk of code makes sure that the regexp-tokenising happens
   * only once. We only tokenise as much as we need.
   */
  if (slot > re_token_depth)
    {
      re_token_depth = slot;
      if (re_token[slot] == NULL)
        re_token[slot] = (selection *) malloc (sizeof (selection));
      next_regexp = re_get_token (re_token[slot], regexp);
      if (next_regexp == NULL)
        {
          /* Syntax error, what else can we do? */
          return false;
        }
      re_substr[slot] = next_regexp;
    }
  else
    {
      next_regexp = re_substr[slot];
    }

  matched = re_match_token (*str, re_token[slot]);
  if (matched)
    ++matches;

  if (*str == 0)
    return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;

  switch (re_token[slot]->repeat)
    {
        case rep_once:
          if (matches == 1)
            {                   /* (matches == 1) => (matched == true) */
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;
        case rep_once_or_more:
          if (matched)
            {                   /* (matched == true) => (matches >= 1) */
              /* First check if the current token repeats more */
              if (re_cmp_step (str + 1, regexp, slot, matches))
                return true;
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;
        case rep_null_or_once:
          /* We must go on to the next token, but should we advance str? */
          if (matches == 0)
            {
              return re_cmp_step (str, next_regexp, slot + 1, 0);
            }
          else if (matches == 1)
            {
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;         /* Not reached */
        case rep_null_or_more:
          if (matched)
            {
              /* Look for further repeats, advance str */
              if (re_cmp_step (str + 1, regexp, slot, matches))
                return true;
              return re_cmp_step (str, next_regexp, slot + 1, 0);
            }
          return re_cmp_step (str, next_regexp, slot + 1, 0);
    }
  return false;
}

static void
re_init (void)
{
  int i;

  re_token[0] = (selection *) malloc (sizeof (selection));
  for (i = 1; i < RE_TOKEN_MAX; i++)
    re_token[i] = NULL;

  re_init_done = true;
}

static bool
re_match_token (unsigned char c, selection * sel)
{
  switch (sel->type)
    {
        case sel_any:
          return true;
        case sel_end:
          return (c == 0);
        case sel_single:
          return (tolower (c) == tolower (sel->u.single));
        case sel_range:
          return (c >= sel->u.range.low && c <= sel->u.range.high);
        case sel_array:
          return (sel->u.array[c]);
        case sel_not_single:
          return (tolower (c) != tolower (sel->u.single));
        case sel_not_range:
          return (c < sel->u.range.low && c > sel->u.range.high);
    }
  return false;
}

/* re_get_token - get regular expression token
 * Returns the first token found in <regexp> in <sel>
 * Return values: NULL  syntax error
 *                pointer to first character past token.
 */
static const char *
re_get_token (selection * sel, const char *regexp)
{

#ifdef SAFE_CHECKS
# define exit_if_null   if (*regexp == 0) return NULL
#else
# define exit_if_null
#endif

  bool quoted = false;
  unsigned char looking_at;

#ifdef SAFE_CHECKS
  if (sel == NULL || regexp == NULL || *regexp == 0)
    return NULL;
#endif

  do
    {
      looking_at = *regexp++;
      switch (looking_at)
        {
            case '$':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  sel->type = sel_end;
                }
              break;
            case '.':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  sel->type = sel_any;
                }
              break;
            case '[':
              /* The fun stuff... perhaps a little obfuscated since I
               * don't trust the compiler to analyse liveness.
               */
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  bool neg = false;
                  unsigned char first, last = 0;

                  exit_if_null;
                  looking_at = *regexp++;

                  if (looking_at == '^')
                    {
                      neg = true;
                      exit_if_null;
                      looking_at = *regexp++;
                    }
                  first = looking_at;
                  exit_if_null;
                  looking_at = *regexp++;
                  if (looking_at == ']')
                    {
                      /* On the form [q] or [^q] */
                      sel->type = neg ? sel_not_single : sel_single;
                      sel->u.single = first;
                      break;
                    }
                  else if (looking_at == '-')
                    {
                      exit_if_null;
                      last = *regexp++;
                      if (last == ']')
                        {
                          /* On the form [A-] or [^A-]. Checking for
                           * [,-] and making it a range is probably not
                           * worth it :-)
                           */
                          sel->type = sel_array;
                          memset (sel->u.array, neg, sizeof (sel->u.array));
                          sel->u.array[first] = sel->u.array['-'] = !neg;
                          break;
                        }
                      else
                        {
                          exit_if_null;
                          looking_at = *regexp++;
                          if (looking_at == ']')
                            {
                              /* On the form [A-G] or [^A-G]. Note that [G-A]
                               * is a syntax error. Fair enough, I think.
                               */
#ifdef SAFE_CHECK
                              if (first > last)
                                return NULL;
#endif
                              sel->type = neg ? sel_not_range : sel_range;
                              sel->u.range.low = first;
                              sel->u.range.high = last;
                              break;
                            }
                        }
                    }
                  {
                    /* The datastructure can only represent a RE this
                     * complex with an array.
                     */
                    int i;
                    unsigned char previous;

                    sel->type = sel_array;
                    memset (sel->u.array, neg, sizeof (sel->u.array));
                    if (last)
                      {
                        /* It starts with a range */
#ifdef SAFE_CHECK
                        if (first > last)
                          return NULL;
#endif
                        for (i = first; i <= last; i++)
                          {
                            sel->u.array[i] = !neg;
                          }
                      }
                    else
                      {
                        /* It begins with a "random" character */
                        sel->u.array[first] = !neg;
                      }
                    sel->u.array[looking_at] = !neg;

                    exit_if_null;
                    previous = looking_at;
                    looking_at = *regexp++;

                    /* Add more characters to the array until we reach
                     * ]. Quoting doesn't and shouldn't work in here.
                     * ("]" should be put first, and "-" last if they
                     * are needed inside this construct.)
                     * Look for ranges as we go along.
                     */
                    while (looking_at != ']')
                      {
                        if (looking_at == '-')
                          {
                            exit_if_null;
                            looking_at = *regexp++;
                            if (looking_at != ']')
                              {
#ifdef SAFE_CHECK
                                if (previous > looking_at)
                                  return NULL;
#endif
                                for (i = previous + 1; i < looking_at; i++)
                                  {
                                    /* previous has already been set and
                                     * looking_at is set below.
                                     */
                                    sel->u.array[i] = !neg;
                                  }
                                exit_if_null;
                              }
                            else
                              {
                                sel->u.array['-'] = !neg;
                                break;
                              }
                          }
                        sel->u.array[looking_at] = !neg;
                        previous = looking_at;
                        exit_if_null;
                        looking_at = *regexp++;
                      }
                  }
                }
              break;
            case '\\':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  quoted = true;
                }
              break;
            default:
              quoted = false;
              sel->type = sel_single;
              sel->u.single = looking_at;
              break;
        }
    }
  while (quoted);

  if (*regexp == '*')
    {
      sel->repeat = rep_null_or_more;
      ++regexp;
    }
  else if (*regexp == '?')
    {
      sel->repeat = rep_null_or_once;
      ++regexp;
    }
  else if (*regexp == '+')
    {
      sel->repeat = rep_once_or_more;
      ++regexp;
    }
  else
    {
      sel->repeat = rep_once;
    }

  return regexp;
}

/*   D e b u g   c o d e
 */
#ifdef DEBUG2                   /* compile all with DEBUG also ? hevi@lut.fi */
static void
re_dump_sel (selection * sel)
{
  switch (sel->type)
    {
        case sel_any:
          printf (".");
          break;
        case sel_end:
          printf ("$");
          break;
        case sel_single:
          printf ("<%c>", sel->u.single);
          break;
        case sel_range:
          printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
          break;
        case sel_array:
          {
            int i;

            printf ("[");
            for (i = 0; i < uchar_MAX; i++)
              {
                if (sel->u.array[i])
                  {
                    printf ("%c", i);
                  }
              }
            printf ("]");
          }
          break;
        case sel_not_single:
          printf ("[^%c]", sel->u.single);
          break;
        case sel_not_range:
          printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
          break;
        default:
          printf ("<UNKNOWN TOKEN!>");
          break;
    }
  switch (sel->repeat)
    {
        case rep_once:
          break;
        case rep_null_or_once:
          printf ("?");
          break;
        case rep_null_or_more:
          printf ("*");
          break;
        case rep_once_or_more:
          printf ("+");
          break;
        default:
          printf ("<UNKNOWN REP-TOKEN!>");
          break;
    }
}

int
main (int argc, char *argv[])
{
  char *re, *m;
  selection sel;

  re = re_get_token (&sel, argv[1]);

  printf ("'%s' -> '%s'\n", argv[1], re);
  re_dump_sel (&sel);
  printf ("\n");
  m = re_cmp (argv[2], argv[1]);
  if (m)
    printf ("MATCH! -> '%s'\n", m);
  return 0;
}
#endif
Revision:	1.6
Committed:	Sat Dec 9 17:28:37 2006 UTC (17 years, 6 months ago) by pippijn
Content type:	text/plain
Branch:	MAIN
CVS Tags:	HEAD
Changes since 1.5:	+0 -0 lines
State:	*FILE REMOVED*
Log Message:	removed regex comparison. this is now done with perl
#	Content
1	/* re-cmp.c
2	* Pattern match a string, parsing some of the common RE-metacharacters.
3	*
4	* This code is public domain, but I would appreciate to hear of
5	* improvements or even the fact that you use it in your program.
6	*
7	* Deliberate BUGS:
8	* - These tokens are not supported: \| ( )
9	* - You will get the longest expansion of the _first_ string which
10	* matches the RE, not the longest string which would be the proper
11	* behaviour for a RE-matcher.
12	*
13	* Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
14	*/
15
16	#include <stdio.h>
17	#include <stdlib.h>
18	#include <memory.h>
19	#include <limits.h>
20	#include <re-cmp.h>
21	#include <ctype.h>
22
23	/* Get prototype functions to prevent warnings. */
24	#if defined (__sun__) && defined(StupidSunHeaders)
25	# include <sys/types.h>
26	# include <sys/time.h>
27	# include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
28	#endif
29
30
31	/* P r o t o t y p e s
32	*/
33	const char re_cmp (const char , const char *);
34	static bool re_cmp_step (const char , const char , unsigned, int);
35	static void re_init (void);
36	static bool re_match_token (unsigned char, selection *);
37	static const char re_get_token (selection , const char *);
38
39	#ifdef DEBUG2
40	static void re_dump_sel (selection *);
41	#endif
42
43	/* G l o b a l v a r i a b l e s
44	*/
45	static bool re_init_done = false;
46	static selection *re_token[RE_TOKEN_MAX];
47	static const char *re_substr[RE_TOKEN_MAX];
48	static unsigned int re_token_depth;
49
50	/* E x t e r n a l f u n c t i o n
51	*/
52
53	/* re-cmp - get regular expression match.
54	* Return values: NULL - no match or error in regexp.
55	* pointer to beginning of matching string
56	*/
57	const char *
58	re_cmp (const char str, const char regexp)
59	{
60	const char *next_regexp;
61	bool once = false;
62	bool matched;
63
64	if (re_init_done == false)
65	re_init ();
66
67	#ifdef SAFE_CHECKS
68	if (regexp == NULL \|\| str == NULL)
69	return NULL;
70	#endif
71	if (*regexp == '^')
72	{
73	once = true;
74	++regexp;
75	}
76	if (*regexp == 0)
77	{
78	/* // or /^/ matches any string */
79	return str;
80	}
81
82	next_regexp = re_get_token (re_token[0], regexp);
83	re_token_depth = 0;
84	re_substr[0] = next_regexp;
85
86	matched = false;
87	while (str != '\0' && !(matched = re_match_token (str, re_token[0])))
88	str++;
89
90	if (matched && *next_regexp == 0)
91	return str;
92
93	/* Apologies for the nearly duplicated code below, hopefully it
94	* speeds things up.
95	*/
96	if (once)
97	{
98	switch (re_token[0]->repeat)
99	{
100	case rep_once:
101	if (matched == false)
102	return NULL;
103	break;
104	case rep_once_or_more:
105	if (matched == false)
106	return NULL;
107
108	if (re_cmp_step (str + 1, regexp, 0, 1))
109	return str;
110	break;
111	case rep_null_or_once:
112	if (matched == false)
113	return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
114	break;
115	case rep_null_or_more:
116	if (matched)
117	{
118	if (re_cmp_step (str + 1, regexp, 0, 1))
119	return str;
120	}
121	else
122	{
123	return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
124	}
125	break;
126	}
127	return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
128	}
129
130	if (matched)
131	{
132	switch (re_token[0]->repeat)
133	{
134	case rep_once:
135	case rep_null_or_once:
136	break;
137	case rep_once_or_more:
138	case rep_null_or_more:
139	if (re_cmp_step (str + 1, regexp, 0, 1))
140	return str;
141	break;
142	}
143	/* The logic here is that re_match_token only sees
144	* if the one letter matches. Thus, if the
145	* regex is like '@match eureca', and the
146	* the user enters anything with an e, re_match_token
147	* returns true, but they really need to match the
148	* entire regexp, which re_cmp_step will do.
149	* However, what happens is that there can be a case
150	* where the string being match is something like
151	* 'where is eureca'. In this case, the re_match_token
152	* matches that first e, but the re_cmp_step below,
153	* fails because the next character (r) doesn't match
154	* the u. So we call re_cmp with the string
155	* after the first r, so that it should hopefully match
156	* up properly.
157	*/
158	if (re_cmp_step (str + 1, next_regexp, 1, 0))
159	return str;
160	else if (*(str + 1) != 0)
161	return re_cmp (str + 1, regexp);
162	}
163	return NULL;
164	}
165
166	/* A u x i l l i a r y f u n c t i o n s
167	*/
168
169	static bool
170	re_cmp_step (const char str, const char regexp, unsigned slot, int matches)
171	{
172	/* str - string to match
173	* regexp - pattern
174	* slot - number of the token which under consideration
175	* matches - how many times the token has matched
176	*/
177	const char *next_regexp;
178	bool matched;
179
180	#ifdef DEBUG
181
182	/* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
183	#endif
184
185	if (*regexp == 0)
186	{
187	/* When we reach the end of the regexp, the match is a success
188	*/
189	return true;
190	}
191
192	/* This chunk of code makes sure that the regexp-tokenising happens
193	* only once. We only tokenise as much as we need.
194	*/
195	if (slot > re_token_depth)
196	{
197	re_token_depth = slot;
198	if (re_token[slot] == NULL)
199	re_token[slot] = (selection *) malloc (sizeof (selection));
200	next_regexp = re_get_token (re_token[slot], regexp);
201	if (next_regexp == NULL)
202	{
203	/* Syntax error, what else can we do? */
204	return false;
205	}
206	re_substr[slot] = next_regexp;
207	}
208	else
209	{
210	next_regexp = re_substr[slot];
211	}
212
213	matched = re_match_token (*str, re_token[slot]);
214	if (matched)
215	++matches;
216
217	if (*str == 0)
218	return (*next_regexp == 0 \|\| re_token[slot]->type == sel_end) && matched;
219
220	switch (re_token[slot]->repeat)
221	{
222	case rep_once:
223	if (matches == 1)
224	{ /* (matches == 1) => (matched == true) */
225	return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
226	}
227	return false;
228	case rep_once_or_more:
229	if (matched)
230	{ /* (matched == true) => (matches >= 1) */
231	/* First check if the current token repeats more */
232	if (re_cmp_step (str + 1, regexp, slot, matches))
233	return true;
234	return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
235	}
236	return false;
237	case rep_null_or_once:
238	/* We must go on to the next token, but should we advance str? */
239	if (matches == 0)
240	{
241	return re_cmp_step (str, next_regexp, slot + 1, 0);
242	}
243	else if (matches == 1)
244	{
245	return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
246	}
247	return false; /* Not reached */
248	case rep_null_or_more:
249	if (matched)
250	{
251	/* Look for further repeats, advance str */
252	if (re_cmp_step (str + 1, regexp, slot, matches))
253	return true;
254	return re_cmp_step (str, next_regexp, slot + 1, 0);
255	}
256	return re_cmp_step (str, next_regexp, slot + 1, 0);
257	}
258	return false;
259	}
260
261	static void
262	re_init (void)
263	{
264	int i;
265
266	re_token[0] = (selection *) malloc (sizeof (selection));
267	for (i = 1; i < RE_TOKEN_MAX; i++)
268	re_token[i] = NULL;
269
270	re_init_done = true;
271	}
272
273	static bool
274	re_match_token (unsigned char c, selection * sel)
275	{
276	switch (sel->type)
277	{
278	case sel_any:
279	return true;
280	case sel_end:
281	return (c == 0);
282	case sel_single:
283	return (tolower (c) == tolower (sel->u.single));
284	case sel_range:
285	return (c >= sel->u.range.low && c <= sel->u.range.high);
286	case sel_array:
287	return (sel->u.array[c]);
288	case sel_not_single:
289	return (tolower (c) != tolower (sel->u.single));
290	case sel_not_range:
291	return (c < sel->u.range.low && c > sel->u.range.high);
292	}
293	return false;
294	}
295
296	/* re_get_token - get regular expression token
297	* Returns the first token found in <regexp> in <sel>
298	* Return values: NULL syntax error
299	* pointer to first character past token.
300	*/
301	static const char *
302	re_get_token (selection * sel, const char *regexp)
303	{
304
305	#ifdef SAFE_CHECKS
306	# define exit_if_null if (*regexp == 0) return NULL
307	#else
308	# define exit_if_null
309	#endif
310
311	bool quoted = false;
312	unsigned char looking_at;
313
314	#ifdef SAFE_CHECKS
315	if (sel == NULL \|\| regexp == NULL \|\| *regexp == 0)
316	return NULL;
317	#endif
318
319	do
320	{
321	looking_at = *regexp++;
322	switch (looking_at)
323	{
324	case '$':
325	if (quoted)
326	{
327	quoted = false;
328	sel->type = sel_single;
329	sel->u.single = looking_at;
330	}
331	else
332	{
333	sel->type = sel_end;
334	}
335	break;
336	case '.':
337	if (quoted)
338	{
339	quoted = false;
340	sel->type = sel_single;
341	sel->u.single = looking_at;
342	}
343	else
344	{
345	sel->type = sel_any;
346	}
347	break;
348	case '[':
349	/* The fun stuff... perhaps a little obfuscated since I
350	* don't trust the compiler to analyse liveness.
351	*/
352	if (quoted)
353	{
354	quoted = false;
355	sel->type = sel_single;
356	sel->u.single = looking_at;
357	}
358	else
359	{
360	bool neg = false;
361	unsigned char first, last = 0;
362
363	exit_if_null;
364	looking_at = *regexp++;
365
366	if (looking_at == '^')
367	{
368	neg = true;
369	exit_if_null;
370	looking_at = *regexp++;
371	}
372	first = looking_at;
373	exit_if_null;
374	looking_at = *regexp++;
375	if (looking_at == ']')
376	{
377	/* On the form [q] or [^q] */
378	sel->type = neg ? sel_not_single : sel_single;
379	sel->u.single = first;
380	break;
381	}
382	else if (looking_at == '-')
383	{
384	exit_if_null;
385	last = *regexp++;
386	if (last == ']')
387	{
388	/* On the form [A-] or [^A-]. Checking for
389	* [,-] and making it a range is probably not
390	* worth it :-)
391	*/
392	sel->type = sel_array;
393	memset (sel->u.array, neg, sizeof (sel->u.array));
394	sel->u.array[first] = sel->u.array['-'] = !neg;
395	break;
396	}
397	else
398	{
399	exit_if_null;
400	looking_at = *regexp++;
401	if (looking_at == ']')
402	{
403	/* On the form [A-G] or [^A-G]. Note that [G-A]
404	* is a syntax error. Fair enough, I think.
405	*/
406	#ifdef SAFE_CHECK
407	if (first > last)
408	return NULL;
409	#endif
410	sel->type = neg ? sel_not_range : sel_range;
411	sel->u.range.low = first;
412	sel->u.range.high = last;
413	break;
414	}
415	}
416	}
417	{
418	/* The datastructure can only represent a RE this
419	* complex with an array.
420	*/
421	int i;
422	unsigned char previous;
423
424	sel->type = sel_array;
425	memset (sel->u.array, neg, sizeof (sel->u.array));
426	if (last)
427	{
428	/* It starts with a range */
429	#ifdef SAFE_CHECK
430	if (first > last)
431	return NULL;
432	#endif
433	for (i = first; i <= last; i++)
434	{
435	sel->u.array[i] = !neg;
436	}
437	}
438	else
439	{
440	/* It begins with a "random" character */
441	sel->u.array[first] = !neg;
442	}
443	sel->u.array[looking_at] = !neg;
444
445	exit_if_null;
446	previous = looking_at;
447	looking_at = *regexp++;
448
449	/* Add more characters to the array until we reach
450	* ]. Quoting doesn't and shouldn't work in here.
451	* ("]" should be put first, and "-" last if they
452	* are needed inside this construct.)
453	* Look for ranges as we go along.
454	*/
455	while (looking_at != ']')
456	{
457	if (looking_at == '-')
458	{
459	exit_if_null;
460	looking_at = *regexp++;
461	if (looking_at != ']')
462	{
463	#ifdef SAFE_CHECK
464	if (previous > looking_at)
465	return NULL;
466	#endif
467	for (i = previous + 1; i < looking_at; i++)
468	{
469	/* previous has already been set and
470	* looking_at is set below.
471	*/
472	sel->u.array[i] = !neg;
473	}
474	exit_if_null;
475	}
476	else
477	{
478	sel->u.array['-'] = !neg;
479	break;
480	}
481	}
482	sel->u.array[looking_at] = !neg;
483	previous = looking_at;
484	exit_if_null;
485	looking_at = *regexp++;
486	}
487	}
488	}
489	break;
490	case '\\':
491	if (quoted)
492	{
493	quoted = false;
494	sel->type = sel_single;
495	sel->u.single = looking_at;
496	}
497	else
498	{
499	quoted = true;
500	}
501	break;
502	default:
503	quoted = false;
504	sel->type = sel_single;
505	sel->u.single = looking_at;
506	break;
507	}
508	}
509	while (quoted);
510
511	if (regexp == '')
512	{
513	sel->repeat = rep_null_or_more;
514	++regexp;
515	}
516	else if (*regexp == '?')
517	{
518	sel->repeat = rep_null_or_once;
519	++regexp;
520	}
521	else if (*regexp == '+')
522	{
523	sel->repeat = rep_once_or_more;
524	++regexp;
525	}
526	else
527	{
528	sel->repeat = rep_once;
529	}
530
531	return regexp;
532	}
533
534	/* D e b u g c o d e
535	*/
536	#ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
537	static void
538	re_dump_sel (selection * sel)
539	{
540	switch (sel->type)
541	{
542	case sel_any:
543	printf (".");
544	break;
545	case sel_end:
546	printf ("$");
547	break;
548	case sel_single:
549	printf ("<%c>", sel->u.single);
550	break;
551	case sel_range:
552	printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
553	break;
554	case sel_array:
555	{
556	int i;
557
558	printf ("[");
559	for (i = 0; i < uchar_MAX; i++)
560	{
561	if (sel->u.array[i])
562	{
563	printf ("%c", i);
564	}
565	}
566	printf ("]");
567	}
568	break;
569	case sel_not_single:
570	printf ("[^%c]", sel->u.single);
571	break;
572	case sel_not_range:
573	printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
574	break;
575	default:
576	printf ("<UNKNOWN TOKEN!>");
577	break;
578	}
579	switch (sel->repeat)
580	{
581	case rep_once:
582	break;
583	case rep_null_or_once:
584	printf ("?");
585	break;
586	case rep_null_or_more:
587	printf ("*");
588	break;
589	case rep_once_or_more:
590	printf ("+");
591	break;
592	default:
593	printf ("<UNKNOWN REP-TOKEN!>");
594	break;
595	}
596	}
597
598	int
599	main (int argc, char *argv[])
600	{
601	char re, m;
602	selection sel;
603
604	re = re_get_token (&sel, argv[1]);
605
606	printf ("'%s' -> '%s'\n", argv[1], re);
607	re_dump_sel (&sel);
608	printf ("\n");
609	m = re_cmp (argv[2], argv[1]);
610	if (m)
611	printf ("MATCH! -> '%s'\n", m);
612	return 0;
613	}
614	#endif