server/common/re-cmp.C


/*
 * static char *rcsid_player_c =
 *   "$Id: re-cmp.C,v 1.3 2006-09-08 04:51:08 pippijn Exp $";
 */


/* re-cmp.c
 * Pattern match a string, parsing some of the common RE-metacharacters.
 *
 * This code is public domain, but I would appreciate to hear of
 * improvements or even the fact that you use it in your program.
 *
 * Deliberate BUGS:
 *    - These tokens are not supported: | ( )
 *    - You will get the longest expansion of the _first_ string which
 *      matches the RE, not the longest string which would be the proper
 *      behaviour for a RE-matcher.
 *
 * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
 */

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <limits.h>
#include <re-cmp.h>
#include <ctype.h>

/* Get prototype functions to prevent warnings. */
#if defined (__sun__) && defined(StupidSunHeaders)
# include <sys/types.h>
# include <sys/time.h>
# include "sunos.h"             /* Prototypes for standard libraries, sunos lack those */
#endif


/*   P r o t o t y p e s
 */
const char *re_cmp (const char *, const char *);
static bool re_cmp_step (const char *, const char *, unsigned, int);
static void re_init (void);
static bool re_match_token (unsigned char, selection *);
static const char *re_get_token (selection *, const char *);

#ifdef DEBUG2
static void re_dump_sel (selection *);
#endif

/*   G l o b a l   v a r i a b l e s
 */
static bool re_init_done = false;
static selection *re_token[RE_TOKEN_MAX];
static const char *re_substr[RE_TOKEN_MAX];
static unsigned int re_token_depth;

/*   E x t e r n a l   f u n c t i o n
 */

/* re-cmp - get regular expression match.
 * Return values: NULL - no match or error in regexp.
 *                pointer to beginning of matching string
 */
const char *
re_cmp (const char *str, const char *regexp)
{
  const char *next_regexp;
  bool once = false;
  bool matched;

  if (re_init_done == false)
    re_init ();

#ifdef SAFE_CHECKS
  if (regexp == NULL || str == NULL)
    return NULL;
#endif
  if (*regexp == '^')
    {
      once = true;
      ++regexp;
    }
  if (*regexp == 0)
    {
      /* // or /^/ matches any string */
      return str;
    }

  next_regexp = re_get_token (re_token[0], regexp);
  re_token_depth = 0;
  re_substr[0] = next_regexp;

  matched = false;
  while (*str != '\0' && !(matched = re_match_token (*str, re_token[0])))
    str++;

  if (matched && *next_regexp == 0)
    return str;

  /* Apologies for the nearly duplicated code below, hopefully it
   * speeds things up.
   */
  if (once)
    {
      switch (re_token[0]->repeat)
        {
            case rep_once:
              if (matched == false)
                return NULL;
              break;
            case rep_once_or_more:
              if (matched == false)
                return NULL;

              if (re_cmp_step (str + 1, regexp, 0, 1))
                return str;
              break;
            case rep_null_or_once:
              if (matched == false)
                return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
              break;
            case rep_null_or_more:
              if (matched)
                {
                  if (re_cmp_step (str + 1, regexp, 0, 1))
                    return str;
                }
              else
                {
                  return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
                }
              break;
        }
      return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
    }

  if (matched)
    {
      switch (re_token[0]->repeat)
        {
            case rep_once:
            case rep_null_or_once:
              break;
            case rep_once_or_more:
            case rep_null_or_more:
              if (re_cmp_step (str + 1, regexp, 0, 1))
                return str;
              break;
        }
      /* The logic here is that re_match_token only sees
       * if the one letter matches.  Thus, if the
       * regex is like '@match eureca', and the
       * the user enters anything with an e, re_match_token
       * returns true, but they really need to match the
       * entire regexp, which re_cmp_step will do.
       * However, what happens is that there can be a case
       * where the string being match is something like
       * 'where is eureca'.  In this case, the re_match_token
       * matches that first e, but the re_cmp_step below,
       * fails because the next character (r) doesn't match
       * the u.  So we call re_cmp with the string
       * after the first r, so that it should hopefully match
       * up properly.
       */
      if (re_cmp_step (str + 1, next_regexp, 1, 0))
        return str;
      else if (*(str + 1) != 0)
        return re_cmp (str + 1, regexp);
    }
  return NULL;
}

/*   A u x i l l i a r y   f u n c t i o n s
 */

static bool
re_cmp_step (const char *str, const char *regexp, unsigned slot, int matches)
{
  /* str      - string to match
   * regexp   - pattern
   * slot     - number of the token which under consideration
   * matches  - how many times the token has matched
   */
  const char *next_regexp;
  bool matched;

#ifdef DEBUG

/*    fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
#endif

  if (*regexp == 0)
    {
      /* When we reach the end of the regexp, the match is a success
       */
      return true;
    }

  /* This chunk of code makes sure that the regexp-tokenising happens
   * only once. We only tokenise as much as we need.
   */
  if (slot > re_token_depth)
    {
      re_token_depth = slot;
      if (re_token[slot] == NULL)
        re_token[slot] = (selection *) malloc (sizeof (selection));
      next_regexp = re_get_token (re_token[slot], regexp);
      if (next_regexp == NULL)
        {
          /* Syntax error, what else can we do? */
          return false;
        }
      re_substr[slot] = next_regexp;
    }
  else
    {
      next_regexp = re_substr[slot];
    }

  matched = re_match_token (*str, re_token[slot]);
  if (matched)
    ++matches;

  if (*str == 0)
    return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;

  switch (re_token[slot]->repeat)
    {
        case rep_once:
          if (matches == 1)
            {                   /* (matches == 1) => (matched == true) */
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;
        case rep_once_or_more:
          if (matched)
            {                   /* (matched == true) => (matches >= 1) */
              /* First check if the current token repeats more */
              if (re_cmp_step (str + 1, regexp, slot, matches))
                return true;
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;
        case rep_null_or_once:
          /* We must go on to the next token, but should we advance str? */
          if (matches == 0)
            {
              return re_cmp_step (str, next_regexp, slot + 1, 0);
            }
          else if (matches == 1)
            {
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;         /* Not reached */
        case rep_null_or_more:
          if (matched)
            {
              /* Look for further repeats, advance str */
              if (re_cmp_step (str + 1, regexp, slot, matches))
                return true;
              return re_cmp_step (str, next_regexp, slot + 1, 0);
            }
          return re_cmp_step (str, next_regexp, slot + 1, 0);
    }
  return false;
}

static void
re_init (void)
{
  int i;

  re_token[0] = (selection *) malloc (sizeof (selection));
  for (i = 1; i < RE_TOKEN_MAX; i++)
    re_token[i] = NULL;

  re_init_done = true;
}

static bool
re_match_token (unsigned char c, selection * sel)
{
  switch (sel->type)
    {
        case sel_any:
          return true;
        case sel_end:
          return (c == 0);
        case sel_single:
          return (tolower (c) == tolower (sel->u.single));
        case sel_range:
          return (c >= sel->u.range.low && c <= sel->u.range.high);
        case sel_array:
          return (sel->u.array[c]);
        case sel_not_single:
          return (tolower (c) != tolower (sel->u.single));
        case sel_not_range:
          return (c < sel->u.range.low && c > sel->u.range.high);
    }
  return false;
}

/* re_get_token - get regular expression token
 * Returns the first token found in <regexp> in <sel>
 * Return values: NULL  syntax error
 *                pointer to first character past token.
 */
static const char *
re_get_token (selection * sel, const char *regexp)
{

#ifdef SAFE_CHECKS
# define exit_if_null   if (*regexp == 0) return NULL
#else
# define exit_if_null
#endif

  bool quoted = false;
  unsigned char looking_at;

#ifdef SAFE_CHECKS
  if (sel == NULL || regexp == NULL || *regexp == 0)
    return NULL;
#endif

  do
    {
      looking_at = *regexp++;
      switch (looking_at)
        {
            case '$':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  sel->type = sel_end;
                }
              break;
            case '.':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  sel->type = sel_any;
                }
              break;
            case '[':
              /* The fun stuff... perhaps a little obfuscated since I
               * don't trust the compiler to analyse liveness.
               */
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  bool neg = false;
                  unsigned char first, last = 0;

                  exit_if_null;
                  looking_at = *regexp++;

                  if (looking_at == '^')
                    {
                      neg = true;
                      exit_if_null;
                      looking_at = *regexp++;
                    }
                  first = looking_at;
                  exit_if_null;
                  looking_at = *regexp++;
                  if (looking_at == ']')
                    {
                      /* On the form [q] or [^q] */
                      sel->type = neg ? sel_not_single : sel_single;
                      sel->u.single = first;
                      break;
                    }
                  else if (looking_at == '-')
                    {
                      exit_if_null;
                      last = *regexp++;
                      if (last == ']')
                        {
                          /* On the form [A-] or [^A-]. Checking for
                           * [,-] and making it a range is probably not
                           * worth it :-)
                           */
                          sel->type = sel_array;
                          memset (sel->u.array, neg, sizeof (sel->u.array));
                          sel->u.array[first] = sel->u.array['-'] = !neg;
                          break;
                        }
                      else
                        {
                          exit_if_null;
                          looking_at = *regexp++;
                          if (looking_at == ']')
                            {
                              /* On the form [A-G] or [^A-G]. Note that [G-A]
                               * is a syntax error. Fair enough, I think.
                               */
#ifdef SAFE_CHECK
                              if (first > last)
                                return NULL;
#endif
                              sel->type = neg ? sel_not_range : sel_range;
                              sel->u.range.low = first;
                              sel->u.range.high = last;
                              break;
                            }
                        }
                    }
                  {
                    /* The datastructure can only represent a RE this
                     * complex with an array.
                     */
                    int i;
                    unsigned char previous;

                    sel->type = sel_array;
                    memset (sel->u.array, neg, sizeof (sel->u.array));
                    if (last)
                      {
                        /* It starts with a range */
#ifdef SAFE_CHECK
                        if (first > last)
                          return NULL;
#endif
                        for (i = first; i <= last; i++)
                          {
                            sel->u.array[i] = !neg;
                          }
                      }
                    else
                      {
                        /* It begins with a "random" character */
                        sel->u.array[first] = !neg;
                      }
                    sel->u.array[looking_at] = !neg;

                    exit_if_null;
                    previous = looking_at;
                    looking_at = *regexp++;

                    /* Add more characters to the array until we reach
                     * ]. Quoting doesn't and shouldn't work in here.
                     * ("]" should be put first, and "-" last if they
                     * are needed inside this construct.)
                     * Look for ranges as we go along.
                     */
                    while (looking_at != ']')
                      {
                        if (looking_at == '-')
                          {
                            exit_if_null;
                            looking_at = *regexp++;
                            if (looking_at != ']')
                              {
#ifdef SAFE_CHECK
                                if (previous > looking_at)
                                  return NULL;
#endif
                                for (i = previous + 1; i < looking_at; i++)
                                  {
                                    /* previous has already been set and
                                     * looking_at is set below.
                                     */
                                    sel->u.array[i] = !neg;
                                  }
                                exit_if_null;
                              }
                            else
                              {
                                sel->u.array['-'] = !neg;
                                break;
                              }
                          }
                        sel->u.array[looking_at] = !neg;
                        previous = looking_at;
                        exit_if_null;
                        looking_at = *regexp++;
                      }
                  }
                }
              break;
            case '\\':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  quoted = true;
                }
              break;
            default:
              quoted = false;
              sel->type = sel_single;
              sel->u.single = looking_at;
              break;
        }
    }
  while (quoted);

  if (*regexp == '*')
    {
      sel->repeat = rep_null_or_more;
      ++regexp;
    }
  else if (*regexp == '?')
    {
      sel->repeat = rep_null_or_once;
      ++regexp;
    }
  else if (*regexp == '+')
    {
      sel->repeat = rep_once_or_more;
      ++regexp;
    }
  else
    {
      sel->repeat = rep_once;
    }

  return regexp;
}

/*   D e b u g   c o d e
 */
#ifdef DEBUG2                   /* compile all with DEBUG also ? hevi@lut.fi */
static void
re_dump_sel (selection * sel)
{
  switch (sel->type)
    {
        case sel_any:
          printf (".");
          break;
        case sel_end:
          printf ("$");
          break;
        case sel_single:
          printf ("<%c>", sel->u.single);
          break;
        case sel_range:
          printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
          break;
        case sel_array:
          {
            int i;

            printf ("[");
            for (i = 0; i < uchar_MAX; i++)
              {
                if (sel->u.array[i])
                  {
                    printf ("%c", i);
                  }
              }
            printf ("]");
          }
          break;
        case sel_not_single:
          printf ("[^%c]", sel->u.single);
          break;
        case sel_not_range:
          printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
          break;
        default:
          printf ("<UNKNOWN TOKEN!>");
          break;
    }
  switch (sel->repeat)
    {
        case rep_once:
          break;
        case rep_null_or_once:
          printf ("?");
          break;
        case rep_null_or_more:
          printf ("*");
          break;
        case rep_once_or_more:
          printf ("+");
          break;
        default:
          printf ("<UNKNOWN REP-TOKEN!>");
          break;
    }
}

int
main (int argc, char *argv[])
{
  char *re, *m;
  selection sel;

  re = re_get_token (&sel, argv[1]);

  printf ("'%s' -> '%s'\n", argv[1], re);
  re_dump_sel (&sel);
  printf ("\n");
  m = re_cmp (argv[2], argv[1]);
  if (m)
    printf ("MATCH! -> '%s'\n", m);
  return 0;
}
#endif
Revision:	1.4
Committed:	Sun Sep 10 16:00:23 2006 UTC (17 years, 8 months ago) by root
Content type:	text/plain
Branch:	MAIN
Changes since 1.3:	+445 -368 lines
Log Message:	indent
#	Content
1
2	/*
3	* static char *rcsid_player_c =
4	* "$Id: re-cmp.C,v 1.3 2006-09-08 04:51:08 pippijn Exp $";
5	*/
6
7
8	/* re-cmp.c
9	* Pattern match a string, parsing some of the common RE-metacharacters.
10	*
11	* This code is public domain, but I would appreciate to hear of
12	* improvements or even the fact that you use it in your program.
13	*
14	* Deliberate BUGS:
15	* - These tokens are not supported: \| ( )
16	* - You will get the longest expansion of the _first_ string which
17	* matches the RE, not the longest string which would be the proper
18	* behaviour for a RE-matcher.
19	*
20	* Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
21	*/
22
23	#include <stdio.h>
24	#include <stdlib.h>
25	#include <memory.h>
26	#include <limits.h>
27	#include <re-cmp.h>
28	#include <ctype.h>
29
30	/* Get prototype functions to prevent warnings. */
31	#if defined (__sun__) && defined(StupidSunHeaders)
32	# include <sys/types.h>
33	# include <sys/time.h>
34	# include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
35	#endif
36
37
38	/* P r o t o t y p e s
39	*/
40	const char re_cmp (const char , const char *);
41	static bool re_cmp_step (const char , const char , unsigned, int);
42	static void re_init (void);
43	static bool re_match_token (unsigned char, selection *);
44	static const char re_get_token (selection , const char *);
45
46	#ifdef DEBUG2
47	static void re_dump_sel (selection *);
48	#endif
49
50	/* G l o b a l v a r i a b l e s
51	*/
52	static bool re_init_done = false;
53	static selection *re_token[RE_TOKEN_MAX];
54	static const char *re_substr[RE_TOKEN_MAX];
55	static unsigned int re_token_depth;
56
57	/* E x t e r n a l f u n c t i o n
58	*/
59
60	/* re-cmp - get regular expression match.
61	* Return values: NULL - no match or error in regexp.
62	* pointer to beginning of matching string
63	*/
64	const char *
65	re_cmp (const char str, const char regexp)
66	{
67	const char *next_regexp;
68	bool once = false;
69	bool matched;
70
71	if (re_init_done == false)
72	re_init ();
73
74	#ifdef SAFE_CHECKS
75	if (regexp == NULL \|\| str == NULL)
76	return NULL;
77	#endif
78	if (*regexp == '^')
79	{
80	once = true;
81	++regexp;
82	}
83	if (*regexp == 0)
84	{
85	/* // or /^/ matches any string */
86	return str;
87	}
88
89	next_regexp = re_get_token (re_token[0], regexp);
90	re_token_depth = 0;
91	re_substr[0] = next_regexp;
92
93	matched = false;
94	while (str != '\0' && !(matched = re_match_token (str, re_token[0])))
95	str++;
96
97	if (matched && *next_regexp == 0)
98	return str;
99
100	/* Apologies for the nearly duplicated code below, hopefully it
101	* speeds things up.
102	*/
103	if (once)
104	{
105	switch (re_token[0]->repeat)
106	{
107	case rep_once:
108	if (matched == false)
109	return NULL;
110	break;
111	case rep_once_or_more:
112	if (matched == false)
113	return NULL;
114
115	if (re_cmp_step (str + 1, regexp, 0, 1))
116	return str;
117	break;
118	case rep_null_or_once:
119	if (matched == false)
120	return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
121	break;
122	case rep_null_or_more:
123	if (matched)
124	{
125	if (re_cmp_step (str + 1, regexp, 0, 1))
126	return str;
127	}
128	else
129	{
130	return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
131	}
132	break;
133	}
134	return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
135	}
136
137	if (matched)
138	{
139	switch (re_token[0]->repeat)
140	{
141	case rep_once:
142	case rep_null_or_once:
143	break;
144	case rep_once_or_more:
145	case rep_null_or_more:
146	if (re_cmp_step (str + 1, regexp, 0, 1))
147	return str;
148	break;
149	}
150	/* The logic here is that re_match_token only sees
151	* if the one letter matches. Thus, if the
152	* regex is like '@match eureca', and the
153	* the user enters anything with an e, re_match_token
154	* returns true, but they really need to match the
155	* entire regexp, which re_cmp_step will do.
156	* However, what happens is that there can be a case
157	* where the string being match is something like
158	* 'where is eureca'. In this case, the re_match_token
159	* matches that first e, but the re_cmp_step below,
160	* fails because the next character (r) doesn't match
161	* the u. So we call re_cmp with the string
162	* after the first r, so that it should hopefully match
163	* up properly.
164	*/
165	if (re_cmp_step (str + 1, next_regexp, 1, 0))
166	return str;
167	else if (*(str + 1) != 0)
168	return re_cmp (str + 1, regexp);
169	}
170	return NULL;
171	}
172
173	/* A u x i l l i a r y f u n c t i o n s
174	*/
175
176	static bool
177	re_cmp_step (const char str, const char regexp, unsigned slot, int matches)
178	{
179	/* str - string to match
180	* regexp - pattern
181	* slot - number of the token which under consideration
182	* matches - how many times the token has matched
183	*/
184	const char *next_regexp;
185	bool matched;
186
187	#ifdef DEBUG
188
189	/* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
190	#endif
191
192	if (*regexp == 0)
193	{
194	/* When we reach the end of the regexp, the match is a success
195	*/
196	return true;
197	}
198
199	/* This chunk of code makes sure that the regexp-tokenising happens
200	* only once. We only tokenise as much as we need.
201	*/
202	if (slot > re_token_depth)
203	{
204	re_token_depth = slot;
205	if (re_token[slot] == NULL)
206	re_token[slot] = (selection *) malloc (sizeof (selection));
207	next_regexp = re_get_token (re_token[slot], regexp);
208	if (next_regexp == NULL)
209	{
210	/* Syntax error, what else can we do? */
211	return false;
212	}
213	re_substr[slot] = next_regexp;
214	}
215	else
216	{
217	next_regexp = re_substr[slot];
218	}
219
220	matched = re_match_token (*str, re_token[slot]);
221	if (matched)
222	++matches;
223
224	if (*str == 0)
225	return (*next_regexp == 0 \|\| re_token[slot]->type == sel_end) && matched;
226
227	switch (re_token[slot]->repeat)
228	{
229	case rep_once:
230	if (matches == 1)
231	{ /* (matches == 1) => (matched == true) */
232	return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
233	}
234	return false;
235	case rep_once_or_more:
236	if (matched)
237	{ /* (matched == true) => (matches >= 1) */
238	/* First check if the current token repeats more */
239	if (re_cmp_step (str + 1, regexp, slot, matches))
240	return true;
241	return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
242	}
243	return false;
244	case rep_null_or_once:
245	/* We must go on to the next token, but should we advance str? */
246	if (matches == 0)
247	{
248	return re_cmp_step (str, next_regexp, slot + 1, 0);
249	}
250	else if (matches == 1)
251	{
252	return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
253	}
254	return false; /* Not reached */
255	case rep_null_or_more:
256	if (matched)
257	{
258	/* Look for further repeats, advance str */
259	if (re_cmp_step (str + 1, regexp, slot, matches))
260	return true;
261	return re_cmp_step (str, next_regexp, slot + 1, 0);
262	}
263	return re_cmp_step (str, next_regexp, slot + 1, 0);
264	}
265	return false;
266	}
267
268	static void
269	re_init (void)
270	{
271	int i;
272
273	re_token[0] = (selection *) malloc (sizeof (selection));
274	for (i = 1; i < RE_TOKEN_MAX; i++)
275	re_token[i] = NULL;
276
277	re_init_done = true;
278	}
279
280	static bool
281	re_match_token (unsigned char c, selection * sel)
282	{
283	switch (sel->type)
284	{
285	case sel_any:
286	return true;
287	case sel_end:
288	return (c == 0);
289	case sel_single:
290	return (tolower (c) == tolower (sel->u.single));
291	case sel_range:
292	return (c >= sel->u.range.low && c <= sel->u.range.high);
293	case sel_array:
294	return (sel->u.array[c]);
295	case sel_not_single:
296	return (tolower (c) != tolower (sel->u.single));
297	case sel_not_range:
298	return (c < sel->u.range.low && c > sel->u.range.high);
299	}
300	return false;
301	}
302
303	/* re_get_token - get regular expression token
304	* Returns the first token found in <regexp> in <sel>
305	* Return values: NULL syntax error
306	* pointer to first character past token.
307	*/
308	static const char *
309	re_get_token (selection * sel, const char *regexp)
310	{
311
312	#ifdef SAFE_CHECKS
313	# define exit_if_null if (*regexp == 0) return NULL
314	#else
315	# define exit_if_null
316	#endif
317
318	bool quoted = false;
319	unsigned char looking_at;
320
321	#ifdef SAFE_CHECKS
322	if (sel == NULL \|\| regexp == NULL \|\| *regexp == 0)
323	return NULL;
324	#endif
325
326	do
327	{
328	looking_at = *regexp++;
329	switch (looking_at)
330	{
331	case '$':
332	if (quoted)
333	{
334	quoted = false;
335	sel->type = sel_single;
336	sel->u.single = looking_at;
337	}
338	else
339	{
340	sel->type = sel_end;
341	}
342	break;
343	case '.':
344	if (quoted)
345	{
346	quoted = false;
347	sel->type = sel_single;
348	sel->u.single = looking_at;
349	}
350	else
351	{
352	sel->type = sel_any;
353	}
354	break;
355	case '[':
356	/* The fun stuff... perhaps a little obfuscated since I
357	* don't trust the compiler to analyse liveness.
358	*/
359	if (quoted)
360	{
361	quoted = false;
362	sel->type = sel_single;
363	sel->u.single = looking_at;
364	}
365	else
366	{
367	bool neg = false;
368	unsigned char first, last = 0;
369
370	exit_if_null;
371	looking_at = *regexp++;
372
373	if (looking_at == '^')
374	{
375	neg = true;
376	exit_if_null;
377	looking_at = *regexp++;
378	}
379	first = looking_at;
380	exit_if_null;
381	looking_at = *regexp++;
382	if (looking_at == ']')
383	{
384	/* On the form [q] or [^q] */
385	sel->type = neg ? sel_not_single : sel_single;
386	sel->u.single = first;
387	break;
388	}
389	else if (looking_at == '-')
390	{
391	exit_if_null;
392	last = *regexp++;
393	if (last == ']')
394	{
395	/* On the form [A-] or [^A-]. Checking for
396	* [,-] and making it a range is probably not
397	* worth it :-)
398	*/
399	sel->type = sel_array;
400	memset (sel->u.array, neg, sizeof (sel->u.array));
401	sel->u.array[first] = sel->u.array['-'] = !neg;
402	break;
403	}
404	else
405	{
406	exit_if_null;
407	looking_at = *regexp++;
408	if (looking_at == ']')
409	{
410	/* On the form [A-G] or [^A-G]. Note that [G-A]
411	* is a syntax error. Fair enough, I think.
412	*/
413	#ifdef SAFE_CHECK
414	if (first > last)
415	return NULL;
416	#endif
417	sel->type = neg ? sel_not_range : sel_range;
418	sel->u.range.low = first;
419	sel->u.range.high = last;
420	break;
421	}
422	}
423	}
424	{
425	/* The datastructure can only represent a RE this
426	* complex with an array.
427	*/
428	int i;
429	unsigned char previous;
430
431	sel->type = sel_array;
432	memset (sel->u.array, neg, sizeof (sel->u.array));
433	if (last)
434	{
435	/* It starts with a range */
436	#ifdef SAFE_CHECK
437	if (first > last)
438	return NULL;
439	#endif
440	for (i = first; i <= last; i++)
441	{
442	sel->u.array[i] = !neg;
443	}
444	}
445	else
446	{
447	/* It begins with a "random" character */
448	sel->u.array[first] = !neg;
449	}
450	sel->u.array[looking_at] = !neg;
451
452	exit_if_null;
453	previous = looking_at;
454	looking_at = *regexp++;
455
456	/* Add more characters to the array until we reach
457	* ]. Quoting doesn't and shouldn't work in here.
458	* ("]" should be put first, and "-" last if they
459	* are needed inside this construct.)
460	* Look for ranges as we go along.
461	*/
462	while (looking_at != ']')
463	{
464	if (looking_at == '-')
465	{
466	exit_if_null;
467	looking_at = *regexp++;
468	if (looking_at != ']')
469	{
470	#ifdef SAFE_CHECK
471	if (previous > looking_at)
472	return NULL;
473	#endif
474	for (i = previous + 1; i < looking_at; i++)
475	{
476	/* previous has already been set and
477	* looking_at is set below.
478	*/
479	sel->u.array[i] = !neg;
480	}
481	exit_if_null;
482	}
483	else
484	{
485	sel->u.array['-'] = !neg;
486	break;
487	}
488	}
489	sel->u.array[looking_at] = !neg;
490	previous = looking_at;
491	exit_if_null;
492	looking_at = *regexp++;
493	}
494	}
495	}
496	break;
497	case '\\':
498	if (quoted)
499	{
500	quoted = false;
501	sel->type = sel_single;
502	sel->u.single = looking_at;
503	}
504	else
505	{
506	quoted = true;
507	}
508	break;
509	default:
510	quoted = false;
511	sel->type = sel_single;
512	sel->u.single = looking_at;
513	break;
514	}
515	}
516	while (quoted);
517
518	if (regexp == '')
519	{
520	sel->repeat = rep_null_or_more;
521	++regexp;
522	}
523	else if (*regexp == '?')
524	{
525	sel->repeat = rep_null_or_once;
526	++regexp;
527	}
528	else if (*regexp == '+')
529	{
530	sel->repeat = rep_once_or_more;
531	++regexp;
532	}
533	else
534	{
535	sel->repeat = rep_once;
536	}
537
538	return regexp;
539	}
540
541	/* D e b u g c o d e
542	*/
543	#ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
544	static void
545	re_dump_sel (selection * sel)
546	{
547	switch (sel->type)
548	{
549	case sel_any:
550	printf (".");
551	break;
552	case sel_end:
553	printf ("$");
554	break;
555	case sel_single:
556	printf ("<%c>", sel->u.single);
557	break;
558	case sel_range:
559	printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
560	break;
561	case sel_array:
562	{
563	int i;
564
565	printf ("[");
566	for (i = 0; i < uchar_MAX; i++)
567	{
568	if (sel->u.array[i])
569	{
570	printf ("%c", i);
571	}
572	}
573	printf ("]");
574	}
575	break;
576	case sel_not_single:
577	printf ("[^%c]", sel->u.single);
578	break;
579	case sel_not_range:
580	printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
581	break;
582	default:
583	printf ("<UNKNOWN TOKEN!>");
584	break;
585	}
586	switch (sel->repeat)
587	{
588	case rep_once:
589	break;
590	case rep_null_or_once:
591	printf ("?");
592	break;
593	case rep_null_or_more:
594	printf ("*");
595	break;
596	case rep_once_or_more:
597	printf ("+");
598	break;
599	default:
600	printf ("<UNKNOWN REP-TOKEN!>");
601	break;
602	}
603	}
604
605	int
606	main (int argc, char *argv[])
607	{
608	char re, m;
609	selection sel;
610
611	re = re_get_token (&sel, argv[1]);
612
613	printf ("'%s' -> '%s'\n", argv[1], re);
614	re_dump_sel (&sel);
615	printf ("\n");
616	m = re_cmp (argv[2], argv[1]);
617	if (m)
618	printf ("MATCH! -> '%s'\n", m);
619	return 0;
620	}
621	#endif