server/common/re-cmp.C


/*
 * static char *rcsid_player_c =
 *   "$Id: re-cmp.C,v 1.3 2006-09-08 04:51:08 pippijn Exp $";
 */


/* re-cmp.c
 * Pattern match a string, parsing some of the common RE-metacharacters.
 *
 * This code is public domain, but I would appreciate to hear of
 * improvements or even the fact that you use it in your program.
 *
 * Deliberate BUGS:
 *    - These tokens are not supported: | ( )
 *    - You will get the longest expansion of the _first_ string which
 *      matches the RE, not the longest string which would be the proper
 *      behaviour for a RE-matcher.
 *
 * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
 */

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <limits.h>
#include <re-cmp.h>
#include <ctype.h>

/* Get prototype functions to prevent warnings. */
#if defined (__sun__) && defined(StupidSunHeaders)
# include <sys/types.h>
# include <sys/time.h>
# include "sunos.h"             /* Prototypes for standard libraries, sunos lack those */
#endif


/*   P r o t o t y p e s
 */
const char *re_cmp (const char *, const char *);
static bool re_cmp_step (const char *, const char *, unsigned, int);
static void re_init (void);
static bool re_match_token (unsigned char, selection *);
static const char *re_get_token (selection *, const char *);

#ifdef DEBUG2
static void re_dump_sel (selection *);
#endif

/*   G l o b a l   v a r i a b l e s
 */
static bool re_init_done = false;
static selection *re_token[RE_TOKEN_MAX];
static const char *re_substr[RE_TOKEN_MAX];
static unsigned int re_token_depth;

/*   E x t e r n a l   f u n c t i o n
 */

/* re-cmp - get regular expression match.
 * Return values: NULL - no match or error in regexp.
 *                pointer to beginning of matching string
 */
const char *
re_cmp (const char *str, const char *regexp)
{
  const char *next_regexp;
  bool once = false;
  bool matched;

  if (re_init_done == false)
    re_init ();

#ifdef SAFE_CHECKS
  if (regexp == NULL || str == NULL)
    return NULL;
#endif
  if (*regexp == '^')
    {
      once = true;
      ++regexp;
    }
  if (*regexp == 0)
    {
      /* // or /^/ matches any string */
      return str;
    }

  next_regexp = re_get_token (re_token[0], regexp);
  re_token_depth = 0;
  re_substr[0] = next_regexp;

  matched = false;
  while (*str != '\0' && !(matched = re_match_token (*str, re_token[0])))
    str++;

  if (matched && *next_regexp == 0)
    return str;

  /* Apologies for the nearly duplicated code below, hopefully it
   * speeds things up.
   */
  if (once)
    {
      switch (re_token[0]->repeat)
        {
            case rep_once:
              if (matched == false)
                return NULL;
              break;
            case rep_once_or_more:
              if (matched == false)
                return NULL;

              if (re_cmp_step (str + 1, regexp, 0, 1))
                return str;
              break;
            case rep_null_or_once:
              if (matched == false)
                return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
              break;
            case rep_null_or_more:
              if (matched)
                {
                  if (re_cmp_step (str + 1, regexp, 0, 1))
                    return str;
                }
              else
                {
                  return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
                }
              break;
        }
      return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
    }

  if (matched)
    {
      switch (re_token[0]->repeat)
        {
            case rep_once:
            case rep_null_or_once:
              break;
            case rep_once_or_more:
            case rep_null_or_more:
              if (re_cmp_step (str + 1, regexp, 0, 1))
                return str;
              break;
        }
      /* The logic here is that re_match_token only sees
       * if the one letter matches.  Thus, if the
       * regex is like '@match eureca', and the
       * the user enters anything with an e, re_match_token
       * returns true, but they really need to match the
       * entire regexp, which re_cmp_step will do.
       * However, what happens is that there can be a case
       * where the string being match is something like
       * 'where is eureca'.  In this case, the re_match_token
       * matches that first e, but the re_cmp_step below,
       * fails because the next character (r) doesn't match
       * the u.  So we call re_cmp with the string
       * after the first r, so that it should hopefully match
       * up properly.
       */
      if (re_cmp_step (str + 1, next_regexp, 1, 0))
        return str;
      else if (*(str + 1) != 0)
        return re_cmp (str + 1, regexp);
    }
  return NULL;
}

/*   A u x i l l i a r y   f u n c t i o n s
 */

static bool
re_cmp_step (const char *str, const char *regexp, unsigned slot, int matches)
{
  /* str      - string to match
   * regexp   - pattern
   * slot     - number of the token which under consideration
   * matches  - how many times the token has matched
   */
  const char *next_regexp;
  bool matched;

#ifdef DEBUG

/*    fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
#endif

  if (*regexp == 0)
    {
      /* When we reach the end of the regexp, the match is a success
       */
      return true;
    }

  /* This chunk of code makes sure that the regexp-tokenising happens
   * only once. We only tokenise as much as we need.
   */
  if (slot > re_token_depth)
    {
      re_token_depth = slot;
      if (re_token[slot] == NULL)
        re_token[slot] = (selection *) malloc (sizeof (selection));
      next_regexp = re_get_token (re_token[slot], regexp);
      if (next_regexp == NULL)
        {
          /* Syntax error, what else can we do? */
          return false;
        }
      re_substr[slot] = next_regexp;
    }
  else
    {
      next_regexp = re_substr[slot];
    }

  matched = re_match_token (*str, re_token[slot]);
  if (matched)
    ++matches;

  if (*str == 0)
    return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;

  switch (re_token[slot]->repeat)
    {
        case rep_once:
          if (matches == 1)
            {                   /* (matches == 1) => (matched == true) */
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;
        case rep_once_or_more:
          if (matched)
            {                   /* (matched == true) => (matches >= 1) */
              /* First check if the current token repeats more */
              if (re_cmp_step (str + 1, regexp, slot, matches))
                return true;
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;
        case rep_null_or_once:
          /* We must go on to the next token, but should we advance str? */
          if (matches == 0)
            {
              return re_cmp_step (str, next_regexp, slot + 1, 0);
            }
          else if (matches == 1)
            {
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;         /* Not reached */
        case rep_null_or_more:
          if (matched)
            {
              /* Look for further repeats, advance str */
              if (re_cmp_step (str + 1, regexp, slot, matches))
                return true;
              return re_cmp_step (str, next_regexp, slot + 1, 0);
            }
          return re_cmp_step (str, next_regexp, slot + 1, 0);
    }
  return false;
}

static void
re_init (void)
{
  int i;

  re_token[0] = (selection *) malloc (sizeof (selection));
  for (i = 1; i < RE_TOKEN_MAX; i++)
    re_token[i] = NULL;

  re_init_done = true;
}

static bool
re_match_token (unsigned char c, selection * sel)
{
  switch (sel->type)
    {
        case sel_any:
          return true;
        case sel_end:
          return (c == 0);
        case sel_single:
          return (tolower (c) == tolower (sel->u.single));
        case sel_range:
          return (c >= sel->u.range.low && c <= sel->u.range.high);
        case sel_array:
          return (sel->u.array[c]);
        case sel_not_single:
          return (tolower (c) != tolower (sel->u.single));
        case sel_not_range:
          return (c < sel->u.range.low && c > sel->u.range.high);
    }
  return false;
}

/* re_get_token - get regular expression token
 * Returns the first token found in <regexp> in <sel>
 * Return values: NULL  syntax error
 *                pointer to first character past token.
 */
static const char *
re_get_token (selection * sel, const char *regexp)
{

#ifdef SAFE_CHECKS
# define exit_if_null   if (*regexp == 0) return NULL
#else
# define exit_if_null
#endif

  bool quoted = false;
  unsigned char looking_at;

#ifdef SAFE_CHECKS
  if (sel == NULL || regexp == NULL || *regexp == 0)
    return NULL;
#endif

  do
    {
      looking_at = *regexp++;
      switch (looking_at)
        {
            case '$':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  sel->type = sel_end;
                }
              break;
            case '.':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  sel->type = sel_any;
                }
              break;
            case '[':
              /* The fun stuff... perhaps a little obfuscated since I
               * don't trust the compiler to analyse liveness.
               */
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  bool neg = false;
                  unsigned char first, last = 0;

                  exit_if_null;
                  looking_at = *regexp++;

                  if (looking_at == '^')
                    {
                      neg = true;
                      exit_if_null;
                      looking_at = *regexp++;
                    }
                  first = looking_at;
                  exit_if_null;
                  looking_at = *regexp++;
                  if (looking_at == ']')
                    {
                      /* On the form [q] or [^q] */
                      sel->type = neg ? sel_not_single : sel_single;
                      sel->u.single = first;
                      break;
                    }
                  else if (looking_at == '-')
                    {
                      exit_if_null;
                      last = *regexp++;
                      if (last == ']')
                        {
                          /* On the form [A-] or [^A-]. Checking for
                           * [,-] and making it a range is probably not
                           * worth it :-)
                           */
                          sel->type = sel_array;
                          memset (sel->u.array, neg, sizeof (sel->u.array));
                          sel->u.array[first] = sel->u.array['-'] = !neg;
                          break;
                        }
                      else
                        {
                          exit_if_null;
                          looking_at = *regexp++;
                          if (looking_at == ']')
                            {
                              /* On the form [A-G] or [^A-G]. Note that [G-A]
                               * is a syntax error. Fair enough, I think.
                               */
#ifdef SAFE_CHECK
                              if (first > last)
                                return NULL;
#endif
                              sel->type = neg ? sel_not_range : sel_range;
                              sel->u.range.low = first;
                              sel->u.range.high = last;
                              break;
                            }
                        }
                    }
                  {
                    /* The datastructure can only represent a RE this
                     * complex with an array.
                     */
                    int i;
                    unsigned char previous;

                    sel->type = sel_array;
                    memset (sel->u.array, neg, sizeof (sel->u.array));
                    if (last)
                      {
                        /* It starts with a range */
#ifdef SAFE_CHECK
                        if (first > last)
                          return NULL;
#endif
                        for (i = first; i <= last; i++)
                          {
                            sel->u.array[i] = !neg;
                          }
                      }
                    else
                      {
                        /* It begins with a "random" character */
                        sel->u.array[first] = !neg;
                      }
                    sel->u.array[looking_at] = !neg;

                    exit_if_null;
                    previous = looking_at;
                    looking_at = *regexp++;

                    /* Add more characters to the array until we reach
                     * ]. Quoting doesn't and shouldn't work in here.
                     * ("]" should be put first, and "-" last if they
                     * are needed inside this construct.)
                     * Look for ranges as we go along.
                     */
                    while (looking_at != ']')
                      {
                        if (looking_at == '-')
                          {
                            exit_if_null;
                            looking_at = *regexp++;
                            if (looking_at != ']')
                              {
#ifdef SAFE_CHECK
                                if (previous > looking_at)
                                  return NULL;
#endif
                                for (i = previous + 1; i < looking_at; i++)
                                  {
                                    /* previous has already been set and
                                     * looking_at is set below.
                                     */
                                    sel->u.array[i] = !neg;
                                  }
                                exit_if_null;
                              }
                            else
                              {
                                sel->u.array['-'] = !neg;
                                break;
                              }
                          }
                        sel->u.array[looking_at] = !neg;
                        previous = looking_at;
                        exit_if_null;
                        looking_at = *regexp++;
                      }
                  }
                }
              break;
            case '\\':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  quoted = true;
                }
              break;
            default:
              quoted = false;
              sel->type = sel_single;
              sel->u.single = looking_at;
              break;
        }
    }
  while (quoted);

  if (*regexp == '*')
    {
      sel->repeat = rep_null_or_more;
      ++regexp;
    }
  else if (*regexp == '?')
    {
      sel->repeat = rep_null_or_once;
      ++regexp;
    }
  else if (*regexp == '+')
    {
      sel->repeat = rep_once_or_more;
      ++regexp;
    }
  else
    {
      sel->repeat = rep_once;
    }

  return regexp;
}

/*   D e b u g   c o d e
 */
#ifdef DEBUG2                   /* compile all with DEBUG also ? hevi@lut.fi */
static void
re_dump_sel (selection * sel)
{
  switch (sel->type)
    {
        case sel_any:
          printf (".");
          break;
        case sel_end:
          printf ("$");
          break;
        case sel_single:
          printf ("<%c>", sel->u.single);
          break;
        case sel_range:
          printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
          break;
        case sel_array:
          {
            int i;

            printf ("[");
            for (i = 0; i < uchar_MAX; i++)
              {
                if (sel->u.array[i])
                  {
                    printf ("%c", i);
                  }
              }
            printf ("]");
          }
          break;
        case sel_not_single:
          printf ("[^%c]", sel->u.single);
          break;
        case sel_not_range:
          printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
          break;
        default:
          printf ("<UNKNOWN TOKEN!>");
          break;
    }
  switch (sel->repeat)
    {
        case rep_once:
          break;
        case rep_null_or_once:
          printf ("?");
          break;
        case rep_null_or_more:
          printf ("*");
          break;
        case rep_once_or_more:
          printf ("+");
          break;
        default:
          printf ("<UNKNOWN REP-TOKEN!>");
          break;
    }
}

int
main (int argc, char *argv[])
{
  char *re, *m;
  selection sel;

  re = re_get_token (&sel, argv[1]);

  printf ("'%s' -> '%s'\n", argv[1], re);
  re_dump_sel (&sel);
  printf ("\n");
  m = re_cmp (argv[2], argv[1]);
  if (m)
    printf ("MATCH! -> '%s'\n", m);
  return 0;
}
#endif
Revision:	1.4
Committed:	Sun Sep 10 16:00:23 2006 UTC (17 years, 8 months ago) by root
Content type:	text/plain
Branch:	MAIN
Changes since 1.3:	+445 -368 lines
Log Message:	indent
#	User	Rev	Content
1	root	1.4
2	elmex	1.1	/*
3			* static char *rcsid_player_c =
4	root	1.4	* "$Id: re-cmp.C,v 1.3 2006-09-08 04:51:08 pippijn Exp $";
5	elmex	1.1	*/
6
7
8			/* re-cmp.c
9			* Pattern match a string, parsing some of the common RE-metacharacters.
10			*
11			* This code is public domain, but I would appreciate to hear of
12			* improvements or even the fact that you use it in your program.
13			*
14			* Deliberate BUGS:
15			* - These tokens are not supported: \| ( )
16			* - You will get the longest expansion of the _first_ string which
17			* matches the RE, not the longest string which would be the proper
18			* behaviour for a RE-matcher.
19			*
20			* Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
21			*/
22
23			#include <stdio.h>
24			#include <stdlib.h>
25			#include <memory.h>
26			#include <limits.h>
27			#include <re-cmp.h>
28			#include <ctype.h>
29
30			/* Get prototype functions to prevent warnings. */
31			#if defined (__sun__) && defined(StupidSunHeaders)
32	root	1.4	# include <sys/types.h>
33			# include <sys/time.h>
34			# include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
35	elmex	1.1	#endif
36
37
38			/* P r o t o t y p e s
39			*/
40	root	1.4	const char re_cmp (const char , const char *);
41			static bool re_cmp_step (const char , const char , unsigned, int);
42			static void re_init (void);
43			static bool re_match_token (unsigned char, selection *);
44			static const char re_get_token (selection , const char *);
45
46	elmex	1.1	#ifdef DEBUG2
47	root	1.4	static void re_dump_sel (selection *);
48	elmex	1.1	#endif
49
50			/* G l o b a l v a r i a b l e s
51			*/
52	root	1.4	static bool re_init_done = false;
53			static selection *re_token[RE_TOKEN_MAX];
54			static const char *re_substr[RE_TOKEN_MAX];
55			static unsigned int re_token_depth;
56	elmex	1.1
57			/* E x t e r n a l f u n c t i o n
58			*/
59
60			/* re-cmp - get regular expression match.
61			* Return values: NULL - no match or error in regexp.
62			* pointer to beginning of matching string
63			*/
64			const char *
65	root	1.4	re_cmp (const char str, const char regexp)
66			{
67			const char *next_regexp;
68			bool once = false;
69			bool matched;
70	elmex	1.1
71	root	1.4	if (re_init_done == false)
72			re_init ();
73	elmex	1.1
74			#ifdef SAFE_CHECKS
75	root	1.4	if (regexp == NULL \|\| str == NULL)
76			return NULL;
77	elmex	1.1	#endif
78	root	1.4	if (*regexp == '^')
79			{
80			once = true;
81			++regexp;
82			}
83			if (*regexp == 0)
84			{
85			/* // or /^/ matches any string */
86			return str;
87			}
88
89			next_regexp = re_get_token (re_token[0], regexp);
90			re_token_depth = 0;
91			re_substr[0] = next_regexp;
92
93			matched = false;
94			while (str != '\0' && !(matched = re_match_token (str, re_token[0])))
95			str++;
96
97			if (matched && *next_regexp == 0)
98			return str;
99
100			/* Apologies for the nearly duplicated code below, hopefully it
101			* speeds things up.
102			*/
103			if (once)
104			{
105			switch (re_token[0]->repeat)
106			{
107	root	1.2	case rep_once:
108	root	1.4	if (matched == false)
109			return NULL;
110			break;
111	root	1.2	case rep_once_or_more:
112	root	1.4	if (matched == false)
113			return NULL;
114	root	1.2
115	root	1.4	if (re_cmp_step (str + 1, regexp, 0, 1))
116			return str;
117			break;
118	root	1.2	case rep_null_or_once:
119	root	1.4	if (matched == false)
120			return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
121			break;
122	root	1.2	case rep_null_or_more:
123	root	1.4	if (matched)
124			{
125			if (re_cmp_step (str + 1, regexp, 0, 1))
126			return str;
127			}
128			else
129			{
130			return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
131	root	1.2	}
132	root	1.4	break;
133	root	1.2	}
134	root	1.4	return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
135	elmex	1.1	}
136
137	root	1.4	if (matched)
138			{
139			switch (re_token[0]->repeat)
140			{
141	root	1.2	case rep_once:
142			case rep_null_or_once:
143	root	1.4	break;
144	root	1.2	case rep_once_or_more:
145			case rep_null_or_more:
146	root	1.4	if (re_cmp_step (str + 1, regexp, 0, 1))
147			return str;
148			break;
149	root	1.2	}
150	root	1.4	/* The logic here is that re_match_token only sees
151			* if the one letter matches. Thus, if the
152			* regex is like '@match eureca', and the
153			* the user enters anything with an e, re_match_token
154			* returns true, but they really need to match the
155			* entire regexp, which re_cmp_step will do.
156			* However, what happens is that there can be a case
157			* where the string being match is something like
158			* 'where is eureca'. In this case, the re_match_token
159			* matches that first e, but the re_cmp_step below,
160			* fails because the next character (r) doesn't match
161			* the u. So we call re_cmp with the string
162			* after the first r, so that it should hopefully match
163			* up properly.
164			*/
165			if (re_cmp_step (str + 1, next_regexp, 1, 0))
166			return str;
167			else if (*(str + 1) != 0)
168			return re_cmp (str + 1, regexp);
169	elmex	1.1	}
170	root	1.4	return NULL;
171	elmex	1.1	}
172
173			/* A u x i l l i a r y f u n c t i o n s
174			*/
175
176	pippijn	1.3	static bool
177	root	1.4	re_cmp_step (const char str, const char regexp, unsigned slot, int matches)
178			{
179			/* str - string to match
180			* regexp - pattern
181			* slot - number of the token which under consideration
182			* matches - how many times the token has matched
183			*/
184			const char *next_regexp;
185			bool matched;
186	elmex	1.1
187			#ifdef DEBUG
188	root	1.4
189	elmex	1.1	/* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
190			#endif
191
192	root	1.4	if (*regexp == 0)
193			{
194			/* When we reach the end of the regexp, the match is a success
195			*/
196			return true;
197			}
198
199			/* This chunk of code makes sure that the regexp-tokenising happens
200			* only once. We only tokenise as much as we need.
201			*/
202			if (slot > re_token_depth)
203			{
204			re_token_depth = slot;
205			if (re_token[slot] == NULL)
206			re_token[slot] = (selection *) malloc (sizeof (selection));
207			next_regexp = re_get_token (re_token[slot], regexp);
208			if (next_regexp == NULL)
209			{
210			/* Syntax error, what else can we do? */
211			return false;
212	root	1.2	}
213	root	1.4	re_substr[slot] = next_regexp;
214			}
215			else
216			{
217			next_regexp = re_substr[slot];
218	elmex	1.1	}
219
220	root	1.4	matched = re_match_token (*str, re_token[slot]);
221			if (matched)
222			++matches;
223	elmex	1.1
224	root	1.4	if (*str == 0)
225			return (*next_regexp == 0 \|\| re_token[slot]->type == sel_end) && matched;
226	elmex	1.1
227	root	1.4	switch (re_token[slot]->repeat)
228			{
229	root	1.2	case rep_once:
230	root	1.4	if (matches == 1)
231			{ /* (matches == 1) => (matched == true) */
232			return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
233	root	1.2	}
234	root	1.4	return false;
235	root	1.2	case rep_once_or_more:
236	root	1.4	if (matched)
237			{ /* (matched == true) => (matches >= 1) */
238			/* First check if the current token repeats more */
239			if (re_cmp_step (str + 1, regexp, slot, matches))
240			return true;
241			return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
242	root	1.2	}
243	root	1.4	return false;
244	root	1.2	case rep_null_or_once:
245	root	1.4	/* We must go on to the next token, but should we advance str? */
246			if (matches == 0)
247			{
248			return re_cmp_step (str, next_regexp, slot + 1, 0);
249			}
250			else if (matches == 1)
251			{
252			return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
253	root	1.2	}
254	root	1.4	return false; /* Not reached */
255	root	1.2	case rep_null_or_more:
256	root	1.4	if (matched)
257			{
258			/* Look for further repeats, advance str */
259			if (re_cmp_step (str + 1, regexp, slot, matches))
260			return true;
261			return re_cmp_step (str, next_regexp, slot + 1, 0);
262	root	1.2	}
263	root	1.4	return re_cmp_step (str, next_regexp, slot + 1, 0);
264	elmex	1.1	}
265	root	1.4	return false;
266	elmex	1.1	}
267
268			static void
269	root	1.4	re_init (void)
270			{
271			int i;
272
273			re_token[0] = (selection *) malloc (sizeof (selection));
274			for (i = 1; i < RE_TOKEN_MAX; i++)
275			re_token[i] = NULL;
276	elmex	1.1
277	root	1.4	re_init_done = true;
278	elmex	1.1	}
279
280	pippijn	1.3	static bool
281	root	1.4	re_match_token (unsigned char c, selection * sel)
282			{
283			switch (sel->type)
284			{
285	root	1.2	case sel_any:
286	root	1.4	return true;
287	root	1.2	case sel_end:
288	root	1.4	return (c == 0);
289	root	1.2	case sel_single:
290	root	1.4	return (tolower (c) == tolower (sel->u.single));
291	root	1.2	case sel_range:
292	root	1.4	return (c >= sel->u.range.low && c <= sel->u.range.high);
293	root	1.2	case sel_array:
294	root	1.4	return (sel->u.array[c]);
295	root	1.2	case sel_not_single:
296	root	1.4	return (tolower (c) != tolower (sel->u.single));
297	root	1.2	case sel_not_range:
298	root	1.4	return (c < sel->u.range.low && c > sel->u.range.high);
299	elmex	1.1	}
300	root	1.4	return false;
301	elmex	1.1	}
302
303			/* re_get_token - get regular expression token
304			* Returns the first token found in <regexp> in <sel>
305			* Return values: NULL syntax error
306			* pointer to first character past token.
307			*/
308			static const char *
309	root	1.4	re_get_token (selection * sel, const char *regexp)
310			{
311	elmex	1.1
312			#ifdef SAFE_CHECKS
313	root	1.4	# define exit_if_null if (*regexp == 0) return NULL
314	elmex	1.1	#else
315	root	1.4	# define exit_if_null
316	elmex	1.1	#endif
317
318	root	1.4	bool quoted = false;
319			unsigned char looking_at;
320	elmex	1.1
321			#ifdef SAFE_CHECKS
322	root	1.4	if (sel == NULL \|\| regexp == NULL \|\| *regexp == 0)
323			return NULL;
324	elmex	1.1	#endif
325
326	root	1.4	do
327			{
328			looking_at = *regexp++;
329			switch (looking_at)
330			{
331	root	1.2	case '$':
332	root	1.4	if (quoted)
333			{
334			quoted = false;
335			sel->type = sel_single;
336			sel->u.single = looking_at;
337			}
338			else
339			{
340			sel->type = sel_end;
341	root	1.2	}
342	root	1.4	break;
343	root	1.2	case '.':
344	root	1.4	if (quoted)
345			{
346			quoted = false;
347			sel->type = sel_single;
348			sel->u.single = looking_at;
349	root	1.2	}
350	root	1.4	else
351			{
352			sel->type = sel_any;
353			}
354			break;
355	root	1.2	case '[':
356	root	1.4	/* The fun stuff... perhaps a little obfuscated since I
357			* don't trust the compiler to analyse liveness.
358			*/
359			if (quoted)
360			{
361			quoted = false;
362			sel->type = sel_single;
363			sel->u.single = looking_at;
364			}
365			else
366			{
367			bool neg = false;
368			unsigned char first, last = 0;
369	root	1.2
370	root	1.4	exit_if_null;
371			looking_at = *regexp++;
372	root	1.2
373	root	1.4	if (looking_at == '^')
374			{
375			neg = true;
376			exit_if_null;
377			looking_at = *regexp++;
378			}
379			first = looking_at;
380			exit_if_null;
381			looking_at = *regexp++;
382			if (looking_at == ']')
383			{
384			/* On the form [q] or [^q] */
385			sel->type = neg ? sel_not_single : sel_single;
386			sel->u.single = first;
387			break;
388	root	1.2	}
389	root	1.4	else if (looking_at == '-')
390			{
391			exit_if_null;
392			last = *regexp++;
393			if (last == ']')
394			{
395			/* On the form [A-] or [^A-]. Checking for
396			* [,-] and making it a range is probably not
397			* worth it :-)
398			*/
399			sel->type = sel_array;
400			memset (sel->u.array, neg, sizeof (sel->u.array));
401			sel->u.array[first] = sel->u.array['-'] = !neg;
402			break;
403			}
404			else
405			{
406			exit_if_null;
407			looking_at = *regexp++;
408			if (looking_at == ']')
409			{
410			/* On the form [A-G] or [^A-G]. Note that [G-A]
411			* is a syntax error. Fair enough, I think.
412			*/
413	elmex	1.1	#ifdef SAFE_CHECK
414	root	1.4	if (first > last)
415			return NULL;
416	elmex	1.1	#endif
417	root	1.4	sel->type = neg ? sel_not_range : sel_range;
418			sel->u.range.low = first;
419			sel->u.range.high = last;
420			break;
421	root	1.2	}
422			}
423			}
424	root	1.4	{
425			/* The datastructure can only represent a RE this
426			* complex with an array.
427			*/
428			int i;
429			unsigned char previous;
430
431			sel->type = sel_array;
432			memset (sel->u.array, neg, sizeof (sel->u.array));
433			if (last)
434			{
435			/* It starts with a range */
436	elmex	1.1	#ifdef SAFE_CHECK
437	root	1.4	if (first > last)
438			return NULL;
439	elmex	1.1	#endif
440	root	1.4	for (i = first; i <= last; i++)
441			{
442			sel->u.array[i] = !neg;
443			}
444			}
445			else
446			{
447			/* It begins with a "random" character */
448			sel->u.array[first] = !neg;
449			}
450			sel->u.array[looking_at] = !neg;
451	root	1.2
452	root	1.4	exit_if_null;
453			previous = looking_at;
454			looking_at = *regexp++;
455	root	1.2
456	root	1.4	/* Add more characters to the array until we reach
457			* ]. Quoting doesn't and shouldn't work in here.
458			* ("]" should be put first, and "-" last if they
459			* are needed inside this construct.)
460			* Look for ranges as we go along.
461			*/
462			while (looking_at != ']')
463			{
464			if (looking_at == '-')
465			{
466			exit_if_null;
467			looking_at = *regexp++;
468			if (looking_at != ']')
469			{
470	elmex	1.1	#ifdef SAFE_CHECK
471	root	1.4	if (previous > looking_at)
472			return NULL;
473	elmex	1.1	#endif
474	root	1.4	for (i = previous + 1; i < looking_at; i++)
475			{
476			/* previous has already been set and
477			* looking_at is set below.
478			*/
479			sel->u.array[i] = !neg;
480			}
481			exit_if_null;
482			}
483			else
484			{
485			sel->u.array['-'] = !neg;
486			break;
487			}
488			}
489			sel->u.array[looking_at] = !neg;
490			previous = looking_at;
491			exit_if_null;
492			looking_at = *regexp++;
493			}
494			}
495	root	1.2	}
496	root	1.4	break;
497	root	1.2	case '\\':
498	root	1.4	if (quoted)
499			{
500			quoted = false;
501			sel->type = sel_single;
502			sel->u.single = looking_at;
503			}
504			else
505			{
506			quoted = true;
507	root	1.2	}
508	root	1.4	break;
509	root	1.2	default:
510	root	1.4	quoted = false;
511			sel->type = sel_single;
512			sel->u.single = looking_at;
513			break;
514	root	1.2	}
515	root	1.4	}
516			while (quoted);
517	elmex	1.1
518	root	1.4	if (regexp == '')
519			{
520			sel->repeat = rep_null_or_more;
521			++regexp;
522			}
523			else if (*regexp == '?')
524			{
525			sel->repeat = rep_null_or_once;
526			++regexp;
527			}
528			else if (*regexp == '+')
529			{
530			sel->repeat = rep_once_or_more;
531			++regexp;
532			}
533			else
534			{
535			sel->repeat = rep_once;
536	elmex	1.1	}
537
538	root	1.4	return regexp;
539	elmex	1.1	}
540
541			/* D e b u g c o d e
542			*/
543	root	1.4	#ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
544	elmex	1.1	static void
545	root	1.4	re_dump_sel (selection * sel)
546			{
547			switch (sel->type)
548			{
549	root	1.2	case sel_any:
550	root	1.4	printf (".");
551			break;
552	root	1.2	case sel_end:
553	root	1.4	printf ("$");
554			break;
555	root	1.2	case sel_single:
556	root	1.4	printf ("<%c>", sel->u.single);
557			break;
558	root	1.2	case sel_range:
559	root	1.4	printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
560			break;
561	root	1.2	case sel_array:
562	root	1.4	{
563			int i;
564
565			printf ("[");
566			for (i = 0; i < uchar_MAX; i++)
567			{
568			if (sel->u.array[i])
569			{
570			printf ("%c", i);
571			}
572			}
573			printf ("]");
574			}
575			break;
576	root	1.2	case sel_not_single:
577	root	1.4	printf ("[^%c]", sel->u.single);
578			break;
579	root	1.2	case sel_not_range:
580	root	1.4	printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
581			break;
582	root	1.2	default:
583	root	1.4	printf ("<UNKNOWN TOKEN!>");
584			break;
585	elmex	1.1	}
586	root	1.4	switch (sel->repeat)
587			{
588	root	1.2	case rep_once:
589	root	1.4	break;
590	root	1.2	case rep_null_or_once:
591	root	1.4	printf ("?");
592			break;
593	root	1.2	case rep_null_or_more:
594	root	1.4	printf ("*");
595			break;
596	root	1.2	case rep_once_or_more:
597	root	1.4	printf ("+");
598			break;
599	root	1.2	default:
600	root	1.4	printf ("<UNKNOWN REP-TOKEN!>");
601			break;
602	elmex	1.1	}
603			}
604
605			int
606	root	1.4	main (int argc, char *argv[])
607			{
608			char re, m;
609			selection sel;
610
611			re = re_get_token (&sel, argv[1]);
612
613			printf ("'%s' -> '%s'\n", argv[1], re);
614			re_dump_sel (&sel);
615			printf ("\n");
616			m = re_cmp (argv[2], argv[1]);
617			if (m)
618			printf ("MATCH! -> '%s'\n", m);
619			return 0;
620	elmex	1.1	}
621			#endif