server/common/re-cmp.C

/* re-cmp.c
 * Pattern match a string, parsing some of the common RE-metacharacters.
 *
 * This code is public domain, but I would appreciate to hear of
 * improvements or even the fact that you use it in your program.
 *
 * Deliberate BUGS:
 *    - These tokens are not supported: | ( )
 *    - You will get the longest expansion of the _first_ string which
 *      matches the RE, not the longest string which would be the proper
 *      behaviour for a RE-matcher.
 *
 * Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
 */

#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include <limits.h>
#include <re-cmp.h>
#include <ctype.h>

/* Get prototype functions to prevent warnings. */
#if defined (__sun__) && defined(StupidSunHeaders)
# include <sys/types.h>
# include <sys/time.h>
# include "sunos.h"             /* Prototypes for standard libraries, sunos lack those */
#endif


/*   P r o t o t y p e s
 */
const char *re_cmp (const char *, const char *);
static bool re_cmp_step (const char *, const char *, unsigned, int);
static void re_init (void);
static bool re_match_token (unsigned char, selection *);
static const char *re_get_token (selection *, const char *);

#ifdef DEBUG2
static void re_dump_sel (selection *);
#endif

/*   G l o b a l   v a r i a b l e s
 */
static bool re_init_done = false;
static selection *re_token[RE_TOKEN_MAX];
static const char *re_substr[RE_TOKEN_MAX];
static unsigned int re_token_depth;

/*   E x t e r n a l   f u n c t i o n
 */

/* re-cmp - get regular expression match.
 * Return values: NULL - no match or error in regexp.
 *                pointer to beginning of matching string
 */
const char *
re_cmp (const char *str, const char *regexp)
{
  const char *next_regexp;
  bool once = false;
  bool matched;

  if (re_init_done == false)
    re_init ();

#ifdef SAFE_CHECKS
  if (regexp == NULL || str == NULL)
    return NULL;
#endif
  if (*regexp == '^')
    {
      once = true;
      ++regexp;
    }
  if (*regexp == 0)
    {
      /* // or /^/ matches any string */
      return str;
    }

  next_regexp = re_get_token (re_token[0], regexp);
  re_token_depth = 0;
  re_substr[0] = next_regexp;

  matched = false;
  while (*str != '\0' && !(matched = re_match_token (*str, re_token[0])))
    str++;

  if (matched && *next_regexp == 0)
    return str;

  /* Apologies for the nearly duplicated code below, hopefully it
   * speeds things up.
   */
  if (once)
    {
      switch (re_token[0]->repeat)
        {
            case rep_once:
              if (matched == false)
                return NULL;
              break;
            case rep_once_or_more:
              if (matched == false)
                return NULL;

              if (re_cmp_step (str + 1, regexp, 0, 1))
                return str;
              break;
            case rep_null_or_once:
              if (matched == false)
                return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
              break;
            case rep_null_or_more:
              if (matched)
                {
                  if (re_cmp_step (str + 1, regexp, 0, 1))
                    return str;
                }
              else
                {
                  return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
                }
              break;
        }
      return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
    }

  if (matched)
    {
      switch (re_token[0]->repeat)
        {
            case rep_once:
            case rep_null_or_once:
              break;
            case rep_once_or_more:
            case rep_null_or_more:
              if (re_cmp_step (str + 1, regexp, 0, 1))
                return str;
              break;
        }
      /* The logic here is that re_match_token only sees
       * if the one letter matches.  Thus, if the
       * regex is like '@match eureca', and the
       * the user enters anything with an e, re_match_token
       * returns true, but they really need to match the
       * entire regexp, which re_cmp_step will do.
       * However, what happens is that there can be a case
       * where the string being match is something like
       * 'where is eureca'.  In this case, the re_match_token
       * matches that first e, but the re_cmp_step below,
       * fails because the next character (r) doesn't match
       * the u.  So we call re_cmp with the string
       * after the first r, so that it should hopefully match
       * up properly.
       */
      if (re_cmp_step (str + 1, next_regexp, 1, 0))
        return str;
      else if (*(str + 1) != 0)
        return re_cmp (str + 1, regexp);
    }
  return NULL;
}

/*   A u x i l l i a r y   f u n c t i o n s
 */

static bool
re_cmp_step (const char *str, const char *regexp, unsigned slot, int matches)
{
  /* str      - string to match
   * regexp   - pattern
   * slot     - number of the token which under consideration
   * matches  - how many times the token has matched
   */
  const char *next_regexp;
  bool matched;

#ifdef DEBUG

/*    fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
#endif

  if (*regexp == 0)
    {
      /* When we reach the end of the regexp, the match is a success
       */
      return true;
    }

  /* This chunk of code makes sure that the regexp-tokenising happens
   * only once. We only tokenise as much as we need.
   */
  if (slot > re_token_depth)
    {
      re_token_depth = slot;
      if (re_token[slot] == NULL)
        re_token[slot] = (selection *) malloc (sizeof (selection));
      next_regexp = re_get_token (re_token[slot], regexp);
      if (next_regexp == NULL)
        {
          /* Syntax error, what else can we do? */
          return false;
        }
      re_substr[slot] = next_regexp;
    }
  else
    {
      next_regexp = re_substr[slot];
    }

  matched = re_match_token (*str, re_token[slot]);
  if (matched)
    ++matches;

  if (*str == 0)
    return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched;

  switch (re_token[slot]->repeat)
    {
        case rep_once:
          if (matches == 1)
            {                   /* (matches == 1) => (matched == true) */
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;
        case rep_once_or_more:
          if (matched)
            {                   /* (matched == true) => (matches >= 1) */
              /* First check if the current token repeats more */
              if (re_cmp_step (str + 1, regexp, slot, matches))
                return true;
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;
        case rep_null_or_once:
          /* We must go on to the next token, but should we advance str? */
          if (matches == 0)
            {
              return re_cmp_step (str, next_regexp, slot + 1, 0);
            }
          else if (matches == 1)
            {
              return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
            }
          return false;         /* Not reached */
        case rep_null_or_more:
          if (matched)
            {
              /* Look for further repeats, advance str */
              if (re_cmp_step (str + 1, regexp, slot, matches))
                return true;
              return re_cmp_step (str, next_regexp, slot + 1, 0);
            }
          return re_cmp_step (str, next_regexp, slot + 1, 0);
    }
  return false;
}

static void
re_init (void)
{
  int i;

  re_token[0] = (selection *) malloc (sizeof (selection));
  for (i = 1; i < RE_TOKEN_MAX; i++)
    re_token[i] = NULL;

  re_init_done = true;
}

static bool
re_match_token (unsigned char c, selection * sel)
{
  switch (sel->type)
    {
        case sel_any:
          return true;
        case sel_end:
          return (c == 0);
        case sel_single:
          return (tolower (c) == tolower (sel->u.single));
        case sel_range:
          return (c >= sel->u.range.low && c <= sel->u.range.high);
        case sel_array:
          return (sel->u.array[c]);
        case sel_not_single:
          return (tolower (c) != tolower (sel->u.single));
        case sel_not_range:
          return (c < sel->u.range.low && c > sel->u.range.high);
    }
  return false;
}

/* re_get_token - get regular expression token
 * Returns the first token found in <regexp> in <sel>
 * Return values: NULL  syntax error
 *                pointer to first character past token.
 */
static const char *
re_get_token (selection * sel, const char *regexp)
{

#ifdef SAFE_CHECKS
# define exit_if_null   if (*regexp == 0) return NULL
#else
# define exit_if_null
#endif

  bool quoted = false;
  unsigned char looking_at;

#ifdef SAFE_CHECKS
  if (sel == NULL || regexp == NULL || *regexp == 0)
    return NULL;
#endif

  do
    {
      looking_at = *regexp++;
      switch (looking_at)
        {
            case '$':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  sel->type = sel_end;
                }
              break;
            case '.':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  sel->type = sel_any;
                }
              break;
            case '[':
              /* The fun stuff... perhaps a little obfuscated since I
               * don't trust the compiler to analyse liveness.
               */
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  bool neg = false;
                  unsigned char first, last = 0;

                  exit_if_null;
                  looking_at = *regexp++;

                  if (looking_at == '^')
                    {
                      neg = true;
                      exit_if_null;
                      looking_at = *regexp++;
                    }
                  first = looking_at;
                  exit_if_null;
                  looking_at = *regexp++;
                  if (looking_at == ']')
                    {
                      /* On the form [q] or [^q] */
                      sel->type = neg ? sel_not_single : sel_single;
                      sel->u.single = first;
                      break;
                    }
                  else if (looking_at == '-')
                    {
                      exit_if_null;
                      last = *regexp++;
                      if (last == ']')
                        {
                          /* On the form [A-] or [^A-]. Checking for
                           * [,-] and making it a range is probably not
                           * worth it :-)
                           */
                          sel->type = sel_array;
                          memset (sel->u.array, neg, sizeof (sel->u.array));
                          sel->u.array[first] = sel->u.array['-'] = !neg;
                          break;
                        }
                      else
                        {
                          exit_if_null;
                          looking_at = *regexp++;
                          if (looking_at == ']')
                            {
                              /* On the form [A-G] or [^A-G]. Note that [G-A]
                               * is a syntax error. Fair enough, I think.
                               */
#ifdef SAFE_CHECK
                              if (first > last)
                                return NULL;
#endif
                              sel->type = neg ? sel_not_range : sel_range;
                              sel->u.range.low = first;
                              sel->u.range.high = last;
                              break;
                            }
                        }
                    }
                  {
                    /* The datastructure can only represent a RE this
                     * complex with an array.
                     */
                    int i;
                    unsigned char previous;

                    sel->type = sel_array;
                    memset (sel->u.array, neg, sizeof (sel->u.array));
                    if (last)
                      {
                        /* It starts with a range */
#ifdef SAFE_CHECK
                        if (first > last)
                          return NULL;
#endif
                        for (i = first; i <= last; i++)
                          {
                            sel->u.array[i] = !neg;
                          }
                      }
                    else
                      {
                        /* It begins with a "random" character */
                        sel->u.array[first] = !neg;
                      }
                    sel->u.array[looking_at] = !neg;

                    exit_if_null;
                    previous = looking_at;
                    looking_at = *regexp++;

                    /* Add more characters to the array until we reach
                     * ]. Quoting doesn't and shouldn't work in here.
                     * ("]" should be put first, and "-" last if they
                     * are needed inside this construct.)
                     * Look for ranges as we go along.
                     */
                    while (looking_at != ']')
                      {
                        if (looking_at == '-')
                          {
                            exit_if_null;
                            looking_at = *regexp++;
                            if (looking_at != ']')
                              {
#ifdef SAFE_CHECK
                                if (previous > looking_at)
                                  return NULL;
#endif
                                for (i = previous + 1; i < looking_at; i++)
                                  {
                                    /* previous has already been set and
                                     * looking_at is set below.
                                     */
                                    sel->u.array[i] = !neg;
                                  }
                                exit_if_null;
                              }
                            else
                              {
                                sel->u.array['-'] = !neg;
                                break;
                              }
                          }
                        sel->u.array[looking_at] = !neg;
                        previous = looking_at;
                        exit_if_null;
                        looking_at = *regexp++;
                      }
                  }
                }
              break;
            case '\\':
              if (quoted)
                {
                  quoted = false;
                  sel->type = sel_single;
                  sel->u.single = looking_at;
                }
              else
                {
                  quoted = true;
                }
              break;
            default:
              quoted = false;
              sel->type = sel_single;
              sel->u.single = looking_at;
              break;
        }
    }
  while (quoted);

  if (*regexp == '*')
    {
      sel->repeat = rep_null_or_more;
      ++regexp;
    }
  else if (*regexp == '?')
    {
      sel->repeat = rep_null_or_once;
      ++regexp;
    }
  else if (*regexp == '+')
    {
      sel->repeat = rep_once_or_more;
      ++regexp;
    }
  else
    {
      sel->repeat = rep_once;
    }

  return regexp;
}

/*   D e b u g   c o d e
 */
#ifdef DEBUG2                   /* compile all with DEBUG also ? hevi@lut.fi */
static void
re_dump_sel (selection * sel)
{
  switch (sel->type)
    {
        case sel_any:
          printf (".");
          break;
        case sel_end:
          printf ("$");
          break;
        case sel_single:
          printf ("<%c>", sel->u.single);
          break;
        case sel_range:
          printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
          break;
        case sel_array:
          {
            int i;

            printf ("[");
            for (i = 0; i < uchar_MAX; i++)
              {
                if (sel->u.array[i])
                  {
                    printf ("%c", i);
                  }
              }
            printf ("]");
          }
          break;
        case sel_not_single:
          printf ("[^%c]", sel->u.single);
          break;
        case sel_not_range:
          printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
          break;
        default:
          printf ("<UNKNOWN TOKEN!>");
          break;
    }
  switch (sel->repeat)
    {
        case rep_once:
          break;
        case rep_null_or_once:
          printf ("?");
          break;
        case rep_null_or_more:
          printf ("*");
          break;
        case rep_once_or_more:
          printf ("+");
          break;
        default:
          printf ("<UNKNOWN REP-TOKEN!>");
          break;
    }
}

int
main (int argc, char *argv[])
{
  char *re, *m;
  selection sel;

  re = re_get_token (&sel, argv[1]);

  printf ("'%s' -> '%s'\n", argv[1], re);
  re_dump_sel (&sel);
  printf ("\n");
  m = re_cmp (argv[2], argv[1]);
  if (m)
    printf ("MATCH! -> '%s'\n", m);
  return 0;
}
#endif
Revision:	1.6
Committed:	Sat Dec 9 17:28:37 2006 UTC (17 years, 5 months ago) by pippijn
Content type:	text/plain
Branch:	MAIN
CVS Tags:	HEAD
Changes since 1.5:	+0 -0 lines
State:	*FILE REMOVED*
Log Message:	removed regex comparison. this is now done with perl
#	User	Rev	Content
1	elmex	1.1	/* re-cmp.c
2			* Pattern match a string, parsing some of the common RE-metacharacters.
3			*
4			* This code is public domain, but I would appreciate to hear of
5			* improvements or even the fact that you use it in your program.
6			*
7			* Deliberate BUGS:
8			* - These tokens are not supported: \| ( )
9			* - You will get the longest expansion of the _first_ string which
10			* matches the RE, not the longest string which would be the proper
11			* behaviour for a RE-matcher.
12			*
13			* Author: Kjetil T. Homme <kjetilho@ifi.uio.no> May 1993
14			*/
15
16			#include <stdio.h>
17			#include <stdlib.h>
18			#include <memory.h>
19			#include <limits.h>
20			#include <re-cmp.h>
21			#include <ctype.h>
22
23			/* Get prototype functions to prevent warnings. */
24			#if defined (__sun__) && defined(StupidSunHeaders)
25	root	1.4	# include <sys/types.h>
26			# include <sys/time.h>
27			# include "sunos.h" /* Prototypes for standard libraries, sunos lack those */
28	elmex	1.1	#endif
29
30
31			/* P r o t o t y p e s
32			*/
33	root	1.4	const char re_cmp (const char , const char *);
34			static bool re_cmp_step (const char , const char , unsigned, int);
35			static void re_init (void);
36			static bool re_match_token (unsigned char, selection *);
37			static const char re_get_token (selection , const char *);
38
39	elmex	1.1	#ifdef DEBUG2
40	root	1.4	static void re_dump_sel (selection *);
41	elmex	1.1	#endif
42
43			/* G l o b a l v a r i a b l e s
44			*/
45	root	1.4	static bool re_init_done = false;
46			static selection *re_token[RE_TOKEN_MAX];
47			static const char *re_substr[RE_TOKEN_MAX];
48			static unsigned int re_token_depth;
49	elmex	1.1
50			/* E x t e r n a l f u n c t i o n
51			*/
52
53			/* re-cmp - get regular expression match.
54			* Return values: NULL - no match or error in regexp.
55			* pointer to beginning of matching string
56			*/
57			const char *
58	root	1.4	re_cmp (const char str, const char regexp)
59			{
60			const char *next_regexp;
61			bool once = false;
62			bool matched;
63	elmex	1.1
64	root	1.4	if (re_init_done == false)
65			re_init ();
66	elmex	1.1
67			#ifdef SAFE_CHECKS
68	root	1.4	if (regexp == NULL \|\| str == NULL)
69			return NULL;
70	elmex	1.1	#endif
71	root	1.4	if (*regexp == '^')
72			{
73			once = true;
74			++regexp;
75			}
76			if (*regexp == 0)
77			{
78			/* // or /^/ matches any string */
79			return str;
80			}
81
82			next_regexp = re_get_token (re_token[0], regexp);
83			re_token_depth = 0;
84			re_substr[0] = next_regexp;
85
86			matched = false;
87			while (str != '\0' && !(matched = re_match_token (str, re_token[0])))
88			str++;
89
90			if (matched && *next_regexp == 0)
91			return str;
92
93			/* Apologies for the nearly duplicated code below, hopefully it
94			* speeds things up.
95			*/
96			if (once)
97			{
98			switch (re_token[0]->repeat)
99			{
100	root	1.2	case rep_once:
101	root	1.4	if (matched == false)
102			return NULL;
103			break;
104	root	1.2	case rep_once_or_more:
105	root	1.4	if (matched == false)
106			return NULL;
107	root	1.2
108	root	1.4	if (re_cmp_step (str + 1, regexp, 0, 1))
109			return str;
110			break;
111	root	1.2	case rep_null_or_once:
112	root	1.4	if (matched == false)
113			return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
114			break;
115	root	1.2	case rep_null_or_more:
116	root	1.4	if (matched)
117			{
118			if (re_cmp_step (str + 1, regexp, 0, 1))
119			return str;
120			}
121			else
122			{
123			return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL;
124	root	1.2	}
125	root	1.4	break;
126	root	1.2	}
127	root	1.4	return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL;
128	elmex	1.1	}
129
130	root	1.4	if (matched)
131			{
132			switch (re_token[0]->repeat)
133			{
134	root	1.2	case rep_once:
135			case rep_null_or_once:
136	root	1.4	break;
137	root	1.2	case rep_once_or_more:
138			case rep_null_or_more:
139	root	1.4	if (re_cmp_step (str + 1, regexp, 0, 1))
140			return str;
141			break;
142	root	1.2	}
143	root	1.4	/* The logic here is that re_match_token only sees
144			* if the one letter matches. Thus, if the
145			* regex is like '@match eureca', and the
146			* the user enters anything with an e, re_match_token
147			* returns true, but they really need to match the
148			* entire regexp, which re_cmp_step will do.
149			* However, what happens is that there can be a case
150			* where the string being match is something like
151			* 'where is eureca'. In this case, the re_match_token
152			* matches that first e, but the re_cmp_step below,
153			* fails because the next character (r) doesn't match
154			* the u. So we call re_cmp with the string
155			* after the first r, so that it should hopefully match
156			* up properly.
157			*/
158			if (re_cmp_step (str + 1, next_regexp, 1, 0))
159			return str;
160			else if (*(str + 1) != 0)
161			return re_cmp (str + 1, regexp);
162	elmex	1.1	}
163	root	1.4	return NULL;
164	elmex	1.1	}
165
166			/* A u x i l l i a r y f u n c t i o n s
167			*/
168
169	pippijn	1.3	static bool
170	root	1.4	re_cmp_step (const char str, const char regexp, unsigned slot, int matches)
171			{
172			/* str - string to match
173			* regexp - pattern
174			* slot - number of the token which under consideration
175			* matches - how many times the token has matched
176			*/
177			const char *next_regexp;
178			bool matched;
179	elmex	1.1
180			#ifdef DEBUG
181	root	1.4
182	elmex	1.1	/* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/
183			#endif
184
185	root	1.4	if (*regexp == 0)
186			{
187			/* When we reach the end of the regexp, the match is a success
188			*/
189			return true;
190			}
191
192			/* This chunk of code makes sure that the regexp-tokenising happens
193			* only once. We only tokenise as much as we need.
194			*/
195			if (slot > re_token_depth)
196			{
197			re_token_depth = slot;
198			if (re_token[slot] == NULL)
199			re_token[slot] = (selection *) malloc (sizeof (selection));
200			next_regexp = re_get_token (re_token[slot], regexp);
201			if (next_regexp == NULL)
202			{
203			/* Syntax error, what else can we do? */
204			return false;
205	root	1.2	}
206	root	1.4	re_substr[slot] = next_regexp;
207			}
208			else
209			{
210			next_regexp = re_substr[slot];
211	elmex	1.1	}
212
213	root	1.4	matched = re_match_token (*str, re_token[slot]);
214			if (matched)
215			++matches;
216	elmex	1.1
217	root	1.4	if (*str == 0)
218			return (*next_regexp == 0 \|\| re_token[slot]->type == sel_end) && matched;
219	elmex	1.1
220	root	1.4	switch (re_token[slot]->repeat)
221			{
222	root	1.2	case rep_once:
223	root	1.4	if (matches == 1)
224			{ /* (matches == 1) => (matched == true) */
225			return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
226	root	1.2	}
227	root	1.4	return false;
228	root	1.2	case rep_once_or_more:
229	root	1.4	if (matched)
230			{ /* (matched == true) => (matches >= 1) */
231			/* First check if the current token repeats more */
232			if (re_cmp_step (str + 1, regexp, slot, matches))
233			return true;
234			return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
235	root	1.2	}
236	root	1.4	return false;
237	root	1.2	case rep_null_or_once:
238	root	1.4	/* We must go on to the next token, but should we advance str? */
239			if (matches == 0)
240			{
241			return re_cmp_step (str, next_regexp, slot + 1, 0);
242			}
243			else if (matches == 1)
244			{
245			return re_cmp_step (str + 1, next_regexp, slot + 1, 0);
246	root	1.2	}
247	root	1.4	return false; /* Not reached */
248	root	1.2	case rep_null_or_more:
249	root	1.4	if (matched)
250			{
251			/* Look for further repeats, advance str */
252			if (re_cmp_step (str + 1, regexp, slot, matches))
253			return true;
254			return re_cmp_step (str, next_regexp, slot + 1, 0);
255	root	1.2	}
256	root	1.4	return re_cmp_step (str, next_regexp, slot + 1, 0);
257	elmex	1.1	}
258	root	1.4	return false;
259	elmex	1.1	}
260
261			static void
262	root	1.4	re_init (void)
263			{
264			int i;
265
266			re_token[0] = (selection *) malloc (sizeof (selection));
267			for (i = 1; i < RE_TOKEN_MAX; i++)
268			re_token[i] = NULL;
269	elmex	1.1
270	root	1.4	re_init_done = true;
271	elmex	1.1	}
272
273	pippijn	1.3	static bool
274	root	1.4	re_match_token (unsigned char c, selection * sel)
275			{
276			switch (sel->type)
277			{
278	root	1.2	case sel_any:
279	root	1.4	return true;
280	root	1.2	case sel_end:
281	root	1.4	return (c == 0);
282	root	1.2	case sel_single:
283	root	1.4	return (tolower (c) == tolower (sel->u.single));
284	root	1.2	case sel_range:
285	root	1.4	return (c >= sel->u.range.low && c <= sel->u.range.high);
286	root	1.2	case sel_array:
287	root	1.4	return (sel->u.array[c]);
288	root	1.2	case sel_not_single:
289	root	1.4	return (tolower (c) != tolower (sel->u.single));
290	root	1.2	case sel_not_range:
291	root	1.4	return (c < sel->u.range.low && c > sel->u.range.high);
292	elmex	1.1	}
293	root	1.4	return false;
294	elmex	1.1	}
295
296			/* re_get_token - get regular expression token
297			* Returns the first token found in <regexp> in <sel>
298			* Return values: NULL syntax error
299			* pointer to first character past token.
300			*/
301			static const char *
302	root	1.4	re_get_token (selection * sel, const char *regexp)
303			{
304	elmex	1.1
305			#ifdef SAFE_CHECKS
306	root	1.4	# define exit_if_null if (*regexp == 0) return NULL
307	elmex	1.1	#else
308	root	1.4	# define exit_if_null
309	elmex	1.1	#endif
310
311	root	1.4	bool quoted = false;
312			unsigned char looking_at;
313	elmex	1.1
314			#ifdef SAFE_CHECKS
315	root	1.4	if (sel == NULL \|\| regexp == NULL \|\| *regexp == 0)
316			return NULL;
317	elmex	1.1	#endif
318
319	root	1.4	do
320			{
321			looking_at = *regexp++;
322			switch (looking_at)
323			{
324	root	1.2	case '$':
325	root	1.4	if (quoted)
326			{
327			quoted = false;
328			sel->type = sel_single;
329			sel->u.single = looking_at;
330			}
331			else
332			{
333			sel->type = sel_end;
334	root	1.2	}
335	root	1.4	break;
336	root	1.2	case '.':
337	root	1.4	if (quoted)
338			{
339			quoted = false;
340			sel->type = sel_single;
341			sel->u.single = looking_at;
342	root	1.2	}
343	root	1.4	else
344			{
345			sel->type = sel_any;
346			}
347			break;
348	root	1.2	case '[':
349	root	1.4	/* The fun stuff... perhaps a little obfuscated since I
350			* don't trust the compiler to analyse liveness.
351			*/
352			if (quoted)
353			{
354			quoted = false;
355			sel->type = sel_single;
356			sel->u.single = looking_at;
357			}
358			else
359			{
360			bool neg = false;
361			unsigned char first, last = 0;
362	root	1.2
363	root	1.4	exit_if_null;
364			looking_at = *regexp++;
365	root	1.2
366	root	1.4	if (looking_at == '^')
367			{
368			neg = true;
369			exit_if_null;
370			looking_at = *regexp++;
371			}
372			first = looking_at;
373			exit_if_null;
374			looking_at = *regexp++;
375			if (looking_at == ']')
376			{
377			/* On the form [q] or [^q] */
378			sel->type = neg ? sel_not_single : sel_single;
379			sel->u.single = first;
380			break;
381	root	1.2	}
382	root	1.4	else if (looking_at == '-')
383			{
384			exit_if_null;
385			last = *regexp++;
386			if (last == ']')
387			{
388			/* On the form [A-] or [^A-]. Checking for
389			* [,-] and making it a range is probably not
390			* worth it :-)
391			*/
392			sel->type = sel_array;
393			memset (sel->u.array, neg, sizeof (sel->u.array));
394			sel->u.array[first] = sel->u.array['-'] = !neg;
395			break;
396			}
397			else
398			{
399			exit_if_null;
400			looking_at = *regexp++;
401			if (looking_at == ']')
402			{
403			/* On the form [A-G] or [^A-G]. Note that [G-A]
404			* is a syntax error. Fair enough, I think.
405			*/
406	elmex	1.1	#ifdef SAFE_CHECK
407	root	1.4	if (first > last)
408			return NULL;
409	elmex	1.1	#endif
410	root	1.4	sel->type = neg ? sel_not_range : sel_range;
411			sel->u.range.low = first;
412			sel->u.range.high = last;
413			break;
414	root	1.2	}
415			}
416			}
417	root	1.4	{
418			/* The datastructure can only represent a RE this
419			* complex with an array.
420			*/
421			int i;
422			unsigned char previous;
423
424			sel->type = sel_array;
425			memset (sel->u.array, neg, sizeof (sel->u.array));
426			if (last)
427			{
428			/* It starts with a range */
429	elmex	1.1	#ifdef SAFE_CHECK
430	root	1.4	if (first > last)
431			return NULL;
432	elmex	1.1	#endif
433	root	1.4	for (i = first; i <= last; i++)
434			{
435			sel->u.array[i] = !neg;
436			}
437			}
438			else
439			{
440			/* It begins with a "random" character */
441			sel->u.array[first] = !neg;
442			}
443			sel->u.array[looking_at] = !neg;
444	root	1.2
445	root	1.4	exit_if_null;
446			previous = looking_at;
447			looking_at = *regexp++;
448	root	1.2
449	root	1.4	/* Add more characters to the array until we reach
450			* ]. Quoting doesn't and shouldn't work in here.
451			* ("]" should be put first, and "-" last if they
452			* are needed inside this construct.)
453			* Look for ranges as we go along.
454			*/
455			while (looking_at != ']')
456			{
457			if (looking_at == '-')
458			{
459			exit_if_null;
460			looking_at = *regexp++;
461			if (looking_at != ']')
462			{
463	elmex	1.1	#ifdef SAFE_CHECK
464	root	1.4	if (previous > looking_at)
465			return NULL;
466	elmex	1.1	#endif
467	root	1.4	for (i = previous + 1; i < looking_at; i++)
468			{
469			/* previous has already been set and
470			* looking_at is set below.
471			*/
472			sel->u.array[i] = !neg;
473			}
474			exit_if_null;
475			}
476			else
477			{
478			sel->u.array['-'] = !neg;
479			break;
480			}
481			}
482			sel->u.array[looking_at] = !neg;
483			previous = looking_at;
484			exit_if_null;
485			looking_at = *regexp++;
486			}
487			}
488	root	1.2	}
489	root	1.4	break;
490	root	1.2	case '\\':
491	root	1.4	if (quoted)
492			{
493			quoted = false;
494			sel->type = sel_single;
495			sel->u.single = looking_at;
496			}
497			else
498			{
499			quoted = true;
500	root	1.2	}
501	root	1.4	break;
502	root	1.2	default:
503	root	1.4	quoted = false;
504			sel->type = sel_single;
505			sel->u.single = looking_at;
506			break;
507	root	1.2	}
508	root	1.4	}
509			while (quoted);
510	elmex	1.1
511	root	1.4	if (regexp == '')
512			{
513			sel->repeat = rep_null_or_more;
514			++regexp;
515			}
516			else if (*regexp == '?')
517			{
518			sel->repeat = rep_null_or_once;
519			++regexp;
520			}
521			else if (*regexp == '+')
522			{
523			sel->repeat = rep_once_or_more;
524			++regexp;
525			}
526			else
527			{
528			sel->repeat = rep_once;
529	elmex	1.1	}
530
531	root	1.4	return regexp;
532	elmex	1.1	}
533
534			/* D e b u g c o d e
535			*/
536	root	1.4	#ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */
537	elmex	1.1	static void
538	root	1.4	re_dump_sel (selection * sel)
539			{
540			switch (sel->type)
541			{
542	root	1.2	case sel_any:
543	root	1.4	printf (".");
544			break;
545	root	1.2	case sel_end:
546	root	1.4	printf ("$");
547			break;
548	root	1.2	case sel_single:
549	root	1.4	printf ("<%c>", sel->u.single);
550			break;
551	root	1.2	case sel_range:
552	root	1.4	printf ("[%c-%c]", sel->u.range.low, sel->u.range.high);
553			break;
554	root	1.2	case sel_array:
555	root	1.4	{
556			int i;
557
558			printf ("[");
559			for (i = 0; i < uchar_MAX; i++)
560			{
561			if (sel->u.array[i])
562			{
563			printf ("%c", i);
564			}
565			}
566			printf ("]");
567			}
568			break;
569	root	1.2	case sel_not_single:
570	root	1.4	printf ("[^%c]", sel->u.single);
571			break;
572	root	1.2	case sel_not_range:
573	root	1.4	printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high);
574			break;
575	root	1.2	default:
576	root	1.4	printf ("<UNKNOWN TOKEN!>");
577			break;
578	elmex	1.1	}
579	root	1.4	switch (sel->repeat)
580			{
581	root	1.2	case rep_once:
582	root	1.4	break;
583	root	1.2	case rep_null_or_once:
584	root	1.4	printf ("?");
585			break;
586	root	1.2	case rep_null_or_more:
587	root	1.4	printf ("*");
588			break;
589	root	1.2	case rep_once_or_more:
590	root	1.4	printf ("+");
591			break;
592	root	1.2	default:
593	root	1.4	printf ("<UNKNOWN REP-TOKEN!>");
594			break;
595	elmex	1.1	}
596			}
597
598			int
599	root	1.4	main (int argc, char *argv[])
600			{
601			char re, m;
602			selection sel;
603
604			re = re_get_token (&sel, argv[1]);
605
606			printf ("'%s' -> '%s'\n", argv[1], re);
607			re_dump_sel (&sel);
608			printf ("\n");
609			m = re_cmp (argv[2], argv[1]);
610			if (m)
611			printf ("MATCH! -> '%s'\n", m);
612			return 0;
613	elmex	1.1	}
614			#endif