1 | /* |
|
|
2 | * static char *rcsid_player_c = |
|
|
3 | * "$Id: re-cmp.C,v 1.3 2006/09/08 04:51:08 pippijn Exp $"; |
|
|
4 | */ |
|
|
5 | |
|
|
6 | |
|
|
7 | /* re-cmp.c |
1 | /* re-cmp.c |
8 | * Pattern match a string, parsing some of the common RE-metacharacters. |
2 | * Pattern match a string, parsing some of the common RE-metacharacters. |
9 | * |
3 | * |
10 | * This code is public domain, but I would appreciate to hear of |
4 | * This code is public domain, but I would appreciate to hear of |
11 | * improvements or even the fact that you use it in your program. |
5 | * improvements or even the fact that you use it in your program. |
… | |
… | |
26 | #include <re-cmp.h> |
20 | #include <re-cmp.h> |
27 | #include <ctype.h> |
21 | #include <ctype.h> |
28 | |
22 | |
29 | /* Get prototype functions to prevent warnings. */ |
23 | /* Get prototype functions to prevent warnings. */ |
30 | #if defined (__sun__) && defined(StupidSunHeaders) |
24 | #if defined (__sun__) && defined(StupidSunHeaders) |
31 | # include <sys/types.h> |
25 | # include <sys/types.h> |
32 | # include <sys/time.h> |
26 | # include <sys/time.h> |
33 | # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */ |
27 | # include "sunos.h" /* Prototypes for standard libraries, sunos lack those */ |
34 | #endif |
28 | #endif |
35 | |
29 | |
36 | |
30 | |
37 | /* P r o t o t y p e s |
31 | /* P r o t o t y p e s |
38 | */ |
32 | */ |
39 | const char *re_cmp(const char *, const char *); |
33 | const char *re_cmp (const char *, const char *); |
40 | static bool re_cmp_step(const char *, const char *, unsigned, int); |
34 | static bool re_cmp_step (const char *, const char *, unsigned, int); |
41 | static void re_init(void); |
35 | static void re_init (void); |
42 | static bool re_match_token(unsigned char, selection *); |
36 | static bool re_match_token (unsigned char, selection *); |
43 | static const char *re_get_token(selection *, const char *); |
37 | static const char *re_get_token (selection *, const char *); |
|
|
38 | |
44 | #ifdef DEBUG2 |
39 | #ifdef DEBUG2 |
45 | static void re_dump_sel(selection *); |
40 | static void re_dump_sel (selection *); |
46 | #endif |
41 | #endif |
47 | |
42 | |
48 | /* G l o b a l v a r i a b l e s |
43 | /* G l o b a l v a r i a b l e s |
49 | */ |
44 | */ |
50 | static bool re_init_done = false; |
45 | static bool re_init_done = false; |
51 | static selection *re_token[RE_TOKEN_MAX]; |
46 | static selection *re_token[RE_TOKEN_MAX]; |
52 | static const char *re_substr[RE_TOKEN_MAX]; |
47 | static const char *re_substr[RE_TOKEN_MAX]; |
53 | static unsigned int re_token_depth; |
48 | static unsigned int re_token_depth; |
54 | |
49 | |
55 | /* E x t e r n a l f u n c t i o n |
50 | /* E x t e r n a l f u n c t i o n |
56 | */ |
51 | */ |
57 | |
52 | |
58 | /* re-cmp - get regular expression match. |
53 | /* re-cmp - get regular expression match. |
59 | * Return values: NULL - no match or error in regexp. |
54 | * Return values: NULL - no match or error in regexp. |
60 | * pointer to beginning of matching string |
55 | * pointer to beginning of matching string |
61 | */ |
56 | */ |
62 | const char * |
57 | const char * |
63 | re_cmp(const char *str, const char *regexp) { |
58 | re_cmp (const char *str, const char *regexp) |
|
|
59 | { |
64 | const char *next_regexp; |
60 | const char *next_regexp; |
65 | bool once = false; |
61 | bool once = false; |
66 | bool matched; |
62 | bool matched; |
67 | |
63 | |
68 | if (re_init_done == false) |
64 | if (re_init_done == false) |
69 | re_init(); |
65 | re_init (); |
70 | |
66 | |
71 | #ifdef SAFE_CHECKS |
67 | #ifdef SAFE_CHECKS |
72 | if (regexp == NULL || str == NULL) |
68 | if (regexp == NULL || str == NULL) |
73 | return NULL; |
69 | return NULL; |
74 | #endif |
70 | #endif |
75 | if (*regexp == '^') { |
71 | if (*regexp == '^') |
|
|
72 | { |
76 | once = true; |
73 | once = true; |
77 | ++regexp; |
74 | ++regexp; |
78 | } |
75 | } |
79 | if (*regexp == 0) { |
76 | if (*regexp == 0) |
|
|
77 | { |
80 | /* // or /^/ matches any string */ |
78 | /* // or /^/ matches any string */ |
81 | return str; |
79 | return str; |
82 | } |
80 | } |
83 | |
81 | |
84 | next_regexp = re_get_token(re_token[0], regexp); |
82 | next_regexp = re_get_token (re_token[0], regexp); |
85 | re_token_depth = 0; |
83 | re_token_depth = 0; |
86 | re_substr[0] = next_regexp; |
84 | re_substr[0] = next_regexp; |
87 | |
85 | |
88 | matched = false; |
86 | matched = false; |
89 | while (*str != '\0' && !(matched = re_match_token(*str, re_token[0]))) |
87 | while (*str != '\0' && !(matched = re_match_token (*str, re_token[0]))) |
90 | str++; |
88 | str++; |
91 | |
89 | |
92 | if (matched && *next_regexp == 0) |
90 | if (matched && *next_regexp == 0) |
93 | return str; |
91 | return str; |
94 | |
92 | |
95 | /* Apologies for the nearly duplicated code below, hopefully it |
93 | /* Apologies for the nearly duplicated code below, hopefully it |
96 | * speeds things up. |
94 | * speeds things up. |
97 | */ |
95 | */ |
98 | if (once) { |
96 | if (once) |
|
|
97 | { |
99 | switch (re_token[0]->repeat) { |
98 | switch (re_token[0]->repeat) |
|
|
99 | { |
100 | case rep_once: |
100 | case rep_once: |
101 | if (matched == false) |
101 | if (matched == false) |
102 | return NULL; |
102 | return NULL; |
103 | break; |
103 | break; |
104 | case rep_once_or_more: |
104 | case rep_once_or_more: |
105 | if (matched == false) |
105 | if (matched == false) |
106 | return NULL; |
106 | return NULL; |
107 | |
107 | |
|
|
108 | if (re_cmp_step (str + 1, regexp, 0, 1)) |
|
|
109 | return str; |
|
|
110 | break; |
|
|
111 | case rep_null_or_once: |
|
|
112 | if (matched == false) |
|
|
113 | return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL; |
|
|
114 | break; |
|
|
115 | case rep_null_or_more: |
|
|
116 | if (matched) |
|
|
117 | { |
108 | if (re_cmp_step(str+1, regexp, 0, 1)) |
118 | if (re_cmp_step (str + 1, regexp, 0, 1)) |
109 | return str; |
119 | return str; |
110 | break; |
120 | } |
111 | case rep_null_or_once: |
121 | else |
112 | if (matched == false) |
122 | { |
113 | return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL; |
123 | return re_cmp_step (str, next_regexp, 1, 0) ? str : NULL; |
|
|
124 | } |
114 | break; |
125 | break; |
115 | case rep_null_or_more: |
|
|
116 | if (matched) { |
|
|
117 | if (re_cmp_step(str+1, regexp, 0, 1)) |
|
|
118 | return str; |
|
|
119 | } else { |
|
|
120 | return re_cmp_step(str, next_regexp, 1, 0) ? str : NULL; |
|
|
121 | } |
|
|
122 | break; |
|
|
123 | } |
126 | } |
124 | return re_cmp_step(str+1, next_regexp, 1, 0) ? str : NULL; |
127 | return re_cmp_step (str + 1, next_regexp, 1, 0) ? str : NULL; |
125 | } |
128 | } |
126 | |
129 | |
127 | if (matched) { |
130 | if (matched) |
|
|
131 | { |
128 | switch (re_token[0]->repeat) { |
132 | switch (re_token[0]->repeat) |
|
|
133 | { |
129 | case rep_once: |
134 | case rep_once: |
130 | case rep_null_or_once: |
135 | case rep_null_or_once: |
131 | break; |
136 | break; |
132 | case rep_once_or_more: |
137 | case rep_once_or_more: |
133 | case rep_null_or_more: |
138 | case rep_null_or_more: |
134 | if (re_cmp_step(str+1, regexp, 0, 1)) |
139 | if (re_cmp_step (str + 1, regexp, 0, 1)) |
135 | return str; |
140 | return str; |
136 | break; |
141 | break; |
137 | } |
142 | } |
138 | /* The logic here is that re_match_token only sees |
143 | /* The logic here is that re_match_token only sees |
139 | * if the one letter matches. Thus, if the |
144 | * if the one letter matches. Thus, if the |
140 | * regex is like '@match eureca', and the |
145 | * regex is like '@match eureca', and the |
141 | * the user enters anything with an e, re_match_token |
146 | * the user enters anything with an e, re_match_token |
142 | * returns true, but they really need to match the |
147 | * returns true, but they really need to match the |
143 | * entire regexp, which re_cmp_step will do. |
148 | * entire regexp, which re_cmp_step will do. |
144 | * However, what happens is that there can be a case |
149 | * However, what happens is that there can be a case |
145 | * where the string being match is something like |
150 | * where the string being match is something like |
146 | * 'where is eureca'. In this case, the re_match_token |
151 | * 'where is eureca'. In this case, the re_match_token |
147 | * matches that first e, but the re_cmp_step below, |
152 | * matches that first e, but the re_cmp_step below, |
148 | * fails because the next character (r) doesn't match |
153 | * fails because the next character (r) doesn't match |
149 | * the u. So we call re_cmp with the string |
154 | * the u. So we call re_cmp with the string |
150 | * after the first r, so that it should hopefully match |
155 | * after the first r, so that it should hopefully match |
151 | * up properly. |
156 | * up properly. |
152 | */ |
157 | */ |
153 | if (re_cmp_step(str+1, next_regexp, 1, 0)) |
158 | if (re_cmp_step (str + 1, next_regexp, 1, 0)) |
154 | return str; |
159 | return str; |
155 | else if (*(str+1) != 0) |
160 | else if (*(str + 1) != 0) |
156 | return re_cmp(str+1, regexp); |
161 | return re_cmp (str + 1, regexp); |
157 | } |
162 | } |
158 | return NULL; |
163 | return NULL; |
159 | } |
164 | } |
160 | |
165 | |
161 | /* A u x i l l i a r y f u n c t i o n s |
166 | /* A u x i l l i a r y f u n c t i o n s |
162 | */ |
167 | */ |
163 | |
168 | |
164 | static bool |
169 | static bool |
165 | re_cmp_step(const char *str, const char *regexp, unsigned slot, int matches) { |
170 | re_cmp_step (const char *str, const char *regexp, unsigned slot, int matches) |
|
|
171 | { |
166 | /* str - string to match |
172 | /* str - string to match |
167 | * regexp - pattern |
173 | * regexp - pattern |
168 | * slot - number of the token which under consideration |
174 | * slot - number of the token which under consideration |
169 | * matches - how many times the token has matched |
175 | * matches - how many times the token has matched |
170 | */ |
176 | */ |
171 | const char *next_regexp; |
177 | const char *next_regexp; |
172 | bool matched; |
178 | bool matched; |
173 | |
179 | |
174 | #ifdef DEBUG |
180 | #ifdef DEBUG |
|
|
181 | |
175 | /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/ |
182 | /* fprintf(stderr, "['%s', '%s', %u, %d]\n", str, regexp, slot, matches);*/ |
176 | #endif |
183 | #endif |
177 | |
184 | |
178 | if (*regexp == 0) { |
185 | if (*regexp == 0) |
|
|
186 | { |
179 | /* When we reach the end of the regexp, the match is a success |
187 | /* When we reach the end of the regexp, the match is a success |
180 | */ |
188 | */ |
181 | return true; |
189 | return true; |
182 | } |
190 | } |
183 | |
191 | |
184 | /* This chunk of code makes sure that the regexp-tokenising happens |
192 | /* This chunk of code makes sure that the regexp-tokenising happens |
185 | * only once. We only tokenise as much as we need. |
193 | * only once. We only tokenise as much as we need. |
186 | */ |
194 | */ |
187 | if (slot > re_token_depth) { |
195 | if (slot > re_token_depth) |
|
|
196 | { |
188 | re_token_depth = slot; |
197 | re_token_depth = slot; |
189 | if (re_token[slot] == NULL) |
198 | if (re_token[slot] == NULL) |
190 | re_token[slot] = (selection *) malloc(sizeof(selection)); |
199 | re_token[slot] = (selection *) malloc (sizeof (selection)); |
191 | next_regexp = re_get_token(re_token[slot], regexp); |
200 | next_regexp = re_get_token (re_token[slot], regexp); |
192 | if (next_regexp == NULL) { |
201 | if (next_regexp == NULL) |
|
|
202 | { |
193 | /* Syntax error, what else can we do? */ |
203 | /* Syntax error, what else can we do? */ |
194 | return false; |
204 | return false; |
195 | } |
205 | } |
196 | re_substr[slot] = next_regexp; |
206 | re_substr[slot] = next_regexp; |
197 | } else { |
207 | } |
|
|
208 | else |
|
|
209 | { |
198 | next_regexp = re_substr[slot]; |
210 | next_regexp = re_substr[slot]; |
199 | } |
211 | } |
200 | |
212 | |
201 | matched = re_match_token(*str, re_token[slot]); |
213 | matched = re_match_token (*str, re_token[slot]); |
202 | if (matched) |
214 | if (matched) |
203 | ++matches; |
215 | ++matches; |
204 | |
216 | |
205 | if (*str == 0) |
217 | if (*str == 0) |
206 | return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched; |
218 | return (*next_regexp == 0 || re_token[slot]->type == sel_end) && matched; |
207 | |
219 | |
208 | switch (re_token[slot]->repeat) { |
220 | switch (re_token[slot]->repeat) |
|
|
221 | { |
209 | case rep_once: |
222 | case rep_once: |
210 | if (matches == 1) { /* (matches == 1) => (matched == true) */ |
223 | if (matches == 1) |
|
|
224 | { /* (matches == 1) => (matched == true) */ |
211 | return re_cmp_step(str+1, next_regexp, slot+1, 0); |
225 | return re_cmp_step (str + 1, next_regexp, slot + 1, 0); |
212 | } |
226 | } |
213 | return false; |
227 | return false; |
214 | case rep_once_or_more: |
228 | case rep_once_or_more: |
215 | if (matched) { /* (matched == true) => (matches >= 1) */ |
229 | if (matched) |
|
|
230 | { /* (matched == true) => (matches >= 1) */ |
216 | /* First check if the current token repeats more */ |
231 | /* First check if the current token repeats more */ |
217 | if (re_cmp_step(str+1, regexp, slot, matches)) |
232 | if (re_cmp_step (str + 1, regexp, slot, matches)) |
218 | return true; |
233 | return true; |
219 | return re_cmp_step(str+1, next_regexp, slot+1, 0); |
234 | return re_cmp_step (str + 1, next_regexp, slot + 1, 0); |
220 | } |
235 | } |
221 | return false; |
236 | return false; |
222 | case rep_null_or_once: |
237 | case rep_null_or_once: |
223 | /* We must go on to the next token, but should we advance str? */ |
238 | /* We must go on to the next token, but should we advance str? */ |
224 | if (matches == 0) { |
239 | if (matches == 0) |
|
|
240 | { |
225 | return re_cmp_step(str, next_regexp, slot+1, 0); |
241 | return re_cmp_step (str, next_regexp, slot + 1, 0); |
226 | } else if (matches == 1) { |
|
|
227 | return re_cmp_step(str+1, next_regexp, slot+1, 0); |
|
|
228 | } |
242 | } |
|
|
243 | else if (matches == 1) |
|
|
244 | { |
|
|
245 | return re_cmp_step (str + 1, next_regexp, slot + 1, 0); |
|
|
246 | } |
229 | return false; /* Not reached */ |
247 | return false; /* Not reached */ |
230 | case rep_null_or_more: |
248 | case rep_null_or_more: |
231 | if (matched) { |
249 | if (matched) |
|
|
250 | { |
232 | /* Look for further repeats, advance str */ |
251 | /* Look for further repeats, advance str */ |
233 | if (re_cmp_step(str+1, regexp, slot, matches)) |
252 | if (re_cmp_step (str + 1, regexp, slot, matches)) |
234 | return true; |
253 | return true; |
235 | return re_cmp_step(str, next_regexp, slot+1, 0); |
254 | return re_cmp_step (str, next_regexp, slot + 1, 0); |
236 | } |
255 | } |
237 | return re_cmp_step(str, next_regexp, slot+1, 0); |
256 | return re_cmp_step (str, next_regexp, slot + 1, 0); |
238 | } |
257 | } |
239 | return false; |
258 | return false; |
240 | } |
259 | } |
241 | |
260 | |
242 | static void |
261 | static void |
243 | re_init(void) { |
262 | re_init (void) |
|
|
263 | { |
244 | int i; |
264 | int i; |
245 | |
265 | |
246 | re_token[0] = (selection *) malloc(sizeof(selection)); |
266 | re_token[0] = (selection *) malloc (sizeof (selection)); |
247 | for (i = 1; i < RE_TOKEN_MAX; i++) |
267 | for (i = 1; i < RE_TOKEN_MAX; i++) |
248 | re_token[i] = NULL; |
268 | re_token[i] = NULL; |
249 | |
269 | |
250 | re_init_done = true; |
270 | re_init_done = true; |
251 | } |
271 | } |
252 | |
272 | |
253 | static bool |
273 | static bool |
254 | re_match_token(unsigned char c, selection *sel) { |
274 | re_match_token (unsigned char c, selection * sel) |
|
|
275 | { |
255 | switch (sel->type) { |
276 | switch (sel->type) |
|
|
277 | { |
256 | case sel_any: |
278 | case sel_any: |
257 | return true; |
279 | return true; |
258 | case sel_end: |
280 | case sel_end: |
259 | return (c == 0); |
281 | return (c == 0); |
260 | case sel_single: |
282 | case sel_single: |
261 | return (tolower(c) == tolower(sel->u.single)); |
283 | return (tolower (c) == tolower (sel->u.single)); |
262 | case sel_range: |
284 | case sel_range: |
263 | return (c >= sel->u.range.low && c <= sel->u.range.high); |
285 | return (c >= sel->u.range.low && c <= sel->u.range.high); |
264 | case sel_array: |
286 | case sel_array: |
265 | return (sel->u.array[c]); |
287 | return (sel->u.array[c]); |
266 | case sel_not_single: |
288 | case sel_not_single: |
267 | return (tolower(c) != tolower(sel->u.single)); |
289 | return (tolower (c) != tolower (sel->u.single)); |
268 | case sel_not_range: |
290 | case sel_not_range: |
269 | return (c < sel->u.range.low && c > sel->u.range.high); |
291 | return (c < sel->u.range.low && c > sel->u.range.high); |
270 | } |
292 | } |
271 | return false; |
293 | return false; |
272 | } |
294 | } |
273 | |
295 | |
274 | /* re_get_token - get regular expression token |
296 | /* re_get_token - get regular expression token |
275 | * Returns the first token found in <regexp> in <sel> |
297 | * Returns the first token found in <regexp> in <sel> |
276 | * Return values: NULL syntax error |
298 | * Return values: NULL syntax error |
277 | * pointer to first character past token. |
299 | * pointer to first character past token. |
278 | */ |
300 | */ |
279 | static const char * |
301 | static const char * |
280 | re_get_token(selection *sel, const char *regexp) { |
302 | re_get_token (selection * sel, const char *regexp) |
|
|
303 | { |
281 | |
304 | |
282 | #ifdef SAFE_CHECKS |
305 | #ifdef SAFE_CHECKS |
283 | # define exit_if_null if (*regexp == 0) return NULL |
306 | # define exit_if_null if (*regexp == 0) return NULL |
284 | #else |
307 | #else |
285 | # define exit_if_null |
308 | # define exit_if_null |
286 | #endif |
309 | #endif |
287 | |
310 | |
288 | bool quoted = false; |
311 | bool quoted = false; |
289 | unsigned char looking_at; |
312 | unsigned char looking_at; |
290 | |
313 | |
291 | #ifdef SAFE_CHECKS |
314 | #ifdef SAFE_CHECKS |
292 | if (sel == NULL || regexp == NULL || *regexp == 0) |
315 | if (sel == NULL || regexp == NULL || *regexp == 0) |
293 | return NULL; |
316 | return NULL; |
294 | #endif |
317 | #endif |
295 | |
318 | |
|
|
319 | do |
296 | do { |
320 | { |
297 | looking_at = *regexp++; |
321 | looking_at = *regexp++; |
298 | switch (looking_at) { |
322 | switch (looking_at) |
|
|
323 | { |
299 | case '$': |
324 | case '$': |
300 | if (quoted) { |
325 | if (quoted) |
|
|
326 | { |
301 | quoted = false; |
327 | quoted = false; |
302 | sel->type = sel_single; |
328 | sel->type = sel_single; |
303 | sel->u.single = looking_at; |
329 | sel->u.single = looking_at; |
|
|
330 | } |
304 | } else { |
331 | else |
|
|
332 | { |
305 | sel->type = sel_end; |
333 | sel->type = sel_end; |
306 | } |
334 | } |
307 | break; |
335 | break; |
308 | case '.': |
336 | case '.': |
309 | if (quoted) { |
337 | if (quoted) |
|
|
338 | { |
310 | quoted = false; |
339 | quoted = false; |
311 | sel->type = sel_single; |
340 | sel->type = sel_single; |
312 | sel->u.single = looking_at; |
341 | sel->u.single = looking_at; |
|
|
342 | } |
313 | } else { |
343 | else |
|
|
344 | { |
314 | sel->type = sel_any; |
345 | sel->type = sel_any; |
315 | } |
346 | } |
316 | break; |
347 | break; |
317 | case '[': |
348 | case '[': |
318 | /* The fun stuff... perhaps a little obfuscated since I |
349 | /* The fun stuff... perhaps a little obfuscated since I |
319 | * don't trust the compiler to analyse liveness. |
350 | * don't trust the compiler to analyse liveness. |
320 | */ |
351 | */ |
321 | if (quoted) { |
352 | if (quoted) |
|
|
353 | { |
322 | quoted = false; |
354 | quoted = false; |
323 | sel->type = sel_single; |
355 | sel->type = sel_single; |
324 | sel->u.single = looking_at; |
356 | sel->u.single = looking_at; |
|
|
357 | } |
325 | } else { |
358 | else |
|
|
359 | { |
326 | bool neg = false; |
360 | bool neg = false; |
327 | unsigned char first, last = 0; |
361 | unsigned char first, last = 0; |
328 | |
362 | |
329 | exit_if_null; |
363 | exit_if_null; |
330 | looking_at = *regexp++; |
364 | looking_at = *regexp++; |
331 | |
365 | |
332 | if (looking_at == '^') { |
366 | if (looking_at == '^') |
|
|
367 | { |
333 | neg = true; |
368 | neg = true; |
334 | exit_if_null; |
369 | exit_if_null; |
335 | looking_at = *regexp++; |
370 | looking_at = *regexp++; |
336 | } |
371 | } |
337 | first = looking_at; |
372 | first = looking_at; |
338 | exit_if_null; |
373 | exit_if_null; |
339 | looking_at = *regexp++; |
374 | looking_at = *regexp++; |
340 | if (looking_at == ']') { |
375 | if (looking_at == ']') |
|
|
376 | { |
341 | /* On the form [q] or [^q] */ |
377 | /* On the form [q] or [^q] */ |
342 | sel->type = neg ? sel_not_single : sel_single; |
378 | sel->type = neg ? sel_not_single : sel_single; |
343 | sel->u.single = first; |
379 | sel->u.single = first; |
344 | break; |
380 | break; |
|
|
381 | } |
345 | } else if (looking_at == '-') { |
382 | else if (looking_at == '-') |
|
|
383 | { |
346 | exit_if_null; |
384 | exit_if_null; |
347 | last = *regexp++; |
385 | last = *regexp++; |
348 | if (last == ']') { |
386 | if (last == ']') |
|
|
387 | { |
349 | /* On the form [A-] or [^A-]. Checking for |
388 | /* On the form [A-] or [^A-]. Checking for |
350 | * [,-] and making it a range is probably not |
389 | * [,-] and making it a range is probably not |
351 | * worth it :-) |
390 | * worth it :-) |
352 | */ |
391 | */ |
353 | sel->type = sel_array; |
392 | sel->type = sel_array; |
354 | memset(sel->u.array, neg, sizeof(sel->u.array)); |
393 | memset (sel->u.array, neg, sizeof (sel->u.array)); |
355 | sel->u.array[first] = sel->u.array['-'] = !neg; |
394 | sel->u.array[first] = sel->u.array['-'] = !neg; |
356 | break; |
395 | break; |
|
|
396 | } |
357 | } else { |
397 | else |
|
|
398 | { |
358 | exit_if_null; |
399 | exit_if_null; |
359 | looking_at = *regexp++; |
400 | looking_at = *regexp++; |
360 | if (looking_at == ']') { |
401 | if (looking_at == ']') |
|
|
402 | { |
361 | /* On the form [A-G] or [^A-G]. Note that [G-A] |
403 | /* On the form [A-G] or [^A-G]. Note that [G-A] |
362 | * is a syntax error. Fair enough, I think. |
404 | * is a syntax error. Fair enough, I think. |
363 | */ |
405 | */ |
364 | #ifdef SAFE_CHECK |
406 | #ifdef SAFE_CHECK |
365 | if (first > last) |
407 | if (first > last) |
366 | return NULL; |
408 | return NULL; |
367 | #endif |
409 | #endif |
368 | sel->type = neg ? sel_not_range : sel_range; |
410 | sel->type = neg ? sel_not_range : sel_range; |
369 | sel->u.range.low = first; |
411 | sel->u.range.low = first; |
370 | sel->u.range.high = last; |
412 | sel->u.range.high = last; |
371 | break; |
413 | break; |
372 | } |
414 | } |
373 | } |
415 | } |
374 | } |
416 | } |
375 | { |
417 | { |
376 | /* The datastructure can only represent a RE this |
418 | /* The datastructure can only represent a RE this |
377 | * complex with an array. |
419 | * complex with an array. |
378 | */ |
420 | */ |
379 | int i; |
421 | int i; |
380 | unsigned char previous; |
422 | unsigned char previous; |
381 | |
423 | |
382 | sel->type = sel_array; |
424 | sel->type = sel_array; |
383 | memset(sel->u.array, neg, sizeof(sel->u.array)); |
425 | memset (sel->u.array, neg, sizeof (sel->u.array)); |
384 | if (last) { |
426 | if (last) |
|
|
427 | { |
385 | /* It starts with a range */ |
428 | /* It starts with a range */ |
386 | #ifdef SAFE_CHECK |
429 | #ifdef SAFE_CHECK |
387 | if (first > last) |
430 | if (first > last) |
388 | return NULL; |
431 | return NULL; |
389 | #endif |
432 | #endif |
390 | for (i = first; i <= last; i++) { |
433 | for (i = first; i <= last; i++) |
|
|
434 | { |
391 | sel->u.array[i] = !neg; |
435 | sel->u.array[i] = !neg; |
392 | } |
436 | } |
|
|
437 | } |
393 | } else { |
438 | else |
|
|
439 | { |
394 | /* It begins with a "random" character */ |
440 | /* It begins with a "random" character */ |
395 | sel->u.array[first] = !neg; |
441 | sel->u.array[first] = !neg; |
396 | } |
442 | } |
397 | sel->u.array[looking_at] = !neg; |
443 | sel->u.array[looking_at] = !neg; |
398 | |
444 | |
399 | exit_if_null; |
445 | exit_if_null; |
400 | previous = looking_at; |
446 | previous = looking_at; |
401 | looking_at = *regexp++; |
447 | looking_at = *regexp++; |
402 | |
448 | |
403 | /* Add more characters to the array until we reach |
449 | /* Add more characters to the array until we reach |
404 | * ]. Quoting doesn't and shouldn't work in here. |
450 | * ]. Quoting doesn't and shouldn't work in here. |
405 | * ("]" should be put first, and "-" last if they |
451 | * ("]" should be put first, and "-" last if they |
406 | * are needed inside this construct.) |
452 | * are needed inside this construct.) |
407 | * Look for ranges as we go along. |
453 | * Look for ranges as we go along. |
408 | */ |
454 | */ |
409 | while (looking_at != ']') { |
455 | while (looking_at != ']') |
|
|
456 | { |
410 | if (looking_at == '-') { |
457 | if (looking_at == '-') |
411 | exit_if_null; |
|
|
412 | looking_at = *regexp++; |
|
|
413 | if (looking_at != ']') { |
|
|
414 | #ifdef SAFE_CHECK |
|
|
415 | if (previous > looking_at) |
|
|
416 | return NULL; |
|
|
417 | #endif |
|
|
418 | for (i = previous+1; i < looking_at; i++) { |
|
|
419 | /* previous has already been set and |
|
|
420 | * looking_at is set below. |
|
|
421 | */ |
|
|
422 | sel->u.array[i] = !neg; |
|
|
423 | } |
|
|
424 | exit_if_null; |
|
|
425 | } else { |
|
|
426 | sel->u.array['-'] = !neg; |
|
|
427 | break; |
|
|
428 | } |
|
|
429 | } |
458 | { |
430 | sel->u.array[looking_at] = !neg; |
|
|
431 | previous = looking_at; |
|
|
432 | exit_if_null; |
459 | exit_if_null; |
433 | looking_at = *regexp++; |
460 | looking_at = *regexp++; |
|
|
461 | if (looking_at != ']') |
|
|
462 | { |
|
|
463 | #ifdef SAFE_CHECK |
|
|
464 | if (previous > looking_at) |
|
|
465 | return NULL; |
|
|
466 | #endif |
|
|
467 | for (i = previous + 1; i < looking_at; i++) |
|
|
468 | { |
|
|
469 | /* previous has already been set and |
|
|
470 | * looking_at is set below. |
|
|
471 | */ |
|
|
472 | sel->u.array[i] = !neg; |
|
|
473 | } |
|
|
474 | exit_if_null; |
|
|
475 | } |
|
|
476 | else |
|
|
477 | { |
|
|
478 | sel->u.array['-'] = !neg; |
|
|
479 | break; |
|
|
480 | } |
434 | } |
481 | } |
|
|
482 | sel->u.array[looking_at] = !neg; |
|
|
483 | previous = looking_at; |
|
|
484 | exit_if_null; |
|
|
485 | looking_at = *regexp++; |
435 | } |
486 | } |
436 | } |
487 | } |
|
|
488 | } |
437 | break; |
489 | break; |
438 | case '\\': |
490 | case '\\': |
439 | if (quoted) { |
491 | if (quoted) |
|
|
492 | { |
440 | quoted = false; |
493 | quoted = false; |
441 | sel->type = sel_single; |
494 | sel->type = sel_single; |
442 | sel->u.single = looking_at; |
495 | sel->u.single = looking_at; |
|
|
496 | } |
443 | } else { |
497 | else |
|
|
498 | { |
444 | quoted = true; |
499 | quoted = true; |
445 | } |
500 | } |
446 | break; |
501 | break; |
447 | default: |
502 | default: |
448 | quoted = false; |
503 | quoted = false; |
449 | sel->type = sel_single; |
504 | sel->type = sel_single; |
450 | sel->u.single = looking_at; |
505 | sel->u.single = looking_at; |
451 | break; |
506 | break; |
452 | } |
507 | } |
|
|
508 | } |
453 | } while (quoted); |
509 | while (quoted); |
454 | |
510 | |
455 | if (*regexp == '*') { |
511 | if (*regexp == '*') |
|
|
512 | { |
456 | sel->repeat = rep_null_or_more; |
513 | sel->repeat = rep_null_or_more; |
457 | ++regexp; |
514 | ++regexp; |
|
|
515 | } |
458 | } else if (*regexp == '?') { |
516 | else if (*regexp == '?') |
|
|
517 | { |
459 | sel->repeat = rep_null_or_once; |
518 | sel->repeat = rep_null_or_once; |
460 | ++regexp; |
519 | ++regexp; |
|
|
520 | } |
461 | } else if (*regexp == '+') { |
521 | else if (*regexp == '+') |
|
|
522 | { |
462 | sel->repeat = rep_once_or_more; |
523 | sel->repeat = rep_once_or_more; |
463 | ++regexp; |
524 | ++regexp; |
464 | } else { |
525 | } |
|
|
526 | else |
|
|
527 | { |
465 | sel->repeat = rep_once; |
528 | sel->repeat = rep_once; |
466 | } |
529 | } |
467 | |
530 | |
468 | return regexp; |
531 | return regexp; |
469 | } |
532 | } |
470 | |
533 | |
471 | /* D e b u g c o d e |
534 | /* D e b u g c o d e |
472 | */ |
535 | */ |
473 | #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */ |
536 | #ifdef DEBUG2 /* compile all with DEBUG also ? hevi@lut.fi */ |
474 | static void |
537 | static void |
475 | re_dump_sel(selection *sel) { |
538 | re_dump_sel (selection * sel) |
|
|
539 | { |
476 | switch(sel->type) { |
540 | switch (sel->type) |
|
|
541 | { |
477 | case sel_any: |
542 | case sel_any: |
478 | printf("."); |
543 | printf ("."); |
479 | break; |
544 | break; |
480 | case sel_end: |
545 | case sel_end: |
481 | printf("$"); |
546 | printf ("$"); |
482 | break; |
547 | break; |
483 | case sel_single: |
548 | case sel_single: |
484 | printf("<%c>", sel->u.single); |
549 | printf ("<%c>", sel->u.single); |
485 | break; |
550 | break; |
486 | case sel_range: |
551 | case sel_range: |
487 | printf("[%c-%c]", sel->u.range.low, sel->u.range.high); |
552 | printf ("[%c-%c]", sel->u.range.low, sel->u.range.high); |
488 | break; |
553 | break; |
489 | case sel_array: |
554 | case sel_array: |
490 | { |
555 | { |
491 | int i; |
556 | int i; |
|
|
557 | |
492 | printf("["); |
558 | printf ("["); |
493 | for (i = 0; i < uchar_MAX; i++) { |
559 | for (i = 0; i < uchar_MAX; i++) |
|
|
560 | { |
494 | if (sel->u.array[i]) { |
561 | if (sel->u.array[i]) |
|
|
562 | { |
495 | printf("%c", i); |
563 | printf ("%c", i); |
496 | } |
564 | } |
497 | } |
565 | } |
498 | printf("]"); |
566 | printf ("]"); |
499 | } |
567 | } |
500 | break; |
568 | break; |
501 | case sel_not_single: |
569 | case sel_not_single: |
502 | printf("[^%c]", sel->u.single); |
570 | printf ("[^%c]", sel->u.single); |
503 | break; |
571 | break; |
504 | case sel_not_range: |
572 | case sel_not_range: |
505 | printf("[^%c-%c]", sel->u.range.low, sel->u.range.high); |
573 | printf ("[^%c-%c]", sel->u.range.low, sel->u.range.high); |
506 | break; |
574 | break; |
507 | default: |
575 | default: |
508 | printf("<UNKNOWN TOKEN!>"); |
576 | printf ("<UNKNOWN TOKEN!>"); |
509 | break; |
577 | break; |
510 | } |
578 | } |
511 | switch(sel->repeat) { |
579 | switch (sel->repeat) |
|
|
580 | { |
512 | case rep_once: |
581 | case rep_once: |
513 | break; |
582 | break; |
514 | case rep_null_or_once: |
583 | case rep_null_or_once: |
515 | printf("?"); |
584 | printf ("?"); |
516 | break; |
585 | break; |
517 | case rep_null_or_more: |
586 | case rep_null_or_more: |
518 | printf("*"); |
587 | printf ("*"); |
519 | break; |
588 | break; |
520 | case rep_once_or_more: |
589 | case rep_once_or_more: |
521 | printf("+"); |
590 | printf ("+"); |
522 | break; |
591 | break; |
523 | default: |
592 | default: |
524 | printf("<UNKNOWN REP-TOKEN!>"); |
593 | printf ("<UNKNOWN REP-TOKEN!>"); |
525 | break; |
594 | break; |
526 | } |
595 | } |
527 | } |
596 | } |
528 | |
597 | |
529 | int |
598 | int |
530 | main(int argc, char *argv[]) { |
599 | main (int argc, char *argv[]) |
|
|
600 | { |
531 | char *re, *m; |
601 | char *re, *m; |
532 | selection sel; |
602 | selection sel; |
533 | |
603 | |
534 | re = re_get_token(&sel, argv[1]); |
604 | re = re_get_token (&sel, argv[1]); |
535 | |
605 | |
536 | printf("'%s' -> '%s'\n", argv[1], re); |
606 | printf ("'%s' -> '%s'\n", argv[1], re); |
537 | re_dump_sel(&sel); |
607 | re_dump_sel (&sel); |
538 | printf("\n"); |
608 | printf ("\n"); |
539 | m = re_cmp(argv[2], argv[1]); |
609 | m = re_cmp (argv[2], argv[1]); |
540 | if (m) |
610 | if (m) |
541 | printf("MATCH! -> '%s'\n", m); |
611 | printf ("MATCH! -> '%s'\n", m); |
542 | return 0; |
612 | return 0; |
543 | } |
613 | } |
544 | #endif |
614 | #endif |