]> git.zerfleddert.de Git - micropolis/blob - src/tclx/src/tclxrexp.c
Import Micropolis from http://www.donhopkins.com/home/micropolis/
[micropolis] / src / tclx / src / tclxrexp.c
1 /*
2 * tclXregexp.c --
3 *
4 * Tcl regular expression pattern matching utilities.
5 *-----------------------------------------------------------------------------
6 * Copyright 1992 Karl Lehenbauer and Mark Diekhans.
7 *
8 * Permission to use, copy, modify, and distribute this software and its
9 * documentation for any purpose and without fee is hereby granted, provided
10 * that the above copyright notice appear in all copies. Karl Lehenbauer and
11 * Mark Diekhans make no representations about the suitability of this
12 * software for any purpose. It is provided "as is" without express or
13 * implied warranty.
14 *-----------------------------------------------------------------------------
15 * Boyer-Moore code from:
16 * torek-boyer-moore/27-Aug-90 by
17 * chris@mimsy.umd.edu (Chris Torek)
18 *-----------------------------------------------------------------------------
19 * $Id: tclXregexp.c,v 2.0 1992/10/16 04:51:08 markd Rel $
20 *-----------------------------------------------------------------------------
21 */
22
23 #include "tclxint.h"
24 #include "regexp.h"
25
26 /*
27 * This is declared in tclUtil.c. Must be set to NULL before compiling
28 * a regular expressions.
29 */
30 extern char *tclRegexpError;
31
32 /*
33 * Meta-characters for regular expression
34 */
35 #define REXP_META "^$.[()|?+*\\"
36 #define REXP_META_NO_BRACKET_NO_OR "^$.()?+*\\"
37
38 #ifndef CHAR_MAX
39 # define CHAR_MAX 255
40 #endif
41
42 /*
43 * Prototypes of internal functions.
44 */
45
46 static char *
47 BoyerMooreCompile _ANSI_ARGS_((char *pat,
48 int patlen));
49
50 static char *
51 BoyerMooreExecute _ANSI_ARGS_((char *text,
52 unsigned textlen,
53 char *compPtr,
54 unsigned *patLenP));
55
56 static int
57 FindNonRegExpSubStr _ANSI_ARGS_((char *expression,
58 char **subStrPtrPtr));
59
60 \f
61 /*
62 * Boyer-Moore search: input is `text' (a string) and its length,
63 * and a `pattern' (another string) and its length.
64 *
65 * The linear setup cost of this function is approximately 256 + patlen.
66 * Afterwards, however, the average cost is O(textlen/patlen), and the
67 * worst case is O(textlen+patlen).
68 *
69 * The Boyer-Moore algorithm works by observing that, for each position
70 * in the text, if the character there does *not* occur somewhere in the
71 * search pattern, no comparisons including that character will match.
72 * That is, given the text "hello world..." and the pattern "goodbye", the
73 * `w' in `world' means that none of `hello w', `ello wo', `llo wor',
74 * `lo worl', `o world', ` world.', and `world..' can match. In fact,
75 * exactly patlen strings are certain not to match. We can discover this
76 * simply by looking at the patlen'th character. Furthermore, even if
77 * the text character does occur, it may be that it rules out some number
78 * of other matches. Again, we can discover this by doing the match
79 * `backwards'.
80 *
81 * We set up a table of deltas for each possible character, with
82 * delta[character] being patlen for characters not in the pattern,
83 * less for characters in the pattern, growing progressively smaller
84 * as we near the end of the pattern. Matching then works as follows:
85 *
86 * 0 1 2 3
87 * 01234567890123456789012345678901234567
88 * "Here is the string being searched into" (text)
89 * ------ (pos = [0..5])
90 * "string" (pat)
91 * 654321- (deltas)
92 *
93 * (the delta for `-' will be derived below).
94 *
95 * Positions 0..5 end with `i', which is not the `g' we want. `i' does
96 * appear in `string', but two characters before the end. We skip
97 * forward so as to make the `i's match up:
98 *
99 * "Here is the string being searched into" (text)
100 * "string" (pos = [2..7])
101 *
102 * Next we find that ` ' and `g' do not match. Since ` ' does not appear
103 * in the pattern at all, we can skip forward 6:
104 *
105 * "Here is the string being searched into" (text)
106 * "string" (pos = [8..13])
107 *
108 * Comparing `t' vs `g', we again find no match, and so we obtain the
109 * delta for `t', which is 4. We skip to position 17:
110 *
111 * "Here is the string being searched into" (text)
112 * "string" (pos = [12..17])
113 *
114 * It thus takes only four steps to move the search point forward to the
115 * match, in this case.
116 *
117 * If the pattern has a recurring character, we must set the delta for
118 * that character to the distance of the one closest to the end:
119 *
120 * "befuddle the cat" (text)
121 * "fuddle" (pos = [0..5])
122 * 654321- (delta)
123 *
124 * We want the next search to line the `d's up like this:
125 *
126 * "befuddle the cat" (text)
127 * "fuddle" (pos = [2..7])
128 *
129 * and not like this:
130 *
131 * "befuddle the cat" (text)
132 * "fuddle" (pos = [3..8])
133 *
134 * so we take the smaller delta for d, i.e., 2.
135 *
136 * The last task is computing the delta we have noted above as `-':
137 *
138 * "candlesticks" (text)
139 * "hand" (pos = [0..3])
140 * 4321- (delta)
141 *
142 * Here the `d' in `hand' matches the `d' in `candlesticks', but the
143 * strings differ. Since there are no other `d's in `hand', we know
144 * that none of (cand,andl,ndle,dles) can match, and thus we want this
145 * delta to be 4 (the length of the pattern). But if we had, e.g.:
146 *
147 * "candlesticks" (text)
148 * "deed" (pos = [0..3])
149 * 4321- (delta)
150 *
151 * then we should advance to line up the other `d':
152 *
153 * "candlesticks" (text)
154 * "deed" (pos = [3..6])
155 *
156 * As this suggests, the delta should be that for the `d' nearest the
157 * end, but not including the end. This is easily managed by setting up
158 * a delta table as follows:
159 *
160 * for int:c in [0..255] { delta[c] = patlen; };
161 * for int:x in [0..patlen-1) { delta[pat[x]] = patlen - (x + 1); };
162 *
163 * delta[pat[patlen-1]] is never written, so the last letter inherits the
164 * delta from an earlier iteration or from the previous loop.
165 *
166 * NB: the nonsense with `deltaspace' below exists merely because gcc
167 * does a horrible job of common subexpression elimination (it does not
168 * notice that the array is at a constant stack address).
169 */
170
171 struct compiled_search_struct {
172 unsigned patlen;
173 unsigned deltaspace[CHAR_MAX + 1];
174 };
175
176
177 static char *
178 BoyerMooreCompile (pat, patlen)
179 char *pat;
180 int patlen;
181 {
182 register unsigned char *p, *t;
183 register unsigned i, p1, j, *delta;
184 struct compiled_search_struct *cp;
185 int alloc_len;
186
187 /*
188 * Algorithm fails if pattern is empty.
189 */
190 if ((p1 = patlen) == 0)
191 return (NULL);
192
193 alloc_len = sizeof(struct compiled_search_struct) + patlen + 1;
194 cp = (struct compiled_search_struct *) ckalloc (alloc_len);
195
196 strncpy((char *)cp+sizeof(struct compiled_search_struct), pat, patlen);
197 *((char *)cp+alloc_len-1) = '\0';
198
199 /* set up deltas */
200 delta = cp->deltaspace;
201
202 for (i = 0; i <= CHAR_MAX; i++)
203 delta[i] = p1;
204
205 for (p = (unsigned char *)pat, i = p1; --i > 0;)
206 delta[*p++] = i;
207
208 cp->patlen = patlen;
209 return((char*) cp);
210 }
211
212 static char *
213 BoyerMooreExecute (text, textlen, compPtr, patLenP)
214 char *text;
215 unsigned textlen;
216 char *compPtr;
217 unsigned *patLenP;
218 {
219 register unsigned char *p, *t;
220 struct compiled_search_struct *csp =
221 (struct compiled_search_struct*) compPtr;
222 register unsigned i, p1, j, *delta = csp->deltaspace;
223 char *pat;
224 unsigned patlen;
225
226 *patLenP = p1 = patlen = csp->patlen;
227 /* code below fails (whenever i is unsigned) if pattern too long */
228 if (p1 > textlen)
229 return (NULL);
230
231 pat = (char *)csp + sizeof(struct compiled_search_struct);
232 /*
233 * From now on, we want patlen - 1.
234 * In the loop below, p points to the end of the pattern,
235 * t points to the end of the text to be tested against the
236 * pattern, and i counts the amount of text remaining, not
237 * including the part to be tested.
238 */
239 p1--;
240 p = (unsigned char *)pat + p1;
241 t = (unsigned char *)text + p1;
242 i = textlen - patlen;
243 for (;;) {
244 if (*p == *t &&
245 memcmp((p - p1), (t - p1), p1) == 0)
246 return ((char *)t - p1);
247 j = delta[*t];
248 if (i < j)
249 break;
250 i -= j;
251 t += j;
252 }
253 return (NULL);
254 }
255
256 \f
257 /*
258 *-----------------------------------------------------------------------------
259 *
260 * Tcl_RegExpClean --
261 * Free all resources associated with a regular expression info
262 * structure..
263 *
264 *-----------------------------------------------------------------------------
265 */
266 void
267 Tcl_RegExpClean (regExpPtr)
268 regexp_pt regExpPtr;
269 {
270 if (regExpPtr->progPtr != NULL)
271 ckfree ((char *) regExpPtr->progPtr);
272 if (regExpPtr->boyerMoorePtr != NULL)
273 ckfree ((char *) regExpPtr->boyerMoorePtr);
274 }
275 \f
276 /*
277 *-----------------------------------------------------------------------------
278 *
279 * FindNonRegExpSubStr
280 * Find the largest substring that does not have any regular
281 * expression meta-characters and is not located within `[...]'.
282 * If the regexp contains an or (|), zero is returned, as the
283 * Boyer-Moore optimization does not work, since there are actually
284 * multiple patterns. The real solution is to build the Boyer-Moore
285 * into the regular expression code.
286 *-----------------------------------------------------------------------------
287 */
288 static int
289 FindNonRegExpSubStr (expression, subStrPtrPtr)
290 char *expression;
291 char **subStrPtrPtr;
292 {
293 register char *subStrPtr = NULL;
294 register char subStrLen = 0;
295 register char *scanPtr = expression;
296 register int len;
297
298 while (*scanPtr != '\0') {
299 len = strcspn (scanPtr, REXP_META);
300 /*
301 * If we are at a meta-character, by-pass till non-meta. If we hit
302 * a `[' then by-pass the entire `[...]' range, but be careful, could
303 * have omitted `]'. In a `|' is encountered (except in brackets),'
304 * we are through.
305 */
306 if (len == 0) {
307 scanPtr += strspn (scanPtr, REXP_META_NO_BRACKET_NO_OR);
308 if (*scanPtr == '|')
309 return 0;
310 if (*scanPtr == '[') {
311 scanPtr += strcspn (scanPtr, "]");
312 if (*scanPtr == ']')
313 scanPtr++;
314 }
315 } else {
316 if (len > subStrLen) {
317 subStrPtr = scanPtr;
318 subStrLen = len;
319 }
320 scanPtr += len;
321 }
322 }
323 *subStrPtrPtr = subStrPtr;
324 return subStrLen;
325 }
326 \f
327 /*
328 *-----------------------------------------------------------------------------
329 *
330 * Tcl_RegExpCompile --
331 * Compile a regular expression.
332 *
333 * Parameters:
334 * o regExpPtr - Used to hold info on this regular expression. If the
335 * structure is being reused, it Tcl_RegExpClean should be called first.
336 * o expression - Regular expression to compile.
337 * o flags - The following flags are recognized:
338 * o REXP_NO_CASE - Comparison will be regardless of case.
339 * o REXP_BOTH_ALGORITHMS - If specified, a Boyer-Moore expression is
340 * compiled for the largest substring of the expression that does
341 * not contain any meta-characters. This is slows compiling, but
342 * speeds up large searches.
343 *
344 * Results:
345 * Standard TCL results.
346 *-----------------------------------------------------------------------------
347 */
348 int
349 Tcl_RegExpCompile (interp, regExpPtr, expression, flags)
350 Tcl_Interp *interp;
351 regexp_pt regExpPtr;
352 char *expression;
353 int flags;
354 {
355 char *expBuf;
356 int anyMeta;
357
358 if (*expression == '\0') {
359 Tcl_AppendResult (interp, "Null regular expression", (char *) NULL);
360 return TCL_ERROR;
361 }
362
363 regExpPtr->progPtr = NULL;
364 regExpPtr->boyerMoorePtr = NULL;
365 regExpPtr->noCase = flags & REXP_NO_CASE;
366
367 if (flags & REXP_NO_CASE) {
368 expBuf = ckalloc (strlen (expression) + 1);
369 Tcl_DownShift (expBuf, expression);
370 } else
371 expBuf = expression;
372
373 anyMeta = strpbrk (expBuf, REXP_META) != NULL;
374
375 /*
376 * If no meta-characters, use Boyer-Moore string matching only.
377 */
378 if (!anyMeta) {
379 regExpPtr->boyerMoorePtr = BoyerMooreCompile (expBuf, strlen (expBuf));
380 goto okExitPoint;
381 }
382
383 /*
384 * Build a Boyer-Moore on the largest non-meta substring, if requested,
385 * and the reg-exp does not contain a `|' (or). If less that three
386 * characters in the string, don't use B-M, as it seems not optimal at
387 * this point.
388 */
389 if (flags & REXP_BOTH_ALGORITHMS) {
390 char *subStrPtr;
391 int subStrLen;
392
393 subStrLen = FindNonRegExpSubStr (expBuf, &subStrPtr);
394 if (subStrLen > 2)
395 regExpPtr->boyerMoorePtr =
396 BoyerMooreCompile (subStrPtr, subStrLen);
397 }
398
399 /*
400 * Compile meta-character containing regular expression.
401 */
402 tclRegexpError = NULL;
403 regExpPtr->progPtr = regcomp (expBuf);
404 if (tclRegexpError != NULL) {
405 if (flags & REXP_NO_CASE)
406 ckfree (expBuf);
407 Tcl_AppendResult (interp, "error in regular expression: ",
408 tclRegexpError, (char *) NULL);
409 if (flags & REXP_NO_CASE)
410 ckfree (expBuf);
411 Tcl_RegExpClean (regExpPtr);
412 }
413
414 okExitPoint:
415 if (flags & REXP_NO_CASE)
416 ckfree (expBuf);
417 return TCL_OK;
418
419 }
420 \f
421 /*
422 *-----------------------------------------------------------------------------
423 *
424 * Tcl_RegExpExecute --
425 * Execute a regular expression compiled with Boyer-Moore and/or
426 * regexp.
427 *
428 * Parameters:
429 * o regExpPtr - Used to hold info on this regular expression.
430 * o matchStrIn - String to match against the regular expression.
431 * o matchStrLower - Optional lower case version of the string. If
432 * multiple no case matches are being done, time can be saved by
433 * down shifting the string in advance. NULL if not a no-case
434 * match or this procedure is to do the down shifting.
435 *
436 * Results:
437 * TRUE if a match, FALSE if it does not match.
438 *
439 *-----------------------------------------------------------------------------
440 */
441 int
442 Tcl_RegExpExecute (interp, regExpPtr, matchStrIn, matchStrLower)
443 Tcl_Interp *interp;
444 regexp_pt regExpPtr;
445 char *matchStrIn;
446 char *matchStrLower;
447 {
448 char *matchStr;
449 int result;
450
451 if (regExpPtr->noCase) {
452 if (matchStrLower == NULL) {
453 matchStr = ckalloc (strlen (matchStrIn) + 1);
454 Tcl_DownShift (matchStr, matchStrIn);
455 } else
456 matchStr = matchStrLower;
457 } else
458 matchStr = matchStrIn;
459
460 /*
461 * If a Boyer-Moore pattern has been compiled, use that algorithm to test
462 * against the text. If that passes, then test with the regexp if we have
463 * it.
464 */
465 if (regExpPtr->boyerMoorePtr != NULL) {
466 char *startPtr;
467 unsigned matchLen;
468
469 startPtr = BoyerMooreExecute (matchStr, strlen (matchStr),
470 regExpPtr->boyerMoorePtr, &matchLen);
471 if (startPtr == NULL) {
472 result = FALSE;
473 goto exitPoint;
474 }
475 if (regExpPtr->progPtr == NULL) {
476 result = TRUE; /* No regexp, its a match! */
477 goto exitPoint;
478 }
479 }
480
481 /*
482 * Give it a go with full regular expressions
483 */
484 result = regexec (regExpPtr->progPtr, matchStr);
485
486 /*
487 * Clean up and return status here.
488 */
489 exitPoint:
490 if ((regExpPtr->noCase) && (matchStrLower == NULL))
491 ckfree (matchStr);
492 return result;
493 }
Impressum, Datenschutz