4 * Tcl regular expression pattern matching utilities.
5 *-----------------------------------------------------------------------------
6 * Copyright 1992 Karl Lehenbauer and Mark Diekhans.
8 * Permission to use, copy, modify, and distribute this software and its
9 * documentation for any purpose and without fee is hereby granted, provided
10 * that the above copyright notice appear in all copies. Karl Lehenbauer and
11 * Mark Diekhans make no representations about the suitability of this
12 * software for any purpose. It is provided "as is" without express or
14 *-----------------------------------------------------------------------------
15 * Boyer-Moore code from:
16 * torek-boyer-moore/27-Aug-90 by
17 * chris@mimsy.umd.edu (Chris Torek)
18 *-----------------------------------------------------------------------------
19 * $Id: tclXregexp.c,v 2.0 1992/10/16 04:51:08 markd Rel $
20 *-----------------------------------------------------------------------------
27 * This is declared in tclUtil.c. Must be set to NULL before compiling
28 * a regular expressions.
30 extern char *tclRegexpError
;
33 * Meta-characters for regular expression
35 #define REXP_META "^$.[()|?+*\\"
36 #define REXP_META_NO_BRACKET_NO_OR "^$.()?+*\\"
43 * Prototypes of internal functions.
47 BoyerMooreCompile
_ANSI_ARGS_((char *pat
,
51 BoyerMooreExecute
_ANSI_ARGS_((char *text
,
57 FindNonRegExpSubStr
_ANSI_ARGS_((char *expression
,
58 char **subStrPtrPtr
));
62 * Boyer-Moore search: input is `text' (a string) and its length,
63 * and a `pattern' (another string) and its length.
65 * The linear setup cost of this function is approximately 256 + patlen.
66 * Afterwards, however, the average cost is O(textlen/patlen), and the
67 * worst case is O(textlen+patlen).
69 * The Boyer-Moore algorithm works by observing that, for each position
70 * in the text, if the character there does *not* occur somewhere in the
71 * search pattern, no comparisons including that character will match.
72 * That is, given the text "hello world..." and the pattern "goodbye", the
73 * `w' in `world' means that none of `hello w', `ello wo', `llo wor',
74 * `lo worl', `o world', ` world.', and `world..' can match. In fact,
75 * exactly patlen strings are certain not to match. We can discover this
76 * simply by looking at the patlen'th character. Furthermore, even if
77 * the text character does occur, it may be that it rules out some number
78 * of other matches. Again, we can discover this by doing the match
81 * We set up a table of deltas for each possible character, with
82 * delta[character] being patlen for characters not in the pattern,
83 * less for characters in the pattern, growing progressively smaller
84 * as we near the end of the pattern. Matching then works as follows:
87 * 01234567890123456789012345678901234567
88 * "Here is the string being searched into" (text)
89 * ------ (pos = [0..5])
93 * (the delta for `-' will be derived below).
95 * Positions 0..5 end with `i', which is not the `g' we want. `i' does
96 * appear in `string', but two characters before the end. We skip
97 * forward so as to make the `i's match up:
99 * "Here is the string being searched into" (text)
100 * "string" (pos = [2..7])
102 * Next we find that ` ' and `g' do not match. Since ` ' does not appear
103 * in the pattern at all, we can skip forward 6:
105 * "Here is the string being searched into" (text)
106 * "string" (pos = [8..13])
108 * Comparing `t' vs `g', we again find no match, and so we obtain the
109 * delta for `t', which is 4. We skip to position 17:
111 * "Here is the string being searched into" (text)
112 * "string" (pos = [12..17])
114 * It thus takes only four steps to move the search point forward to the
115 * match, in this case.
117 * If the pattern has a recurring character, we must set the delta for
118 * that character to the distance of the one closest to the end:
120 * "befuddle the cat" (text)
121 * "fuddle" (pos = [0..5])
124 * We want the next search to line the `d's up like this:
126 * "befuddle the cat" (text)
127 * "fuddle" (pos = [2..7])
131 * "befuddle the cat" (text)
132 * "fuddle" (pos = [3..8])
134 * so we take the smaller delta for d, i.e., 2.
136 * The last task is computing the delta we have noted above as `-':
138 * "candlesticks" (text)
139 * "hand" (pos = [0..3])
142 * Here the `d' in `hand' matches the `d' in `candlesticks', but the
143 * strings differ. Since there are no other `d's in `hand', we know
144 * that none of (cand,andl,ndle,dles) can match, and thus we want this
145 * delta to be 4 (the length of the pattern). But if we had, e.g.:
147 * "candlesticks" (text)
148 * "deed" (pos = [0..3])
151 * then we should advance to line up the other `d':
153 * "candlesticks" (text)
154 * "deed" (pos = [3..6])
156 * As this suggests, the delta should be that for the `d' nearest the
157 * end, but not including the end. This is easily managed by setting up
158 * a delta table as follows:
160 * for int:c in [0..255] { delta[c] = patlen; };
161 * for int:x in [0..patlen-1) { delta[pat[x]] = patlen - (x + 1); };
163 * delta[pat[patlen-1]] is never written, so the last letter inherits the
164 * delta from an earlier iteration or from the previous loop.
166 * NB: the nonsense with `deltaspace' below exists merely because gcc
167 * does a horrible job of common subexpression elimination (it does not
168 * notice that the array is at a constant stack address).
171 struct compiled_search_struct
{
173 unsigned deltaspace
[CHAR_MAX
+ 1];
178 BoyerMooreCompile (pat
, patlen
)
182 register unsigned char *p
, *t
;
183 register unsigned i
, p1
, j
, *delta
;
184 struct compiled_search_struct
*cp
;
188 * Algorithm fails if pattern is empty.
190 if ((p1
= patlen
) == 0)
193 alloc_len
= sizeof(struct compiled_search_struct
) + patlen
+ 1;
194 cp
= (struct compiled_search_struct
*) ckalloc (alloc_len
);
196 strncpy((char *)cp
+sizeof(struct compiled_search_struct
), pat
, patlen
);
197 *((char *)cp
+alloc_len
-1) = '\0';
200 delta
= cp
->deltaspace
;
202 for (i
= 0; i
<= CHAR_MAX
; i
++)
205 for (p
= (unsigned char *)pat
, i
= p1
; --i
> 0;)
213 BoyerMooreExecute (text
, textlen
, compPtr
, patLenP
)
219 register unsigned char *p
, *t
;
220 struct compiled_search_struct
*csp
=
221 (struct compiled_search_struct
*) compPtr
;
222 register unsigned i
, p1
, j
, *delta
= csp
->deltaspace
;
226 *patLenP
= p1
= patlen
= csp
->patlen
;
227 /* code below fails (whenever i is unsigned) if pattern too long */
231 pat
= (char *)csp
+ sizeof(struct compiled_search_struct
);
233 * From now on, we want patlen - 1.
234 * In the loop below, p points to the end of the pattern,
235 * t points to the end of the text to be tested against the
236 * pattern, and i counts the amount of text remaining, not
237 * including the part to be tested.
240 p
= (unsigned char *)pat
+ p1
;
241 t
= (unsigned char *)text
+ p1
;
242 i
= textlen
- patlen
;
245 memcmp((p
- p1
), (t
- p1
), p1
) == 0)
246 return ((char *)t
- p1
);
258 *-----------------------------------------------------------------------------
261 * Free all resources associated with a regular expression info
264 *-----------------------------------------------------------------------------
267 Tcl_RegExpClean (regExpPtr
)
270 if (regExpPtr
->progPtr
!= NULL
)
271 ckfree ((char *) regExpPtr
->progPtr
);
272 if (regExpPtr
->boyerMoorePtr
!= NULL
)
273 ckfree ((char *) regExpPtr
->boyerMoorePtr
);
277 *-----------------------------------------------------------------------------
279 * FindNonRegExpSubStr
280 * Find the largest substring that does not have any regular
281 * expression meta-characters and is not located within `[...]'.
282 * If the regexp contains an or (|), zero is returned, as the
283 * Boyer-Moore optimization does not work, since there are actually
284 * multiple patterns. The real solution is to build the Boyer-Moore
285 * into the regular expression code.
286 *-----------------------------------------------------------------------------
289 FindNonRegExpSubStr (expression
, subStrPtrPtr
)
293 register char *subStrPtr
= NULL
;
294 register char subStrLen
= 0;
295 register char *scanPtr
= expression
;
298 while (*scanPtr
!= '\0') {
299 len
= strcspn (scanPtr
, REXP_META
);
301 * If we are at a meta-character, by-pass till non-meta. If we hit
302 * a `[' then by-pass the entire `[...]' range, but be careful, could
303 * have omitted `]'. In a `|' is encountered (except in brackets),'
307 scanPtr
+= strspn (scanPtr
, REXP_META_NO_BRACKET_NO_OR
);
310 if (*scanPtr
== '[') {
311 scanPtr
+= strcspn (scanPtr
, "]");
316 if (len
> subStrLen
) {
323 *subStrPtrPtr
= subStrPtr
;
328 *-----------------------------------------------------------------------------
330 * Tcl_RegExpCompile --
331 * Compile a regular expression.
334 * o regExpPtr - Used to hold info on this regular expression. If the
335 * structure is being reused, it Tcl_RegExpClean should be called first.
336 * o expression - Regular expression to compile.
337 * o flags - The following flags are recognized:
338 * o REXP_NO_CASE - Comparison will be regardless of case.
339 * o REXP_BOTH_ALGORITHMS - If specified, a Boyer-Moore expression is
340 * compiled for the largest substring of the expression that does
341 * not contain any meta-characters. This is slows compiling, but
342 * speeds up large searches.
345 * Standard TCL results.
346 *-----------------------------------------------------------------------------
349 Tcl_RegExpCompile (interp
, regExpPtr
, expression
, flags
)
358 if (*expression
== '\0') {
359 Tcl_AppendResult (interp
, "Null regular expression", (char *) NULL
);
363 regExpPtr
->progPtr
= NULL
;
364 regExpPtr
->boyerMoorePtr
= NULL
;
365 regExpPtr
->noCase
= flags
& REXP_NO_CASE
;
367 if (flags
& REXP_NO_CASE
) {
368 expBuf
= ckalloc (strlen (expression
) + 1);
369 Tcl_DownShift (expBuf
, expression
);
373 anyMeta
= strpbrk (expBuf
, REXP_META
) != NULL
;
376 * If no meta-characters, use Boyer-Moore string matching only.
379 regExpPtr
->boyerMoorePtr
= BoyerMooreCompile (expBuf
, strlen (expBuf
));
384 * Build a Boyer-Moore on the largest non-meta substring, if requested,
385 * and the reg-exp does not contain a `|' (or). If less that three
386 * characters in the string, don't use B-M, as it seems not optimal at
389 if (flags
& REXP_BOTH_ALGORITHMS
) {
393 subStrLen
= FindNonRegExpSubStr (expBuf
, &subStrPtr
);
395 regExpPtr
->boyerMoorePtr
=
396 BoyerMooreCompile (subStrPtr
, subStrLen
);
400 * Compile meta-character containing regular expression.
402 tclRegexpError
= NULL
;
403 regExpPtr
->progPtr
= regcomp (expBuf
);
404 if (tclRegexpError
!= NULL
) {
405 if (flags
& REXP_NO_CASE
)
407 Tcl_AppendResult (interp
, "error in regular expression: ",
408 tclRegexpError
, (char *) NULL
);
409 if (flags
& REXP_NO_CASE
)
411 Tcl_RegExpClean (regExpPtr
);
415 if (flags
& REXP_NO_CASE
)
422 *-----------------------------------------------------------------------------
424 * Tcl_RegExpExecute --
425 * Execute a regular expression compiled with Boyer-Moore and/or
429 * o regExpPtr - Used to hold info on this regular expression.
430 * o matchStrIn - String to match against the regular expression.
431 * o matchStrLower - Optional lower case version of the string. If
432 * multiple no case matches are being done, time can be saved by
433 * down shifting the string in advance. NULL if not a no-case
434 * match or this procedure is to do the down shifting.
437 * TRUE if a match, FALSE if it does not match.
439 *-----------------------------------------------------------------------------
442 Tcl_RegExpExecute (interp
, regExpPtr
, matchStrIn
, matchStrLower
)
451 if (regExpPtr
->noCase
) {
452 if (matchStrLower
== NULL
) {
453 matchStr
= ckalloc (strlen (matchStrIn
) + 1);
454 Tcl_DownShift (matchStr
, matchStrIn
);
456 matchStr
= matchStrLower
;
458 matchStr
= matchStrIn
;
461 * If a Boyer-Moore pattern has been compiled, use that algorithm to test
462 * against the text. If that passes, then test with the regexp if we have
465 if (regExpPtr
->boyerMoorePtr
!= NULL
) {
469 startPtr
= BoyerMooreExecute (matchStr
, strlen (matchStr
),
470 regExpPtr
->boyerMoorePtr
, &matchLen
);
471 if (startPtr
== NULL
) {
475 if (regExpPtr
->progPtr
== NULL
) {
476 result
= TRUE
; /* No regexp, its a match! */
482 * Give it a go with full regular expressions
484 result
= regexec (regExpPtr
->progPtr
, matchStr
);
487 * Clean up and return status here.
490 if ((regExpPtr
->noCase
) && (matchStrLower
== NULL
))