]>
Commit | Line | Data |
---|---|---|
6a5fa4e0 MG |
1 | /* |
2 | * tclXregexp.c -- | |
3 | * | |
4 | * Tcl regular expression pattern matching utilities. | |
5 | *----------------------------------------------------------------------------- | |
6 | * Copyright 1992 Karl Lehenbauer and Mark Diekhans. | |
7 | * | |
8 | * Permission to use, copy, modify, and distribute this software and its | |
9 | * documentation for any purpose and without fee is hereby granted, provided | |
10 | * that the above copyright notice appear in all copies. Karl Lehenbauer and | |
11 | * Mark Diekhans make no representations about the suitability of this | |
12 | * software for any purpose. It is provided "as is" without express or | |
13 | * implied warranty. | |
14 | *----------------------------------------------------------------------------- | |
15 | * Boyer-Moore code from: | |
16 | * torek-boyer-moore/27-Aug-90 by | |
17 | * chris@mimsy.umd.edu (Chris Torek) | |
18 | *----------------------------------------------------------------------------- | |
19 | * $Id: tclXregexp.c,v 2.0 1992/10/16 04:51:08 markd Rel $ | |
20 | *----------------------------------------------------------------------------- | |
21 | */ | |
22 | ||
23 | #include "tclxint.h" | |
24 | #include "regexp.h" | |
25 | ||
26 | /* | |
27 | * This is declared in tclUtil.c. Must be set to NULL before compiling | |
28 | * a regular expressions. | |
29 | */ | |
30 | extern char *tclRegexpError; | |
31 | ||
32 | /* | |
33 | * Meta-characters for regular expression | |
34 | */ | |
35 | #define REXP_META "^$.[()|?+*\\" | |
36 | #define REXP_META_NO_BRACKET_NO_OR "^$.()?+*\\" | |
37 | ||
38 | #ifndef CHAR_MAX | |
39 | # define CHAR_MAX 255 | |
40 | #endif | |
41 | ||
42 | /* | |
43 | * Prototypes of internal functions. | |
44 | */ | |
45 | ||
46 | static char * | |
47 | BoyerMooreCompile _ANSI_ARGS_((char *pat, | |
48 | int patlen)); | |
49 | ||
50 | static char * | |
51 | BoyerMooreExecute _ANSI_ARGS_((char *text, | |
52 | unsigned textlen, | |
53 | char *compPtr, | |
54 | unsigned *patLenP)); | |
55 | ||
56 | static int | |
57 | FindNonRegExpSubStr _ANSI_ARGS_((char *expression, | |
58 | char **subStrPtrPtr)); | |
59 | ||
60 | \f | |
61 | /* | |
62 | * Boyer-Moore search: input is `text' (a string) and its length, | |
63 | * and a `pattern' (another string) and its length. | |
64 | * | |
65 | * The linear setup cost of this function is approximately 256 + patlen. | |
66 | * Afterwards, however, the average cost is O(textlen/patlen), and the | |
67 | * worst case is O(textlen+patlen). | |
68 | * | |
69 | * The Boyer-Moore algorithm works by observing that, for each position | |
70 | * in the text, if the character there does *not* occur somewhere in the | |
71 | * search pattern, no comparisons including that character will match. | |
72 | * That is, given the text "hello world..." and the pattern "goodbye", the | |
73 | * `w' in `world' means that none of `hello w', `ello wo', `llo wor', | |
74 | * `lo worl', `o world', ` world.', and `world..' can match. In fact, | |
75 | * exactly patlen strings are certain not to match. We can discover this | |
76 | * simply by looking at the patlen'th character. Furthermore, even if | |
77 | * the text character does occur, it may be that it rules out some number | |
78 | * of other matches. Again, we can discover this by doing the match | |
79 | * `backwards'. | |
80 | * | |
81 | * We set up a table of deltas for each possible character, with | |
82 | * delta[character] being patlen for characters not in the pattern, | |
83 | * less for characters in the pattern, growing progressively smaller | |
84 | * as we near the end of the pattern. Matching then works as follows: | |
85 | * | |
86 | * 0 1 2 3 | |
87 | * 01234567890123456789012345678901234567 | |
88 | * "Here is the string being searched into" (text) | |
89 | * ------ (pos = [0..5]) | |
90 | * "string" (pat) | |
91 | * 654321- (deltas) | |
92 | * | |
93 | * (the delta for `-' will be derived below). | |
94 | * | |
95 | * Positions 0..5 end with `i', which is not the `g' we want. `i' does | |
96 | * appear in `string', but two characters before the end. We skip | |
97 | * forward so as to make the `i's match up: | |
98 | * | |
99 | * "Here is the string being searched into" (text) | |
100 | * "string" (pos = [2..7]) | |
101 | * | |
102 | * Next we find that ` ' and `g' do not match. Since ` ' does not appear | |
103 | * in the pattern at all, we can skip forward 6: | |
104 | * | |
105 | * "Here is the string being searched into" (text) | |
106 | * "string" (pos = [8..13]) | |
107 | * | |
108 | * Comparing `t' vs `g', we again find no match, and so we obtain the | |
109 | * delta for `t', which is 4. We skip to position 17: | |
110 | * | |
111 | * "Here is the string being searched into" (text) | |
112 | * "string" (pos = [12..17]) | |
113 | * | |
114 | * It thus takes only four steps to move the search point forward to the | |
115 | * match, in this case. | |
116 | * | |
117 | * If the pattern has a recurring character, we must set the delta for | |
118 | * that character to the distance of the one closest to the end: | |
119 | * | |
120 | * "befuddle the cat" (text) | |
121 | * "fuddle" (pos = [0..5]) | |
122 | * 654321- (delta) | |
123 | * | |
124 | * We want the next search to line the `d's up like this: | |
125 | * | |
126 | * "befuddle the cat" (text) | |
127 | * "fuddle" (pos = [2..7]) | |
128 | * | |
129 | * and not like this: | |
130 | * | |
131 | * "befuddle the cat" (text) | |
132 | * "fuddle" (pos = [3..8]) | |
133 | * | |
134 | * so we take the smaller delta for d, i.e., 2. | |
135 | * | |
136 | * The last task is computing the delta we have noted above as `-': | |
137 | * | |
138 | * "candlesticks" (text) | |
139 | * "hand" (pos = [0..3]) | |
140 | * 4321- (delta) | |
141 | * | |
142 | * Here the `d' in `hand' matches the `d' in `candlesticks', but the | |
143 | * strings differ. Since there are no other `d's in `hand', we know | |
144 | * that none of (cand,andl,ndle,dles) can match, and thus we want this | |
145 | * delta to be 4 (the length of the pattern). But if we had, e.g.: | |
146 | * | |
147 | * "candlesticks" (text) | |
148 | * "deed" (pos = [0..3]) | |
149 | * 4321- (delta) | |
150 | * | |
151 | * then we should advance to line up the other `d': | |
152 | * | |
153 | * "candlesticks" (text) | |
154 | * "deed" (pos = [3..6]) | |
155 | * | |
156 | * As this suggests, the delta should be that for the `d' nearest the | |
157 | * end, but not including the end. This is easily managed by setting up | |
158 | * a delta table as follows: | |
159 | * | |
160 | * for int:c in [0..255] { delta[c] = patlen; }; | |
161 | * for int:x in [0..patlen-1) { delta[pat[x]] = patlen - (x + 1); }; | |
162 | * | |
163 | * delta[pat[patlen-1]] is never written, so the last letter inherits the | |
164 | * delta from an earlier iteration or from the previous loop. | |
165 | * | |
166 | * NB: the nonsense with `deltaspace' below exists merely because gcc | |
167 | * does a horrible job of common subexpression elimination (it does not | |
168 | * notice that the array is at a constant stack address). | |
169 | */ | |
170 | ||
171 | struct compiled_search_struct { | |
172 | unsigned patlen; | |
173 | unsigned deltaspace[CHAR_MAX + 1]; | |
174 | }; | |
175 | ||
176 | ||
177 | static char * | |
178 | BoyerMooreCompile (pat, patlen) | |
179 | char *pat; | |
180 | int patlen; | |
181 | { | |
182 | register unsigned char *p, *t; | |
183 | register unsigned i, p1, j, *delta; | |
184 | struct compiled_search_struct *cp; | |
185 | int alloc_len; | |
186 | ||
187 | /* | |
188 | * Algorithm fails if pattern is empty. | |
189 | */ | |
190 | if ((p1 = patlen) == 0) | |
191 | return (NULL); | |
192 | ||
193 | alloc_len = sizeof(struct compiled_search_struct) + patlen + 1; | |
194 | cp = (struct compiled_search_struct *) ckalloc (alloc_len); | |
195 | ||
196 | strncpy((char *)cp+sizeof(struct compiled_search_struct), pat, patlen); | |
197 | *((char *)cp+alloc_len-1) = '\0'; | |
198 | ||
199 | /* set up deltas */ | |
200 | delta = cp->deltaspace; | |
201 | ||
202 | for (i = 0; i <= CHAR_MAX; i++) | |
203 | delta[i] = p1; | |
204 | ||
205 | for (p = (unsigned char *)pat, i = p1; --i > 0;) | |
206 | delta[*p++] = i; | |
207 | ||
208 | cp->patlen = patlen; | |
209 | return((char*) cp); | |
210 | } | |
211 | ||
212 | static char * | |
213 | BoyerMooreExecute (text, textlen, compPtr, patLenP) | |
214 | char *text; | |
215 | unsigned textlen; | |
216 | char *compPtr; | |
217 | unsigned *patLenP; | |
218 | { | |
219 | register unsigned char *p, *t; | |
220 | struct compiled_search_struct *csp = | |
221 | (struct compiled_search_struct*) compPtr; | |
222 | register unsigned i, p1, j, *delta = csp->deltaspace; | |
223 | char *pat; | |
224 | unsigned patlen; | |
225 | ||
226 | *patLenP = p1 = patlen = csp->patlen; | |
227 | /* code below fails (whenever i is unsigned) if pattern too long */ | |
228 | if (p1 > textlen) | |
229 | return (NULL); | |
230 | ||
231 | pat = (char *)csp + sizeof(struct compiled_search_struct); | |
232 | /* | |
233 | * From now on, we want patlen - 1. | |
234 | * In the loop below, p points to the end of the pattern, | |
235 | * t points to the end of the text to be tested against the | |
236 | * pattern, and i counts the amount of text remaining, not | |
237 | * including the part to be tested. | |
238 | */ | |
239 | p1--; | |
240 | p = (unsigned char *)pat + p1; | |
241 | t = (unsigned char *)text + p1; | |
242 | i = textlen - patlen; | |
243 | for (;;) { | |
244 | if (*p == *t && | |
245 | memcmp((p - p1), (t - p1), p1) == 0) | |
246 | return ((char *)t - p1); | |
247 | j = delta[*t]; | |
248 | if (i < j) | |
249 | break; | |
250 | i -= j; | |
251 | t += j; | |
252 | } | |
253 | return (NULL); | |
254 | } | |
255 | ||
256 | \f | |
257 | /* | |
258 | *----------------------------------------------------------------------------- | |
259 | * | |
260 | * Tcl_RegExpClean -- | |
261 | * Free all resources associated with a regular expression info | |
262 | * structure.. | |
263 | * | |
264 | *----------------------------------------------------------------------------- | |
265 | */ | |
266 | void | |
267 | Tcl_RegExpClean (regExpPtr) | |
268 | regexp_pt regExpPtr; | |
269 | { | |
270 | if (regExpPtr->progPtr != NULL) | |
271 | ckfree ((char *) regExpPtr->progPtr); | |
272 | if (regExpPtr->boyerMoorePtr != NULL) | |
273 | ckfree ((char *) regExpPtr->boyerMoorePtr); | |
274 | } | |
275 | \f | |
276 | /* | |
277 | *----------------------------------------------------------------------------- | |
278 | * | |
279 | * FindNonRegExpSubStr | |
280 | * Find the largest substring that does not have any regular | |
281 | * expression meta-characters and is not located within `[...]'. | |
282 | * If the regexp contains an or (|), zero is returned, as the | |
283 | * Boyer-Moore optimization does not work, since there are actually | |
284 | * multiple patterns. The real solution is to build the Boyer-Moore | |
285 | * into the regular expression code. | |
286 | *----------------------------------------------------------------------------- | |
287 | */ | |
288 | static int | |
289 | FindNonRegExpSubStr (expression, subStrPtrPtr) | |
290 | char *expression; | |
291 | char **subStrPtrPtr; | |
292 | { | |
293 | register char *subStrPtr = NULL; | |
294 | register char subStrLen = 0; | |
295 | register char *scanPtr = expression; | |
296 | register int len; | |
297 | ||
298 | while (*scanPtr != '\0') { | |
299 | len = strcspn (scanPtr, REXP_META); | |
300 | /* | |
301 | * If we are at a meta-character, by-pass till non-meta. If we hit | |
302 | * a `[' then by-pass the entire `[...]' range, but be careful, could | |
303 | * have omitted `]'. In a `|' is encountered (except in brackets),' | |
304 | * we are through. | |
305 | */ | |
306 | if (len == 0) { | |
307 | scanPtr += strspn (scanPtr, REXP_META_NO_BRACKET_NO_OR); | |
308 | if (*scanPtr == '|') | |
309 | return 0; | |
310 | if (*scanPtr == '[') { | |
311 | scanPtr += strcspn (scanPtr, "]"); | |
312 | if (*scanPtr == ']') | |
313 | scanPtr++; | |
314 | } | |
315 | } else { | |
316 | if (len > subStrLen) { | |
317 | subStrPtr = scanPtr; | |
318 | subStrLen = len; | |
319 | } | |
320 | scanPtr += len; | |
321 | } | |
322 | } | |
323 | *subStrPtrPtr = subStrPtr; | |
324 | return subStrLen; | |
325 | } | |
326 | \f | |
327 | /* | |
328 | *----------------------------------------------------------------------------- | |
329 | * | |
330 | * Tcl_RegExpCompile -- | |
331 | * Compile a regular expression. | |
332 | * | |
333 | * Parameters: | |
334 | * o regExpPtr - Used to hold info on this regular expression. If the | |
335 | * structure is being reused, it Tcl_RegExpClean should be called first. | |
336 | * o expression - Regular expression to compile. | |
337 | * o flags - The following flags are recognized: | |
338 | * o REXP_NO_CASE - Comparison will be regardless of case. | |
339 | * o REXP_BOTH_ALGORITHMS - If specified, a Boyer-Moore expression is | |
340 | * compiled for the largest substring of the expression that does | |
341 | * not contain any meta-characters. This is slows compiling, but | |
342 | * speeds up large searches. | |
343 | * | |
344 | * Results: | |
345 | * Standard TCL results. | |
346 | *----------------------------------------------------------------------------- | |
347 | */ | |
348 | int | |
349 | Tcl_RegExpCompile (interp, regExpPtr, expression, flags) | |
350 | Tcl_Interp *interp; | |
351 | regexp_pt regExpPtr; | |
352 | char *expression; | |
353 | int flags; | |
354 | { | |
355 | char *expBuf; | |
356 | int anyMeta; | |
357 | ||
358 | if (*expression == '\0') { | |
359 | Tcl_AppendResult (interp, "Null regular expression", (char *) NULL); | |
360 | return TCL_ERROR; | |
361 | } | |
362 | ||
363 | regExpPtr->progPtr = NULL; | |
364 | regExpPtr->boyerMoorePtr = NULL; | |
365 | regExpPtr->noCase = flags & REXP_NO_CASE; | |
366 | ||
367 | if (flags & REXP_NO_CASE) { | |
368 | expBuf = ckalloc (strlen (expression) + 1); | |
369 | Tcl_DownShift (expBuf, expression); | |
370 | } else | |
371 | expBuf = expression; | |
372 | ||
373 | anyMeta = strpbrk (expBuf, REXP_META) != NULL; | |
374 | ||
375 | /* | |
376 | * If no meta-characters, use Boyer-Moore string matching only. | |
377 | */ | |
378 | if (!anyMeta) { | |
379 | regExpPtr->boyerMoorePtr = BoyerMooreCompile (expBuf, strlen (expBuf)); | |
380 | goto okExitPoint; | |
381 | } | |
382 | ||
383 | /* | |
384 | * Build a Boyer-Moore on the largest non-meta substring, if requested, | |
385 | * and the reg-exp does not contain a `|' (or). If less that three | |
386 | * characters in the string, don't use B-M, as it seems not optimal at | |
387 | * this point. | |
388 | */ | |
389 | if (flags & REXP_BOTH_ALGORITHMS) { | |
390 | char *subStrPtr; | |
391 | int subStrLen; | |
392 | ||
393 | subStrLen = FindNonRegExpSubStr (expBuf, &subStrPtr); | |
394 | if (subStrLen > 2) | |
395 | regExpPtr->boyerMoorePtr = | |
396 | BoyerMooreCompile (subStrPtr, subStrLen); | |
397 | } | |
398 | ||
399 | /* | |
400 | * Compile meta-character containing regular expression. | |
401 | */ | |
402 | tclRegexpError = NULL; | |
403 | regExpPtr->progPtr = regcomp (expBuf); | |
404 | if (tclRegexpError != NULL) { | |
405 | if (flags & REXP_NO_CASE) | |
406 | ckfree (expBuf); | |
407 | Tcl_AppendResult (interp, "error in regular expression: ", | |
408 | tclRegexpError, (char *) NULL); | |
409 | if (flags & REXP_NO_CASE) | |
410 | ckfree (expBuf); | |
411 | Tcl_RegExpClean (regExpPtr); | |
412 | } | |
413 | ||
414 | okExitPoint: | |
415 | if (flags & REXP_NO_CASE) | |
416 | ckfree (expBuf); | |
417 | return TCL_OK; | |
418 | ||
419 | } | |
420 | \f | |
421 | /* | |
422 | *----------------------------------------------------------------------------- | |
423 | * | |
424 | * Tcl_RegExpExecute -- | |
425 | * Execute a regular expression compiled with Boyer-Moore and/or | |
426 | * regexp. | |
427 | * | |
428 | * Parameters: | |
429 | * o regExpPtr - Used to hold info on this regular expression. | |
430 | * o matchStrIn - String to match against the regular expression. | |
431 | * o matchStrLower - Optional lower case version of the string. If | |
432 | * multiple no case matches are being done, time can be saved by | |
433 | * down shifting the string in advance. NULL if not a no-case | |
434 | * match or this procedure is to do the down shifting. | |
435 | * | |
436 | * Results: | |
437 | * TRUE if a match, FALSE if it does not match. | |
438 | * | |
439 | *----------------------------------------------------------------------------- | |
440 | */ | |
441 | int | |
442 | Tcl_RegExpExecute (interp, regExpPtr, matchStrIn, matchStrLower) | |
443 | Tcl_Interp *interp; | |
444 | regexp_pt regExpPtr; | |
445 | char *matchStrIn; | |
446 | char *matchStrLower; | |
447 | { | |
448 | char *matchStr; | |
449 | int result; | |
450 | ||
451 | if (regExpPtr->noCase) { | |
452 | if (matchStrLower == NULL) { | |
453 | matchStr = ckalloc (strlen (matchStrIn) + 1); | |
454 | Tcl_DownShift (matchStr, matchStrIn); | |
455 | } else | |
456 | matchStr = matchStrLower; | |
457 | } else | |
458 | matchStr = matchStrIn; | |
459 | ||
460 | /* | |
461 | * If a Boyer-Moore pattern has been compiled, use that algorithm to test | |
462 | * against the text. If that passes, then test with the regexp if we have | |
463 | * it. | |
464 | */ | |
465 | if (regExpPtr->boyerMoorePtr != NULL) { | |
466 | char *startPtr; | |
467 | unsigned matchLen; | |
468 | ||
469 | startPtr = BoyerMooreExecute (matchStr, strlen (matchStr), | |
470 | regExpPtr->boyerMoorePtr, &matchLen); | |
471 | if (startPtr == NULL) { | |
472 | result = FALSE; | |
473 | goto exitPoint; | |
474 | } | |
475 | if (regExpPtr->progPtr == NULL) { | |
476 | result = TRUE; /* No regexp, its a match! */ | |
477 | goto exitPoint; | |
478 | } | |
479 | } | |
480 | ||
481 | /* | |
482 | * Give it a go with full regular expressions | |
483 | */ | |
484 | result = regexec (regExpPtr->progPtr, matchStr); | |
485 | ||
486 | /* | |
487 | * Clean up and return status here. | |
488 | */ | |
489 | exitPoint: | |
490 | if ((regExpPtr->noCase) && (matchStrLower == NULL)) | |
491 | ckfree (matchStr); | |
492 | return result; | |
493 | } |