git.zerfleddert.de Git - micropolis/blob - src/tclx/src/tclxrexp.c

   1 /*
   2  * tclXregexp.c --
   3  *
   4  * Tcl regular expression pattern matching utilities.
   5  *-----------------------------------------------------------------------------
   6  * Copyright 1992 Karl Lehenbauer and Mark Diekhans.
   7  *
   8  * Permission to use, copy, modify, and distribute this software and its
   9  * documentation for any purpose and without fee is hereby granted, provided
  10  * that the above copyright notice appear in all copies.  Karl Lehenbauer and
  11  * Mark Diekhans make no representations about the suitability of this
  12  * software for any purpose.  It is provided "as is" without express or
  13  * implied warranty.
  14  *-----------------------------------------------------------------------------
  15  * Boyer-Moore code from:
  16  *     torek-boyer-moore/27-Aug-90 by
  17  *     chris@mimsy.umd.edu (Chris Torek)
  18  *-----------------------------------------------------------------------------
  19  * $Id: tclXregexp.c,v 2.0 1992/10/16 04:51:08 markd Rel $
  20  *-----------------------------------------------------------------------------
  21  */
  22
  23 #include "tclxint.h"
  24 #include "regexp.h"
  25
  26 /*
  27  * This is declared in tclUtil.c.  Must be set to NULL before compiling
  28  * a regular expressions.
  29  */
  30 extern char *tclRegexpError;
  31
  32 /*
  33  * Meta-characters for regular expression
  34  */
  35 #define REXP_META                   "^$.[()|?+*\\"
  36 #define REXP_META_NO_BRACKET_NO_OR  "^$.()?+*\\"
  37
  38 #ifndef CHAR_MAX
  39 #    define CHAR_MAX 255
  40 #endif
  41
  42 /*
  43  * Prototypes of internal functions.
  44  */
  45
  46 static char *
  47 BoyerMooreCompile _ANSI_ARGS_((char *pat,
  48                                   int patlen));
  49
  50 static char *
  51 BoyerMooreExecute _ANSI_ARGS_((char     *text,
  52                                unsigned  textlen,
  53                                char     *compPtr,
  54                                unsigned *patLenP));
  55
  56 static int
  57 FindNonRegExpSubStr _ANSI_ARGS_((char  *expression,
  58                                  char **subStrPtrPtr));
  59
  60 \f
  61 /*
  62  * Boyer-Moore search: input is `text' (a string) and its length,
  63  * and a `pattern' (another string) and its length.
  64  *
  65  * The linear setup cost of this function is approximately 256 + patlen.
  66  * Afterwards, however, the average cost is O(textlen/patlen), and the
  67  * worst case is O(textlen+patlen).
  68  *
  69  * The Boyer-Moore algorithm works by observing that, for each position
  70  * in the text, if the character there does *not* occur somewhere in the
  71  * search pattern, no comparisons including that character will match.
  72  * That is, given the text "hello world..." and the pattern "goodbye", the
  73  * `w' in `world' means that none of `hello w', `ello wo', `llo wor',
  74  * `lo worl', `o world', ` world.', and `world..' can match.  In fact,
  75  * exactly patlen strings are certain not to match.  We can discover this
  76  * simply by looking at the patlen'th character.  Furthermore, even if
  77  * the text character does occur, it may be that it rules out some number
  78  * of other matches.  Again, we can discover this by doing the match
  79  * `backwards'.
  80  *
  81  * We set up a table of deltas for each possible character, with
  82  * delta[character] being patlen for characters not in the pattern,
  83  * less for characters in the pattern, growing progressively smaller
  84  * as we near the end of the pattern.  Matching then works as follows:
  85  *
  86  *       0         1         2         3
  87  *       01234567890123456789012345678901234567
  88  *      "Here is the string being searched into"        (text)
  89  *       ------                                         (pos = [0..5])
  90  *      "string"                                        (pat)
  91  *      654321-                                         (deltas)
  92  *
  93  * (the delta for `-' will be derived below).
  94  *
  95  * Positions 0..5 end with `i', which is not the `g' we want.  `i' does
  96  * appear in `string', but two characters before the end.  We skip
  97  * forward so as to make the `i's match up:
  98  *
  99  *      "Here is the string being searched into"        (text)
 100  *        "string"                                      (pos = [2..7])
 101  *
 102  * Next we find that ` ' and `g' do not match.  Since ` ' does not appear
 103  * in the pattern at all, we can skip forward 6:
 104  *
 105  *      "Here is the string being searched into"        (text)
 106  *              "string"                                (pos = [8..13])
 107  *
 108  * Comparing `t' vs `g', we again find no match, and so we obtain the
 109  * delta for `t', which is 4.  We skip to position 17:
 110  *
 111  *      "Here is the string being searched into"        (text)
 112  *                  "string"                            (pos = [12..17])
 113  *
 114  * It thus takes only four steps to move the search point forward to the
 115  * match, in this case.
 116  *
 117  * If the pattern has a recurring character, we must set the delta for
 118  * that character to the distance of the one closest to the end:
 119  *
 120  *      "befuddle the cat"      (text)
 121  *      "fuddle"                (pos = [0..5])
 122  *      654321-                 (delta)
 123  *
 124  * We want the next search to line the `d's up like this:
 125  *
 126  *      "befuddle the cat"      (text)
 127  *        "fuddle"              (pos = [2..7])
 128  *
 129  * and not like this:
 130  *
 131  *      "befuddle the cat"      (text)
 132  *         "fuddle"             (pos = [3..8])
 133  *
 134  * so we take the smaller delta for d, i.e., 2.
 135  *
 136  * The last task is computing the delta we have noted above as `-':
 137  *
 138  *      "candlesticks"          (text)
 139  *      "hand"                  (pos = [0..3])
 140  *      4321-                   (delta)
 141  *
 142  * Here the `d' in `hand' matches the `d' in `candlesticks', but the
 143  * strings differ.  Since there are no other `d's in `hand', we know
 144  * that none of (cand,andl,ndle,dles) can match, and thus we want this
 145  * delta to be 4 (the length of the pattern).  But if we had, e.g.:
 146  *
 147  *      "candlesticks"          (text)
 148  *      "deed"                  (pos = [0..3])
 149  *      4321-                   (delta)
 150  *
 151  * then we should advance to line up the other `d':
 152  *
 153  *      "candlesticks"          (text)
 154  *         "deed"               (pos = [3..6])
 155  *
 156  * As this suggests, the delta should be that for the `d' nearest the
 157  * end, but not including the end.  This is easily managed by setting up
 158  * a delta table as follows:
 159  *
 160  *      for int:c in [0..255] { delta[c] = patlen; };
 161  *      for int:x in [0..patlen-1) { delta[pat[x]] = patlen - (x + 1); };
 162  *
 163  * delta[pat[patlen-1]] is never written, so the last letter inherits the
 164  * delta from an earlier iteration or from the previous loop.
 165  *
 166  * NB: the nonsense with `deltaspace' below exists merely because gcc
 167  * does a horrible job of common subexpression elimination (it does not
 168  * notice that the array is at a constant stack address).
 169  */
 170
 171 struct compiled_search_struct {
 172         unsigned patlen;
 173         unsigned deltaspace[CHAR_MAX + 1];
 174 };
 175
 176
 177 static char *
 178 BoyerMooreCompile (pat, patlen)
 179     char *pat;
 180     int   patlen;
 181 {
 182         register unsigned char *p, *t;
 183         register unsigned i, p1, j, *delta;
 184         struct compiled_search_struct *cp;
 185         int alloc_len;
 186
 187         /*
 188          * Algorithm fails if pattern is empty.
 189          */
 190         if ((p1 = patlen) == 0)
 191                 return (NULL);
 192
 193         alloc_len = sizeof(struct compiled_search_struct) + patlen + 1;
 194         cp = (struct compiled_search_struct *) ckalloc (alloc_len);
 195
 196         strncpy((char *)cp+sizeof(struct compiled_search_struct), pat, patlen);
 197         *((char *)cp+alloc_len-1) = '\0';
 198
 199         /* set up deltas */
 200         delta = cp->deltaspace;
 201
 202         for (i = 0; i <= CHAR_MAX; i++)
 203                 delta[i] = p1;
 204
 205         for (p = (unsigned char *)pat, i = p1; --i > 0;)
 206                 delta[*p++] = i;
 207
 208         cp->patlen = patlen;
 209         return((char*) cp);
 210 }
 211
 212 static char *
 213 BoyerMooreExecute (text, textlen, compPtr, patLenP)
 214         char     *text;
 215         unsigned  textlen;
 216         char     *compPtr;
 217         unsigned *patLenP;
 218 {
 219         register unsigned char *p, *t;
 220         struct compiled_search_struct *csp =
 221                 (struct compiled_search_struct*) compPtr;
 222         register unsigned i, p1, j, *delta = csp->deltaspace;
 223         char *pat;
 224         unsigned patlen;
 225
 226         *patLenP = p1 = patlen = csp->patlen;
 227         /* code below fails (whenever i is unsigned) if pattern too long */
 228         if (p1 > textlen)
 229                 return (NULL);
 230
 231         pat = (char *)csp + sizeof(struct compiled_search_struct);
 232         /*
 233          * From now on, we want patlen - 1.
 234          * In the loop below, p points to the end of the pattern,
 235          * t points to the end of the text to be tested against the
 236          * pattern, and i counts the amount of text remaining, not
 237          * including the part to be tested.
 238          */
 239         p1--;
 240         p = (unsigned char *)pat + p1;
 241         t = (unsigned char *)text + p1;
 242         i = textlen - patlen;
 243         for (;;) {
 244                 if (*p == *t &&
 245                     memcmp((p - p1), (t - p1), p1) == 0)
 246                         return ((char *)t - p1);
 247                 j = delta[*t];
 248                 if (i < j)
 249                         break;
 250                 i -= j;
 251                 t += j;
 252         }
 253         return (NULL);
 254 }
 255
 256 \f
 257 /*
 258  *-----------------------------------------------------------------------------
 259  *
 260  * Tcl_RegExpClean --
 261  *     Free all resources associated with a regular expression info
 262  *     structure..
 263  *
 264  *-----------------------------------------------------------------------------
 265  */
 266 void
 267 Tcl_RegExpClean (regExpPtr)
 268     regexp_pt regExpPtr;
 269 {
 270     if (regExpPtr->progPtr != NULL)
 271         ckfree ((char *) regExpPtr->progPtr);
 272     if (regExpPtr->boyerMoorePtr != NULL)
 273         ckfree ((char *) regExpPtr->boyerMoorePtr);
 274 }
 275 \f
 276 /*
 277  *-----------------------------------------------------------------------------
 278  *
 279  * FindNonRegExpSubStr
 280  *     Find the largest substring that does not have any regular
 281  *     expression meta-characters and is not located within `[...]'.
 282  *     If the regexp contains an or (|), zero is returned, as the
 283  *     Boyer-Moore optimization does not work, since there are actually
 284  *     multiple patterns.  The real solution is to build the Boyer-Moore
 285  *     into the regular expression code.
 286  *-----------------------------------------------------------------------------
 287  */
 288 static int
 289 FindNonRegExpSubStr (expression, subStrPtrPtr)
 290     char  *expression;
 291     char **subStrPtrPtr;
 292 {
 293     register char *subStrPtr = NULL;
 294     register char  subStrLen = 0;
 295     register char *scanPtr   = expression;
 296     register int   len;
 297
 298     while (*scanPtr != '\0') {
 299         len = strcspn (scanPtr, REXP_META);
 300         /*
 301          * If we are at a meta-character, by-pass till non-meta.  If we hit
 302          * a `[' then by-pass the entire `[...]' range, but be careful, could
 303          * have omitted `]'.  In a `|' is encountered (except in brackets),'
 304          * we are through.
 305          */
 306         if (len == 0) {
 307             scanPtr += strspn (scanPtr, REXP_META_NO_BRACKET_NO_OR);
 308             if (*scanPtr == '|')
 309                 return 0;
 310             if (*scanPtr == '[') {
 311                 scanPtr += strcspn (scanPtr, "]");
 312                 if (*scanPtr == ']')
 313                     scanPtr++;
 314             }
 315         } else {
 316             if (len > subStrLen) {
 317                 subStrPtr = scanPtr;
 318                 subStrLen = len;
 319             }
 320             scanPtr += len;
 321         }
 322     }
 323     *subStrPtrPtr = subStrPtr;
 324     return subStrLen;
 325 }
 326 \f
 327 /*
 328  *-----------------------------------------------------------------------------
 329  *
 330  * Tcl_RegExpCompile --
 331  *     Compile a regular expression.
 332  *
 333  * Parameters:
 334  *     o regExpPtr - Used to hold info on this regular expression.  If the
 335  *       structure is being reused, it Tcl_RegExpClean should be called first.
 336  *     o expression - Regular expression to compile.
 337  *     o flags - The following flags are recognized:
 338  *         o REXP_NO_CASE - Comparison will be regardless of case.
 339  *         o REXP_BOTH_ALGORITHMS - If specified, a Boyer-Moore expression is
 340  *           compiled for the largest substring of the expression that does
 341  *           not contain any meta-characters.  This is slows compiling, but
 342  *           speeds up large searches.
 343  *
 344  * Results:
 345  *     Standard TCL results.
 346  *-----------------------------------------------------------------------------
 347  */
 348 int
 349 Tcl_RegExpCompile (interp, regExpPtr, expression, flags)
 350     Tcl_Interp  *interp;
 351     regexp_pt    regExpPtr;
 352     char        *expression;
 353     int          flags;
 354 {
 355     char *expBuf;
 356     int   anyMeta;
 357
 358     if (*expression == '\0') {
 359         Tcl_AppendResult (interp, "Null regular expression", (char *) NULL);
 360         return TCL_ERROR;
 361     }
 362
 363     regExpPtr->progPtr = NULL;
 364     regExpPtr->boyerMoorePtr = NULL;
 365     regExpPtr->noCase = flags & REXP_NO_CASE;
 366
 367     if (flags & REXP_NO_CASE) {
 368         expBuf = ckalloc (strlen (expression) + 1);
 369         Tcl_DownShift (expBuf, expression);
 370     } else
 371         expBuf = expression;
 372
 373     anyMeta = strpbrk (expBuf, REXP_META) != NULL;
 374
 375     /*
 376      * If no meta-characters, use Boyer-Moore string matching only.
 377      */
 378     if (!anyMeta) {
 379         regExpPtr->boyerMoorePtr = BoyerMooreCompile (expBuf, strlen (expBuf));
 380         goto okExitPoint;
 381     }
 382
 383     /*
 384      * Build a Boyer-Moore on the largest non-meta substring, if requested,
 385      * and the reg-exp does not contain a `|' (or).  If less that three
 386      * characters in the string, don't use B-M, as it seems not optimal at
 387      * this point.
 388      */
 389     if (flags & REXP_BOTH_ALGORITHMS) {
 390         char *subStrPtr;
 391         int   subStrLen;
 392
 393         subStrLen = FindNonRegExpSubStr (expBuf, &subStrPtr);
 394         if (subStrLen > 2)
 395             regExpPtr->boyerMoorePtr =
 396                 BoyerMooreCompile (subStrPtr, subStrLen);
 397     }
 398
 399     /*
 400      * Compile meta-character containing regular expression.
 401      */
 402     tclRegexpError = NULL;
 403     regExpPtr->progPtr = regcomp (expBuf);
 404     if (tclRegexpError != NULL) {
 405         if (flags & REXP_NO_CASE)
 406             ckfree (expBuf);
 407         Tcl_AppendResult (interp, "error in regular expression: ",
 408                           tclRegexpError, (char *) NULL);
 409         if (flags & REXP_NO_CASE)
 410             ckfree (expBuf);
 411         Tcl_RegExpClean (regExpPtr);
 412     }
 413
 414 okExitPoint:
 415     if (flags & REXP_NO_CASE)
 416         ckfree (expBuf);
 417     return TCL_OK;
 418
 419 }
 420 \f
 421 /*
 422  *-----------------------------------------------------------------------------
 423  *
 424  * Tcl_RegExpExecute --
 425  *     Execute a regular expression compiled with Boyer-Moore and/or
 426  *     regexp.
 427  *
 428  * Parameters:
 429  *     o regExpPtr - Used to hold info on this regular expression.
 430  *     o matchStrIn - String to match against the regular expression.
 431  *     o matchStrLower - Optional lower case version of the string.  If
 432  *       multiple no case matches are being done, time can be saved by
 433  *       down shifting the string in advance.  NULL if not a no-case
 434  *       match or this procedure is to do the down shifting.
 435  *
 436  * Results:
 437  *     TRUE if a match, FALSE if it does not match.
 438  *
 439  *-----------------------------------------------------------------------------
 440  */
 441 int
 442 Tcl_RegExpExecute (interp, regExpPtr, matchStrIn, matchStrLower)
 443     Tcl_Interp  *interp;
 444     regexp_pt    regExpPtr;
 445     char        *matchStrIn;
 446     char        *matchStrLower;
 447 {
 448     char *matchStr;
 449     int   result;
 450
 451     if (regExpPtr->noCase) {
 452         if (matchStrLower == NULL) {
 453             matchStr = ckalloc (strlen (matchStrIn) + 1);
 454             Tcl_DownShift (matchStr, matchStrIn);
 455         } else
 456             matchStr = matchStrLower;
 457     } else
 458         matchStr = matchStrIn;
 459
 460     /*
 461      * If a Boyer-Moore pattern has been compiled, use that algorithm to test
 462      * against the text.  If that passes, then test with the regexp if we have
 463      * it.
 464      */
 465     if (regExpPtr->boyerMoorePtr != NULL) {
 466         char     *startPtr;
 467         unsigned  matchLen;
 468
 469         startPtr = BoyerMooreExecute (matchStr, strlen (matchStr),
 470                                       regExpPtr->boyerMoorePtr, &matchLen);
 471         if (startPtr == NULL) {
 472             result = FALSE;
 473             goto exitPoint;
 474         }
 475         if (regExpPtr->progPtr == NULL) {
 476             result = TRUE;  /* No regexp, its a match! */
 477             goto exitPoint;
 478         }
 479     }
 480
 481     /*
 482      * Give it a go with full regular expressions
 483      */
 484     result = regexec (regExpPtr->progPtr, matchStr);
 485
 486     /*
 487      * Clean up and return status here.
 488      */
 489 exitPoint:
 490     if ((regExpPtr->noCase) && (matchStrLower == NULL))
 491         ckfree (matchStr);
 492     return result;
 493 }