[micropolis] / src / tclx / src / tclxrexp.c

/*
 * tclXregexp.c --
 *
 * Tcl regular expression pattern matching utilities.
 *-----------------------------------------------------------------------------
 * Copyright 1992 Karl Lehenbauer and Mark Diekhans.
 *
 * Permission to use, copy, modify, and distribute this software and its
 * documentation for any purpose and without fee is hereby granted, provided
 * that the above copyright notice appear in all copies.  Karl Lehenbauer and
 * Mark Diekhans make no representations about the suitability of this
 * software for any purpose.  It is provided "as is" without express or
 * implied warranty.
 *-----------------------------------------------------------------------------
 * Boyer-Moore code from: 
 *     torek-boyer-moore/27-Aug-90 by
 *     chris@mimsy.umd.edu (Chris Torek)
 *-----------------------------------------------------------------------------
 * $Id: tclXregexp.c,v 2.0 1992/10/16 04:51:08 markd Rel $
 *-----------------------------------------------------------------------------
 */

#include "tclxint.h"
#include "regexp.h"

/*
 * This is declared in tclUtil.c.  Must be set to NULL before compiling
 * a regular expressions.
 */
extern char *tclRegexpError;

/*
 * Meta-characters for regular expression
 */
#define REXP_META                   "^$.[()|?+*\\"
#define REXP_META_NO_BRACKET_NO_OR  "^$.()?+*\\"

#ifndef CHAR_MAX
#    define CHAR_MAX 255
#endif

/*
 * Prototypes of internal functions.
 */

static char *
BoyerMooreCompile _ANSI_ARGS_((char *pat,
                                  int patlen));

static char *
BoyerMooreExecute _ANSI_ARGS_((char     *text,
                               unsigned  textlen,
                               char     *compPtr,
                               unsigned *patLenP));

static int
FindNonRegExpSubStr _ANSI_ARGS_((char  *expression,
                                 char **subStrPtrPtr));

\f
/*
 * Boyer-Moore search: input is `text' (a string) and its length,
 * and a `pattern' (another string) and its length.
 *
 * The linear setup cost of this function is approximately 256 + patlen.
 * Afterwards, however, the average cost is O(textlen/patlen), and the
 * worst case is O(textlen+patlen).
 *
 * The Boyer-Moore algorithm works by observing that, for each position
 * in the text, if the character there does *not* occur somewhere in the
 * search pattern, no comparisons including that character will match.
 * That is, given the text "hello world..." and the pattern "goodbye", the
 * `w' in `world' means that none of `hello w', `ello wo', `llo wor',
 * `lo worl', `o world', ` world.', and `world..' can match.  In fact,
 * exactly patlen strings are certain not to match.  We can discover this
 * simply by looking at the patlen'th character.  Furthermore, even if
 * the text character does occur, it may be that it rules out some number
 * of other matches.  Again, we can discover this by doing the match
 * `backwards'.
 *
 * We set up a table of deltas for each possible character, with
 * delta[character] being patlen for characters not in the pattern,
 * less for characters in the pattern, growing progressively smaller
 * as we near the end of the pattern.  Matching then works as follows:
 *
 *       0         1         2         3
 *       01234567890123456789012345678901234567
 *      "Here is the string being searched into"        (text)
 *       ------                                         (pos = [0..5])
 *      "string"                                        (pat)
 *      654321-                                         (deltas)
 *
 * (the delta for `-' will be derived below).
 *
 * Positions 0..5 end with `i', which is not the `g' we want.  `i' does
 * appear in `string', but two characters before the end.  We skip
 * forward so as to make the `i's match up:
 *
 *      "Here is the string being searched into"        (text)
 *        "string"                                      (pos = [2..7])
 *
 * Next we find that ` ' and `g' do not match.  Since ` ' does not appear
 * in the pattern at all, we can skip forward 6:
 *
 *      "Here is the string being searched into"        (text)
 *              "string"                                (pos = [8..13])
 *
 * Comparing `t' vs `g', we again find no match, and so we obtain the
 * delta for `t', which is 4.  We skip to position 17:
 *
 *      "Here is the string being searched into"        (text)
 *                  "string"                            (pos = [12..17])
 *
 * It thus takes only four steps to move the search point forward to the
 * match, in this case.
 *
 * If the pattern has a recurring character, we must set the delta for
 * that character to the distance of the one closest to the end:
 *
 *      "befuddle the cat"      (text)
 *      "fuddle"                (pos = [0..5])
 *      654321-                 (delta)
 *
 * We want the next search to line the `d's up like this:
 *
 *      "befuddle the cat"      (text)
 *        "fuddle"              (pos = [2..7])
 *
 * and not like this:
 *
 *      "befuddle the cat"      (text)
 *         "fuddle"             (pos = [3..8])
 *
 * so we take the smaller delta for d, i.e., 2.
 *
 * The last task is computing the delta we have noted above as `-':
 *
 *      "candlesticks"          (text)
 *      "hand"                  (pos = [0..3])
 *      4321-                   (delta)
 *
 * Here the `d' in `hand' matches the `d' in `candlesticks', but the
 * strings differ.  Since there are no other `d's in `hand', we know
 * that none of (cand,andl,ndle,dles) can match, and thus we want this
 * delta to be 4 (the length of the pattern).  But if we had, e.g.:
 *
 *      "candlesticks"          (text)
 *      "deed"                  (pos = [0..3])
 *      4321-                   (delta)
 *
 * then we should advance to line up the other `d':
 *
 *      "candlesticks"          (text)
 *         "deed"               (pos = [3..6])
 *
 * As this suggests, the delta should be that for the `d' nearest the
 * end, but not including the end.  This is easily managed by setting up
 * a delta table as follows:
 *
 *      for int:c in [0..255] { delta[c] = patlen; };
 *      for int:x in [0..patlen-1) { delta[pat[x]] = patlen - (x + 1); };
 *
 * delta[pat[patlen-1]] is never written, so the last letter inherits the
 * delta from an earlier iteration or from the previous loop.
 *
 * NB: the nonsense with `deltaspace' below exists merely because gcc
 * does a horrible job of common subexpression elimination (it does not
 * notice that the array is at a constant stack address).
 */

struct compiled_search_struct {
        unsigned patlen;
        unsigned deltaspace[CHAR_MAX + 1];
};


static char *
BoyerMooreCompile (pat, patlen)
    char *pat;
    int   patlen;
{
        register unsigned char *p, *t;
        register unsigned i, p1, j, *delta;
        struct compiled_search_struct *cp;
        int alloc_len;

        /*
         * Algorithm fails if pattern is empty.
         */
        if ((p1 = patlen) == 0)
                return (NULL);

        alloc_len = sizeof(struct compiled_search_struct) + patlen + 1;
        cp = (struct compiled_search_struct *) ckalloc (alloc_len);

        strncpy((char *)cp+sizeof(struct compiled_search_struct), pat, patlen);
        *((char *)cp+alloc_len-1) = '\0';

        /* set up deltas */
        delta = cp->deltaspace;

        for (i = 0; i <= CHAR_MAX; i++)
                delta[i] = p1;

        for (p = (unsigned char *)pat, i = p1; --i > 0;)
                delta[*p++] = i;

        cp->patlen = patlen;
        return((char*) cp);
}

static char *
BoyerMooreExecute (text, textlen, compPtr, patLenP)
        char     *text;
        unsigned  textlen;
        char     *compPtr;
        unsigned *patLenP;
{
        register unsigned char *p, *t;
        struct compiled_search_struct *csp = 
        	(struct compiled_search_struct*) compPtr;
        register unsigned i, p1, j, *delta = csp->deltaspace;
        char *pat;
        unsigned patlen;

        *patLenP = p1 = patlen = csp->patlen;
        /* code below fails (whenever i is unsigned) if pattern too long */
        if (p1 > textlen)
                return (NULL);

        pat = (char *)csp + sizeof(struct compiled_search_struct);
        /*
         * From now on, we want patlen - 1.
         * In the loop below, p points to the end of the pattern,
         * t points to the end of the text to be tested against the
         * pattern, and i counts the amount of text remaining, not
         * including the part to be tested.
         */
        p1--;
        p = (unsigned char *)pat + p1;
        t = (unsigned char *)text + p1;
        i = textlen - patlen;
        for (;;) {
                if (*p == *t && 
                    memcmp((p - p1), (t - p1), p1) == 0)
                        return ((char *)t - p1);
                j = delta[*t];
                if (i < j)
                        break;
                i -= j;
                t += j;
        }
        return (NULL);
}

\f
/*
 *-----------------------------------------------------------------------------
 *
 * Tcl_RegExpClean --
 *     Free all resources associated with a regular expression info 
 *     structure..
 *
 *-----------------------------------------------------------------------------
 */
void
Tcl_RegExpClean (regExpPtr)
    regexp_pt regExpPtr;
{
    if (regExpPtr->progPtr != NULL)
    	ckfree ((char *) regExpPtr->progPtr);
    if (regExpPtr->boyerMoorePtr != NULL)
    	ckfree ((char *) regExpPtr->boyerMoorePtr);
}
\f
/*
 *-----------------------------------------------------------------------------
 *
 * FindNonRegExpSubStr
 *     Find the largest substring that does not have any regular 
 *     expression meta-characters and is not located within `[...]'.
 *     If the regexp contains an or (|), zero is returned, as the 
 *     Boyer-Moore optimization does not work, since there are actually
 *     multiple patterns.  The real solution is to build the Boyer-Moore
 *     into the regular expression code.
 *-----------------------------------------------------------------------------
 */
static int
FindNonRegExpSubStr (expression, subStrPtrPtr)
    char  *expression;
    char **subStrPtrPtr;
{
    register char *subStrPtr = NULL;
    register char  subStrLen = 0;
    register char *scanPtr   = expression;
    register int   len;

    while (*scanPtr != '\0') {
        len = strcspn (scanPtr, REXP_META);
        /*
         * If we are at a meta-character, by-pass till non-meta.  If we hit
         * a `[' then by-pass the entire `[...]' range, but be careful, could
         * have omitted `]'.  In a `|' is encountered (except in brackets),'
         * we are through.
         */
        if (len == 0) {
            scanPtr += strspn (scanPtr, REXP_META_NO_BRACKET_NO_OR);
            if (*scanPtr == '|')
                return 0;
            if (*scanPtr == '[') {
                scanPtr += strcspn (scanPtr, "]");
                if (*scanPtr == ']')
                    scanPtr++;
            }          
        } else {
            if (len > subStrLen) {
                subStrPtr = scanPtr;
                subStrLen = len;
            }
            scanPtr += len;
        }
    }
    *subStrPtrPtr = subStrPtr;
    return subStrLen;
}
\f
/*
 *-----------------------------------------------------------------------------
 *
 * Tcl_RegExpCompile --
 *     Compile a regular expression.
 *
 * Parameters:
 *     o regExpPtr - Used to hold info on this regular expression.  If the
 *       structure is being reused, it Tcl_RegExpClean should be called first.
 *     o expression - Regular expression to compile.
 *     o flags - The following flags are recognized:
 *         o REXP_NO_CASE - Comparison will be regardless of case.
 *         o REXP_BOTH_ALGORITHMS - If specified, a Boyer-Moore expression is 
 *           compiled for the largest substring of the expression that does
 *           not contain any meta-characters.  This is slows compiling, but
 *           speeds up large searches.
 *
 * Results:
 *     Standard TCL results.
 *-----------------------------------------------------------------------------
 */
int
Tcl_RegExpCompile (interp, regExpPtr, expression, flags)
    Tcl_Interp  *interp;
    regexp_pt    regExpPtr;
    char        *expression;
    int          flags;
{
    char *expBuf;
    int   anyMeta;

    if (*expression == '\0') {
        Tcl_AppendResult (interp, "Null regular expression", (char *) NULL);
        return TCL_ERROR;
    }

    regExpPtr->progPtr = NULL;
    regExpPtr->boyerMoorePtr = NULL;
    regExpPtr->noCase = flags & REXP_NO_CASE;

    if (flags & REXP_NO_CASE) {
        expBuf = ckalloc (strlen (expression) + 1);
        Tcl_DownShift (expBuf, expression);
    } else
        expBuf = expression;

    anyMeta = strpbrk (expBuf, REXP_META) != NULL;

    /*
     * If no meta-characters, use Boyer-Moore string matching only.
     */
    if (!anyMeta) {
        regExpPtr->boyerMoorePtr = BoyerMooreCompile (expBuf, strlen (expBuf));
        goto okExitPoint;
    }
 
    /*
     * Build a Boyer-Moore on the largest non-meta substring, if requested,
     * and the reg-exp does not contain a `|' (or).  If less that three
     * characters in the string, don't use B-M, as it seems not optimal at
     * this point.
     */
    if (flags & REXP_BOTH_ALGORITHMS) {
        char *subStrPtr;
        int   subStrLen;
        
        subStrLen = FindNonRegExpSubStr (expBuf, &subStrPtr);
        if (subStrLen > 2)
            regExpPtr->boyerMoorePtr = 
                BoyerMooreCompile (subStrPtr, subStrLen);
    }
    
    /*
     * Compile meta-character containing regular expression.
     */
    tclRegexpError = NULL;
    regExpPtr->progPtr = regcomp (expBuf);
    if (tclRegexpError != NULL) {
        if (flags & REXP_NO_CASE)
            ckfree (expBuf);
        Tcl_AppendResult (interp, "error in regular expression: ", 
                          tclRegexpError, (char *) NULL);
        if (flags & REXP_NO_CASE)
            ckfree (expBuf);
        Tcl_RegExpClean (regExpPtr);
    }
  
okExitPoint: 
    if (flags & REXP_NO_CASE)
        ckfree (expBuf);
    return TCL_OK;

}
\f
/*
 *-----------------------------------------------------------------------------
 *
 * Tcl_RegExpExecute --
 *     Execute a regular expression compiled with Boyer-Moore and/or 
 *     regexp.
 *
 * Parameters:
 *     o regExpPtr - Used to hold info on this regular expression.
 *     o matchStrIn - String to match against the regular expression.
 *     o matchStrLower - Optional lower case version of the string.  If
 *       multiple no case matches are being done, time can be saved by
 *       down shifting the string in advance.  NULL if not a no-case 
 *       match or this procedure is to do the down shifting.
 *
 * Results:
 *     TRUE if a match, FALSE if it does not match.
 *
 *-----------------------------------------------------------------------------
 */
int
Tcl_RegExpExecute (interp, regExpPtr, matchStrIn, matchStrLower)
    Tcl_Interp  *interp;
    regexp_pt    regExpPtr;
    char        *matchStrIn;
    char        *matchStrLower;
{
    char *matchStr;
    int   result;

    if (regExpPtr->noCase) {
        if (matchStrLower == NULL) {
            matchStr = ckalloc (strlen (matchStrIn) + 1);
            Tcl_DownShift (matchStr, matchStrIn);
        } else
            matchStr = matchStrLower;
    } else
        matchStr = matchStrIn;

    /*
     * If a Boyer-Moore pattern has been compiled, use that algorithm to test
     * against the text.  If that passes, then test with the regexp if we have
     * it.
     */
    if (regExpPtr->boyerMoorePtr != NULL) {
        char     *startPtr;
        unsigned  matchLen;

        startPtr = BoyerMooreExecute (matchStr, strlen (matchStr), 
                                      regExpPtr->boyerMoorePtr, &matchLen);
        if (startPtr == NULL) {
            result = FALSE;
            goto exitPoint;
        }
        if (regExpPtr->progPtr == NULL) {
            result = TRUE;  /* No regexp, its a match! */
            goto exitPoint;
        }
    }
    
    /*
     * Give it a go with full regular expressions
     */
    result = regexec (regExpPtr->progPtr, matchStr);

    /*
     * Clean up and return status here.
     */
exitPoint:
    if ((regExpPtr->noCase) && (matchStrLower == NULL))
        ckfree (matchStr);
    return result;
}
Commit	Line	Data
6a5fa4e0 MG	1	/*
	2	* tclXregexp.c --
	3	*
	4	* Tcl regular expression pattern matching utilities.
	5	*-----------------------------------------------------------------------------
	6	* Copyright 1992 Karl Lehenbauer and Mark Diekhans.
	7	*
	8	* Permission to use, copy, modify, and distribute this software and its
	9	* documentation for any purpose and without fee is hereby granted, provided
	10	* that the above copyright notice appear in all copies. Karl Lehenbauer and
	11	* Mark Diekhans make no representations about the suitability of this
	12	* software for any purpose. It is provided "as is" without express or
	13	* implied warranty.
	14	*-----------------------------------------------------------------------------
	15	* Boyer-Moore code from:
	16	* torek-boyer-moore/27-Aug-90 by
	17	* chris@mimsy.umd.edu (Chris Torek)
	18	*-----------------------------------------------------------------------------
	19	* $Id: tclXregexp.c,v 2.0 1992/10/16 04:51:08 markd Rel $
	20	*-----------------------------------------------------------------------------
	21	*/
	22
	23	#include "tclxint.h"
	24	#include "regexp.h"
	25
	26	/*
	27	* This is declared in tclUtil.c. Must be set to NULL before compiling
	28	* a regular expressions.
	29	*/
	30	extern char *tclRegexpError;
	31
	32	/*
	33	* Meta-characters for regular expression
	34	*/
	35	#define REXP_META "^$.[()\|?+*\\"
	36	#define REXP_META_NO_BRACKET_NO_OR "^$.()?+*\\"
	37
	38	#ifndef CHAR_MAX
	39	# define CHAR_MAX 255
	40	#endif
	41
	42	/*
	43	* Prototypes of internal functions.
	44	*/
	45
	46	static char *
	47	BoyerMooreCompile _ANSI_ARGS_((char *pat,
	48	int patlen));
	49
	50	static char *
	51	BoyerMooreExecute _ANSI_ARGS_((char *text,
	52	unsigned textlen,
	53	char *compPtr,
	54	unsigned *patLenP));
	55
	56	static int
	57	FindNonRegExpSubStr _ANSI_ARGS_((char *expression,
	58	char **subStrPtrPtr));
	59
	60	\f
	61	/*
	62	* Boyer-Moore search: input is `text' (a string) and its length,
	63	* and a `pattern' (another string) and its length.
	64	*
65	* The linear setup cost of this function is approximately 256 + patlen.
66	* Afterwards, however, the average cost is O(textlen/patlen), and the
67	* worst case is O(textlen+patlen).
68	*
69	* The Boyer-Moore algorithm works by observing that, for each position
70	* in the text, if the character there does not occur somewhere in the
71	* search pattern, no comparisons including that character will match.
72	* That is, given the text "hello world..." and the pattern "goodbye", the
73	* `w' in `world' means that none of `hello w', `ello wo', `llo wor',
74	* `lo worl', `o world', ` world.', and `world..' can match. In fact,
75	* exactly patlen strings are certain not to match. We can discover this
76	* simply by looking at the patlen'th character. Furthermore, even if
77	* the text character does occur, it may be that it rules out some number
78	* of other matches. Again, we can discover this by doing the match
79	* `backwards'.
80	*
81	* We set up a table of deltas for each possible character, with
82	* delta[character] being patlen for characters not in the pattern,
83	* less for characters in the pattern, growing progressively smaller
84	* as we near the end of the pattern. Matching then works as follows:
85	*
86	* 0 1 2 3
87	* 01234567890123456789012345678901234567
88	* "Here is the string being searched into" (text)
89	* ------ (pos = [0..5])
90	* "string" (pat)
91	* 654321- (deltas)
92	*
93	* (the delta for `-' will be derived below).
94	*
95	* Positions 0..5 end with `i', which is not the `g' we want. `i' does
96	* appear in `string', but two characters before the end. We skip
97	* forward so as to make the `i's match up:
98	*
99	* "Here is the string being searched into" (text)
100	* "string" (pos = [2..7])
101	*
102	* Next we find that ` ' and `g' do not match. Since ` ' does not appear
103	* in the pattern at all, we can skip forward 6:
104	*
105	* "Here is the string being searched into" (text)
106	* "string" (pos = [8..13])
107	*
108	* Comparing `t' vs `g', we again find no match, and so we obtain the
109	* delta for `t', which is 4. We skip to position 17:
110	*
111	* "Here is the string being searched into" (text)
112	* "string" (pos = [12..17])
113	*
114	* It thus takes only four steps to move the search point forward to the
115	* match, in this case.
116	*
117	* If the pattern has a recurring character, we must set the delta for
118	* that character to the distance of the one closest to the end:
119	*
120	* "befuddle the cat" (text)
121	* "fuddle" (pos = [0..5])
122	* 654321- (delta)
123	*
124	* We want the next search to line the `d's up like this:
125	*
126	* "befuddle the cat" (text)
127	* "fuddle" (pos = [2..7])
128	*
129	* and not like this:
130	*
131	* "befuddle the cat" (text)
132	* "fuddle" (pos = [3..8])
133	*
134	* so we take the smaller delta for d, i.e., 2.
135	*
136	* The last task is computing the delta we have noted above as `-':
137	*
138	* "candlesticks" (text)
139	* "hand" (pos = [0..3])
140	* 4321- (delta)
141	*
142	* Here the `d' in `hand' matches the `d' in `candlesticks', but the
143	* strings differ. Since there are no other `d's in `hand', we know
144	* that none of (cand,andl,ndle,dles) can match, and thus we want this
145	* delta to be 4 (the length of the pattern). But if we had, e.g.:
146	*
147	* "candlesticks" (text)
148	* "deed" (pos = [0..3])
149	* 4321- (delta)
150	*
151	* then we should advance to line up the other `d':
152	*
153	* "candlesticks" (text)
154	* "deed" (pos = [3..6])
155	*
156	* As this suggests, the delta should be that for the `d' nearest the
157	* end, but not including the end. This is easily managed by setting up
158	* a delta table as follows:
159	*
160	* for int:c in [0..255] { delta[c] = patlen; };
161	* for int:x in [0..patlen-1) { delta[pat[x]] = patlen - (x + 1); };
162	*
163	* delta[pat[patlen-1]] is never written, so the last letter inherits the
164	* delta from an earlier iteration or from the previous loop.
165	*
166	* NB: the nonsense with `deltaspace' below exists merely because gcc
167	* does a horrible job of common subexpression elimination (it does not
168	* notice that the array is at a constant stack address).
169	*/
170
171	struct compiled_search_struct {
172	unsigned patlen;
173	unsigned deltaspace[CHAR_MAX + 1];
174	};
175
176
177	static char *
178	BoyerMooreCompile (pat, patlen)
179	char *pat;
180	int patlen;
181	{
182	register unsigned char p, t;
183	register unsigned i, p1, j, *delta;
184	struct compiled_search_struct *cp;
185	int alloc_len;
186
187	/*
188	* Algorithm fails if pattern is empty.
189	*/
190	if ((p1 = patlen) == 0)
191	return (NULL);
192
193	alloc_len = sizeof(struct compiled_search_struct) + patlen + 1;
194	cp = (struct compiled_search_struct *) ckalloc (alloc_len);
195
196	strncpy((char *)cp+sizeof(struct compiled_search_struct), pat, patlen);
197	((char )cp+alloc_len-1) = '\0';
198
199	/* set up deltas */
200	delta = cp->deltaspace;
201
202	for (i = 0; i <= CHAR_MAX; i++)
203	delta[i] = p1;
204
205	for (p = (unsigned char *)pat, i = p1; --i > 0;)
206	delta[*p++] = i;
207
208	cp->patlen = patlen;
209	return((char*) cp);
210	}
211
212	static char *
213	BoyerMooreExecute (text, textlen, compPtr, patLenP)
214	char *text;
215	unsigned textlen;
216	char *compPtr;
217	unsigned *patLenP;
218	{
219	register unsigned char p, t;
220	struct compiled_search_struct *csp =
221	(struct compiled_search_struct*) compPtr;
222	register unsigned i, p1, j, *delta = csp->deltaspace;
223	char *pat;
224	unsigned patlen;
225
226	*patLenP = p1 = patlen = csp->patlen;
227	/* code below fails (whenever i is unsigned) if pattern too long */
228	if (p1 > textlen)
229	return (NULL);
230
231	pat = (char *)csp + sizeof(struct compiled_search_struct);
232	/*
233	* From now on, we want patlen - 1.
234	* In the loop below, p points to the end of the pattern,
235	* t points to the end of the text to be tested against the
236	* pattern, and i counts the amount of text remaining, not
237	* including the part to be tested.
238	*/
239	p1--;
240	p = (unsigned char *)pat + p1;
241	t = (unsigned char *)text + p1;
242	i = textlen - patlen;
243	for (;;) {
244	if (p == t &&
245	memcmp((p - p1), (t - p1), p1) == 0)
246	return ((char *)t - p1);
247	j = delta[*t];
248	if (i < j)
249	break;
250	i -= j;
251	t += j;
252	}
253	return (NULL);
254	}
255
256	\f
257	/*
258	*-----------------------------------------------------------------------------
259	*
260	* Tcl_RegExpClean --
261	* Free all resources associated with a regular expression info
262	* structure..
263	*
264	*-----------------------------------------------------------------------------
265	*/
266	void
267	Tcl_RegExpClean (regExpPtr)
268	regexp_pt regExpPtr;
269	{
270	if (regExpPtr->progPtr != NULL)
271	ckfree ((char *) regExpPtr->progPtr);
272	if (regExpPtr->boyerMoorePtr != NULL)
273	ckfree ((char *) regExpPtr->boyerMoorePtr);
274	}
275	\f
276	/*
277	*-----------------------------------------------------------------------------
278	*
279	* FindNonRegExpSubStr
280	* Find the largest substring that does not have any regular
281	* expression meta-characters and is not located within `[...]'.
282	* If the regexp contains an or (\|), zero is returned, as the
283	* Boyer-Moore optimization does not work, since there are actually
284	* multiple patterns. The real solution is to build the Boyer-Moore
285	* into the regular expression code.
286	*-----------------------------------------------------------------------------
287	*/
288	static int
289	FindNonRegExpSubStr (expression, subStrPtrPtr)
290	char *expression;
291	char **subStrPtrPtr;
292	{
293	register char *subStrPtr = NULL;
294	register char subStrLen = 0;
295	register char *scanPtr = expression;
296	register int len;
297
298	while (*scanPtr != '\0') {
299	len = strcspn (scanPtr, REXP_META);
300	/*
301	* If we are at a meta-character, by-pass till non-meta. If we hit
302	* a `[' then by-pass the entire `[...]' range, but be careful, could
303	* have omitted `]'. In a `\|' is encountered (except in brackets),'
304	* we are through.
305	*/
306	if (len == 0) {
307	scanPtr += strspn (scanPtr, REXP_META_NO_BRACKET_NO_OR);
308	if (*scanPtr == '\|')
309	return 0;
310	if (*scanPtr == '[') {
311	scanPtr += strcspn (scanPtr, "]");
312	if (*scanPtr == ']')
313	scanPtr++;
314	}
315	} else {
316	if (len > subStrLen) {
317	subStrPtr = scanPtr;
318	subStrLen = len;
319	}
320	scanPtr += len;
321	}
322	}
323	*subStrPtrPtr = subStrPtr;
324	return subStrLen;
325	}
326	\f
327	/*
328	*-----------------------------------------------------------------------------
329	*
330	* Tcl_RegExpCompile --
331	* Compile a regular expression.
332	*
333	* Parameters:
334	* o regExpPtr - Used to hold info on this regular expression. If the
335	* structure is being reused, it Tcl_RegExpClean should be called first.
336	* o expression - Regular expression to compile.
337	* o flags - The following flags are recognized:
338	* o REXP_NO_CASE - Comparison will be regardless of case.
339	* o REXP_BOTH_ALGORITHMS - If specified, a Boyer-Moore expression is
340	* compiled for the largest substring of the expression that does
341	* not contain any meta-characters. This is slows compiling, but
342	* speeds up large searches.
343	*
344	* Results:
345	* Standard TCL results.
346	*-----------------------------------------------------------------------------
347	*/
348	int
349	Tcl_RegExpCompile (interp, regExpPtr, expression, flags)
350	Tcl_Interp *interp;
351	regexp_pt regExpPtr;
352	char *expression;
353	int flags;
354	{
355	char *expBuf;
356	int anyMeta;
357
358	if (*expression == '\0') {
359	Tcl_AppendResult (interp, "Null regular expression", (char *) NULL);
360	return TCL_ERROR;
361	}
362
363	regExpPtr->progPtr = NULL;
364	regExpPtr->boyerMoorePtr = NULL;
365	regExpPtr->noCase = flags & REXP_NO_CASE;
366
367	if (flags & REXP_NO_CASE) {
368	expBuf = ckalloc (strlen (expression) + 1);
369	Tcl_DownShift (expBuf, expression);
370	} else
371	expBuf = expression;
372
373	anyMeta = strpbrk (expBuf, REXP_META) != NULL;
374
375	/*
376	* If no meta-characters, use Boyer-Moore string matching only.
377	*/
378	if (!anyMeta) {
379	regExpPtr->boyerMoorePtr = BoyerMooreCompile (expBuf, strlen (expBuf));
380	goto okExitPoint;
381	}
382
383	/*
384	* Build a Boyer-Moore on the largest non-meta substring, if requested,
385	* and the reg-exp does not contain a `\|' (or). If less that three
386	* characters in the string, don't use B-M, as it seems not optimal at
387	* this point.
388	*/
389	if (flags & REXP_BOTH_ALGORITHMS) {
390	char *subStrPtr;
391	int subStrLen;
392
393	subStrLen = FindNonRegExpSubStr (expBuf, &subStrPtr);
394	if (subStrLen > 2)
395	regExpPtr->boyerMoorePtr =
396	BoyerMooreCompile (subStrPtr, subStrLen);
397	}
398
399	/*
400	* Compile meta-character containing regular expression.
401	*/
402	tclRegexpError = NULL;
403	regExpPtr->progPtr = regcomp (expBuf);
404	if (tclRegexpError != NULL) {
405	if (flags & REXP_NO_CASE)
406	ckfree (expBuf);
407	Tcl_AppendResult (interp, "error in regular expression: ",
408	tclRegexpError, (char *) NULL);
409	if (flags & REXP_NO_CASE)
410	ckfree (expBuf);
411	Tcl_RegExpClean (regExpPtr);
412	}
413
414	okExitPoint:
415	if (flags & REXP_NO_CASE)
416	ckfree (expBuf);
417	return TCL_OK;
418
419	}
420	\f
421	/*
422	*-----------------------------------------------------------------------------
423	*
424	* Tcl_RegExpExecute --
425	* Execute a regular expression compiled with Boyer-Moore and/or
426	* regexp.
427	*
428	* Parameters:
429	* o regExpPtr - Used to hold info on this regular expression.
430	* o matchStrIn - String to match against the regular expression.
431	* o matchStrLower - Optional lower case version of the string. If
432	* multiple no case matches are being done, time can be saved by
433	* down shifting the string in advance. NULL if not a no-case
434	* match or this procedure is to do the down shifting.
435	*
436	* Results:
437	* TRUE if a match, FALSE if it does not match.
438	*
439	*-----------------------------------------------------------------------------
440	*/
441	int
442	Tcl_RegExpExecute (interp, regExpPtr, matchStrIn, matchStrLower)
443	Tcl_Interp *interp;
444	regexp_pt regExpPtr;
445	char *matchStrIn;
446	char *matchStrLower;
447	{
448	char *matchStr;
449	int result;
450
451	if (regExpPtr->noCase) {
452	if (matchStrLower == NULL) {
453	matchStr = ckalloc (strlen (matchStrIn) + 1);
454	Tcl_DownShift (matchStr, matchStrIn);
455	} else
456	matchStr = matchStrLower;
457	} else
458	matchStr = matchStrIn;
459
460	/*
461	* If a Boyer-Moore pattern has been compiled, use that algorithm to test
462	* against the text. If that passes, then test with the regexp if we have
463	* it.
464	*/
465	if (regExpPtr->boyerMoorePtr != NULL) {
466	char *startPtr;
467	unsigned matchLen;
468
469	startPtr = BoyerMooreExecute (matchStr, strlen (matchStr),
470	regExpPtr->boyerMoorePtr, &matchLen);
471	if (startPtr == NULL) {
472	result = FALSE;
473	goto exitPoint;
474	}
475	if (regExpPtr->progPtr == NULL) {
476	result = TRUE; /* No regexp, its a match! */
477	goto exitPoint;
478	}
479	}
480
481	/*
482	* Give it a go with full regular expressions
483	*/
484	result = regexec (regExpPtr->progPtr, matchStr);
485
486	/*
487	* Clean up and return status here.
488	*/
489	exitPoint:
490	if ((regExpPtr->noCase) && (matchStrLower == NULL))
491	ckfree (matchStr);
492	return result;
493	}