The Machine Perception Toolbox

[Introduction]- [News]- [Download]- [Screenshots]- [Manual (pdf)]- [Forums]- [API Reference]- [Repository ]

 

Main Page | Namespace List | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Namespace Members | Class Members | File Members | Related Pages

regexp.c

Go to the documentation of this file.
00001 /*
00002  * regcomp and regexec -- regsub and regerror are elsewhere
00003  *
00004  *      Copyright (c) 1986 by University of Toronto.
00005  *      Written by Henry Spencer.  Not derived from licensed software.
00006  *
00007  *      Permission is granted to anyone to use this software for any
00008  *      purpose on any computer system, and to redistribute it freely,
00009  *      subject to the following restrictions:
00010  *
00011  *      1. The author is not responsible for the consequences of use of
00012  *              this software, no matter how awful, even if they arise
00013  *              from defects in it.
00014  *
00015  *      2. The origin of this software must not be misrepresented, either
00016  *              by explicit claim or by omission.
00017  *
00018  *      3. Altered versions must be plainly marked as such, and must not
00019  *              be misrepresented as being the original software.
00020  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
00021  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
00022  *** to assist in implementing egrep.
00023  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
00024  *** hoptoad!gnu, on 27 Dec 1986, to add < and > for word-matching
00025  *** as in BSD grep and ex.
00026  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
00027  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
00028  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
00029  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
00030  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
00031  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
00032  *** was moved into regexp.h, and the include of regexp.h now uses "'s
00033  *** to avoid conflicting with the system regexp.h.  Const, bless its
00034  *** soul, was removed so it can compile everywhere.  The declaration
00035  *** of strchr() was in conflict on AIX, so it was removed (as it is
00036  *** happily defined in string.h).
00037  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
00038  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
00039  *
00040  * Beware that some of this code is subtly aware of the way operator
00041  * precedence is structured in regular expressions.  Serious changes in
00042  * regular-expression syntax might require a total rethink.
00043  */
00044 #include "regexp.h"
00045 #include <stdio.h>
00046 #include <ctype.h>
00047 #ifndef ultrix
00048 #include <stdlib.h>
00049 #endif
00050 #include <string.h>
00051 
00052 /*
00053  * The "internal use only" fields in regexp.h are present to pass info from
00054  * compile to execute that permits the execute phase to run lots faster on
00055  * simple cases.  They are:
00056  *
00057  * regstart     char that must begin a match; '\0' if none obvious
00058  * reganch      is the match anchored (at beginning-of-line only)?
00059  * regmust      string (pointer into program) that match must include, or NULL
00060  * regmlen      length of regmust string
00061  *
00062  * Regstart and reganch permit very fast decisions on suitable starting points
00063  * for a match, cutting down the work a lot.  Regmust permits fast rejection
00064  * of lines that cannot possibly match.  The regmust tests are costly enough
00065  * that regcomp() supplies a regmust only if the r.e. contains something
00066  * potentially expensive (at present, the only such thing detected is * or +
00067  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
00068  * supplied because the test in regexec() needs it and regcomp() is computing
00069  * it anyway.
00070  */
00071 
00072 /*
00073  * Structure for regexp "program".  This is essentially a linear encoding
00074  * of a nondeterministic finite-state machine (aka syntax charts or
00075  * "railroad normal form" in parsing technology).  Each node is an opcode
00076  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
00077  * all nodes except BRANCH implement concatenation; a "next" pointer with
00078  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
00079  * have one of the subtle syntax dependencies:  an individual BRANCH (as
00080  * opposed to a collection of them) is never concatenated with anything
00081  * because of operator precedence.)  The operand of some types of node is
00082  * a literal string; for others, it is a node leading into a sub-FSM.  In
00083  * particular, the operand of a BRANCH node is the first node of the branch.
00084  * (NB this is *not* a tree structure:  the tail of the branch connects
00085  * to the thing following the set of BRANCHes.)  The opcodes are:
00086  */
00087 
00088 /* definition   number  opnd?   meaning */
00089 #define END     0       /* no   End of program. */
00090 #define BOL     1       /* no   Match "" at beginning of line. */
00091 #define EOL     2       /* no   Match "" at end of line. */
00092 #define ANY     3       /* no   Match any one character. */
00093 #define ANYOF   4       /* str  Match any character in this string. */
00094 #define ANYBUT  5       /* str  Match any character not in this string. */
00095 #define BRANCH  6       /* node Match this alternative, or the next... */
00096 #define BACK    7       /* no   Match "", "next" ptr points backward. */
00097 #define EXACTLY 8       /* str  Match this string. */
00098 #define NOTHING 9       /* no   Match empty string. */
00099 #define STAR    10      /* node Match this (simple) thing 0 or more times. */
00100 #define PLUS    11      /* node Match this (simple) thing 1 or more times. */
00101 #define WORDA   12      /* no   Match "" at wordchar, where prev is nonword */
00102 #define WORDZ   13      /* no   Match "" at nonwordchar, where prev is word */
00103 #define OPEN    20      /* no   Mark this point in input as start of #n. */
00104                         /*      OPEN+1 is number 1, etc. */
00105 #define CLOSE   30      /* no   Analogous to OPEN. */
00106 
00107 /*
00108  * Opcode notes:
00109  *
00110  * BRANCH       The set of branches constituting a single choice are hooked
00111  *              together with their "next" pointers, since precedence prevents
00112  *              anything being concatenated to any individual branch.  The
00113  *              "next" pointer of the last BRANCH in a choice points to the
00114  *              thing following the whole choice.  This is also where the
00115  *              final "next" pointer of each individual branch points; each
00116  *              branch starts with the operand node of a BRANCH node.
00117  *
00118  * BACK         Normal "next" pointers all implicitly point forward; BACK
00119  *              exists to make loop structures possible.
00120  *
00121  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
00122  *              BRANCH structures using BACK.  Simple cases (one character
00123  *              per match) are implemented with STAR and PLUS for speed
00124  *              and to minimize recursive plunges.
00125  *
00126  * OPEN,CLOSE   ...are numbered at compile time.
00127  */
00128 
00129 /*
00130  * A node is one char of opcode followed by two chars of "next" pointer.
00131  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
00132  * value is a positive offset from the opcode of the node containing it.
00133  * An operand, if any, simply follows the node.  (Note that much of the
00134  * code generation knows about this implicit relationship.)
00135  *
00136  * Using two bytes for the "next" pointer is vast overkill for most things,
00137  * but allows patterns to get big without disasters.
00138  */
00139 #define OP(p)   (*(p))
00140 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
00141 #define OPERAND(p)      ((p) + 3)
00142 
00143 /*
00144  * See regmagic.h for one further detail of program structure.
00145  */
00146 
00147 
00148 /*
00149  * Utility definitions.
00150  */
00151 #ifndef CHARBITS
00152 #define UCHARAT(p)      ((int)*(unsigned char *)(p))
00153 #else
00154 #define UCHARAT(p)      ((int)*(p)&CHARBITS)
00155 #endif
00156 
00157 #define FAIL(m) { regerror(m); return(NULL); }
00158 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?')
00159 
00160 /*
00161  * Flags to be passed up and down.
00162  */
00163 #define HASWIDTH        01      /* Known never to match null string. */
00164 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
00165 #define SPSTART         04      /* Starts with * or +. */
00166 #define WORST           0       /* Worst case. */
00167 
00168 /*
00169  * Global work variables for regcomp().
00170  */
00171 static char *regparse;          /* Input-scan pointer. */
00172 static int regnpar;             /* () count. */
00173 static char regdummy;
00174 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
00175 static long regsize;            /* Code size. */
00176 
00177 /*
00178  * Forward declarations for regcomp()'s friends.
00179  */
00180 #ifndef STATIC
00181 #define STATIC  static
00182 #endif
00183 STATIC char *reg( int paren, int *flagp );
00184 STATIC char *regbranch( int *flagp );
00185 STATIC char *regpiece( int *flagp );
00186 STATIC char *regatom( int *flagp );
00187 STATIC char *regnode( int op );
00188 STATIC char *regnext( register char *p );
00189 STATIC void regc( int b );
00190 STATIC void reginsert( char op, char *opnd );
00191 STATIC void regtail( char *p, char *val );
00192 STATIC void regoptail( char *p, char *val );
00193 #ifdef STRCSPN
00194 STATIC int strcspn();
00195 #endif
00196 
00197 /*
00198  - regcomp - compile a regular expression into internal code
00199  *
00200  * We can't allocate space until we know how big the compiled form will be,
00201  * but we can't compile it (and thus know how big it is) until we've got a
00202  * place to put the code.  So we cheat:  we compile it twice, once with code
00203  * generation turned off and size counting turned on, and once "for real".
00204  * This also means that we don't allocate space until we are sure that the
00205  * thing really will compile successfully, and we never have to move the
00206  * code and thus invalidate pointers into it.  (Note that it has to be in
00207  * one piece because free() must be able to free it all.)
00208  *
00209  * Beware that the optimization-preparation code in here knows about some
00210  * of the structure of the compiled regexp.
00211  */
00212 regexp *
00213 regcomp( char *exp )
00214 {
00215         register regexp *r;
00216         register char *scan;
00217         register char *longest;
00218         register unsigned len;
00219         int flags;
00220 
00221         if (exp == NULL)
00222                 FAIL("NULL argument");
00223 
00224         /* First pass: determine size, legality. */
00225 #ifdef notdef
00226         if (exp[0] == '.' && exp[1] == '*') exp += 2;  /* aid grep */
00227 #endif
00228         regparse = (char *)exp;
00229         regnpar = 1;
00230         regsize = 0L;
00231         regcode = &regdummy;
00232         regc(MAGIC);
00233         if (reg(0, &flags) == NULL)
00234                 return(NULL);
00235 
00236         /* Small enough for pointer-storage convention? */
00237         if (regsize >= 32767L)          /* Probably could be 65535L. */
00238                 FAIL("regexp too big");
00239 
00240         /* Allocate space. */
00241         r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
00242         if (r == NULL)
00243                 FAIL("out of space");
00244 
00245         /* Second pass: emit code. */
00246         regparse = (char *)exp;
00247         regnpar = 1;
00248         regcode = r->program;
00249         regc(MAGIC);
00250         if (reg(0, &flags) == NULL)
00251                 return(NULL);
00252 
00253         /* Dig out information for optimizations. */
00254         r->regstart = '\0';     /* Worst-case defaults. */
00255         r->reganch = 0;
00256         r->regmust = NULL;
00257         r->regmlen = 0;
00258         scan = r->program+1;                    /* First BRANCH. */
00259         if (OP(regnext(scan)) == END) {         /* Only one top-level choice. */
00260                 scan = OPERAND(scan);
00261 
00262                 /* Starting-point info. */
00263                 if (OP(scan) == EXACTLY)
00264                         r->regstart = *OPERAND(scan);
00265                 else if (OP(scan) == BOL)
00266                         r->reganch++;
00267 
00268                 /*
00269                  * If there's something expensive in the r.e., find the
00270                  * longest literal string that must appear and make it the
00271                  * regmust.  Resolve ties in favor of later strings, since
00272                  * the regstart check works with the beginning of the r.e.
00273                  * and avoiding duplication strengthens checking.  Not a
00274                  * strong reason, but sufficient in the absence of others.
00275                  */
00276                 if (flags&SPSTART) {
00277                         longest = NULL;
00278                         len = 0;
00279                         for (; scan != NULL; scan = regnext(scan))
00280                                 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
00281                                         longest = OPERAND(scan);
00282                                         len = strlen(OPERAND(scan));
00283                                 }
00284                         r->regmust = longest;
00285                         r->regmlen = len;
00286                 }
00287         }
00288 
00289         return(r);
00290 }
00291 
00292 /*
00293  - reg - regular expression, i.e. main body or parenthesized thing
00294  *
00295  * Caller must absorb opening parenthesis.
00296  *
00297  * Combining parenthesis handling with the base level of regular expression
00298  * is a trifle forced, but the need to tie the tails of the branches to what
00299  * follows makes it hard to avoid.
00300  */
00301 static char *
00302 reg(
00303         int paren,                      /* Parenthesized? */
00304         int *flagp )
00305 {
00306         register char *ret;
00307         register char *br;
00308         register char *ender;
00309         register int parno;
00310         int flags;
00311 
00312         *flagp = HASWIDTH;      /* Tentatively. */
00313 
00314         /* Make an OPEN node, if parenthesized. */
00315         if (paren) {
00316                 if (regnpar >= NSUBEXP)
00317                         FAIL("too many ()");
00318                 parno = regnpar;
00319                 regnpar++;
00320                 ret = regnode(OPEN+parno);
00321         } else
00322                 ret = NULL;
00323 
00324         /* Pick up the branches, linking them together. */
00325         br = regbranch(&flags);
00326         if (br == NULL)
00327                 return(NULL);
00328         if (ret != NULL)
00329                 regtail(ret, br);       /* OPEN -> first. */
00330         else
00331                 ret = br;
00332         if (!(flags&HASWIDTH))
00333                 *flagp &= ~HASWIDTH;
00334         *flagp |= flags&SPSTART;
00335         while (*regparse == '|' || *regparse == '\n') {
00336                 regparse++;
00337                 br = regbranch(&flags);
00338                 if (br == NULL)
00339                         return(NULL);
00340                 regtail(ret, br);       /* BRANCH -> BRANCH. */
00341                 if (!(flags&HASWIDTH))
00342                         *flagp &= ~HASWIDTH;
00343                 *flagp |= flags&SPSTART;
00344         }
00345 
00346         /* Make a closing node, and hook it on the end. */
00347         ender = regnode((paren) ? CLOSE+parno : END);   
00348         regtail(ret, ender);
00349 
00350         /* Hook the tails of the branches to the closing node. */
00351         for (br = ret; br != NULL; br = regnext(br))
00352                 regoptail(br, ender);
00353 
00354         /* Check for proper termination. */
00355         if (paren && *regparse++ != ')') {
00356                 FAIL("unmatched ()");
00357         } else if (!paren && *regparse != '\0') {
00358                 if (*regparse == ')') {
00359                         FAIL("unmatched ()");
00360                 } else
00361                         FAIL("junk on end");    /* "Can't happen". */
00362                 /* NOTREACHED */
00363         }
00364 
00365         return(ret);
00366 }
00367 
00368 /*
00369  - regbranch - one alternative of an | operator
00370  *
00371  * Implements the concatenation operator.
00372  */
00373 static char *
00374 regbranch( int *flagp )
00375 {
00376         register char *ret;
00377         register char *chain;
00378         register char *latest;
00379         int flags;
00380 
00381         *flagp = WORST;         /* Tentatively. */
00382 
00383         ret = regnode(BRANCH);
00384         chain = NULL;
00385         while (*regparse != '\0' && *regparse != ')' &&
00386                *regparse != '\n' && *regparse != '|') {
00387                 latest = regpiece(&flags);
00388                 if (latest == NULL)
00389                         return(NULL);
00390                 *flagp |= flags&HASWIDTH;
00391                 if (chain == NULL)      /* First piece. */
00392                         *flagp |= flags&SPSTART;
00393                 else
00394                         regtail(chain, latest);
00395                 chain = latest;
00396         }
00397         if (chain == NULL)      /* Loop ran zero times. */
00398                 (void) regnode(NOTHING);
00399 
00400         return(ret);
00401 }
00402 
00403 /*
00404  - regpiece - something followed by possible [*+?]
00405  *
00406  * Note that the branching code sequences used for ? and the general cases
00407  * of * and + are somewhat optimized:  they use the same NOTHING node as
00408  * both the endmarker for their branch list and the body of the last branch.
00409  * It might seem that this node could be dispensed with entirely, but the
00410  * endmarker role is not redundant.
00411  */
00412 static char *
00413 regpiece( int *flagp )
00414 {
00415         register char *ret;
00416         register char op;
00417         register char *next;
00418         int flags;
00419 
00420         ret = regatom(&flags);
00421         if (ret == NULL)
00422                 return(NULL);
00423 
00424         op = *regparse;
00425         if (!ISMULT(op)) {
00426                 *flagp = flags;
00427                 return(ret);
00428         }
00429 
00430         if (!(flags&HASWIDTH) && op != '?')
00431                 FAIL("*+ operand could be empty");
00432         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
00433 
00434         if (op == '*' && (flags&SIMPLE))
00435                 reginsert(STAR, ret);
00436         else if (op == '*') {
00437                 /* Emit x* as (x&|), where & means "self". */
00438                 reginsert(BRANCH, ret);                 /* Either x */
00439                 regoptail(ret, regnode(BACK));          /* and loop */
00440                 regoptail(ret, ret);                    /* back */
00441                 regtail(ret, regnode(BRANCH));          /* or */
00442                 regtail(ret, regnode(NOTHING));         /* null. */
00443         } else if (op == '+' && (flags&SIMPLE))
00444                 reginsert(PLUS, ret);
00445         else if (op == '+') {
00446                 /* Emit x+ as x(&|), where & means "self". */
00447                 next = regnode(BRANCH);                 /* Either */
00448                 regtail(ret, next);
00449                 regtail(regnode(BACK), ret);            /* loop back */
00450                 regtail(next, regnode(BRANCH));         /* or */
00451                 regtail(ret, regnode(NOTHING));         /* null. */
00452         } else if (op == '?') {
00453                 /* Emit x? as (x|) */
00454                 reginsert(BRANCH, ret);                 /* Either x */
00455                 regtail(ret, regnode(BRANCH));          /* or */
00456                 next = regnode(NOTHING);                /* null. */
00457                 regtail(ret, next);
00458                 regoptail(ret, next);
00459         }
00460         regparse++;
00461         if (ISMULT(*regparse))
00462                 FAIL("nested *?+");
00463 
00464         return(ret);
00465 }
00466 
00467 /*
00468  - regatom - the lowest level
00469  *
00470  * Optimization:  gobbles an entire sequence of ordinary characters so that
00471  * it can turn them into a single node, which is smaller to store and
00472  * faster to run.  Backslashed characters are exceptions, each becoming a
00473  * separate node; the code is simpler that way and it's not worth fixing.
00474  */
00475 static char *
00476 regatom( int *flagp )
00477 {
00478         register char *ret;
00479         int flags;
00480 
00481         *flagp = WORST;         /* Tentatively. */
00482 
00483         switch (*regparse++) {
00484         /* FIXME: these chars only have meaning at beg/end of pat? */
00485         case '^':
00486                 ret = regnode(BOL);
00487                 break;
00488         case '$':
00489                 ret = regnode(EOL);
00490                 break;
00491         case '.':
00492                 ret = regnode(ANY);
00493                 *flagp |= HASWIDTH|SIMPLE;
00494                 break;
00495         case '[': {
00496                         register int classr;
00497                         register int classend;
00498 
00499                         if (*regparse == '^') { /* Complement of range. */
00500                                 ret = regnode(ANYBUT);
00501                                 regparse++;
00502                         } else
00503                                 ret = regnode(ANYOF);
00504                         if (*regparse == ']' || *regparse == '-')
00505                                 regc(*regparse++);
00506                         while (*regparse != '\0' && *regparse != ']') {
00507                                 if (*regparse == '-') {
00508                                         regparse++;
00509                                         if (*regparse == ']' || *regparse == '\0')
00510                                                 regc('-');
00511                                         else {
00512                                                 classr = UCHARAT(regparse-2)+1;
00513                                                 classend = UCHARAT(regparse);
00514                                                 if (classr > classend+1)
00515                                                         FAIL("invalid [] range");
00516                                                 for (; classr <= classend; classr++)
00517                                                         regc(classr);
00518                                                 regparse++;
00519                                         }
00520                                 } else
00521                                         regc(*regparse++);
00522                         }
00523                         regc('\0');
00524                         if (*regparse != ']')
00525                                 FAIL("unmatched []");
00526                         regparse++;
00527                         *flagp |= HASWIDTH|SIMPLE;
00528                 }
00529                 break;
00530         case '(':
00531                 ret = reg(1, &flags);
00532                 if (ret == NULL)
00533                         return(NULL);
00534                 *flagp |= flags&(HASWIDTH|SPSTART);
00535                 break;
00536         case '\0':
00537         case '|':
00538         case '\n':
00539         case ')':
00540                 FAIL("internal urp");   /* Supposed to be caught earlier. */
00541                 break;
00542         case '?':
00543         case '+':
00544         case '*':
00545                 FAIL("?+* follows nothing");
00546                 break;
00547         case '\\':
00548                 switch (*regparse++) {
00549                 case '\0':
00550                         FAIL("trailing \\");
00551                         break;
00552                 case '<':
00553                         ret = regnode(WORDA);
00554                         break;
00555                 case '>':
00556                         ret = regnode(WORDZ);
00557                         break;
00558                 /* FIXME: Someday handle \1, \2, ... */
00559                 default:
00560                         /* Handle general quoted chars in exact-match routine */
00561                         goto de_fault;
00562                 }
00563                 break;
00564         de_fault:
00565         default:
00566                 /*
00567                  * Encode a string of characters to be matched exactly.
00568                  *
00569                  * This is a bit tricky due to quoted chars and due to
00570                  * '*', '+', and '?' taking the SINGLE char previous
00571                  * as their operand.
00572                  *
00573                  * On entry, the char at regparse[-1] is going to go
00574                  * into the string, no matter what it is.  (It could be
00575                  * following a \ if we are entered from the '\' case.)
00576                  * 
00577                  * Basic idea is to pick up a good char in  ch  and
00578                  * examine the next char.  If it's *+? then we twiddle.
00579                  * If it's \ then we frozzle.  If it's other magic char
00580                  * we push  ch  and terminate the string.  If none of the
00581                  * above, we push  ch  on the string and go around again.
00582                  *
00583                  *  regprev  is used to remember where "the current char"
00584                  * starts in the string, if due to a *+? we need to back
00585                  * up and put the current char in a separate, 1-char, string.
00586                  * When  regprev  is NULL,  ch  is the only char in the
00587                  * string; this is used in *+? handling, and in setting
00588                  * flags |= SIMPLE at the end.
00589                  */
00590                 {
00591                         char *regprev;
00592                         register char ch;
00593 
00594                         regparse--;                     /* Look at cur char */
00595                         ret = regnode(EXACTLY);
00596                         for ( regprev = 0 ; ; ) {
00597                                 ch = *regparse++;       /* Get current char */
00598                                 switch (*regparse) {    /* look at next one */
00599 
00600                                 default:
00601                                         regc(ch);       /* Add cur to string */
00602                                         break;
00603 
00604                                 case '.': case '[': case '(':
00605                                 case ')': case '|': case '\n':
00606                                 case '$': case '^':
00607                                 case '\0':
00608                                 /* FIXME, $ and ^ should not always be magic */
00609                                 magic:
00610                                         regc(ch);       /* dump cur char */
00611                                         goto done;      /* and we are done */
00612 
00613                                 case '?': case '+': case '*':
00614                                         if (!regprev)   /* If just ch in str, */
00615                                                 goto magic;     /* use it */
00616                                         /* End mult-char string one early */
00617                                         regparse = regprev; /* Back up parse */
00618                                         goto done;
00619 
00620                                 case '\\':
00621                                         regc(ch);       /* Cur char OK */
00622                                         switch (regparse[1]){ /* Look after \ */
00623                                         case '\0':
00624                                         case '<':
00625                                         case '>':
00626                                         /* FIXME: Someday handle \1, \2, ... */
00627                                                 goto done; /* Not quoted */
00628                                         default:
00629                                                 /* Backup point is \, scan                                                       * point is after it. */
00630                                                 regprev = regparse;
00631                                                 regparse++; 
00632                                                 continue;       /* NOT break; */
00633                                         }
00634                                 }
00635                                 regprev = regparse;     /* Set backup point */
00636                         }
00637                 done:
00638                         regc('\0');
00639                         *flagp |= HASWIDTH;
00640                         if (!regprev)           /* One char? */
00641                                 *flagp |= SIMPLE;
00642                 }
00643                 break;
00644         }
00645 
00646         return(ret);
00647 }
00648 
00649 /*
00650  - regnode - emit a node
00651  */
00652 static char *                   /* Location. */
00653 regnode( int op )
00654 {
00655         register char *ret;
00656         register char *ptr;
00657 
00658         ret = regcode;
00659         if (ret == &regdummy) {
00660                 regsize += 3;
00661                 return(ret);
00662         }
00663 
00664         ptr = ret;
00665         *ptr++ = op;
00666         *ptr++ = '\0';          /* Null "next" pointer. */
00667         *ptr++ = '\0';
00668         regcode = ptr;
00669 
00670         return(ret);
00671 }
00672 
00673 /*
00674  - regc - emit (if appropriate) a byte of code
00675  */
00676 static void
00677 regc( int b )
00678 {
00679         if (regcode != &regdummy)
00680                 *regcode++ = b;
00681         else
00682                 regsize++;
00683 }
00684 
00685 /*
00686  - reginsert - insert an operator in front of already-emitted operand
00687  *
00688  * Means relocating the operand.
00689  */
00690 static void
00691 reginsert(
00692         char op,
00693         char *opnd )
00694 {
00695         register char *src;
00696         register char *dst;
00697         register char *place;
00698 
00699         if (regcode == &regdummy) {
00700                 regsize += 3;
00701                 return;
00702         }
00703 
00704         src = regcode;
00705         regcode += 3;
00706         dst = regcode;
00707         while (src > opnd)
00708                 *--dst = *--src;
00709 
00710         place = opnd;           /* Op node, where operand used to be. */
00711         *place++ = op;
00712         *place++ = '\0';
00713         *place++ = '\0';
00714 }
00715 
00716 /*
00717  - regtail - set the next-pointer at the end of a node chain
00718  */
00719 static void
00720 regtail(
00721         char *p,
00722         char *val )
00723 {
00724         register char *scan;
00725         register char *temp;
00726         register int offset;
00727 
00728         if (p == &regdummy)
00729                 return;
00730 
00731         /* Find last node. */
00732         scan = p;
00733         for (;;) {
00734                 temp = regnext(scan);
00735                 if (temp == NULL)
00736                         break;
00737                 scan = temp;
00738         }
00739 
00740         if (OP(scan) == BACK)
00741                 offset = scan - val;
00742         else
00743                 offset = val - scan;
00744         *(scan+1) = (offset>>8)&0377;
00745         *(scan+2) = offset&0377;
00746 }
00747 
00748 /*
00749  - regoptail - regtail on operand of first argument; nop if operandless
00750  */
00751 
00752 static void
00753 regoptail(
00754         char *p,
00755         char *val )
00756 {
00757         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
00758         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
00759                 return;
00760         regtail(OPERAND(p), val);
00761 }
00762 
00763 /*
00764  * regexec and friends
00765  */
00766 
00767 /*
00768  * Global work variables for regexec().
00769  */
00770 static char *reginput;          /* String-input pointer. */
00771 static char *regbol;            /* Beginning of input, for ^ check. */
00772 static char **regstartp;        /* Pointer to startp array. */
00773 static char **regendp;          /* Ditto for endp. */
00774 
00775 /*
00776  * Forwards.
00777  */
00778 STATIC int regtry( regexp *prog, char *string );
00779 STATIC int regmatch( char *prog );
00780 STATIC int regrepeat( char *p );
00781 
00782 #ifdef DEBUG
00783 int regnarrate = 0;
00784 void regdump();
00785 STATIC char *regprop();
00786 #endif
00787 
00788 /*
00789  - regexec - match a regexp against a string
00790  */
00791 int
00792 regexec(
00793         register regexp *prog,
00794         register char *string )
00795 {
00796         register char *s;
00797 
00798         /* Be paranoid... */
00799         if (prog == NULL || string == NULL) {
00800                 regerror("NULL parameter");
00801                 return(0);
00802         }
00803 
00804         /* Check validity of program. */
00805         if (UCHARAT(prog->program) != MAGIC) {
00806                 regerror("corrupted program");
00807                 return(0);
00808         }
00809 
00810         /* If there is a "must appear" string, look for it. */
00811         if (prog->regmust != NULL) {
00812                 s = (char *)string;
00813                 while ((s = strchr(s, prog->regmust[0])) != NULL) {
00814                         if (strncmp(s, prog->regmust, prog->regmlen) == 0)
00815                                 break;  /* Found it. */
00816                         s++;
00817                 }
00818                 if (s == NULL)  /* Not present. */
00819                         return(0);
00820         }
00821 
00822         /* Mark beginning of line for ^ . */
00823         regbol = (char *)string;
00824 
00825         /* Simplest case:  anchored match need be tried only once. */
00826         if (prog->reganch)
00827                 return(regtry(prog, string));
00828 
00829         /* Messy cases:  unanchored match. */
00830         s = (char *)string;
00831         if (prog->regstart != '\0')
00832                 /* We know what char it must start with. */
00833                 while ((s = strchr(s, prog->regstart)) != NULL) {
00834                         if (regtry(prog, s))
00835                                 return(1);
00836                         s++;
00837                 }
00838         else
00839                 /* We don't -- general case. */
00840                 do {
00841                         if (regtry(prog, s))
00842                                 return(1);
00843                 } while (*s++ != '\0');
00844 
00845         /* Failure. */
00846         return(0);
00847 }
00848 
00849 /*
00850  - regtry - try match at specific point
00851  */
00852 static int                      /* 0 failure, 1 success */
00853 regtry(
00854         regexp *prog,
00855         char *string )
00856 {
00857         register int i;
00858         register char **sp;
00859         register char **ep;
00860 
00861         reginput = string;
00862         regstartp = prog->startp;
00863         regendp = prog->endp;
00864 
00865         sp = prog->startp;
00866         ep = prog->endp;
00867         for (i = NSUBEXP; i > 0; i--) {
00868                 *sp++ = NULL;
00869                 *ep++ = NULL;
00870         }
00871         if (regmatch(prog->program + 1)) {
00872                 prog->startp[0] = string;
00873                 prog->endp[0] = reginput;
00874                 return(1);
00875         } else
00876                 return(0);
00877 }
00878 
00879 /*
00880  - regmatch - main matching routine
00881  *
00882  * Conceptually the strategy is simple:  check to see whether the current
00883  * node matches, call self recursively to see whether the rest matches,
00884  * and then act accordingly.  In practice we make some effort to avoid
00885  * recursion, in particular by going through "ordinary" nodes (that don't
00886  * need to know whether the rest of the match failed) by a loop instead of
00887  * by recursion.
00888  */
00889 static int                      /* 0 failure, 1 success */
00890 regmatch( char *prog )
00891 {
00892         register char *scan;    /* Current node. */
00893         char *next;             /* Next node. */
00894 
00895         scan = prog;
00896 #ifdef DEBUG
00897         if (scan != NULL && regnarrate)
00898                 fprintf(stderr, "%s(\n", regprop(scan));
00899 #endif
00900         while (scan != NULL) {
00901 #ifdef DEBUG
00902                 if (regnarrate)
00903                         fprintf(stderr, "%s...\n", regprop(scan));
00904 #endif
00905                 next = regnext(scan);
00906 
00907                 switch (OP(scan)) {
00908                 case BOL:
00909                         if (reginput != regbol)
00910                                 return(0);
00911                         break;
00912                 case EOL:
00913                         if (*reginput != '\0')
00914                                 return(0);
00915                         break;
00916                 case WORDA:
00917                         /* Must be looking at a letter, digit, or _ */
00918                         if ((!isalnum(*reginput)) && *reginput != '_')
00919                                 return(0);
00920                         /* Prev must be BOL or nonword */
00921                         if (reginput > regbol &&
00922                             (isalnum(reginput[-1]) || reginput[-1] == '_'))
00923                                 return(0);
00924                         break;
00925                 case WORDZ:
00926                         /* Must be looking at non letter, digit, or _ */
00927                         if (isalnum(*reginput) || *reginput == '_')
00928                                 return(0);
00929                         /* We don't care what the previous char was */
00930                         break;
00931                 case ANY:
00932                         if (*reginput == '\0')
00933                                 return(0);
00934                         reginput++;
00935                         break;
00936                 case EXACTLY: {
00937                                 register int len;
00938                                 register char *opnd;
00939 
00940                                 opnd = OPERAND(scan);
00941                                 /* Inline the first character, for speed. */
00942                                 if (*opnd != *reginput)
00943                                         return(0);
00944                                 len = strlen(opnd);
00945                                 if (len > 1 && strncmp(opnd, reginput, len) != 0)
00946                                         return(0);
00947                                 reginput += len;
00948                         }
00949                         break;
00950                 case ANYOF:
00951                         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
00952                                 return(0);
00953                         reginput++;
00954                         break;
00955                 case ANYBUT:
00956                         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
00957                                 return(0);
00958                         reginput++;
00959                         break;
00960                 case NOTHING:
00961                         break;
00962                 case BACK:
00963                         break;
00964                 case OPEN+1:
00965                 case OPEN+2:
00966                 case OPEN+3:
00967                 case OPEN+4:
00968                 case OPEN+5:
00969                 case OPEN+6:
00970                 case OPEN+7:
00971                 case OPEN+8:
00972                 case OPEN+9: {
00973                                 register int no;
00974                                 register char *save;
00975 
00976                                 no = OP(scan) - OPEN;
00977                                 save = reginput;
00978 
00979                                 if (regmatch(next)) {
00980                                         /*
00981                                          * Don't set startp if some later
00982                                          * invocation of the same parentheses
00983                                          * already has.
00984                                          */
00985                                         if (regstartp[no] == NULL)
00986                                                 regstartp[no] = save;
00987                                         return(1);
00988                                 } else
00989                                         return(0);
00990                         }
00991                         break;
00992                 case CLOSE+1:
00993                 case CLOSE+2:
00994                 case CLOSE+3:
00995                 case CLOSE+4:
00996                 case CLOSE+5:
00997                 case CLOSE+6:
00998                 case CLOSE+7:
00999                 case CLOSE+8:
01000                 case CLOSE+9: {
01001                                 register int no;
01002                                 register char *save;
01003 
01004                                 no = OP(scan) - CLOSE;
01005                                 save = reginput;
01006 
01007                                 if (regmatch(next)) {
01008                                         /*
01009                                          * Don't set endp if some later
01010                                          * invocation of the same parentheses
01011                                          * already has.
01012                                          */
01013                                         if (regendp[no] == NULL)
01014                                                 regendp[no] = save;
01015                                         return(1);
01016                                 } else
01017                                         return(0);
01018                         }
01019                         break;
01020                 case BRANCH: {
01021                                 register char *save;
01022 
01023                                 if (OP(next) != BRANCH)         /* No choice. */
01024                                         next = OPERAND(scan);   /* Avoid recursion. */
01025                                 else {
01026                                         do {
01027                                                 save = reginput;
01028                                                 if (regmatch(OPERAND(scan)))
01029                                                         return(1);
01030                                                 reginput = save;
01031                                                 scan = regnext(scan);
01032                                         } while (scan != NULL && OP(scan) == BRANCH);
01033                                         return(0);
01034                                         /* NOTREACHED */
01035                                 }
01036                         }
01037                         break;
01038                 case STAR:
01039                 case PLUS: {
01040                                 register char nextch;
01041                                 register int no;
01042                                 register char *save;
01043                                 register int min;
01044 
01045                                 /*
01046                                  * Lookahead to avoid useless match attempts
01047                                  * when we know what character comes next.
01048                                  */
01049                                 nextch = '\0';
01050                                 if (OP(next) == EXACTLY)
01051                                         nextch = *OPERAND(next);
01052                                 min = (OP(scan) == STAR) ? 0 : 1;
01053                                 save = reginput;
01054                                 no = regrepeat(OPERAND(scan));
01055                                 while (no >= min) {
01056                                         /* If it could work, try it. */
01057                                         if (nextch == '\0' || *reginput == nextch)
01058                                                 if (regmatch(next))
01059                                                         return(1);
01060                                         /* Couldn't or didn't -- back up. */
01061                                         no--;
01062                                         reginput = save + no;
01063                                 }
01064                                 return(0);
01065                         }
01066                         break;
01067                 case END:
01068                         return(1);      /* Success! */
01069                         break;
01070                 default:
01071                         regerror("memory corruption");
01072                         return(0);
01073                         break;
01074                 }
01075 
01076                 scan = next;
01077         }
01078 
01079         /*
01080          * We get here only if there's trouble -- normally "case END" is
01081          * the terminating point.
01082          */
01083         regerror("corrupted pointers");
01084         return(0);
01085 }
01086 
01087 /*
01088  - regrepeat - repeatedly match something simple, report how many
01089  */
01090 static int
01091 regrepeat( char *p )
01092 {
01093         register int count = 0;
01094         register char *scan;
01095         register char *opnd;
01096 
01097         scan = reginput;
01098         opnd = OPERAND(p);
01099         switch (OP(p)) {
01100         case ANY:
01101                 count = strlen(scan);
01102                 scan += count;
01103                 break;
01104         case EXACTLY:
01105                 while (*opnd == *scan) {
01106                         count++;
01107                         scan++;
01108                 }
01109                 break;
01110         case ANYOF:
01111                 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
01112                         count++;
01113                         scan++;
01114                 }
01115                 break;
01116         case ANYBUT:
01117                 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
01118                         count++;
01119                         scan++;
01120                 }
01121                 break;
01122         default:                /* Oh dear.  Called inappropriately. */
01123                 regerror("internal foulup");
01124                 count = 0;      /* Best compromise. */
01125                 break;
01126         }
01127         reginput = scan;
01128 
01129         return(count);
01130 }
01131 
01132 /*
01133  - regnext - dig the "next" pointer out of a node
01134  */
01135 static char *
01136 regnext( register char *p )
01137 {
01138         register int offset;
01139 
01140         if (p == &regdummy)
01141                 return(NULL);
01142 
01143         offset = NEXT(p);
01144         if (offset == 0)
01145                 return(NULL);
01146 
01147         if (OP(p) == BACK)
01148                 return(p-offset);
01149         else
01150                 return(p+offset);
01151 }
01152 
01153 #ifdef DEBUG
01154 
01155 STATIC char *regprop();
01156 
01157 /*
01158  - regdump - dump a regexp onto stdout in vaguely comprehensible form
01159  */
01160 void
01161 regdump( regexp *r )
01162 {
01163         register char *s;
01164         register char op = EXACTLY;     /* Arbitrary non-END op. */
01165         register char *next;
01166 
01167 
01168         s = r->program + 1;
01169         while (op != END) {     /* While that wasn't END last time... */
01170                 op = OP(s);
01171                 printf("%2d%s", s-r->program, regprop(s));      /* Where, what. */
01172                 next = regnext(s);
01173                 if (next == NULL)               /* Next ptr. */
01174                         printf("(0)");
01175                 else 
01176                         printf("(%d)", (s-r->program)+(next-s));
01177                 s += 3;
01178                 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
01179                         /* Literal string, where present. */
01180                         while (*s != '\0') {
01181                                 putchar(*s);
01182                                 s++;
01183                         }
01184                         s++;
01185                 }
01186                 putchar('\n');
01187         }
01188 
01189         /* Header fields of interest. */
01190         if (r->regstart != '\0')
01191                 printf("start `%c' ", r->regstart);
01192         if (r->reganch)
01193                 printf("anchored ");
01194         if (r->regmust != NULL)
01195                 printf("must have \"%s\"", r->regmust);
01196         printf("\n");
01197 }
01198 
01199 /*
01200  - regprop - printable representation of opcode
01201  */
01202 static char *
01203 regprop( char *op )
01204 {
01205         register char *p;
01206         static char buf[50];
01207 
01208         (void) strcpy(buf, ":");
01209 
01210         switch (OP(op)) {
01211         case BOL:
01212                 p = "BOL";
01213                 break;
01214         case EOL:
01215                 p = "EOL";
01216                 break;
01217         case ANY:
01218                 p = "ANY";
01219                 break;
01220         case ANYOF:
01221                 p = "ANYOF";
01222                 break;
01223         case ANYBUT:
01224                 p = "ANYBUT";
01225                 break;
01226         case BRANCH:
01227                 p = "BRANCH";
01228                 break;
01229         case EXACTLY:
01230                 p = "EXACTLY";
01231                 break;
01232         case NOTHING:
01233                 p = "NOTHING";
01234                 break;
01235         case BACK:
01236                 p = "BACK";
01237                 break;
01238         case END:
01239                 p = "END";
01240                 break;
01241         case OPEN+1:
01242         case OPEN+2:
01243         case OPEN+3:
01244         case OPEN+4:
01245         case OPEN+5:
01246         case OPEN+6:
01247         case OPEN+7:
01248         case OPEN+8:
01249         case OPEN+9:
01250                 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
01251                 p = NULL;
01252                 break;
01253         case CLOSE+1:
01254         case CLOSE+2:
01255         case CLOSE+3:
01256         case CLOSE+4:
01257         case CLOSE+5:
01258         case CLOSE+6:
01259         case CLOSE+7:
01260         case CLOSE+8:
01261         case CLOSE+9:
01262                 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
01263                 p = NULL;
01264                 break;
01265         case STAR:
01266                 p = "STAR";
01267                 break;
01268         case PLUS:
01269                 p = "PLUS";
01270                 break;
01271         case WORDA:
01272                 p = "WORDA";
01273                 break;
01274         case WORDZ:
01275                 p = "WORDZ";
01276                 break;
01277         default:
01278                 regerror("corrupted opcode");
01279                 break;
01280         }
01281         if (p != NULL)
01282                 (void) strcat(buf, p);
01283         return(buf);
01284 }
01285 #endif
01286 
01287 /*
01288  * The following is provided for those people who do not have strcspn() in
01289  * their C libraries.  They should get off their butts and do something
01290  * about it; at least one public-domain implementation of those (highly
01291  * useful) string routines has been published on Usenet.
01292  */
01293 #ifdef STRCSPN
01294 /*
01295  * strcspn - find length of initial segment of s1 consisting entirely
01296  * of characters not from s2
01297  */
01298 
01299 static int
01300 strcspn(
01301         char *s1,
01302         char *s2 )
01303 {
01304         register char *scan1;
01305         register char *scan2;
01306         register int count;
01307 
01308         count = 0;
01309         for (scan1 = s1; *scan1 != '\0'; scan1++) {
01310                 for (scan2 = s2; *scan2 != '\0';)       /* ++ moved down. */
01311                         if (*scan1 == *scan2++)
01312                                 return(count);
01313                 count++;
01314         }
01315         return(count);
01316 }
01317 #endif

Generated on Mon Nov 8 17:07:47 2004 for MPT by  doxygen 1.3.9.1