core/vul/vul_reg_exp.cxx
Go to the documentation of this file.
00001 // This is core/vul/vul_reg_exp.cxx
00002 #include "vul_reg_exp.h"
00003 //:
00004 // \file
00005 //
00006 // Copyright (C) 1991 Texas Instruments Incorporated.
00007 //
00008 // Permission is granted to any individual or institution to use, copy, modify,
00009 // and distribute this software, provided that this complete copyright and
00010 // permission notice is maintained, intact, in all copies and supporting
00011 // documentation.
00012 //
00013 // Texas Instruments Incorporated provides this software "as is" without
00014 // express or implied warranty.
00015 //
00016 //
00017 // Created: MNF Jun 13, 1989 Initial Design and Implementation
00018 // Updated: LGO Aug 09, 1989 Inherit from Generic
00019 // Updated: MBN Sep 07, 1989 Added conditional exception handling
00020 // Updated: MBN Dec 15, 1989 Sprinkled "const" qualifiers all over the place!
00021 // Updated: DLS Mar 22, 1991 New lite version
00022 //
00023 // This  is the header file  for the regular  expression class.   An object of
00024 // this class contains a regular expression,  in  a special "compiled" format.
00025 // This  compiled format consists  of  several slots   all kept as the objects
00026 // private data.  The vul_reg_exp class  provides a convenient  way  to  represent
00027 // regular  expressions.  It makes it easy   to search  for  the  same regular
00028 // expression in many different strings without having to  compile a string to
00029 // regular expression format more than necessary.
00030 //
00031 // A regular  expression allows a programmer to  specify complex patterns that
00032 // can be searched for  and  matched against the  character string of a String
00033 // object.  In  its  simplest case, a   regular expression  is a  sequence  of
00034 // characters with which you can search for exact character matches.  However,
00035 // many times you may not know the exact sequence you want to find, or you may
00036 // only want to find a match at the beginning or end of  a String.  The vul_reg_exp
00037 // object  allows specification of  such patterns by  utilizing the  following
00038 // regular  expression  meta-characters   (note   that  more  one  of  these
00039 // meta-characters  can  be used in a single  regular  expression in  order to
00040 // create complex search patterns):
00041 //
00042 // -   ^  Match at beginning of line
00043 // -   $  Match at end of line
00044 // -   .  Match any single character
00045 // -   [ ]  Match any one character inside the brackets
00046 // -   [^ ] Match any character NOT inside the brackets
00047 // -   -  Match any character in range on either side of dash
00048 // -   *  Match preceding pattern zero or more times
00049 // -   +  Match preceding pattern one or more times
00050 // -   ?  Match preceding pattern zero or once only
00051 // -   ()   Save a matched expression and use it in a further match.
00052 //
00053 // There are three constructors for vul_reg_exp.  One  just creates an empty vul_reg_exp
00054 // object.  Another creates a vul_reg_exp object  and initializes it with a regular
00055 // expression  that is given  in  the form of a   char*.   The  third  takes a
00056 // reference  to  a vul_reg_exp  object  as an  argument  and creates an object
00057 // initialized with the information from the given vul_reg_exp object.
00058 //
00059 // The  find  member function  finds   the  first  occurrence   of  the regular
00060 // expression of that object in the string given to find as an argument.  Find
00061 // returns a boolean, and  if true,  mutates  the private  data appropriately.
00062 // Find sets pointers to the beginning and end of  the thing last  found, they
00063 // are pointers into the actual string  that was searched.   The start and end
00064 // member functions  return indices  into the searched string that  correspond
00065 // to the beginning   and  end pointers  respectively.   The  compile member
00066 // function takes a char* and puts the  compiled version of the char* argument
00067 // into the object's private data fields.  The == and  != operators only check
00068 // the  to see  if   the compiled  regular  expression   is the same, and  the
00069 // deep_equal functions also checks  to see if the  start and end pointers are
00070 // the same.  The is_valid  function returns false if  program is set to NULL,
00071 // (i.e. there is no valid compiled expression). The set_invalid function sets
00072 // the  program to NULL  (Warning: this deletes the compiled  expression). The
00073 // following examples may help clarify regular expression usage:
00074 //
00075 //   *  The regular expression  "^hello" matches  a "hello"  only at  the
00076 //    beginning of a  line.  It would match "hello  there" but not "hi,
00077 //    hello there".
00078 //
00079 //   *  The regular expression "long$" matches a  "long"  only at the end
00080 //    of a line. It would match "so long\0", but not "long ago".
00081 //
00082 //   *  The regular expression "t..t..g"  will match anything that  has a
00083 //    "t" then any two characters, another "t", any  two characters and
00084 //    then a "g".   It will match  "testing", or "test again" but would
00085 //    not match "toasting"
00086 //
00087 //   *  The regular  expression "[1-9ab]" matches any  number one through
00088 //    nine, and the characters  "a" and  "b".  It would match "hello 1"
00089 //    or "begin", but would not match "no-match".
00090 //
00091 //   *  The  regular expression "[^1-9ab]"  matches any character that is
00092 //    not a number one  through nine, or  an "a" or "b".   It would NOT
00093 //    match "hello 1" or "begin", but would match "no-match".
00094 //
00095 //   *  The regular expression "br* " matches  something that begins with
00096 //    a "b", is followed by zero or more "r"s, and ends in a space.  It
00097 //    would match "brrrrr ", and "b ", but would not match "brrh ".
00098 //
00099 //   *  The regular expression "br+ " matches something  that begins with
00100 //    a "b", is followed by one or more "r"s, and ends in  a space.  It
00101 //    would match "brrrrr ",  and  "br ", but would not  match "b  " or
00102 //    "brrh ".
00103 //
00104 //   *  The regular expression "br? " matches  something that begins with
00105 //    a "b", is followed by zero or one "r"s, and ends in  a space.  It
00106 //    would  match  "br ", and "b  ", but would not match  "brrrr "  or
00107 //    "brrh ".
00108 //
00109 //   *  The regular expression "(..p)b" matches  something ending with pb
00110 //    and beginning with whatever the two characters before the first p
00111 //    encountered in the line were. It would find  "repb" in "rep drepa
00112 //    qrepb".  The regular expression "(..p)a"  would find "repa qrepb"
00113 //    in "rep drepa qrepb"
00114 //
00115 //   *  The regular expression "d(..p)" matches something ending  with p,
00116 //    beginning with d, and having  two characters  in between that are
00117 //    the same as the two characters before  the first p encountered in
00118 //    the line.  It would match "drepa qrepb" in "rep drepa qrepb".
00119 
00120 #include <vcl_iostream.h>
00121 #include <vcl_cstring.h> // for strcspn()
00122 #include <vcl_cstddef.h> // for vcl_ptrdiff_t
00123 
00124 //: Copies the given regular expression.
00125 
00126 vul_reg_exp::vul_reg_exp (vul_reg_exp const& rxp)
00127 {
00128   int ind;
00129   this->progsize = rxp.progsize;      // Copy regular expression size
00130   this->program = new char[this->progsize]; // Allocate storage
00131   for (ind=this->progsize; ind-- != 0;)   // Copy regular expression
00132   this->program[ind] = rxp.program[ind];
00133   this->startp[0] = rxp.startp[0];      // Copy pointers into last
00134   this->endp[0] = rxp.endp[0];        // Successful "find" operation
00135   this->regmust = rxp.regmust;        // Copy field
00136   if (rxp.regmust != NULL) {
00137   char* dum = rxp.program;
00138   ind = 0;
00139   while (dum != rxp.regmust) {
00140     ++dum;
00141     ++ind;
00142   }
00143   this->regmust = this->program + ind;
00144   }
00145   this->regstart = rxp.regstart;      // Copy starting index
00146   this->reganch = rxp.reganch;        // Copy remaining private data
00147   this->regmlen = rxp.regmlen;        // Copy remaining private data
00148 }
00149 
00150 
00151 //: Returns true if two regular expressions have the same compiled program for pattern matching.
00152 
00153 bool vul_reg_exp::operator== (vul_reg_exp const& rxp) const
00154 {
00155   if (this != &rxp) {           // Same address?
00156   int ind = this->progsize;     // Get regular expression size
00157   if (ind != rxp.progsize)      // If different size regexp
00158     return false;               // Return failure
00159   while (ind-- != 0)            // Else while still characters
00160     if (this->program[ind] != rxp.program[ind])// If regexp are different
00161       return false;             // Return failure
00162   }
00163   return true;                  // Else same, return success
00164 }
00165 
00166 
00167 //: Returns true if have the same compiled regular expressions and the same start and end pointers.
00168 
00169 bool vul_reg_exp::deep_equal (vul_reg_exp const& rxp) const
00170 {
00171   int ind = this->progsize;     // Get regular expression size
00172   if (ind != rxp.progsize)      // If different size regexp
00173   return false;                 // Return failure
00174   while (ind-- != 0)            // Else while still characters
00175   if (this->program[ind] != rxp.program[ind]) // If regexp are different
00176     return false;               // Return failure
00177   return this->startp[0] == rxp.startp[0] &&  // Else if same start/end ptrs,
00178        this->endp[0] == rxp.endp[0];    // Return true
00179 }
00180 
00181 
00182 // The remaining code in this file is derived from the  regular expression code
00183 // whose  copyright statement appears  below.  It has been  changed to work
00184 // with the class concepts of C++ and COOL.
00185 
00186 //
00187 // compile and find
00188 //
00189 // Copyright (c) 1986 by University of Toronto.
00190 // Written by Henry Spencer.  Not derived from licensed software.
00191 //
00192 // Permission is granted to anyone to use this software for any
00193 // purpose on any computer system, and to redistribute it freely,
00194 // subject to the following restrictions:
00195 //
00196 // 1. The author is not responsible for the consequences of use of
00197 //  this software, no matter how awful, even if they arise
00198 //  from defects in it.
00199 //
00200 // 2. The origin of this software must not be misrepresented, either
00201 //  by explicit claim or by omission.
00202 //
00203 // 3. Altered versions must be plainly marked as such, and must not
00204 //  be misrepresented as being the original software.
00205 //
00206 // Beware that some of this code is subtly aware of the way operator
00207 // precedence is structured in regular expressions.  Serious changes in
00208 // regular-expression syntax might require a total rethink.
00209 //
00210 
00211 //
00212 // The "internal use only" fields in regexp.h are present to pass info from
00213 // compile to execute that permits the execute phase to run lots faster on
00214 // simple cases.  They are:
00215 //
00216 // regstart  char that must begin a match; '\0' if none obvious
00217 // reganch   is the match anchored (at beginning-of-line only)?
00218 // regmust   string (pointer into program) that match must include, or NULL
00219 // regmlen   length of regmust string
00220 //
00221 // Regstart and reganch permit very fast decisions on suitable starting points
00222 // for a match, cutting down the work a lot.  Regmust permits fast rejection
00223 // of lines that cannot possibly match.  The regmust tests are costly enough
00224 // that compile() supplies a regmust only if the r.e. contains something
00225 // potentially expensive (at present, the only such thing detected is * or +
00226 // at the start of the r.e., which can involve a lot of backup).  Regmlen is
00227 // supplied because the test in find() needs it and compile() is computing
00228 // it anyway.
00229 //
00230 
00231 //
00232 // Structure for regexp "program".  This is essentially a linear encoding
00233 // of a nondeterministic finite-state machine (aka syntax charts or
00234 // "railroad normal form" in parsing technology).  Each node is an opcode
00235 // plus a "next" pointer, possibly plus an operand.  "Next" pointers of
00236 // all nodes except BRANCH implement concatenation; a "next" pointer with
00237 // a BRANCH on both ends of it is connecting two alternatives.  (Here we
00238 // have one of the subtle syntax dependencies:  an individual BRANCH (as
00239 // opposed to a collection of them) is never concatenated with anything
00240 // because of operator precedence.)  The operand of some types of node is
00241 // a literal string; for others, it is a node leading into a sub-FSM.  In
00242 // particular, the operand of a BRANCH node is the first node of the branch.
00243 // (NB this is \e not a tree structure:  the tail of the branch connects
00244 // to the thing following the set of BRANCHes.)  The opcodes are:
00245 //
00246 
00247 // definition   number  opnd?   meaning
00248 #define END   0  // no   End of program.
00249 #define BOL   1  // no   Match "" at beginning of line.
00250 #define EOL   2  // no   Match "" at end of line.
00251 #define ANY   3  // no   Match any one character.
00252 #define ANYOF   4  // str  Match any character in this string.
00253 #define ANYBUT  5  // str  Match any character not in this string.
00254 #define BRANCH  6  // node Match this alternative, or the next...
00255 #define BACK  7  // no   Match "", "next" ptr points backward.
00256 #define EXACTLY 8  // str  Match this string.
00257 #define NOTHING 9  // no   Match empty string.
00258 #define STAR  10   // node Match this (simple) thing 0 or more times.
00259 #define PLUS  11   // node Match this (simple) thing 1 or more times.
00260 #define OPEN  20   // no   Mark this point in input as start of #n.
00261 // OPEN+1 is number 1, etc.
00262 #define CLOSE   30   // no   Analogous to OPEN.
00263 
00264 //
00265 // Opcode notes:
00266 //
00267 // BRANCH     The set of branches constituting a single choice are hooked
00268 //        together with their "next" pointers, since precedence prevents
00269 //        anything being concatenated to any individual branch.  The
00270 //        "next" pointer of the last BRANCH in a choice points to the
00271 //        thing following the whole choice.  This is also where the
00272 //        final "next" pointer of each individual branch points; each
00273 //        branch starts with the operand node of a BRANCH node.
00274 //
00275 // BACK     Normal "next" pointers all implicitly point forward; BACK
00276 //        exists to make loop structures possible.
00277 //
00278 // STAR,PLUS  '?', and complex '*' and '+', are implemented as circular
00279 //        BRANCH structures using BACK.  Simple cases (one character
00280 //        per match) are implemented with STAR and PLUS for speed
00281 //        and to minimize recursive plunges.
00282 //
00283 // OPEN,CLOSE   ...are numbered at compile time.
00284 //
00285 
00286 //
00287 // A node is one char of opcode followed by two chars of "next" pointer.
00288 // "Next" pointers are stored as two 8-bit pieces, high order first.  The
00289 // value is a positive offset from the opcode of the node containing it.
00290 // An operand, if any, simply follows the node.  (Note that much of the
00291 // code generation knows about this implicit relationship.)
00292 //
00293 // Using two bytes for the "next" pointer is vast overkill for most things,
00294 // but allows patterns to get big without disasters.
00295 //
00296 
00297 #define OP(p)       (*(p))
00298 #define NEXT(p)     (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
00299 #define OPERAND(p)    ((p) + 3)
00300 
00301 const unsigned char MAGIC = 0234;
00302 
00303 //
00304 // Utility definitions.
00305 //
00306 #define UCHARAT(p)    ((const unsigned char*)(p))[0]
00307 
00308 #define FAIL(m) { regerror(m); return NULL; }
00309 #define ISMULT(c)     ((c) == '*' || (c) == '+' || (c) == '?')
00310 #define META  "^$.[()|?+*\\"
00311 
00312 //
00313 // Flags to be passed up and down.
00314 //
00315 #define HASWIDTH    01    // Known never to match null string.
00316 #define SIMPLE      02    // Simple enough to be STAR/PLUS operand.
00317 #define SPSTART     04    // Starts with * or +.
00318 #define WORST       0     // Worst case.
00319 
00320 
00321 //: Return an expression that will match precisely c
00322 // The returned string is owned by the function, and
00323 // will be overwritten in subsequent calls.
00324 const char * vul_reg_exp::protect(char c)
00325 {
00326   //: This should be in thread local storage.
00327   static char pattern[3];
00328 
00329   if (vcl_strchr(META, c) != 0)
00330   {
00331     pattern[0] = '\\';
00332     pattern[1] = c;
00333     pattern[2] = 0;
00334   }
00335   else
00336   {
00337     pattern[0] = c;
00338     pattern[1] = 0;
00339   }
00340   return pattern;
00341 }
00342 
00343 
00344 /////////////////////////////////////////////////////////////////////////
00345 //
00346 //  COMPILE AND ASSOCIATED FUNCTIONS
00347 //
00348 /////////////////////////////////////////////////////////////////////////
00349 
00350 
00351 //
00352 // Global work variables for compile().
00353 //
00354 static const char* regparse; // Input-scan pointer.
00355 static       int   regnpar; // () count.
00356 static       char  regdummy;
00357 static       char* regcode; // Code-emit pointer; &regdummy = don't.
00358 static       long  regsize; // Code size.
00359 
00360 //
00361 // Forward declarations for compile()'s friends.
00362 //
00363 static       char* reg (int, int*);
00364 static       char* regbranch (int*);
00365 static       char* regpiece (int*);
00366 static       char* regatom (int*);
00367 static       char* regnode (char);
00368 static const char* regnext (register const char*);
00369 static       char* regnext (register char*);
00370 static void        regc (unsigned char);
00371 static void        reginsert (char, char*);
00372 static void        regtail (char*, const char*);
00373 static void        regoptail (char*, const char*);
00374 
00375 
00376 //
00377 // We can't allocate space until we know how big the compiled form will be,
00378 // but we can't compile it (and thus know how big it is) until we've got a
00379 // place to put the code.  So we cheat:  we compile it twice, once with code
00380 // generation turned off and size counting turned on, and once "for real".
00381 // This also means that we don't allocate space until we are sure that the
00382 // thing really will compile successfully, and we never have to move the
00383 // code and thus invalidate pointers into it.  (Note that it has to be in
00384 // one piece because free() must be able to free it all.)
00385 //
00386 // Beware that the optimization-preparation code in here knows about some
00387 // of the structure of the compiled regexp.
00388 //
00389 
00390 
00391 //: Compile a regular expression into internal code for later pattern matching.
00392 
00393 void vul_reg_exp::compile (char const* exp)
00394 {
00395   register const char*   scan;
00396   register const char*   longest;
00397   register unsigned long len;
00398            int           flags;
00399 
00400   if (exp == NULL) {
00401     //RAISE Error, SYM(vul_reg_exp), SYM(No_Expr),
00402     vcl_cout << "vul_reg_exp::compile(): No expression supplied.\n";
00403     return;
00404   }
00405 
00406   // First pass: determine size, legality.
00407   regparse = exp;
00408   regnpar = 1;
00409   regsize = 0L;
00410   regcode = &regdummy;
00411   regc(MAGIC);
00412   if (!reg(0, &flags))
00413   {
00414     vcl_cout << "vul_reg_exp::compile(): Error in compile.\n";
00415     return;
00416   }
00417   this->startp[0] = this->endp[0] = this->searchstring = NULL;
00418 
00419   // Small enough for pointer-storage convention?
00420   if (regsize >= 32767L) // Probably could be 65535L.
00421   {
00422     //RAISE Error, SYM(vul_reg_exp), SYM(Expr_Too_Big),
00423     vcl_cout << "vul_reg_exp::compile(): Expression too big.\n";
00424     return;
00425   }
00426 
00427   // Allocate space.
00428 //#ifndef VCL_WIN32
00429   if (this->program != NULL) delete [] this->program;
00430 //#endif
00431   this->program = new char[regsize];
00432   this->progsize = (int) regsize;
00433 
00434   if (this->program == NULL) {
00435     //RAISE Error, SYM(vul_reg_exp), SYM(Out_Of_Memory),
00436     vcl_cout << "vul_reg_exp::compile(): Out of memory.\n";
00437     return;
00438   }
00439 
00440   // Second pass: emit code.
00441   regparse = exp;
00442   regnpar = 1;
00443   regcode = this->program;
00444   regc(MAGIC);
00445   reg(0, &flags);
00446 
00447   // Dig out information for optimizations.
00448   this->regstart = '\0'; // Worst-case defaults.
00449   this->reganch = 0;
00450   this->regmust = NULL;
00451   this->regmlen = 0;
00452   scan = this->program + 1; // First BRANCH.
00453   if (OP(regnext(scan)) == END) // Only one top-level choice.
00454   {
00455     scan = OPERAND(scan);
00456 
00457     // Starting-point info.
00458     if (OP(scan) == EXACTLY)
00459       this->regstart = *OPERAND(scan);
00460     else if (OP(scan) == BOL)
00461       this->reganch++;
00462 
00463      //
00464      // If there's something expensive in the r.e., find the longest
00465      // literal string that must appear and make it the regmust.  Resolve
00466      // ties in favor of later strings, since the regstart check works
00467      // with the beginning of the r.e. and avoiding duplication
00468      // strengthens checking.  Not a strong reason, but sufficient in the
00469      // absence of others.
00470      //
00471     if (flags & SPSTART) {
00472       longest = NULL;
00473       len = 0L;
00474       for (; scan != NULL; scan = regnext(scan))
00475         if (OP(scan) == EXACTLY && vcl_strlen(OPERAND(scan)) >= len) {
00476           longest = OPERAND(scan);
00477           len = (unsigned long)vcl_strlen(OPERAND(scan));
00478         }
00479       this->regmust = longest;
00480       this->regmlen = (int)len;
00481     }
00482   }
00483 }
00484 
00485 
00486 // regular expression, i.e. main body or parenthesized thing
00487 //
00488 // Caller must absorb opening parenthesis.
00489 //
00490 // Combining parenthesis handling with the base level of regular expression
00491 // is a trifle forced, but the need to tie the tails of the branches to what
00492 // follows makes it hard to avoid.
00493 //
00494 static char* reg (int paren, int *flagp)
00495 {
00496   register char* ret;
00497   register char* br;
00498   register char* ender;
00499   register int   parno =0;
00500            int   flags;
00501 
00502   *flagp = HASWIDTH; // Tentatively.
00503 
00504   // Make an OPEN node, if parenthesized.
00505   if (paren) {
00506     if (regnpar >= vul_reg_exp_nsubexp) {
00507       //RAISE Error, SYM(vul_reg_exp), SYM(Too_Many_Parens),
00508       vcl_cout << "vul_reg_exp::compile(): Too many parentheses.\n";
00509       return 0;
00510     }
00511     parno = regnpar;
00512     regnpar++;
00513     ret = regnode(char(OPEN + parno));
00514   }
00515   else
00516     ret = NULL;
00517 
00518   // Pick up the branches, linking them together.
00519   br = regbranch(&flags);
00520   if (br == NULL)
00521     return NULL;
00522   if (ret != NULL)
00523     regtail(ret, br); // OPEN -> first.
00524   else
00525     ret = br;
00526   if (!(flags & HASWIDTH))
00527     *flagp &= ~HASWIDTH;
00528   *flagp |= flags & SPSTART;
00529   while (*regparse == '|')
00530   {
00531     regparse++;
00532     br = regbranch(&flags);
00533     if (br == NULL)
00534       return NULL;
00535     regtail(ret, br); // BRANCH -> BRANCH.
00536     if (!(flags & HASWIDTH))
00537       *flagp &= ~HASWIDTH;
00538     *flagp |= flags & SPSTART;
00539   }
00540 
00541   // Make a closing node, and hook it on the end.
00542   ender = regnode((paren) ? char(CLOSE + parno) : char(END));
00543   regtail(ret, ender);
00544 
00545   // Hook the tails of the branches to the closing node.
00546   for (br = ret; br != NULL; br = regnext(br))
00547     regoptail(br, ender);
00548 
00549   // Check for proper termination.
00550   if (paren && *regparse++ != ')') {
00551     //RAISE Error, SYM(vul_reg_exp), SYM(Unmatched_Parens),
00552     vcl_cout << "vul_reg_exp::compile(): Unmatched parentheses.\n";
00553     return 0;
00554   }
00555   else if (!paren && *regparse != '\0') {
00556     if (*regparse == ')') {
00557       //RAISE Error, SYM(vul_reg_exp), SYM(Unmatched_Parens),
00558       vcl_cout << "vul_reg_exp::compile(): Unmatched parentheses.\n";
00559       return 0;
00560     }
00561     else {
00562       //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
00563       vcl_cout << "vul_reg_exp::compile(): Internal error.\n";
00564       return 0;
00565     }
00566     // NOTREACHED
00567   }
00568   return ret;
00569 }
00570 
00571 
00572 // one alternative of an | operator
00573 //
00574 // Implements the concatenation operator.
00575 //
00576 static char* regbranch (int *flagp)
00577 {
00578   register char* ret;
00579   register char* chain;
00580   register char* latest;
00581   int            flags;
00582 
00583   *flagp = WORST; // Tentatively.
00584 
00585   ret = regnode(BRANCH);
00586   chain = NULL;
00587   while (*regparse != '\0' && *regparse != '|' && *regparse != ')')
00588   {
00589     latest = regpiece(&flags);
00590     if (latest == NULL)
00591       return NULL;
00592     *flagp |= flags & HASWIDTH;
00593     if (chain == NULL) // First piece.
00594       *flagp |= flags & SPSTART;
00595     else
00596       regtail(chain, latest);
00597     chain = latest;
00598   }
00599   if (chain == NULL) // Loop ran zero times.
00600     regnode(NOTHING);
00601 
00602   return ret;
00603 }
00604 
00605 
00606 //
00607 // something followed by possible [*+?]
00608 //
00609 // Note that the branching code sequences used for ? and the general cases
00610 // of * and + are somewhat optimized:  they use the same NOTHING node as
00611 // both the endmarker for their branch list and the body of the last branch.
00612 // It might seem that this node could be dispensed with entirely, but the
00613 // endmarker role is not redundant.
00614 //
00615 static char* regpiece (int *flagp)
00616 {
00617   register char* ret;
00618   register char  op;
00619   register char* next;
00620   int            flags;
00621 
00622   ret = regatom(&flags);
00623   if (ret == NULL)
00624     return NULL;
00625 
00626   op = *regparse;
00627   if (!ISMULT(op)) {
00628     *flagp = flags;
00629     return ret;
00630   }
00631 
00632   if (!(flags & HASWIDTH) && op != '?') {
00633     //RAISE Error, SYM(vul_reg_exp), SYM(Empty_Operand),
00634     vcl_cout << "vul_reg_exp::compile() : *+ operand could be empty.\n";
00635     return 0;
00636   }
00637   *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
00638 
00639   if (op == '*' && (flags & SIMPLE))
00640     reginsert(STAR, ret);
00641   else if (op == '*') {
00642     // Emit x* as (x&|), where & means "self".
00643     reginsert(BRANCH, ret); // Either x
00644     regoptail(ret, regnode(BACK)); // and loop
00645     regoptail(ret, ret); // back
00646     regtail(ret, regnode(BRANCH)); // or
00647     regtail(ret, regnode(NOTHING)); // null.
00648   }
00649   else if (op == '+' && (flags & SIMPLE))
00650     reginsert(PLUS, ret);
00651   else if (op == '+') {
00652     // Emit x+ as x(&|), where & means "self".
00653     next = regnode(BRANCH); // Either
00654     regtail(ret, next);
00655     regtail(regnode(BACK), ret); // loop back
00656     regtail(next, regnode(BRANCH)); // or
00657     regtail(ret, regnode(NOTHING)); // null.
00658   }
00659   else if (op == '?') {
00660     // Emit x? as (x|)
00661     reginsert(BRANCH, ret); // Either x
00662     regtail(ret, regnode(BRANCH)); // or
00663     next = regnode(NOTHING);// null.
00664     regtail(ret, next);
00665     regoptail(ret, next);
00666   }
00667   regparse++;
00668   if (ISMULT(*regparse)) {
00669     //RAISE Error, SYM(vul_reg_exp), SYM(Nested_Operand),
00670     vcl_cout << "vul_reg_exp::compile(): Nested *?+.\n";
00671     return 0;
00672   }
00673   return ret;
00674 }
00675 
00676 
00677 // the lowest level
00678 //
00679 // Optimization:  gobbles an entire sequence of ordinary characters so that
00680 // it can turn them into a single node, which is smaller to store and
00681 // faster to run.  Backslashed characters are exceptions, each becoming a
00682 // separate node; the code is simpler that way and it's not worth fixing.
00683 //
00684 static char* regatom (int *flagp)
00685 {
00686   register char* ret;
00687        int   flags;
00688 
00689   *flagp = WORST; // Tentatively.
00690 
00691   switch (*regparse++)
00692   {
00693    case '^':
00694     ret = regnode(BOL);
00695     break;
00696    case '$':
00697     ret = regnode(EOL);
00698     break;
00699    case '.':
00700     ret = regnode(ANY);
00701     *flagp |= HASWIDTH | SIMPLE;
00702     break;
00703    case '[':
00704    {
00705     register int  rxpclass;
00706     register int  rxpclassend;
00707 
00708     if (*regparse == '^') { // Complement of range.
00709       ret = regnode(ANYBUT);
00710       regparse++;
00711     }
00712     else
00713       ret = regnode(ANYOF);
00714     if (*regparse == ']' || *regparse == '-')
00715       regc(*regparse++);
00716     while (*regparse != '\0' && *regparse != ']')
00717     {
00718       if (*regparse == '-')
00719       {
00720         regparse++;
00721         if (*regparse == ']' || *regparse == '\0')
00722           regc('-');
00723         else {
00724           rxpclass = UCHARAT(regparse - 2) + 1;
00725           rxpclassend = UCHARAT(regparse);
00726           if (rxpclass > rxpclassend + 1) {
00727              //RAISE Error, SYM(vul_reg_exp), SYM(Invalid_Range),
00728              vcl_cout << "vul_reg_exp::compile(): Invalid range in [].\n";
00729              return 0;
00730           }
00731           for (; rxpclass <= rxpclassend; rxpclass++)
00732             regc(static_cast<unsigned char>(rxpclass));
00733           regparse++;
00734         }
00735       }
00736       else
00737         regc(*regparse++);
00738     }
00739     regc('\0');
00740     if (*regparse != ']') {
00741       //RAISE Error, SYM(vul_reg_exp), SYM(Unmatched_Bracket),
00742       vcl_cout << "vul_reg_exp::compile(): Unmatched [].\n";
00743       return 0;
00744     }
00745     regparse++;
00746     *flagp |= HASWIDTH | SIMPLE;
00747     break;
00748    }
00749    case '(':
00750     ret = reg(1, &flags);
00751     if (ret == NULL)
00752       return NULL;
00753     *flagp |= flags & (HASWIDTH | SPSTART);
00754     break;
00755    case '\0':
00756    case '|':
00757    case ')':
00758     //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
00759     vcl_cout << "vul_reg_exp::compile(): Internal error.\n"; // Never here
00760     return 0;
00761    case '?':
00762    case '+':
00763    case '*':
00764     //RAISE Error, SYM(vul_reg_exp), SYM(No_Operand),
00765     vcl_cout << "vul_reg_exp::compile(): ?+* follows nothing.\n";
00766     return 0;
00767    case '\\':
00768     if (*regparse == '\0') {
00769       //RAISE Error, SYM(vul_reg_exp), SYM(Trailing_Backslash),
00770       vcl_cout << "vul_reg_exp::compile(): Trailing backslash.\n";
00771       return 0;
00772     }
00773     ret = regnode(EXACTLY);
00774     regc(*regparse++);
00775     regc('\0');
00776     *flagp |= HASWIDTH | SIMPLE;
00777     break;
00778    default:
00779    {
00780     register int  len;
00781     register char   ender;
00782 
00783     regparse--;
00784     len = (int)vcl_strcspn(regparse, META);
00785     if (len <= 0) {
00786       //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
00787       vcl_cout << "vul_reg_exp::compile(): Internal error.\n";
00788       return 0;
00789     }
00790     ender = *(regparse + len);
00791     if (len > 1 && ISMULT(ender))
00792       len--; // Back off clear of ?+* operand.
00793     *flagp |= HASWIDTH;
00794     if (len == 1)
00795       *flagp |= SIMPLE;
00796     ret = regnode(EXACTLY);
00797     while (len > 0) {
00798       regc(*regparse++);
00799       len--;
00800     }
00801     regc('\0');
00802     break;
00803    }
00804   }
00805   return ret;
00806 }
00807 
00808 
00809 // emit a node
00810 // Location.
00811 //
00812 static char* regnode (char op)
00813 {
00814   register char* ret;
00815   register char* ptr;
00816 
00817   ret = regcode;
00818   if (ret == &regdummy) {
00819     regsize += 3;
00820     return ret;
00821   }
00822 
00823   ptr = ret;
00824   *ptr++ = op;
00825   *ptr++ = '\0'; // Null "next" pointer.
00826   *ptr++ = '\0';
00827   regcode = ptr;
00828 
00829   return ret;
00830 }
00831 
00832 
00833 // emit (if appropriate) a byte of code
00834 //
00835 static void regc (unsigned char b)
00836 {
00837   if (regcode != &regdummy)
00838     *regcode++ = b;
00839   else
00840     regsize++;
00841 }
00842 
00843 
00844 // insert an operator in front of already-emitted operand
00845 //
00846 // Means relocating the operand.
00847 //
00848 static void reginsert (char op, char* opnd)
00849 {
00850   register char* src;
00851   register char* dst;
00852   register char* place;
00853 
00854   if (regcode == &regdummy) {
00855     regsize += 3;
00856     return;
00857   }
00858 
00859   src = regcode;
00860   regcode += 3;
00861   dst = regcode;
00862   while (src > opnd)
00863     *--dst = *--src;
00864 
00865   place = opnd; // Op node, where operand used to be.
00866   *place++ = op;
00867   *place++ = '\0';
00868   *place   = '\0';
00869 }
00870 
00871 
00872 // set the next-pointer at the end of a node chain
00873 //
00874 static void regtail (char* p, const char* val)
00875 {
00876   register char* scan;
00877   register char* temp;
00878   register vcl_ptrdiff_t  offset;
00879 
00880   if (p == &regdummy)
00881     return;
00882 
00883   // Find last node.
00884   scan = p;
00885   for (;;) {
00886     temp = regnext(scan);
00887     if (temp == NULL)
00888       break;
00889     scan = temp;
00890   }
00891 
00892   if (OP(scan) == BACK)
00893     offset = (const char*)scan - val;
00894   else
00895     offset = val - scan;
00896   *(scan + 1) = (char)((offset >> 8) & 0377);
00897   *(scan + 2) = (char)(offset & 0377);
00898 }
00899 
00900 
00901 // regtail on operand of first argument; nop if operandless
00902 //
00903 static void regoptail (char* p, const char* val)
00904 {
00905   // "Operandless" and "op != BRANCH" are synonymous in practice.
00906   if (p == NULL || p == &regdummy || OP(p) != BRANCH)
00907     return;
00908   regtail(OPERAND(p), val);
00909 }
00910 
00911 
00912 ////////////////////////////////////////////////////////////////////////
00913 //
00914 //  find and friends
00915 //
00916 ////////////////////////////////////////////////////////////////////////
00917 
00918 
00919 //
00920 // Global work variables for find().
00921 //
00922 static const char*  reginput; // String-input pointer.
00923 static const char*  regbol; // Beginning of input, for ^ check.
00924 static const char* *regstartp; // Pointer to startp array.
00925 static const char* *regendp; // Ditto for endp.
00926 
00927 //
00928 // Forwards.
00929 //
00930 static int regtry (const char*, const char* *, const char* *, const char*);
00931 static int regmatch (const char*);
00932 static int regrepeat (const char*);
00933 
00934 #ifdef DEBUG
00935 int      regnarrate = 0;
00936 void     regdump ();
00937 static char* regprop ();
00938 #endif
00939 
00940 bool vul_reg_exp::find (vcl_string const& s)
00941 {
00942   return find(s.c_str());
00943 }
00944 
00945 
00946 //: Matches the regular expression to the given string.
00947 // Returns true if found, and sets start and end indexes accordingly.
00948 
00949 bool vul_reg_exp::find (char const* string)
00950 {
00951   register const char* s;
00952 
00953   this->searchstring = string;
00954 
00955    // Check validity of program.
00956   if (!this->program || UCHARAT(this->program) != MAGIC) {
00957     //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
00958     vcl_cout << "vul_reg_exp::find(): Compiled regular expression corrupted.\n";
00959     return 0;
00960   }
00961 
00962   // If there is a "must appear" string, look for it.
00963   if (this->regmust != NULL)
00964   {
00965     s = string;
00966     while ((s = vcl_strchr(s, this->regmust[0])) != NULL) {
00967       if (vcl_strncmp(s, this->regmust, this->regmlen) == 0)
00968         break; // Found it.
00969       s++;
00970     }
00971     if (s == NULL) // Not present.
00972       return 0;
00973   }
00974 
00975   // Mark beginning of line for ^ .
00976   regbol = string;
00977 
00978   // Simplest case:  anchored match need be tried only once.
00979   if (this->reganch)
00980     return regtry(string, this->startp, this->endp, this->program) != 0;
00981 
00982   // Messy cases:  unanchored match.
00983   s = string;
00984   if (this->regstart != '\0')
00985     // We know what char it must start with.
00986     while ((s = vcl_strchr(s, this->regstart)) != NULL) {
00987       if (regtry(s, this->startp, this->endp, this->program))
00988         return 1;
00989       s++;
00990     }
00991   else
00992     // We don't - general case.
00993     do {
00994       if (regtry(s, this->startp, this->endp, this->program))
00995         return 1;
00996     } while (*s++ != '\0');
00997 
00998   // Failure.
00999   return 0;
01000 }
01001 
01002 
01003 // try match at specific point
01004 // 0 failure, 1 success
01005 //
01006 static int regtry(const char* string, const char* *start,
01007                   const char* *end, const char* prog)
01008 {
01009   register       int   i;
01010   register const char* *sp1;
01011   register const char* *ep;
01012 
01013   reginput = string;
01014   regstartp = start;
01015   regendp = end;
01016 
01017   sp1 = start;
01018   ep = end;
01019   for (i = vul_reg_exp_nsubexp; i > 0; i--) {
01020     *sp1++ = NULL;
01021     *ep++ = NULL;
01022   }
01023   if (regmatch(prog + 1)) {
01024     start[0] = string;
01025     end[0] = reginput;
01026     return 1;
01027   }
01028   else
01029     return 0;
01030 }
01031 
01032 
01033 // main matching routine
01034 //
01035 // Conceptually the strategy is simple:  check to see whether the current
01036 // node matches, call self recursively to see whether the rest matches,
01037 // and then act accordingly.  In practice we make some effort to avoid
01038 // recursion, in particular by going through "ordinary" nodes (that don't
01039 // need to know whether the rest of the match failed) by a loop instead of
01040 // by recursion.
01041 // 0 failure, 1 success
01042 //
01043 static int regmatch(const char* prog)
01044 {
01045   register const char* scan; // Current node.
01046   const char* next; // Next node.
01047 
01048   scan = prog;
01049 
01050   while (scan != NULL)
01051   {
01052     next = regnext(scan);
01053 
01054     switch (OP(scan))
01055     {
01056      case BOL:
01057       if (reginput != regbol)
01058         return 0;
01059       break;
01060      case EOL:
01061       if (*reginput != '\0')
01062         return 0;
01063       break;
01064      case ANY:
01065       if (*reginput == '\0')
01066         return 0;
01067       reginput++;
01068       break;
01069      case EXACTLY:
01070      {
01071       register int     len;
01072       register const char* opnd;
01073 
01074       opnd = OPERAND(scan);
01075       // Inline the first character, for speed.
01076       if (*opnd != *reginput)
01077         return 0;
01078       len = (int)vcl_strlen(opnd);
01079       if (len > 1 && vcl_strncmp(opnd, reginput, len) != 0)
01080         return 0;
01081       reginput += len;
01082       break;
01083      }
01084      case ANYOF:
01085       if (*reginput == '\0' || vcl_strchr(OPERAND(scan), *reginput) == NULL)
01086         return 0;
01087       reginput++;
01088       break;
01089      case ANYBUT:
01090       if (*reginput == '\0' || vcl_strchr(OPERAND(scan), *reginput) != NULL)
01091         return 0;
01092       reginput++;
01093       break;
01094      case NOTHING:
01095       break;
01096      case BACK:
01097       break;
01098      case OPEN+1: case OPEN+2: case OPEN+3: case OPEN+4: case OPEN+5: case OPEN+6: case OPEN+7: case OPEN+8: case OPEN+9:
01099      {
01100       register int no;
01101       register const char* save;
01102 
01103       no = OP(scan) - OPEN;
01104       save = reginput;
01105 
01106       if (regmatch(next))
01107       {
01108         //
01109         // Don't set startp if some later invocation of the
01110         // same parentheses already has.
01111         //
01112         if (regstartp[no] == NULL)
01113           regstartp[no] = save;
01114         return 1;
01115       }
01116       else
01117         return 0;
01118      }
01119      case CLOSE+1: case CLOSE+2: case CLOSE+3: case CLOSE+4: case CLOSE+5: case CLOSE+6: case CLOSE+7: case CLOSE+8: case CLOSE+9:
01120      {
01121       register     int  no;
01122       register const char* save;
01123 
01124       no = OP(scan) - CLOSE;
01125       save = reginput;
01126 
01127       if (regmatch(next))
01128       {
01129         //
01130         // Don't set endp if some later invocation of the
01131         // same parentheses already has.
01132         //
01133         if (regendp[no] == NULL)
01134           regendp[no] = save;
01135         return 1;
01136       }
01137       else
01138         return 0;
01139      }
01140      case BRANCH:
01141      {
01142       register const char* save;
01143 
01144       if (OP(next) != BRANCH) // No choice.
01145         next = OPERAND(scan); // Avoid recursion.
01146       else {
01147         do {
01148           save = reginput;
01149           if (regmatch(OPERAND(scan)))
01150             return 1;
01151           reginput = save;
01152           scan = regnext(scan);
01153         } while (scan != NULL && OP(scan) == BRANCH);
01154         return 0;
01155         // NOTREACHED
01156       }
01157       break;
01158      }
01159      case STAR:
01160      case PLUS:
01161      {
01162       register char   nextch;
01163       register int    no;
01164       register const char* save;
01165       register int    min_no;
01166 
01167       //
01168       // Lookahead to avoid useless match attempts when we know
01169       // what character comes next.
01170       //
01171       nextch = '\0';
01172       if (OP(next) == EXACTLY)
01173         nextch = *OPERAND(next);
01174       min_no = (OP(scan) == STAR) ? 0 : 1;
01175       save = reginput;
01176       no = regrepeat(OPERAND(scan));
01177       while (no >= min_no) {
01178         // If it could work, try it.
01179         if (nextch == '\0' || *reginput == nextch)
01180           if (regmatch(next))
01181             return 1;
01182         // Couldn't or didn't - back up.
01183         no--;
01184         reginput = save + no;
01185       }
01186       return 0;
01187      }
01188      case END:
01189       return 1; // Success!
01190 
01191      default:
01192       //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
01193       vcl_cout << "vul_reg_exp::find(): Internal error -- memory corrupted.\n";
01194       return 0;
01195     }
01196     scan = next;
01197   }
01198 
01199   //
01200   //  We get here only if there's trouble - normally "case END" is the
01201   //  terminating point.
01202   //
01203   //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
01204   vcl_cout << "vul_reg_exp::find(): Internal error -- corrupted pointers.\n";
01205   return 0;
01206 }
01207 
01208 
01209 // repeatedly match something simple, report how many
01210 //
01211 static int regrepeat(const char* p)
01212 {
01213   register     int     count = 0;
01214   register const char* scan;
01215   register const char* opnd;
01216 
01217   scan = reginput;
01218   opnd = OPERAND(p);
01219   switch (OP(p))
01220   {
01221    case ANY:
01222     count = (int)vcl_strlen(scan);
01223     scan += count;
01224     break;
01225    case EXACTLY:
01226     while (*opnd == *scan) {
01227       count++;
01228       scan++;
01229     }
01230     break;
01231    case ANYOF:
01232     while (*scan != '\0' && vcl_strchr(opnd, *scan) != NULL) {
01233       count++;
01234       scan++;
01235     }
01236     break;
01237    case ANYBUT:
01238     while (*scan != '\0' && vcl_strchr(opnd, *scan) == NULL) {
01239       count++;
01240       scan++;
01241     }
01242     break;
01243    default: // Oh dear.  Called inappropriately.
01244     //RAISE Error, SYM(vul_reg_exp), SYM(Internal_Error),
01245     vcl_cout << "vul_reg_exp::find(): Internal error.\n";
01246     return 0;
01247   }
01248   reginput = scan;
01249   return count;
01250 }
01251 
01252 
01253 // dig the "next" pointer out of a node
01254 //
01255 static const char* regnext(register const char* p)
01256 {
01257   register int offset;
01258 
01259   if (p == &regdummy)
01260     return NULL;
01261 
01262   offset = NEXT(p);
01263   if (offset == 0)
01264     return NULL;
01265 
01266   if (OP(p) == BACK)
01267     return p - offset;
01268   else
01269     return p + offset;
01270 }
01271 
01272 
01273 static char* regnext(register char* p)
01274 {
01275   register int offset;
01276 
01277   if (p == &regdummy)
01278     return NULL;
01279 
01280   offset = NEXT(p);
01281   if (offset == 0)
01282     return NULL;
01283 
01284   if (OP(p) == BACK)
01285     return p - offset;
01286   else
01287     return p + offset;
01288 }