core/vul/vul_reg_exp.h
Go to the documentation of this file.
00001 // This is core/vul/vul_reg_exp.h
00002 #ifndef vul_reg_exph
00003 #define vul_reg_exph
00004 //:
00005 // \file
00006 // \brief contains class for pattern matching with regular expressions
00007 // \author Texas Instruments Incorporated.
00008 //
00009 // \verbatim
00010 // Modifications
00011 // PDA (Manchester) 21/03/2001: Tidied up the documentation
00012 // Peter Vanroose   27/05/2001: Corrected the documentation
00013 // Peter Vanroose   07/02/2002: brief doxygen comment placed on single line
00014 // Peter Vanroose   13/06/2002: bug fix: crash in match() when startp==endp==0
00015 // Ian Scott        08/06/2003: Add protect(char) function
00016 // \endverbatim
00017 //
00018 // Original Copyright notice:
00019 // Copyright (C) 1991 Texas Instruments Incorporated.
00020 //
00021 // Permission is granted to any individual or institution to use, copy, modify,
00022 // and distribute this software, provided that this complete copyright and
00023 // permission notice is maintained, intact, in all copies and supporting
00024 // documentation.
00025 //
00026 // Texas Instruments Incorporated provides this software "as is" without
00027 // express or implied warranty.
00028 
00029 #include <vcl_string.h>
00030 #include <vcl_cstddef.h> // for vcl_ptrdiff_t
00031 
00032 const int vul_reg_exp_nsubexp = 10;
00033 
00034 //: Pattern matching with regular expressions.
00035 //  A regular expression allows a programmer to specify complex
00036 //  patterns that can be searched for and matched against the
00037 //  character string of a string object. In its simplest form, a
00038 //  regular expression is a sequence of characters used to search for
00039 //  exact character matches. However, many times the exact sequence to
00040 //  be found is not known, or only a match at the beginning or end of
00041 //  a string is desired. This regular expression class implements
00042 //  regular expression pattern matching as is found and implemented in
00043 //  many UNIX commands and utilities.
00044 //
00045 //  Example: The perl code
00046 // \code
00047 //     $filename =~ m"([a-z]+)\.cc";
00048 //     print $1;
00049 // \endcode
00050 //  is written as follows in C++
00051 // \code
00052 //     vul_reg_exp re("([a-z]+)\\.cc");
00053 //     re.find(filename);
00054 //     vcl_cout << re.match(1);
00055 // \endcode
00056 //
00057 //  The regular expression class provides a convenient mechanism for
00058 //  specifying and manipulating regular expressions. The regular
00059 //  expression object allows specification of such patterns by using
00060 //  the following regular expression metacharacters:
00061 //
00062 // -  ^        Matches at beginning of a line
00063 // -  $        Matches at end of a line
00064 // - .         Matches any single character
00065 // - [ ]       Matches any character(s) inside the brackets
00066 // - [^ ]      Matches any character(s) not inside the brackets
00067 // - [ - ]     Matches any character in range on either side of a dash
00068 // -  *        Matches preceding pattern zero or more times
00069 // -  +        Matches preceding pattern one or more times
00070 // -  ?        Matches preceding pattern at most once
00071 // - ()        Saves a matched expression and uses it in a later match
00072 //
00073 //  Note that more than one of these metacharacters can be used in a
00074 //  single regular expression in order to create complex search
00075 //  patterns. For example, the pattern [^ab1-9] says to match any
00076 //  character sequence that does not begin with the characters "a",
00077 //  "b", or the characters "1" through "9".
00078 //
00079 class vul_reg_exp
00080 {
00081   //: anchor point of start position for n-th matching regular expression
00082   const char* startp[vul_reg_exp_nsubexp];
00083   //: anchor point of end position for n-th matching regular expression
00084   const char* endp[vul_reg_exp_nsubexp];
00085   //: Internal use only
00086   char  regstart;
00087   //: Internal use only
00088   char  reganch;
00089   //: Internal use only
00090   const char* regmust;
00091   //: Internal use only
00092   int   regmlen;
00093   char* program;
00094   int   progsize;
00095   const char* searchstring;
00096  public:
00097   //: Creates an empty regular expression.
00098   inline vul_reg_exp() : program(0) { clear_bufs(); }
00099   //: Creates a regular expression from string s, and compiles s.
00100   inline vul_reg_exp(char const* s) : program(0) { clear_bufs(); compile(s); }
00101   //: Copy constructor
00102   vul_reg_exp(vul_reg_exp const&);
00103   //: Frees space allocated for regular expression.
00104   inline ~vul_reg_exp() { delete[] this->program; }
00105   //: Compiles char* --> regexp
00106   void compile(char const*);
00107   //: true if regexp in char* arg
00108   bool find(char const*);
00109   //: true if regexp in char* arg
00110   bool find(vcl_string const&);
00111   //: Returns the start index of the last item found.
00112   inline vcl_ptrdiff_t start() const { return this->startp[0] - searchstring; }
00113   //: Returns the end index of the last item found.
00114   inline vcl_ptrdiff_t end()   const { return this->endp[0] - searchstring; }
00115   //: Equality operator
00116   bool operator==(vul_reg_exp const&) const;
00117   //: Inequality operator
00118   inline bool operator!=(vul_reg_exp const& r) const { return !operator==(r); }
00119   //: Same regexp and state?
00120   bool deep_equal(vul_reg_exp const&) const;
00121   //: Returns true if a valid RE is compiled and ready for pattern matching.
00122   inline bool is_valid() const { return this->program != 0; }
00123   //: Invalidates regular expression.
00124   inline void set_invalid() { delete[] this->program; this->program = 0; clear_bufs(); }
00125 
00126   //: Return start index of nth submatch.
00127   // start(0) is the start of the full match.
00128   inline vcl_ptrdiff_t start(long n) const { return this->startp[n] - searchstring; }
00129   //: Return end index of nth submatch.
00130   // end(0) is the end of the full match.
00131   inline vcl_ptrdiff_t end(long n)   const { return this->endp[n] - searchstring; }
00132   //: Return nth submatch as a string.
00133   vcl_string match(int n) const {
00134     return this->endp[n] == this->startp[n] ? vcl_string("") :
00135            vcl_string(this->startp[n], this->endp[n] - this->startp[n]);
00136   }
00137   //: Return an expression that will match precisely c
00138   // The returned string is owned by the function, and
00139   // will be overwritten in subsequent calls.
00140   static const char * protect(char c);
00141 
00142  private:
00143   //: private function to clear startp[] and endp[]
00144   void clear_bufs() { for (int n=0; n<vul_reg_exp_nsubexp; ++n) startp[n]=endp[n]=0; }
00145 };
00146 
00147 #endif // vul_reg_exph