ICU 49.1.1
49.1.1
|
00001 /* 00002 ********************************************************************** 00003 * Copyright (C) 2002-2012, International Business Machines 00004 * Corporation and others. All Rights Reserved. 00005 ********************************************************************** 00006 * file name: regex.h 00007 * encoding: US-ASCII 00008 * indentation:4 00009 * 00010 * created on: 2002oct22 00011 * created by: Andy Heninger 00012 * 00013 * ICU Regular Expressions, API for C++ 00014 */ 00015 00016 #ifndef REGEX_H 00017 #define REGEX_H 00018 00019 //#define REGEX_DEBUG 00020 00045 #include "unicode/utypes.h" 00046 00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS 00048 00049 #include "unicode/uobject.h" 00050 #include "unicode/unistr.h" 00051 #include "unicode/utext.h" 00052 #include "unicode/parseerr.h" 00053 00054 #include "unicode/uregex.h" 00055 00056 // Forward Declarations 00057 00058 U_NAMESPACE_BEGIN 00059 00060 struct Regex8BitSet; 00061 class RegexCImpl; 00062 class RegexMatcher; 00063 class RegexPattern; 00064 struct REStackFrame; 00065 class RuleBasedBreakIterator; 00066 class UnicodeSet; 00067 class UVector; 00068 class UVector32; 00069 class UVector64; 00070 00075 #ifdef REGEX_DEBUG 00076 U_INTERNAL void U_EXPORT2 00077 RegexPatternDump(const RegexPattern *pat); 00078 #else 00079 #undef RegexPatternDump 00080 #define RegexPatternDump(pat) 00081 #endif 00082 00083 00084 00096 class U_I18N_API RegexPattern: public UObject { 00097 public: 00098 00106 RegexPattern(); 00107 00114 RegexPattern(const RegexPattern &source); 00115 00121 virtual ~RegexPattern(); 00122 00131 UBool operator==(const RegexPattern& that) const; 00132 00141 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);} 00142 00148 RegexPattern &operator =(const RegexPattern &source); 00149 00157 virtual RegexPattern *clone() const; 00158 00159 00184 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00185 UParseError &pe, 00186 UErrorCode &status); 00187 00214 static RegexPattern * U_EXPORT2 compile( UText *regex, 00215 UParseError &pe, 00216 UErrorCode &status); 00217 00242 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00243 uint32_t flags, 00244 UParseError &pe, 00245 UErrorCode &status); 00246 00273 static RegexPattern * U_EXPORT2 compile( UText *regex, 00274 uint32_t flags, 00275 UParseError &pe, 00276 UErrorCode &status); 00277 00300 static RegexPattern * U_EXPORT2 compile( const UnicodeString ®ex, 00301 uint32_t flags, 00302 UErrorCode &status); 00303 00328 static RegexPattern * U_EXPORT2 compile( UText *regex, 00329 uint32_t flags, 00330 UErrorCode &status); 00331 00337 virtual uint32_t flags() const; 00338 00356 virtual RegexMatcher *matcher(const UnicodeString &input, 00357 UErrorCode &status) const; 00358 00359 private: 00373 RegexMatcher *matcher(const UChar *input, 00374 UErrorCode &status) const; 00375 public: 00376 00377 00389 virtual RegexMatcher *matcher(UErrorCode &status) const; 00390 00391 00406 static UBool U_EXPORT2 matches(const UnicodeString ®ex, 00407 const UnicodeString &input, 00408 UParseError &pe, 00409 UErrorCode &status); 00410 00425 static UBool U_EXPORT2 matches(UText *regex, 00426 UText *input, 00427 UParseError &pe, 00428 UErrorCode &status); 00429 00438 virtual UnicodeString pattern() const; 00439 00440 00451 virtual UText *patternText(UErrorCode &status) const; 00452 00453 00492 virtual int32_t split(const UnicodeString &input, 00493 UnicodeString dest[], 00494 int32_t destCapacity, 00495 UErrorCode &status) const; 00496 00497 00536 virtual int32_t split(UText *input, 00537 UText *dest[], 00538 int32_t destCapacity, 00539 UErrorCode &status) const; 00540 00541 00547 virtual UClassID getDynamicClassID() const; 00548 00554 static UClassID U_EXPORT2 getStaticClassID(); 00555 00556 private: 00557 // 00558 // Implementation Data 00559 // 00560 UText *fPattern; // The original pattern string. 00561 UnicodeString *fPatternString; // The original pattern UncodeString if relevant 00562 uint32_t fFlags; // The flags used when compiling the pattern. 00563 // 00564 UVector64 *fCompiledPat; // The compiled pattern p-code. 00565 UnicodeString fLiteralText; // Any literal string data from the pattern, 00566 // after un-escaping, for use during the match. 00567 00568 UVector *fSets; // Any UnicodeSets referenced from the pattern. 00569 Regex8BitSet *fSets8; // (and fast sets for latin-1 range.) 00570 00571 00572 UErrorCode fDeferredStatus; // status if some prior error has left this 00573 // RegexPattern in an unusable state. 00574 00575 int32_t fMinMatchLen; // Minimum Match Length. All matches will have length 00576 // >= this value. For some patterns, this calculated 00577 // value may be less than the true shortest 00578 // possible match. 00579 00580 int32_t fFrameSize; // Size of a state stack frame in the 00581 // execution engine. 00582 00583 int32_t fDataSize; // The size of the data needed by the pattern that 00584 // does not go on the state stack, but has just 00585 // a single copy per matcher. 00586 00587 UVector32 *fGroupMap; // Map from capture group number to position of 00588 // the group's variables in the matcher stack frame. 00589 00590 int32_t fMaxCaptureDigits; 00591 00592 UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined 00593 // regex character classes, e.g. Word. 00594 00595 Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only 00596 // sets for predefined regex classes. 00597 00598 int32_t fStartType; // Info on how a match must start. 00599 int32_t fInitialStringIdx; // 00600 int32_t fInitialStringLen; 00601 UnicodeSet *fInitialChars; 00602 UChar32 fInitialChar; 00603 Regex8BitSet *fInitialChars8; 00604 UBool fNeedsAltInput; 00605 00606 friend class RegexCompile; 00607 friend class RegexMatcher; 00608 friend class RegexCImpl; 00609 00610 // 00611 // Implementation Methods 00612 // 00613 void init(); // Common initialization, for use by constructors. 00614 void zap(); // Common cleanup 00615 #ifdef REGEX_DEBUG 00616 void dumpOp(int32_t index) const; 00617 friend void U_EXPORT2 RegexPatternDump(const RegexPattern *); 00618 #endif 00619 00620 }; 00621 00622 00623 00633 class U_I18N_API RegexMatcher: public UObject { 00634 public: 00635 00650 RegexMatcher(const UnicodeString ®exp, uint32_t flags, UErrorCode &status); 00651 00667 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status); 00668 00690 RegexMatcher(const UnicodeString ®exp, const UnicodeString &input, 00691 uint32_t flags, UErrorCode &status); 00692 00714 RegexMatcher(UText *regexp, UText *input, 00715 uint32_t flags, UErrorCode &status); 00716 00717 private: 00731 RegexMatcher(const UnicodeString ®exp, const UChar *input, 00732 uint32_t flags, UErrorCode &status); 00733 public: 00734 00735 00741 virtual ~RegexMatcher(); 00742 00743 00750 virtual UBool matches(UErrorCode &status); 00751 00752 00763 virtual UBool matches(int64_t startIndex, UErrorCode &status); 00764 00765 00779 virtual UBool lookingAt(UErrorCode &status); 00780 00781 00795 virtual UBool lookingAt(int64_t startIndex, UErrorCode &status); 00796 00797 00810 virtual UBool find(); 00811 00812 00822 virtual UBool find(int64_t start, UErrorCode &status); 00823 00824 00834 virtual UnicodeString group(UErrorCode &status) const; 00835 00836 00849 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const; 00850 00851 00857 virtual int32_t groupCount() const; 00858 00859 00874 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 00875 00891 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const; 00892 00908 virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const; 00909 00910 00918 virtual int32_t start(UErrorCode &status) const; 00919 00927 virtual int64_t start64(UErrorCode &status) const; 00928 00929 00943 virtual int32_t start(int32_t group, UErrorCode &status) const; 00944 00958 virtual int64_t start64(int32_t group, UErrorCode &status) const; 00959 00960 00974 virtual int32_t end(UErrorCode &status) const; 00975 00989 virtual int64_t end64(UErrorCode &status) const; 00990 00991 01009 virtual int32_t end(int32_t group, UErrorCode &status) const; 01010 01028 virtual int64_t end64(int32_t group, UErrorCode &status) const; 01029 01030 01039 virtual RegexMatcher &reset(); 01040 01041 01057 virtual RegexMatcher &reset(int64_t index, UErrorCode &status); 01058 01059 01077 virtual RegexMatcher &reset(const UnicodeString &input); 01078 01079 01093 virtual RegexMatcher &reset(UText *input); 01094 01095 01120 virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status); 01121 01122 private: 01136 RegexMatcher &reset(const UChar *input); 01137 public: 01138 01146 virtual const UnicodeString &input() const; 01147 01156 virtual UText *inputText() const; 01157 01168 virtual UText *getInput(UText *dest, UErrorCode &status) const; 01169 01170 01189 virtual RegexMatcher ®ion(int64_t start, int64_t limit, UErrorCode &status); 01190 01202 virtual RegexMatcher ®ion(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status); 01203 01212 virtual int32_t regionStart() const; 01213 01222 virtual int64_t regionStart64() const; 01223 01224 01233 virtual int32_t regionEnd() const; 01234 01243 virtual int64_t regionEnd64() const; 01244 01253 virtual UBool hasTransparentBounds() const; 01254 01273 virtual RegexMatcher &useTransparentBounds(UBool b); 01274 01275 01283 virtual UBool hasAnchoringBounds() const; 01284 01285 01298 virtual RegexMatcher &useAnchoringBounds(UBool b); 01299 01300 01313 virtual UBool hitEnd() const; 01314 01324 virtual UBool requireEnd() const; 01325 01326 01332 virtual const RegexPattern &pattern() const; 01333 01334 01351 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status); 01352 01353 01374 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status); 01375 01376 01397 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status); 01398 01399 01424 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status); 01425 01426 01454 virtual RegexMatcher &appendReplacement(UnicodeString &dest, 01455 const UnicodeString &replacement, UErrorCode &status); 01456 01457 01485 virtual RegexMatcher &appendReplacement(UText *dest, 01486 UText *replacement, UErrorCode &status); 01487 01488 01499 virtual UnicodeString &appendTail(UnicodeString &dest); 01500 01501 01515 virtual UText *appendTail(UText *dest, UErrorCode &status); 01516 01517 01541 virtual int32_t split(const UnicodeString &input, 01542 UnicodeString dest[], 01543 int32_t destCapacity, 01544 UErrorCode &status); 01545 01546 01570 virtual int32_t split(UText *input, 01571 UText *dest[], 01572 int32_t destCapacity, 01573 UErrorCode &status); 01574 01596 virtual void setTimeLimit(int32_t limit, UErrorCode &status); 01597 01604 virtual int32_t getTimeLimit() const; 01605 01627 virtual void setStackLimit(int32_t limit, UErrorCode &status); 01628 01636 virtual int32_t getStackLimit() const; 01637 01638 01652 virtual void setMatchCallback(URegexMatchCallback *callback, 01653 const void *context, 01654 UErrorCode &status); 01655 01656 01667 virtual void getMatchCallback(URegexMatchCallback *&callback, 01668 const void *&context, 01669 UErrorCode &status); 01670 01671 01685 virtual void setFindProgressCallback(URegexFindProgressCallback *callback, 01686 const void *context, 01687 UErrorCode &status); 01688 01689 01700 virtual void getFindProgressCallback(URegexFindProgressCallback *&callback, 01701 const void *&context, 01702 UErrorCode &status); 01703 01704 #ifndef U_HIDE_INTERNAL_API 01705 01710 void setTrace(UBool state); 01711 #endif /* U_HIDE_INTERNAL_API */ 01712 01718 static UClassID U_EXPORT2 getStaticClassID(); 01719 01725 virtual UClassID getDynamicClassID() const; 01726 01727 private: 01728 // Constructors and other object boilerplate are private. 01729 // Instances of RegexMatcher can not be assigned, copied, cloned, etc. 01730 RegexMatcher(); // default constructor not implemented 01731 RegexMatcher(const RegexPattern *pat); 01732 RegexMatcher(const RegexMatcher &other); 01733 RegexMatcher &operator =(const RegexMatcher &rhs); 01734 void init(UErrorCode &status); // Common initialization 01735 void init2(UText *t, UErrorCode &e); // Common initialization, part 2. 01736 01737 friend class RegexPattern; 01738 friend class RegexCImpl; 01739 public: 01740 #ifndef U_HIDE_INTERNAL_API 01741 01742 void resetPreserveRegion(); // Reset matcher state, but preserve any region. 01743 #endif /* U_HIDE_INTERNAL_API */ 01744 private: 01745 01746 // 01747 // MatchAt This is the internal interface to the match engine itself. 01748 // Match status comes back in matcher member variables. 01749 // 01750 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status); 01751 inline void backTrack(int64_t &inputIdx, int32_t &patIdx); 01752 UBool isWordBoundary(int64_t pos); // perform Perl-like \b test 01753 UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test 01754 REStackFrame *resetStack(); 01755 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status); 01756 void IncrementTime(UErrorCode &status); 01757 UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status); 01758 01759 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const; 01760 01761 UBool findUsingChunk(); 01762 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status); 01763 UBool isChunkWordBoundary(int32_t pos); 01764 01765 const RegexPattern *fPattern; 01766 RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and 01767 // should delete it when through. 01768 01769 const UnicodeString *fInput; // The string being matched. Only used for input() 01770 UText *fInputText; // The text being matched. Is never NULL. 01771 UText *fAltInputText; // A shallow copy of the text being matched. 01772 // Only created if the pattern contains backreferences. 01773 int64_t fInputLength; // Full length of the input text. 01774 int32_t fFrameSize; // The size of a frame in the backtrack stack. 01775 01776 int64_t fRegionStart; // Start of the input region, default = 0. 01777 int64_t fRegionLimit; // End of input region, default to input.length. 01778 01779 int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $). 01780 int64_t fAnchorLimit; // See useAnchoringBounds 01781 01782 int64_t fLookStart; // Region bounds for look-ahead/behind and 01783 int64_t fLookLimit; // and other boundary tests. See 01784 // useTransparentBounds 01785 01786 int64_t fActiveStart; // Currently active bounds for matching. 01787 int64_t fActiveLimit; // Usually is the same as region, but 01788 // is changed to fLookStart/Limit when 01789 // entering look around regions. 01790 01791 UBool fTransparentBounds; // True if using transparent bounds. 01792 UBool fAnchoringBounds; // True if using anchoring bounds. 01793 01794 UBool fMatch; // True if the last attempted match was successful. 01795 int64_t fMatchStart; // Position of the start of the most recent match 01796 int64_t fMatchEnd; // First position after the end of the most recent match 01797 // Zero if no previous match, even when a region 01798 // is active. 01799 int64_t fLastMatchEnd; // First position after the end of the previous match, 01800 // or -1 if there was no previous match. 01801 int64_t fAppendPosition; // First position after the end of the previous 01802 // appendReplacement(). As described by the 01803 // JavaDoc for Java Matcher, where it is called 01804 // "append position" 01805 UBool fHitEnd; // True if the last match touched the end of input. 01806 UBool fRequireEnd; // True if the last match required end-of-input 01807 // (matched $ or Z) 01808 01809 UVector64 *fStack; 01810 REStackFrame *fFrame; // After finding a match, the last active stack frame, 01811 // which will contain the capture group results. 01812 // NOT valid while match engine is running. 01813 01814 int64_t *fData; // Data area for use by the compiled pattern. 01815 int64_t fSmallData[8]; // Use this for data if it's enough. 01816 01817 int32_t fTimeLimit; // Max time (in arbitrary steps) to let the 01818 // match engine run. Zero for unlimited. 01819 01820 int32_t fTime; // Match time, accumulates while matching. 01821 int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves. 01822 // Kept separately from fTime to keep as much 01823 // code as possible out of the inline 01824 // StateSave function. 01825 01826 int32_t fStackLimit; // Maximum memory size to use for the backtrack 01827 // stack, in bytes. Zero for unlimited. 01828 01829 URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct. 01830 // NULL if there is no callback. 01831 const void *fCallbackContext; // User Context ptr for callback function. 01832 01833 URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct. 01834 // NULL if there is no callback. 01835 const void *fFindProgressCallbackContext; // User Context ptr for callback function. 01836 01837 01838 UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility. 01839 01840 UBool fTraceDebug; // Set true for debug tracing of match engine. 01841 01842 UErrorCode fDeferredStatus; // Save error state that cannot be immediately 01843 // reported, or that permanently disables this matcher. 01844 01845 RuleBasedBreakIterator *fWordBreakItr; 01846 }; 01847 01848 U_NAMESPACE_END 01849 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS 01850 #endif