ICU 54.1  54.1
regex.h
Go to the documentation of this file.
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2014, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: regex.h
7 * encoding: US-ASCII
8 * indentation:4
9 *
10 * created on: 2002oct22
11 * created by: Andy Heninger
12 *
13 * ICU Regular Expressions, API for C++
14 */
15 
16 #ifndef REGEX_H
17 #define REGEX_H
18 
19 //#define REGEX_DEBUG
20 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/utext.h"
52 #include "unicode/parseerr.h"
53 
54 #include "unicode/uregex.h"
55 
56 // Forward Declarations
57 
59 
60 struct Regex8BitSet;
61 class RegexCImpl;
62 class RegexMatcher;
63 class RegexPattern;
64 struct REStackFrame;
65 class RuleBasedBreakIterator;
66 class UnicodeSet;
67 class UVector;
68 class UVector32;
69 class UVector64;
70 
71 
83 class U_I18N_API RegexPattern U_FINAL : public UObject {
84 public:
85 
93  RegexPattern();
94 
101  RegexPattern(const RegexPattern &source);
102 
108  virtual ~RegexPattern();
109 
118  UBool operator==(const RegexPattern& that) const;
119 
128  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
129 
135  RegexPattern &operator =(const RegexPattern &source);
136 
144  virtual RegexPattern *clone() const;
145 
146 
171  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
172  UParseError &pe,
173  UErrorCode &status);
174 
201  static RegexPattern * U_EXPORT2 compile( UText *regex,
202  UParseError &pe,
203  UErrorCode &status);
204 
229  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
230  uint32_t flags,
231  UParseError &pe,
232  UErrorCode &status);
233 
260  static RegexPattern * U_EXPORT2 compile( UText *regex,
261  uint32_t flags,
262  UParseError &pe,
263  UErrorCode &status);
264 
287  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
288  uint32_t flags,
289  UErrorCode &status);
290 
315  static RegexPattern * U_EXPORT2 compile( UText *regex,
316  uint32_t flags,
317  UErrorCode &status);
318 
324  virtual uint32_t flags() const;
325 
343  virtual RegexMatcher *matcher(const UnicodeString &input,
344  UErrorCode &status) const;
345 
346 private:
359  RegexMatcher *matcher(const UChar *input,
360  UErrorCode &status) const;
361 public:
362 
363 
375  virtual RegexMatcher *matcher(UErrorCode &status) const;
376 
377 
392  static UBool U_EXPORT2 matches(const UnicodeString &regex,
393  const UnicodeString &input,
394  UParseError &pe,
395  UErrorCode &status);
396 
411  static UBool U_EXPORT2 matches(UText *regex,
412  UText *input,
413  UParseError &pe,
414  UErrorCode &status);
415 
424  virtual UnicodeString pattern() const;
425 
426 
437  virtual UText *patternText(UErrorCode &status) const;
438 
439 
478  virtual int32_t split(const UnicodeString &input,
479  UnicodeString dest[],
480  int32_t destCapacity,
481  UErrorCode &status) const;
482 
483 
522  virtual int32_t split(UText *input,
523  UText *dest[],
524  int32_t destCapacity,
525  UErrorCode &status) const;
526 
527 
533  virtual UClassID getDynamicClassID() const;
534 
540  static UClassID U_EXPORT2 getStaticClassID();
541 
542 private:
543  //
544  // Implementation Data
545  //
546  UText *fPattern; // The original pattern string.
547  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
548  uint32_t fFlags; // The flags used when compiling the pattern.
549  //
550  UVector64 *fCompiledPat; // The compiled pattern p-code.
551  UnicodeString fLiteralText; // Any literal string data from the pattern,
552  // after un-escaping, for use during the match.
553 
554  UVector *fSets; // Any UnicodeSets referenced from the pattern.
555  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
556 
557 
558  UErrorCode fDeferredStatus; // status if some prior error has left this
559  // RegexPattern in an unusable state.
560 
561  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
562  // >= this value. For some patterns, this calculated
563  // value may be less than the true shortest
564  // possible match.
565 
566  int32_t fFrameSize; // Size of a state stack frame in the
567  // execution engine.
568 
569  int32_t fDataSize; // The size of the data needed by the pattern that
570  // does not go on the state stack, but has just
571  // a single copy per matcher.
572 
573  UVector32 *fGroupMap; // Map from capture group number to position of
574  // the group's variables in the matcher stack frame.
575 
576  int32_t fMaxCaptureDigits;
577 
578  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
579  // regex character classes, e.g. Word.
580 
581  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
582  // sets for predefined regex classes.
583 
584  int32_t fStartType; // Info on how a match must start.
585  int32_t fInitialStringIdx; //
586  int32_t fInitialStringLen;
587  UnicodeSet *fInitialChars;
588  UChar32 fInitialChar;
589  Regex8BitSet *fInitialChars8;
590  UBool fNeedsAltInput;
591 
592  friend class RegexCompile;
593  friend class RegexMatcher;
594  friend class RegexCImpl;
595 
596  //
597  // Implementation Methods
598  //
599  void init(); // Common initialization, for use by constructors.
600  void zap(); // Common cleanup
601 
602  void dumpOp(int32_t index) const;
603 
604  public:
605 #ifndef U_HIDE_INTERNAL_API
606 
610  void dumpPattern() const;
611 #endif /* U_HIDE_INTERNAL_API */
612 };
613 
614 
615 
625 class U_I18N_API RegexMatcher U_FINAL : public UObject {
626 public:
627 
642  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
643 
659  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
660 
682  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
683  uint32_t flags, UErrorCode &status);
684 
706  RegexMatcher(UText *regexp, UText *input,
707  uint32_t flags, UErrorCode &status);
708 
709 private:
722  RegexMatcher(const UnicodeString &regexp, const UChar *input,
723  uint32_t flags, UErrorCode &status);
724 public:
725 
726 
732  virtual ~RegexMatcher();
733 
734 
741  virtual UBool matches(UErrorCode &status);
742 
743 
754  virtual UBool matches(int64_t startIndex, UErrorCode &status);
755 
756 
770  virtual UBool lookingAt(UErrorCode &status);
771 
772 
786  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
787 
788 
801  virtual UBool find();
802 
803 
817  virtual UBool find(UErrorCode &status);
818 
828  virtual UBool find(int64_t start, UErrorCode &status);
829 
830 
840  virtual UnicodeString group(UErrorCode &status) const;
841 
842 
855  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
856 
857 
863  virtual int32_t groupCount() const;
864 
865 
880  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
881 
897  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
898 
914  virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
915 
916 
924  virtual int32_t start(UErrorCode &status) const;
925 
933  virtual int64_t start64(UErrorCode &status) const;
934 
935 
949  virtual int32_t start(int32_t group, UErrorCode &status) const;
950 
964  virtual int64_t start64(int32_t group, UErrorCode &status) const;
965 
966 
980  virtual int32_t end(UErrorCode &status) const;
981 
995  virtual int64_t end64(UErrorCode &status) const;
996 
997 
1015  virtual int32_t end(int32_t group, UErrorCode &status) const;
1016 
1034  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1035 
1036 
1045  virtual RegexMatcher &reset();
1046 
1047 
1063  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1064 
1065 
1083  virtual RegexMatcher &reset(const UnicodeString &input);
1084 
1085 
1099  virtual RegexMatcher &reset(UText *input);
1100 
1101 
1126  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1127 
1128 private:
1141  RegexMatcher &reset(const UChar *input);
1142 public:
1143 
1151  virtual const UnicodeString &input() const;
1152 
1161  virtual UText *inputText() const;
1162 
1173  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1174 
1175 
1194  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1195 
1207  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1208 
1217  virtual int32_t regionStart() const;
1218 
1227  virtual int64_t regionStart64() const;
1228 
1229 
1238  virtual int32_t regionEnd() const;
1239 
1248  virtual int64_t regionEnd64() const;
1249 
1258  virtual UBool hasTransparentBounds() const;
1259 
1278  virtual RegexMatcher &useTransparentBounds(UBool b);
1279 
1280 
1288  virtual UBool hasAnchoringBounds() const;
1289 
1290 
1303  virtual RegexMatcher &useAnchoringBounds(UBool b);
1304 
1305 
1318  virtual UBool hitEnd() const;
1319 
1329  virtual UBool requireEnd() const;
1330 
1331 
1337  virtual const RegexPattern &pattern() const;
1338 
1339 
1356  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1357 
1358 
1379  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1380 
1381 
1402  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1403 
1404 
1429  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1430 
1431 
1459  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1460  const UnicodeString &replacement, UErrorCode &status);
1461 
1462 
1490  virtual RegexMatcher &appendReplacement(UText *dest,
1491  UText *replacement, UErrorCode &status);
1492 
1493 
1504  virtual UnicodeString &appendTail(UnicodeString &dest);
1505 
1506 
1520  virtual UText *appendTail(UText *dest, UErrorCode &status);
1521 
1522 
1546  virtual int32_t split(const UnicodeString &input,
1547  UnicodeString dest[],
1548  int32_t destCapacity,
1549  UErrorCode &status);
1550 
1551 
1575  virtual int32_t split(UText *input,
1576  UText *dest[],
1577  int32_t destCapacity,
1578  UErrorCode &status);
1579 
1601  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1602 
1609  virtual int32_t getTimeLimit() const;
1610 
1632  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1633 
1641  virtual int32_t getStackLimit() const;
1642 
1643 
1657  virtual void setMatchCallback(URegexMatchCallback *callback,
1658  const void *context,
1659  UErrorCode &status);
1660 
1661 
1672  virtual void getMatchCallback(URegexMatchCallback *&callback,
1673  const void *&context,
1674  UErrorCode &status);
1675 
1676 
1690  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1691  const void *context,
1692  UErrorCode &status);
1693 
1694 
1705  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1706  const void *&context,
1707  UErrorCode &status);
1708 
1709 #ifndef U_HIDE_INTERNAL_API
1710 
1715  void setTrace(UBool state);
1716 #endif /* U_HIDE_INTERNAL_API */
1717 
1723  static UClassID U_EXPORT2 getStaticClassID();
1724 
1730  virtual UClassID getDynamicClassID() const;
1731 
1732 private:
1733  // Constructors and other object boilerplate are private.
1734  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1735  RegexMatcher(); // default constructor not implemented
1736  RegexMatcher(const RegexPattern *pat);
1737  RegexMatcher(const RegexMatcher &other);
1738  RegexMatcher &operator =(const RegexMatcher &rhs);
1739  void init(UErrorCode &status); // Common initialization
1740  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1741 
1742  friend class RegexPattern;
1743  friend class RegexCImpl;
1744 public:
1745 #ifndef U_HIDE_INTERNAL_API
1746 
1747  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1748 #endif /* U_HIDE_INTERNAL_API */
1749 private:
1750 
1751  //
1752  // MatchAt This is the internal interface to the match engine itself.
1753  // Match status comes back in matcher member variables.
1754  //
1755  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1756  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1757  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1758  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1759  REStackFrame *resetStack();
1760  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1761  void IncrementTime(UErrorCode &status);
1762 
1763  // Call user find callback function, if set. Return TRUE if operation should be interrupted.
1764  inline UBool findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
1765 
1766  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1767 
1768  UBool findUsingChunk(UErrorCode &status);
1769  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1770  UBool isChunkWordBoundary(int32_t pos);
1771 
1772  const RegexPattern *fPattern;
1773  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1774  // should delete it when through.
1775 
1776  const UnicodeString *fInput; // The string being matched. Only used for input()
1777  UText *fInputText; // The text being matched. Is never NULL.
1778  UText *fAltInputText; // A shallow copy of the text being matched.
1779  // Only created if the pattern contains backreferences.
1780  int64_t fInputLength; // Full length of the input text.
1781  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1782 
1783  int64_t fRegionStart; // Start of the input region, default = 0.
1784  int64_t fRegionLimit; // End of input region, default to input.length.
1785 
1786  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1787  int64_t fAnchorLimit; // See useAnchoringBounds
1788 
1789  int64_t fLookStart; // Region bounds for look-ahead/behind and
1790  int64_t fLookLimit; // and other boundary tests. See
1791  // useTransparentBounds
1792 
1793  int64_t fActiveStart; // Currently active bounds for matching.
1794  int64_t fActiveLimit; // Usually is the same as region, but
1795  // is changed to fLookStart/Limit when
1796  // entering look around regions.
1797 
1798  UBool fTransparentBounds; // True if using transparent bounds.
1799  UBool fAnchoringBounds; // True if using anchoring bounds.
1800 
1801  UBool fMatch; // True if the last attempted match was successful.
1802  int64_t fMatchStart; // Position of the start of the most recent match
1803  int64_t fMatchEnd; // First position after the end of the most recent match
1804  // Zero if no previous match, even when a region
1805  // is active.
1806  int64_t fLastMatchEnd; // First position after the end of the previous match,
1807  // or -1 if there was no previous match.
1808  int64_t fAppendPosition; // First position after the end of the previous
1809  // appendReplacement(). As described by the
1810  // JavaDoc for Java Matcher, where it is called
1811  // "append position"
1812  UBool fHitEnd; // True if the last match touched the end of input.
1813  UBool fRequireEnd; // True if the last match required end-of-input
1814  // (matched $ or Z)
1815 
1816  UVector64 *fStack;
1817  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1818  // which will contain the capture group results.
1819  // NOT valid while match engine is running.
1820 
1821  int64_t *fData; // Data area for use by the compiled pattern.
1822  int64_t fSmallData[8]; // Use this for data if it's enough.
1823 
1824  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1825  // match engine run. Zero for unlimited.
1826 
1827  int32_t fTime; // Match time, accumulates while matching.
1828  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1829  // Kept separately from fTime to keep as much
1830  // code as possible out of the inline
1831  // StateSave function.
1832 
1833  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1834  // stack, in bytes. Zero for unlimited.
1835 
1836  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1837  // NULL if there is no callback.
1838  const void *fCallbackContext; // User Context ptr for callback function.
1839 
1840  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1841  // NULL if there is no callback.
1842  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1843 
1844 
1845  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1846 
1847  UBool fTraceDebug; // Set true for debug tracing of match engine.
1848 
1849  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1850  // reported, or that permanently disables this matcher.
1851 
1852  RuleBasedBreakIterator *fWordBreakItr;
1853 };
1854 
1856 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1857 #endif
virtual UClassID getDynamicClassID() const
ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:91
Class RegexPattern represents a compiled regular expression.
Definition: regex.h:83
UBool URegexFindProgressCallback(const void *context, int64_t matchIndex)
Function pointer for a regular expression find callback function.
Definition: uregex.h:1550
C API: Abstract Unicode Text API.
class RegexMatcher bundles together a regular expression pattern and input text to which the expressi...
Definition: regex.h:625
#define U_I18N_API
Set to export library symbols from inside the i18n library, and to import them from outside...
Definition: utypes.h:358
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:129
C API: Regular Expressions.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:332
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:276
C++ API: Common ICU base class UObject.
uint16_t UChar
Define UChar to be UCHAR_TYPE, if that is #defined (for example, to char16_t), or wchar_t if that is ...
Definition: umachine.h:312
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:130
UBool URegexMatchCallback(const void *context, int32_t steps)
Function pointer for a regular expression matching callback function.
Definition: uregex.h:1476
C API: Parse Error Information.
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:476
UBool operator!=(const RegexPattern &that) const
Comparison operator.
Definition: regex.h:128
UText struct.
Definition: utext.h:1343
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:65
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:56
Basic definitions for ICU, for both C and C++ APIs.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:245
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:221
int8_t UBool
The ICU boolean type.
Definition: umachine.h:234