ICU 49.1.1
49.1.1
|
00001 /* 00002 *************************************************************************** 00003 * Copyright (C) 1999-2011, International Business Machines Corporation 00004 * and others. All Rights Reserved. 00005 *************************************************************************** 00006 * Date Name Description 00007 * 10/20/99 alan Creation. 00008 *************************************************************************** 00009 */ 00010 00011 #ifndef UNICODESET_H 00012 #define UNICODESET_H 00013 00014 #include "unicode/unifilt.h" 00015 #include "unicode/unistr.h" 00016 #include "unicode/uset.h" 00017 00023 U_NAMESPACE_BEGIN 00024 00025 class BMPSet; 00026 class ParsePosition; 00027 class RBBIRuleScanner; 00028 class SymbolTable; 00029 class UnicodeSetStringSpan; 00030 class UVector; 00031 class RuleCharacterIterator; 00032 00273 class U_COMMON_API UnicodeSet : public UnicodeFilter { 00274 00275 int32_t len; // length of list used; 0 <= len <= capacity 00276 int32_t capacity; // capacity of list 00277 UChar32* list; // MUST be terminated with HIGH 00278 BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL. 00279 UChar32* buffer; // internal buffer, may be NULL 00280 int32_t bufferCapacity; // capacity of buffer 00281 int32_t patLen; 00282 00292 UChar *pat; 00293 UVector* strings; // maintained in sorted order 00294 UnicodeSetStringSpan *stringSpan; 00295 00296 private: 00297 enum { // constants 00298 kIsBogus = 1 // This set is bogus (i.e. not valid) 00299 }; 00300 uint8_t fFlags; // Bit flag (see constants above) 00301 public: 00311 inline UBool isBogus(void) const; 00312 00329 void setToBogus(); 00330 00331 public: 00332 00333 enum { 00338 MIN_VALUE = 0, 00339 00344 MAX_VALUE = 0x10ffff 00345 }; 00346 00347 //---------------------------------------------------------------- 00348 // Constructors &c 00349 //---------------------------------------------------------------- 00350 00351 public: 00352 00357 UnicodeSet(); 00358 00367 UnicodeSet(UChar32 start, UChar32 end); 00368 00377 UnicodeSet(const UnicodeString& pattern, 00378 UErrorCode& status); 00379 00380 #ifndef U_HIDE_INTERNAL_API 00381 00393 UnicodeSet(const UnicodeString& pattern, 00394 uint32_t options, 00395 const SymbolTable* symbols, 00396 UErrorCode& status); 00397 #endif /* U_HIDE_INTERNAL_API */ 00398 00412 UnicodeSet(const UnicodeString& pattern, ParsePosition& pos, 00413 uint32_t options, 00414 const SymbolTable* symbols, 00415 UErrorCode& status); 00416 00421 UnicodeSet(const UnicodeSet& o); 00422 00427 virtual ~UnicodeSet(); 00428 00434 UnicodeSet& operator=(const UnicodeSet& o); 00435 00447 virtual UBool operator==(const UnicodeSet& o) const; 00448 00454 UBool operator!=(const UnicodeSet& o) const; 00455 00465 virtual UnicodeFunctor* clone() const; 00466 00474 virtual int32_t hashCode(void) const; 00475 00484 inline static UnicodeSet *fromUSet(USet *uset); 00485 00494 inline static const UnicodeSet *fromUSet(const USet *uset); 00495 00503 inline USet *toUSet(); 00504 00505 00513 inline const USet * toUSet() const; 00514 00515 00516 //---------------------------------------------------------------- 00517 // Freezable API 00518 //---------------------------------------------------------------- 00519 00528 inline UBool isFrozen() const; 00529 00543 UnicodeFunctor *freeze(); 00544 00553 UnicodeFunctor *cloneAsThawed() const; 00554 00555 //---------------------------------------------------------------- 00556 // Public API 00557 //---------------------------------------------------------------- 00558 00569 UnicodeSet& set(UChar32 start, UChar32 end); 00570 00576 static UBool resemblesPattern(const UnicodeString& pattern, 00577 int32_t pos); 00578 00591 UnicodeSet& applyPattern(const UnicodeString& pattern, 00592 UErrorCode& status); 00593 00594 #ifndef U_HIDE_INTERNAL_API 00595 00611 UnicodeSet& applyPattern(const UnicodeString& pattern, 00612 uint32_t options, 00613 const SymbolTable* symbols, 00614 UErrorCode& status); 00615 #endif /* U_HIDE_INTERNAL_API */ 00616 00648 UnicodeSet& applyPattern(const UnicodeString& pattern, 00649 ParsePosition& pos, 00650 uint32_t options, 00651 const SymbolTable* symbols, 00652 UErrorCode& status); 00653 00667 virtual UnicodeString& toPattern(UnicodeString& result, 00668 UBool escapeUnprintable = FALSE) const; 00669 00692 UnicodeSet& applyIntPropertyValue(UProperty prop, 00693 int32_t value, 00694 UErrorCode& ec); 00695 00725 UnicodeSet& applyPropertyAlias(const UnicodeString& prop, 00726 const UnicodeString& value, 00727 UErrorCode& ec); 00728 00737 virtual int32_t size(void) const; 00738 00745 virtual UBool isEmpty(void) const; 00746 00754 virtual UBool contains(UChar32 c) const; 00755 00764 virtual UBool contains(UChar32 start, UChar32 end) const; 00765 00773 UBool contains(const UnicodeString& s) const; 00774 00782 virtual UBool containsAll(const UnicodeSet& c) const; 00783 00791 UBool containsAll(const UnicodeString& s) const; 00792 00801 UBool containsNone(UChar32 start, UChar32 end) const; 00802 00810 UBool containsNone(const UnicodeSet& c) const; 00811 00819 UBool containsNone(const UnicodeString& s) const; 00820 00829 inline UBool containsSome(UChar32 start, UChar32 end) const; 00830 00838 inline UBool containsSome(const UnicodeSet& s) const; 00839 00847 inline UBool containsSome(const UnicodeString& s) const; 00848 00867 int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const; 00868 00881 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const; 00882 00900 int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const; 00901 00915 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const; 00916 00935 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; 00936 00954 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const; 00955 00960 virtual UMatchDegree matches(const Replaceable& text, 00961 int32_t& offset, 00962 int32_t limit, 00963 UBool incremental); 00964 00965 private: 00988 static int32_t matchRest(const Replaceable& text, 00989 int32_t start, int32_t limit, 00990 const UnicodeString& s); 00991 01001 int32_t findCodePoint(UChar32 c) const; 01002 01003 public: 01004 01012 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const; 01013 01022 int32_t indexOf(UChar32 c) const; 01023 01033 UChar32 charAt(int32_t index) const; 01034 01049 virtual UnicodeSet& add(UChar32 start, UChar32 end); 01050 01058 UnicodeSet& add(UChar32 c); 01059 01071 UnicodeSet& add(const UnicodeString& s); 01072 01073 private: 01079 static int32_t getSingleCP(const UnicodeString& s); 01080 01081 void _add(const UnicodeString& s); 01082 01083 public: 01092 UnicodeSet& addAll(const UnicodeString& s); 01093 01102 UnicodeSet& retainAll(const UnicodeString& s); 01103 01112 UnicodeSet& complementAll(const UnicodeString& s); 01113 01122 UnicodeSet& removeAll(const UnicodeString& s); 01123 01132 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s); 01133 01134 01142 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s); 01143 01157 virtual UnicodeSet& retain(UChar32 start, UChar32 end); 01158 01159 01165 UnicodeSet& retain(UChar32 c); 01166 01180 virtual UnicodeSet& remove(UChar32 start, UChar32 end); 01181 01189 UnicodeSet& remove(UChar32 c); 01190 01200 UnicodeSet& remove(const UnicodeString& s); 01201 01209 virtual UnicodeSet& complement(void); 01210 01225 virtual UnicodeSet& complement(UChar32 start, UChar32 end); 01226 01234 UnicodeSet& complement(UChar32 c); 01235 01246 UnicodeSet& complement(const UnicodeString& s); 01247 01260 virtual UnicodeSet& addAll(const UnicodeSet& c); 01261 01273 virtual UnicodeSet& retainAll(const UnicodeSet& c); 01274 01286 virtual UnicodeSet& removeAll(const UnicodeSet& c); 01287 01298 virtual UnicodeSet& complementAll(const UnicodeSet& c); 01299 01306 virtual UnicodeSet& clear(void); 01307 01333 UnicodeSet& closeOver(int32_t attribute); 01334 01341 virtual UnicodeSet &removeAllStrings(); 01342 01350 virtual int32_t getRangeCount(void) const; 01351 01359 virtual UChar32 getRangeStart(int32_t index) const; 01360 01368 virtual UChar32 getRangeEnd(int32_t index) const; 01369 01418 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const; 01419 01426 virtual UnicodeSet& compact(); 01427 01439 static UClassID U_EXPORT2 getStaticClassID(void); 01440 01449 virtual UClassID getDynamicClassID(void) const; 01450 01451 private: 01452 01453 // Private API for the USet API 01454 01455 friend class USetAccess; 01456 01457 int32_t getStringCount() const; 01458 01459 const UnicodeString* getString(int32_t index) const; 01460 01461 //---------------------------------------------------------------- 01462 // RuleBasedTransliterator support 01463 //---------------------------------------------------------------- 01464 01465 private: 01466 01472 virtual UBool matchesIndexValue(uint8_t v) const; 01473 01474 private: 01475 friend class RBBIRuleScanner; 01476 01477 //---------------------------------------------------------------- 01478 // Implementation: Clone as thawed (see ICU4J Freezable) 01479 //---------------------------------------------------------------- 01480 01481 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */); 01482 01483 //---------------------------------------------------------------- 01484 // Implementation: Pattern parsing 01485 //---------------------------------------------------------------- 01486 01487 void applyPatternIgnoreSpace(const UnicodeString& pattern, 01488 ParsePosition& pos, 01489 const SymbolTable* symbols, 01490 UErrorCode& status); 01491 01492 void applyPattern(RuleCharacterIterator& chars, 01493 const SymbolTable* symbols, 01494 UnicodeString& rebuiltPat, 01495 uint32_t options, 01496 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), 01497 UErrorCode& ec); 01498 01499 //---------------------------------------------------------------- 01500 // Implementation: Utility methods 01501 //---------------------------------------------------------------- 01502 01503 void ensureCapacity(int32_t newLen, UErrorCode& ec); 01504 01505 void ensureBufferCapacity(int32_t newLen, UErrorCode& ec); 01506 01507 void swapBuffers(void); 01508 01509 UBool allocateStrings(UErrorCode &status); 01510 01511 UnicodeString& _toPattern(UnicodeString& result, 01512 UBool escapeUnprintable) const; 01513 01514 UnicodeString& _generatePattern(UnicodeString& result, 01515 UBool escapeUnprintable) const; 01516 01517 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable); 01518 01519 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable); 01520 01521 //---------------------------------------------------------------- 01522 // Implementation: Fundamental operators 01523 //---------------------------------------------------------------- 01524 01525 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity); 01526 01527 void add(const UChar32* other, int32_t otherLen, int8_t polarity); 01528 01529 void retain(const UChar32* other, int32_t otherLen, int8_t polarity); 01530 01536 static UBool resemblesPropertyPattern(const UnicodeString& pattern, 01537 int32_t pos); 01538 01539 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars, 01540 int32_t iterOpts); 01541 01581 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern, 01582 ParsePosition& ppos, 01583 UErrorCode &ec); 01584 01585 void applyPropertyPattern(RuleCharacterIterator& chars, 01586 UnicodeString& rebuiltPat, 01587 UErrorCode& ec); 01588 01589 static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status); 01590 01595 typedef UBool (*Filter)(UChar32 codePoint, void* context); 01596 01606 void applyFilter(Filter filter, 01607 void* context, 01608 int32_t src, 01609 UErrorCode &status); 01610 01614 void setPattern(const UnicodeString& newPat); 01618 void releasePattern(); 01619 01620 friend class UnicodeSetIterator; 01621 }; 01622 01623 01624 01625 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const { 01626 return !operator==(o); 01627 } 01628 01629 inline UBool UnicodeSet::isFrozen() const { 01630 return (UBool)(bmpSet!=NULL || stringSpan!=NULL); 01631 } 01632 01633 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const { 01634 return !containsNone(start, end); 01635 } 01636 01637 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const { 01638 return !containsNone(s); 01639 } 01640 01641 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const { 01642 return !containsNone(s); 01643 } 01644 01645 inline UBool UnicodeSet::isBogus() const { 01646 return (UBool)(fFlags & kIsBogus); 01647 } 01648 01649 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) { 01650 return reinterpret_cast<UnicodeSet *>(uset); 01651 } 01652 01653 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) { 01654 return reinterpret_cast<const UnicodeSet *>(uset); 01655 } 01656 01657 inline USet *UnicodeSet::toUSet() { 01658 return reinterpret_cast<USet *>(this); 01659 } 01660 01661 inline const USet *UnicodeSet::toUSet() const { 01662 return reinterpret_cast<const USet *>(this); 01663 } 01664 01665 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const { 01666 int32_t sLength=s.length(); 01667 if(start<0) { 01668 start=0; 01669 } else if(start>sLength) { 01670 start=sLength; 01671 } 01672 return start+span(s.getBuffer()+start, sLength-start, spanCondition); 01673 } 01674 01675 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const { 01676 int32_t sLength=s.length(); 01677 if(limit<0) { 01678 limit=0; 01679 } else if(limit>sLength) { 01680 limit=sLength; 01681 } 01682 return spanBack(s.getBuffer(), limit, spanCondition); 01683 } 01684 01685 U_NAMESPACE_END 01686 01687 #endif