FIFE 2008.0
core.h
00001 // Copyright 2006 Nemanja Trifunovic
00002 
00003 /*
00004 Permission is hereby granted, free of charge, to any person or organization
00005 obtaining a copy of the software and accompanying documentation covered by
00006 this license (the "Software") to use, reproduce, display, distribute,
00007 execute, and transmit the Software, and to prepare derivative works of the
00008 Software, and to permit third-parties to whom the Software is furnished to
00009 do so, all subject to the following:
00010 
00011 The copyright notices in the Software and this entire statement, including
00012 the above license grant, this restriction and the following disclaimer,
00013 must be included in all copies of the Software, in whole or in part, and
00014 all derivative works of the Software, unless such copies or derivative
00015 works are solely in the form of machine-executable object code generated by
00016 a source language processor.
00017 
00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
00024 DEALINGS IN THE SOFTWARE.
00025 */
00026 
00027 
00028 #ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00029 #define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
00030 
00031 #include <iterator>
00032 
00033 namespace utf8
00034 {
00035     // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
00036     // You may need to change them to match your system. 
00037     // These typedefs have the same names as ones from cstdint, or boost/cstdint
00038     typedef unsigned char   uint8_t;
00039     typedef unsigned short  uint16_t;
00040     typedef unsigned int    uint32_t;
00041 
00042 // Helper code - not intended to be directly called by the library users. May be changed at any time
00043 namespace internal
00044 {    
00045     // Unicode constants
00046     // Leading (high) surrogates: 0xd800 - 0xdbff
00047     // Trailing (low) surrogates: 0xdc00 - 0xdfff
00048     const uint16_t LEAD_SURROGATE_MIN  = 0xd800u;
00049     const uint16_t LEAD_SURROGATE_MAX  = 0xdbffu;
00050     const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
00051     const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
00052     const uint16_t LEAD_OFFSET         = LEAD_SURROGATE_MIN - (0x10000 >> 10);
00053     const uint32_t SURROGATE_OFFSET    = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
00054 
00055     // Maximum valid value for a Unicode code point
00056     const uint32_t CODE_POINT_MAX      = 0x0010ffffu;
00057 
00058     template<typename octet_type>
00059     inline uint8_t mask8(octet_type oc)
00060     {
00061         return static_cast<uint8_t>(0xff & oc);
00062     }
00063     template<typename u16_type>
00064     inline uint16_t mask16(u16_type oc)
00065     {
00066         return static_cast<uint16_t>(0xffff & oc);
00067     }
00068     template<typename octet_type>
00069     inline bool is_trail(octet_type oc)
00070     {
00071         return ((mask8(oc) >> 6) == 0x2);
00072     }
00073 
00074     template <typename u16>
00075     inline bool is_surrogate(u16 cp)
00076     {
00077         return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
00078     }
00079 
00080     template <typename u32>
00081     inline bool is_code_point_valid(u32 cp)
00082     {
00083         return (cp <= CODE_POINT_MAX && !is_surrogate(cp) && cp != 0xfffe && cp != 0xffff);
00084     }  
00085 
00086     template <typename octet_iterator>
00087     inline typename std::iterator_traits<octet_iterator>::difference_type
00088     sequence_length(octet_iterator lead_it)
00089     {
00090         uint8_t lead = mask8(*lead_it);
00091         if (lead < 0x80) 
00092             return 1;
00093         else if ((lead >> 5) == 0x6)
00094             return 2;
00095         else if ((lead >> 4) == 0xe)
00096             return 3;
00097         else if ((lead >> 3) == 0x1e)
00098             return 4;
00099         else 
00100             return 0;
00101     }
00102 
00103     enum utf_error {OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
00104 
00105     template <typename octet_iterator>
00106     utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t* code_point)
00107     {
00108         uint32_t cp = mask8(*it);
00109         // Check the lead octet
00110         typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
00111         octet_difference_type length = sequence_length(it);
00112 
00113         // "Shortcut" for ASCII characters
00114         if (length == 1) {
00115             if (end - it > 0) {
00116                 if (code_point)
00117                     *code_point = cp;
00118                 ++it;
00119                 return OK;
00120             }
00121             else
00122                 return NOT_ENOUGH_ROOM;
00123         }
00124 
00125         // Do we have enough memory?     
00126         if (std::distance(it, end) < length)
00127             return NOT_ENOUGH_ROOM;
00128         
00129         // Check trail octets and calculate the code point
00130         switch (length) {
00131             case 0:
00132                 return INVALID_LEAD;
00133                 break;
00134             case 2:
00135                 if (is_trail(*(++it))) { 
00136                     cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
00137                 }
00138                 else {
00139                     --it;
00140                     return INCOMPLETE_SEQUENCE;
00141                 }
00142             break;
00143             case 3:
00144                 if (is_trail(*(++it))) {
00145                     cp = ((cp << 12) & 0xffff) + ((mask8(*it) << 6) & 0xfff);
00146                     if (is_trail(*(++it))) {
00147                         cp += (*it) & 0x3f;
00148                     }
00149                     else {
00150                         std::advance(it, -2);
00151                         return INCOMPLETE_SEQUENCE;
00152                     }
00153                 }
00154                 else {
00155                     --it;
00156                     return INCOMPLETE_SEQUENCE;
00157                 }
00158             break;
00159             case 4:
00160                 if (is_trail(*(++it))) {
00161                     cp = ((cp << 18) & 0x1fffff) + ((mask8(*it) << 12) & 0x3ffff);                
00162                     if (is_trail(*(++it))) {
00163                         cp += (mask8(*it) << 6) & 0xfff;
00164                         if (is_trail(*(++it))) {
00165                             cp += (*it) & 0x3f; 
00166                         }
00167                         else {
00168                             std::advance(it, -3);
00169                             return INCOMPLETE_SEQUENCE;
00170                         }
00171                     }
00172                     else {
00173                         std::advance(it, -2);
00174                         return INCOMPLETE_SEQUENCE;
00175                     }
00176                 }
00177                 else {
00178                     --it;
00179                     return INCOMPLETE_SEQUENCE;
00180                 }
00181             break;
00182         }
00183         // Is the code point valid?
00184         if (!is_code_point_valid(cp)) {
00185             for (octet_difference_type i = 0; i < length - 1; ++i) 
00186                 --it;
00187             return INVALID_CODE_POINT;
00188         }
00189             
00190         if (code_point)
00191             *code_point = cp;
00192             
00193         if (cp < 0x80) {
00194             if (length != 1) {
00195                 std::advance(it, -(length-1));
00196                 return OVERLONG_SEQUENCE;
00197             }
00198         }
00199         else if (cp < 0x800) {
00200             if (length != 2) {
00201                 std::advance(it, -(length-1));
00202                 return OVERLONG_SEQUENCE;
00203             }
00204         }
00205         else if (cp < 0x10000) {
00206             if (length != 3) {
00207                 std::advance(it, -(length-1));
00208                 return OVERLONG_SEQUENCE;
00209             }
00210         }
00211            
00212         ++it;
00213         return OK;    
00214     }
00215 
00216     template <typename octet_iterator>
00217     inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
00218         return validate_next(it, end, 0);
00219     }
00220 
00221 } // namespace internal 
00222 
00224 
00225     // Byte order mark
00226     const uint8_t bom[] = {0xef, 0xbb, 0xbf}; 
00227 
00228     template <typename octet_iterator>
00229     octet_iterator find_invalid(octet_iterator start, octet_iterator end)
00230     {
00231         octet_iterator result = start;
00232         while (result != end) {
00233             internal::utf_error err_code = internal::validate_next(result, end);
00234             if (err_code != internal::OK)
00235                 return result;
00236         }
00237         return result;
00238     }
00239 
00240     template <typename octet_iterator>
00241     inline bool is_valid(octet_iterator start, octet_iterator end)
00242     {
00243         return (find_invalid(start, end) == end);
00244     }
00245 
00246     template <typename octet_iterator>
00247     inline bool is_bom (octet_iterator it)
00248     {
00249         return (
00250             (internal::mask8(*it++)) == bom[0] &&
00251             (internal::mask8(*it++)) == bom[1] &&
00252             (internal::mask8(*it))   == bom[2]
00253            );
00254     }
00255 } // namespace utf8
00256 
00257 #endif // header guard
00258 
00259 
 All Classes Namespaces Functions Variables Enumerations Enumerator