FIFE 2008.0
|
00001 // Copyright 2006 Nemanja Trifunovic 00002 00003 /* 00004 Permission is hereby granted, free of charge, to any person or organization 00005 obtaining a copy of the software and accompanying documentation covered by 00006 this license (the "Software") to use, reproduce, display, distribute, 00007 execute, and transmit the Software, and to prepare derivative works of the 00008 Software, and to permit third-parties to whom the Software is furnished to 00009 do so, all subject to the following: 00010 00011 The copyright notices in the Software and this entire statement, including 00012 the above license grant, this restriction and the following disclaimer, 00013 must be included in all copies of the Software, in whole or in part, and 00014 all derivative works of the Software, unless such copies or derivative 00015 works are solely in the form of machine-executable object code generated by 00016 a source language processor. 00017 00018 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00019 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00020 FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT 00021 SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE 00022 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, 00023 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 00024 DEALINGS IN THE SOFTWARE. 00025 */ 00026 00027 00028 #ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00029 #define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 00030 00031 #include "core.h" 00032 #include <stdexcept> 00033 00034 namespace utf8 00035 { 00036 // Exceptions that may be thrown from the library functions. 00037 class invalid_code_point : public std::exception { 00038 uint32_t cp; 00039 public: 00040 invalid_code_point(uint32_t cp) : cp(cp) {} 00041 virtual const char* what() const throw() { return "Invalid code point"; } 00042 uint32_t code_point() const {return cp;} 00043 }; 00044 00045 class invalid_utf8 : public std::exception { 00046 uint8_t u8; 00047 public: 00048 invalid_utf8 (uint8_t u) : u8(u) {} 00049 virtual const char* what() const throw() { return "Invalid UTF-8"; } 00050 uint8_t utf8_octet() const {return u8;} 00051 }; 00052 00053 class invalid_utf16 : public std::exception { 00054 uint16_t u16; 00055 public: 00056 invalid_utf16 (uint16_t u) : u16(u) {} 00057 virtual const char* what() const throw() { return "Invalid UTF-16"; } 00058 uint16_t utf16_word() const {return u16;} 00059 }; 00060 00061 class not_enough_room : public std::exception { 00062 public: 00063 virtual const char* what() const throw() { return "Not enough space"; } 00064 }; 00065 00067 00068 template <typename octet_iterator, typename output_iterator> 00069 output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) 00070 { 00071 while (start != end) { 00072 octet_iterator sequence_start = start; 00073 internal::utf_error err_code = internal::validate_next(start, end); 00074 switch (err_code) { 00075 case internal::OK : 00076 for (octet_iterator it = sequence_start; it != start; ++it) 00077 *out++ = *it; 00078 break; 00079 case internal::NOT_ENOUGH_ROOM: 00080 throw not_enough_room(); 00081 case internal::INVALID_LEAD: 00082 append (replacement, out); 00083 ++start; 00084 break; 00085 case internal::INCOMPLETE_SEQUENCE: 00086 case internal::OVERLONG_SEQUENCE: 00087 case internal::INVALID_CODE_POINT: 00088 append (replacement, out); 00089 ++start; 00090 // just one replacement mark for the sequence 00091 while (internal::is_trail(*start) && start != end) 00092 ++start; 00093 break; 00094 } 00095 } 00096 return out; 00097 } 00098 00099 template <typename octet_iterator, typename output_iterator> 00100 inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) 00101 { 00102 static const uint32_t replacement_marker = internal::mask16(0xfffd); 00103 return replace_invalid(start, end, out, replacement_marker); 00104 } 00105 00106 template <typename octet_iterator> 00107 octet_iterator append(uint32_t cp, octet_iterator result) 00108 { 00109 if (!internal::is_code_point_valid(cp)) 00110 throw invalid_code_point(cp); 00111 00112 if (cp < 0x80) // one octet 00113 *(result++) = static_cast<uint8_t>(cp); 00114 else if (cp < 0x800) { // two octets 00115 *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); 00116 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00117 } 00118 else if (cp < 0x10000) { // three octets 00119 *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); 00120 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80); 00121 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00122 } 00123 else if (cp <= internal::CODE_POINT_MAX) { // four octets 00124 *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); 00125 *(result++) = static_cast<uint8_t>((cp >> 12)& 0x3f | 0x80); 00126 *(result++) = static_cast<uint8_t>((cp >> 6) & 0x3f | 0x80); 00127 *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); 00128 } 00129 else 00130 throw invalid_code_point(cp); 00131 00132 return result; 00133 } 00134 00135 template <typename octet_iterator> 00136 uint32_t next(octet_iterator& it, octet_iterator end) 00137 { 00138 uint32_t cp = 0; 00139 internal::utf_error err_code = internal::validate_next(it, end, &cp); 00140 switch (err_code) { 00141 case internal::OK : 00142 break; 00143 case internal::NOT_ENOUGH_ROOM : 00144 throw not_enough_room(); 00145 case internal::INVALID_LEAD : 00146 case internal::INCOMPLETE_SEQUENCE : 00147 case internal::OVERLONG_SEQUENCE : 00148 throw invalid_utf8(*it); 00149 case internal::INVALID_CODE_POINT : 00150 throw invalid_code_point(cp); 00151 } 00152 return cp; 00153 } 00154 00155 template <typename octet_iterator> 00156 uint32_t prior(octet_iterator& it, octet_iterator start) 00157 { 00158 octet_iterator end = it; 00159 while (internal::is_trail(*(--it))) 00160 if (it < start) 00161 throw invalid_utf8(*it); // error - no lead byte in the sequence 00162 octet_iterator temp = it; 00163 return next(temp, end); 00164 } 00165 00167 template <typename octet_iterator> 00168 uint32_t previous(octet_iterator& it, octet_iterator pass_start) 00169 { 00170 octet_iterator end = it; 00171 while (internal::is_trail(*(--it))) 00172 if (it == pass_start) 00173 throw invalid_utf8(*it); // error - no lead byte in the sequence 00174 octet_iterator temp = it; 00175 return next(temp, end); 00176 } 00177 00178 template <typename octet_iterator, typename distance_type> 00179 void advance (octet_iterator& it, distance_type n, octet_iterator end) 00180 { 00181 for (distance_type i = 0; i < n; ++i) 00182 next(it, end); 00183 } 00184 00185 template <typename octet_iterator> 00186 typename std::iterator_traits<octet_iterator>::difference_type 00187 distance (octet_iterator first, octet_iterator last) 00188 { 00189 typename std::iterator_traits<octet_iterator>::difference_type dist; 00190 for (dist = 0; first < last; ++dist) 00191 next(first, last); 00192 return dist; 00193 } 00194 00195 template <typename u16bit_iterator, typename octet_iterator> 00196 octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) 00197 { 00198 while (start != end) { 00199 uint32_t cp = internal::mask16(*start++); 00200 // Take care of surrogate pairs first 00201 if (internal::is_surrogate(cp)) { 00202 if (start != end) { 00203 uint32_t trail_surrogate = internal::mask16(*start++); 00204 if (trail_surrogate >= internal::TRAIL_SURROGATE_MIN && trail_surrogate <= internal::TRAIL_SURROGATE_MAX) 00205 cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; 00206 else 00207 throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); 00208 } 00209 else 00210 throw invalid_utf16(static_cast<uint16_t>(*start)); 00211 00212 } 00213 result = append(cp, result); 00214 } 00215 return result; 00216 } 00217 00218 template <typename u16bit_iterator, typename octet_iterator> 00219 u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) 00220 { 00221 while (start != end) { 00222 uint32_t cp = next(start, end); 00223 if (cp > 0xffff) { //make a surrogate pair 00224 *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); 00225 *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); 00226 } 00227 else 00228 *result++ = static_cast<uint16_t>(cp); 00229 } 00230 return result; 00231 } 00232 00233 template <typename octet_iterator, typename u32bit_iterator> 00234 octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) 00235 { 00236 while (start != end) 00237 result = append(*(start++), result); 00238 00239 return result; 00240 } 00241 00242 template <typename octet_iterator, typename u32bit_iterator> 00243 u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) 00244 { 00245 while (start < end) 00246 (*result++) = next(start, end); 00247 00248 return result; 00249 } 00250 00251 // The iterator class 00252 template <typename octet_iterator> 00253 class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { 00254 octet_iterator it; 00255 octet_iterator range_start; 00256 octet_iterator range_end; 00257 public: 00258 iterator () {}; 00259 explicit iterator (const octet_iterator& octet_it, 00260 const octet_iterator& range_start, 00261 const octet_iterator& range_end) : 00262 it(octet_it), range_start(range_start), range_end(range_end) 00263 { 00264 if (it < range_start || it > range_end) 00265 throw std::out_of_range("Invalid utf-8 iterator position"); 00266 } 00267 // the default "big three" are OK 00268 octet_iterator base () const { return it; } 00269 uint32_t operator * () const 00270 { 00271 octet_iterator temp = it; 00272 return next(temp, range_end); 00273 } 00274 bool operator == (const iterator& rhs) const 00275 { 00276 if (range_start != rhs.range_start && range_end != rhs.range_end) 00277 throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); 00278 return (it == rhs.it); 00279 } 00280 bool operator != (const iterator& rhs) const 00281 { 00282 return !(operator == (rhs)); 00283 } 00284 iterator& operator ++ () 00285 { 00286 next(it, range_end); 00287 return *this; 00288 } 00289 iterator operator ++ (int) 00290 { 00291 iterator temp = *this; 00292 next(it, range_end); 00293 return temp; 00294 } 00295 iterator& operator -- () 00296 { 00297 prior(it, range_start); 00298 return *this; 00299 } 00300 iterator operator -- (int) 00301 { 00302 iterator temp = *this; 00303 prior(it, range_start); 00304 return temp; 00305 } 00306 }; // class iterator 00307 00308 } // namespace utf8 00309 00310 #endif //header guard 00311 00312