stlab.adobe.com Adobe Systems Incorporated
unicode.hpp
Go to the documentation of this file.
1 /*
2  Copyright 2005-2007 Adobe Systems Incorporated
3  Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
4  or a copy at http://stlab.adobe.com/licenses.html)
5 */
6 
7 /*************************************************************************************************/
8 
9 #ifndef ADOBE_UNICODE_HPP
10 #define ADOBE_UNICODE_HPP
11 
12 /*************************************************************************************************/
13 
14 #include <adobe/config.hpp>
15 
17 
18 #include <boost/cstdint.hpp>
19 #include <boost/utility/enable_if.hpp>
20 
21 #include <vector>
22 #include <cassert>
23 #include <stdexcept>
24 
25 /*************************************************************************************************/
26 
27 namespace adobe {
28 
29 /*************************************************************************************************/
30 
31 #if !defined(ADOBE_NO_DOCUMENTATION)
32 
33 /*************************************************************************************************/
34 
35 template <typename T>
36 struct is_utf8_type
37 { enum { value = sizeof(T) == 1 }; };
38 
39 /*************************************************************************************************/
40 
41 template <typename T>
42 struct is_utf16_type
43 { enum { value = sizeof(T) == 2 }; };
44 
45 /*************************************************************************************************/
46 
47 template <typename T>
48 struct is_utf32_type
49 { enum { value = sizeof(T) == 4 }; };
50 
51 /*************************************************************************************************/
52 
53 template <typename I>
54 struct is_utf8_iterator_type
55 { enum { value = is_utf8_type<typename std::iterator_traits<I>::value_type>::value }; };
56 
57 /*************************************************************************************************/
58 
59 template <typename I>
60 struct is_utf16_iterator_type
61 { enum { value = is_utf16_type<typename std::iterator_traits<I>::value_type>::value }; };
62 
63 /*************************************************************************************************/
64 
65 template <typename I>
66 struct is_utf32_iterator_type
67 { enum { value = is_utf32_type<typename std::iterator_traits<I>::value_type>::value }; };
68 
69 /*************************************************************************************************/
70 
71 namespace implementation {
72 
73 /*************************************************************************************************/
74 
75 // REVISIT (fbrereto) : I don't need to INIT_ONCE these, do I?
76 
77 const unsigned char to_utf32_pivot_1_k(128);
78 const unsigned char to_utf32_pivot_2_k(192);
79 const unsigned char to_utf32_pivot_3_k(224);
80 const unsigned char to_utf32_pivot_4_k(240);
81 const unsigned char to_utf32_pivot_5_k(248);
82 const unsigned char to_utf32_pivot_6_k(252);
83 const unsigned char to_utf32_pivot_7_k(254);
84 
85 const boost::uint32_t to_utf8_pivot_1_k(1UL << 7);
86 const boost::uint32_t to_utf8_pivot_2_k(1UL << 11);
87 const boost::uint32_t to_utf8_pivot_3_k(1UL << 16);
88 const boost::uint32_t to_utf8_pivot_4_k(1UL << 21);
89 const boost::uint32_t to_utf8_pivot_5_k(1UL << 26);
90 
91 const boost::uint16_t to_utf16_surrogate_pivot_k(65535);
92 const boost::uint16_t utf16_high_surrogate_front_k(0xd800);
93 const boost::uint16_t utf16_high_surrogate_back_k(0xdbff);
94 const boost::uint16_t utf16_low_surrogate_front_k(0xdc00);
95 const boost::uint16_t utf16_low_surrogate_back_k(0xdfff);
96 
97 /*************************************************************************************************/
98 /*
99  NOTE (fbrereto) : The char(...) designations are required on windows, otherwise the MSVC
100  compiler complains in the utf8_add_mask routines with the following:
101 
102  "warning C4309: 'specialization' : truncation of constant value"
103 */
104 template <std::size_t NumBytes> struct utf8_header_t { };
105 template <> struct utf8_header_t<0> { static const char value = '\x80'; }; // nonheader
106 //template <> struct utf8_header_t<1> { static const char value = '\x00'; }; // illegal
107 template <> struct utf8_header_t<2> { static const char value = '\xC0'; };
108 template <> struct utf8_header_t<3> { static const char value = '\xE0'; };
109 template <> struct utf8_header_t<4> { static const char value = '\xF0'; };
110 template <> struct utf8_header_t<5> { static const char value = '\xF8'; };
111 template <> struct utf8_header_t<6> { static const char value = '\xFC'; };
112 
113 /*************************************************************************************************/
114 
115 template <char Mask, typename BinaryInteger>
116 inline char add_mask(BinaryInteger code)
117 { return static_cast<char>(code | Mask); }
118 
119 template <std::size_t NumBytes, bool Header, typename BinaryInteger>
120 inline char utf8_add_mask(BinaryInteger code)
121 { return add_mask<utf8_header_t<Header ? NumBytes : 0>::value>(code); }
122 
123 
124 //MM concept gcc-4.1.1 workaround
125 inline char utf8_add_mask_0_false(boost::uint32_t code)
126 {
127  return utf8_add_mask<0,false>(code);
128 }
129 
130 /*************************************************************************************************/
131 
132 template<char Mask, typename BinaryInteger>
133 inline char strip_mask(BinaryInteger code)
134 { return static_cast<char>(code & ~Mask); }
135 
136 template <std::size_t NumBytes, bool Header, typename BinaryInteger>
137 inline char utf8_strip_mask(BinaryInteger code)
138 { return strip_mask<utf8_header_t<Header ? NumBytes : 0>::value>(code); }
139 
140 /*************************************************************************************************/
141 
142 template <std::size_t Position>
143 inline boost::uint32_t promote_fragment(char fragment)
144 { return boost::uint32_t(fragment << ((Position - 1) * 6)); }
145 
146 template <>
147 inline boost::uint32_t promote_fragment<1>(char fragment)
148 { return boost::uint32_t(fragment); }
149 
150 template <>
151 inline boost::uint32_t promote_fragment<0>(char); // unimplemented
152 
153 /*************************************************************************************************/
154 
155 template <std::size_t Position>
156 inline char demote_fragment(boost::uint32_t fragment)
157 { return char((fragment >> ((Position - 1) * 6)) & 0x0000003F); }
158 
159 template <>
160 inline char demote_fragment<1>(boost::uint32_t fragment)
161 { return char(fragment & 0x0000003F); }
162 
163 template <>
164 inline char demote_fragment<0>(boost::uint32_t); // unimplemented
165 
166 //MM concept gcc-4.1.1 workaround
167 inline char demote_fragment_1(boost::uint32_t fragment)
168 {
169  return demote_fragment<1>(fragment);
170 }
171 
172 
173 /*************************************************************************************************/
174 
175 template <std::size_t ByteCount, bool Header = true>
176 struct demotion_engine_t
177 {
178  template <typename OutputIterator>
179  inline OutputIterator operator () (boost::uint32_t code, OutputIterator i)
180  {
181  *i = utf8_add_mask<ByteCount, Header>(demote_fragment<ByteCount>(code));
182 
183  ++i;
184 
185  return demotion_engine_t<ByteCount - 1, false>()(code, i);
186  }
187 };
188 
189 
190 template <>
191 struct demotion_engine_t<1, false>
192 {
193  template <typename OutputIterator>
194  inline OutputIterator operator () (boost::uint32_t code, OutputIterator i)
195  {
196  *i = utf8_add_mask_0_false(demote_fragment_1(code));
197 
198  return ++i;
199  }
200 };
201 
202 /*************************************************************************************************/
203 
204 template <std::size_t ByteCount, bool Header = true>
205 struct promotion_engine_t
206 {
207  template <typename InputIterator>
208  inline boost::uint32_t operator () (InputIterator& first, InputIterator last)
209  {
210  /*
211  CodeWarrior 9.4 doesn't like this code composited into one line;
212  GCC doesn't seem to have a problem.
213  */
214 
215  char n(*first);
216  char stripped(utf8_strip_mask<ByteCount, Header>(n));
217  boost::uint32_t shifted(promote_fragment<ByteCount>(stripped));
218 
219  ++first;
220 
221  if (first == last)
222  throw std::runtime_error("unicode: utf32 conversion ran out of input");
223 
224  return shifted | promotion_engine_t<ByteCount - 1, false>()(first, last);
225  }
226 };
227 
228 template <>
229 struct promotion_engine_t<1, false>
230 {
231  template <typename InputIterator>
232  inline boost::uint32_t operator () (InputIterator& first, InputIterator)
233  {
234  boost::uint32_t result(promote_fragment<1>(utf8_strip_mask<0, false>(*first)));
235 
236  ++first;
237 
238  return result;
239  }
240 };
241 
242 /*************************************************************************************************/
243 
244 template <typename InputIterator, typename DestInteger>
245 typename boost::enable_if<is_utf16_iterator_type<InputIterator>, InputIterator>::type
246  to_utf32 (InputIterator first, InputIterator last, DestInteger& result)
247 {
248  if (first == last) return first;
249 
250  boost::uint16_t code(static_cast<boost::uint16_t>(*first));
251 
252  ++first;
253 
254  if (code >= implementation::utf16_high_surrogate_front_k &&
255  code <= implementation::utf16_high_surrogate_back_k)
256  {
257  result = 0;
258 
259  if (first == last)
260  throw std::runtime_error("unicode: utf16 high surrogate found without low surrogate");
261 
262  boost::uint16_t low(static_cast<boost::uint16_t>(*first));
263 
264  assert (low >= implementation::utf16_low_surrogate_front_k &&
265  low <= implementation::utf16_low_surrogate_back_k);
266 
267  ++first;
268 
269  result = (code - implementation::utf16_high_surrogate_front_k) * 0x400 +
270  (low - implementation::utf16_low_surrogate_front_k) + 0x10000;
271  }
272  else if (code >= implementation::utf16_low_surrogate_front_k &&
273  code <= implementation::utf16_low_surrogate_back_k)
274  { throw std::runtime_error("unicode: utf16 low surrogate found without high surrogate"); }
275  else
276  { result = static_cast<DestInteger>(code); }
277 
278  return first;
279 }
280 
281 /*************************************************************************************************/
282 
283 template <typename InputIterator, typename DestInteger>
284 typename boost::enable_if<is_utf8_iterator_type<InputIterator>, InputIterator>::type
285  to_utf32 (InputIterator first, InputIterator last, DestInteger& result)
286 {
287  if (first == last)
288  return first;
289 
290  unsigned char n(static_cast<unsigned char>(*first));
291 
292  if (n < implementation::to_utf32_pivot_1_k)
293  { result = static_cast<DestInteger>(n); ++first; }
294  else if (n < implementation::to_utf32_pivot_2_k)
295  { throw std::runtime_error("unicode: ill-defined utf8 (< 192)"); }
296  else if (n < implementation::to_utf32_pivot_3_k)
297  result = implementation::promotion_engine_t<2>()(first, last);
298  else if (n < implementation::to_utf32_pivot_4_k)
299  result = implementation::promotion_engine_t<3>()(first, last);
300  else if (n < implementation::to_utf32_pivot_5_k)
301  result = implementation::promotion_engine_t<4>()(first, last);
302  else if (n < implementation::to_utf32_pivot_6_k)
303  result = implementation::promotion_engine_t<5>()(first, last);
304  else if (n < implementation::to_utf32_pivot_7_k)
305  result = implementation::promotion_engine_t<6>()(first, last);
306  else
307  { throw std::runtime_error("unicode: ill-defined utf8 (>= 254)"); }
308 
309  return first;
310 }
311 
312 /*************************************************************************************************/
313 
314 template <typename InputIterator, typename DestInteger>
315 typename boost::enable_if<is_utf32_iterator_type<InputIterator>, InputIterator>::type
316  to_utf32 (InputIterator first, InputIterator last, DestInteger& result)
317 {
318  if (first == last)
319  return first;
320 
321  result = *first;
322 
323  return ++first;
324 }
325 
326 /*************************************************************************************************/
327 
328 } // namespace implementation
329 
330 /*************************************************************************************************/
331 
332 #endif
333 
334 /*************************************************************************************************/
335 /*
336  utf32 -> utf8
337  - 1 source value
338  - n output values
339 */
340 
341 template < typename T, // T models Integer; T must be a valid UTF32-encoded code point
342  typename O> // O models OutputIterator
343 typename boost::enable_if<is_utf32_type<T>, O>::type
344  value_to_utf8(T code, O output)
345 {
346  if (code < implementation::to_utf8_pivot_1_k) // UTF-8 is 1 byte long
347  { *output = static_cast<char>(code); ++output; }
348  else if (code < implementation::to_utf8_pivot_2_k) // UTF-8 is 2 bytes long
349  output = implementation::demotion_engine_t<2>()(code, output);
350  else if (code < implementation::to_utf8_pivot_3_k) // UTF-8 is 3 bytes long
351  output = implementation::demotion_engine_t<3>()(code, output);
352  else if (code < implementation::to_utf8_pivot_4_k) // UTF-8 is 4 bytes long
353  output = implementation::demotion_engine_t<4>()(code, output);
354  else if (code < implementation::to_utf8_pivot_5_k) // UTF-8 is 5 bytes long
355  output = implementation::demotion_engine_t<5>()(code, output);
356  else // UTF-8 is 6 bytes long
357  output = implementation::demotion_engine_t<6>()(code, output);
358 
359  return output;
360 }
361 
362 /*************************************************************************************************/
363 /*
364  utf16 -> utf8
365  - 1 source value
366  - n output values
367 */
368 
369 template < typename T, // T models Integer; T must be a valid UTF16-encoded code point
370  typename O> // O models OutputIterator
371 typename boost::enable_if<is_utf16_type<T>, O>::type
372  value_to_utf8(T code, O output)
373 {
374  return value_to_utf8(static_cast<boost::uint32_t>(code), output);
375 }
376 
377 /*************************************************************************************************/
378 /*
379  utf8 -> utf8
380  - 1 source value
381  - 1 output value
382 */
383 
384 template < typename T, // T models Integer; T must be a valid UTF8-encoded code point
385  typename O> // O models OutputIterator
386 typename boost::enable_if<is_utf8_type<T>, O>::type
387  value_to_utf8(T code, O output)
388 {
389  *output++ = code;
390 
391  return output;
392 }
393 
394 /*************************************************************************************************/
395 /*
396  utf16 -> utf8
397  - n source values
398  - m output values
399 */
400 
401 template < typename I, // I models InputIterator
402  typename O> // O models OutputIterator
403 typename boost::enable_if<is_utf16_iterator_type<I>, O>::type
404  to_utf8(I first, I last, O output)
405 {
406  while (first != last)
407  {
408  boost::uint32_t result;
409 
410  first = implementation::to_utf32(first, last, result);
411 
412  output = value_to_utf8(result, output);
413  }
414 
415  return output;
416 }
417 
418 /*************************************************************************************************/
419 /*
420  utf32 -> utf8
421  - n source values
422  - m output values
423 */
424 
425 template < typename I, // I models InputIterator
426  typename O> // O models OutputIterator
427 typename boost::enable_if<is_utf32_iterator_type<I>, O>::type
428  to_utf8(I first, I last, O output)
429 {
430  if (first == last) return output;
431 
432  typedef typename std::iterator_traits<I>::value_type value_type;
433 
434  adobe::for_each(first, last, boost::bind(&value_to_utf8<value_type, O>, _1, boost::ref(output)));
435 
436  return output;
437 }
438 
439 /*************************************************************************************************/
440 /*
441  utf8 -> utf8
442  - n source values
443  - m output values
444 */
445 
446 template < typename I, // I models InputIterator
447  typename O> // O models OutputIterator
448 typename boost::enable_if<is_utf8_iterator_type<I>, O>::type
449  to_utf8(I first, I last, O output)
450 {
451  return std::copy(first, last, output);
452 }
453 
454 /*************************************************************************************************/
455 /*
456  utf32 -> utf16
457  - 1 source value
458  - n output values
459 */
460 
461 template < typename T, // T models Integer; sizeof(T) must equal 4; code must be valid utf32
462  typename O> // O models OutputIterator
463 typename boost::enable_if<is_utf32_type<T>, O>::type
464  value_to_utf16(T code, O output)
465 {
466  if (code <= implementation::to_utf16_surrogate_pivot_k)
467  {
468  *output = static_cast<boost::uint16_t>(code);
469  }
470  else
471  {
472  *output = static_cast<boost::uint16_t>((code - 0x10000) / 0x400 + implementation::utf16_high_surrogate_front_k);
473 
474  ++output;
475 
476  *output = static_cast<boost::uint16_t>((code - 0x10000) % 0x400 + implementation::utf16_low_surrogate_front_k);
477  }
478 
479  return ++output;
480 }
481 
482 /*************************************************************************************************/
483 /*
484  utf8 -> utf16
485  - n source values
486  - m output values
487 */
488 template < typename I, // I models InputIterator
489  typename O> // O models OutputIterator
490 typename boost::enable_if<is_utf8_iterator_type<I>, O>::type
491  to_utf16(I first, I last, O output)
492 {
493  while (first != last)
494  {
495  boost::uint32_t result;
496 
497  first = implementation::to_utf32(first, last, result);
498 
499  output = value_to_utf16(result, output);
500  }
501 
502  return output;
503 }
504 
505 /*************************************************************************************************/
506 /*
507  utf16 -> utf16
508  - n source values
509  - n output values
510 */
511 template < typename I, // I models InputIterator
512  typename O> // O models OutputIterator
513 typename boost::enable_if<is_utf16_iterator_type<I>, O>::type
514  to_utf16(I first, I last, O output)
515 {
516  return std::copy(first, last, output);
517 }
518 
519 /*************************************************************************************************/
520 /*
521  Precondition: [ first, last ) must convert to exactly one UTF-16 character
522 */
523 
524 template <typename I>
525 inline typename boost::enable_if<is_utf8_iterator_type<I>, boost::uint16_t>::type
526  to_utf16(I first, I last)
527 {
528  boost::uint32_t result;
529 
530  implementation::to_utf32(first, last, result);
531 
532  return static_cast<boost::uint16_t>(result);
533 }
534 
535 /*************************************************************************************************/
536 /*
537  utf16 -> utf32
538  - n source values
539  - m output values
540 
541  utf8 -> utf32
542  - n source values
543  - m output values
544 */
545 
546 template < typename I, // I models InputIterator
547  typename O> // O models OutputIterator
548 O to_utf32(I first, I last, O output)
549 {
550  boost::uint32_t result;
551 
552  while (first != last)
553  {
554  first = implementation::to_utf32(first, last, result);
555 
556  *output = result;
557 
558  ++output;
559  }
560 
561  return output;
562 }
563 
564 /*************************************************************************************************/
565 /*
566  Precondition: [ first, last ) must convert to exactly one UTF-32 character
567 */
568 
569 template <typename I> // I models InputIterator
570 inline boost::uint32_t to_utf32(I first, I last)
571 {
572  boost::uint32_t result;
573 
574  implementation::to_utf32(first, last, result);
575 
576  return result;
577 }
578 
579 /*************************************************************************************************/
580 
581 } // namespace adobe
582 
583 /*************************************************************************************************/
584 
585 #endif
586 
587 /*************************************************************************************************/

Copyright © 2006-2007 Adobe Systems Incorporated.

Use of this website signifies your agreement to the Terms of Use and Online Privacy Policy.

Search powered by Google