Open Chinese Convert  1.0.2
A project for conversion between Traditional and Simplified Chinese
 All Classes Functions Typedefs Modules
UTF8Util.hpp
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2013 BYVoid <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #pragma once
20 
21 #include "Common.hpp"
22 
23 namespace opencc {
28 class OPENCC_EXPORT UTF8Util {
29 public:
33  static void SkipUtf8Bom(FILE* fp);
34 
39  static size_t NextCharLengthNoException(const char* str) {
40  char ch = *str;
41  if ((ch & 0x80) == 0x00) {
42  return 1;
43  } else if ((ch & 0xE0) == 0xC0) {
44  return 2;
45  } else if ((ch & 0xF0) == 0xE0) {
46  return 3;
47  } else if ((ch & 0xF8) == 0xF0) {
48  return 4;
49  } else if ((ch & 0xFC) == 0xF8) {
50  return 5;
51  } else if ((ch & 0xFE) == 0xFC) {
52  return 6;
53  }
54  return 0;
55  }
56 
60  static size_t NextCharLength(const char* str) {
61  size_t length = NextCharLengthNoException(str);
62  if (length == 0) {
63  throw InvalidUTF8(str);
64  }
65  return length;
66  }
67 
71  static size_t PrevCharLength(const char* str) {
72  for (size_t i = 1; i <= 6; i++) {
73  str--;
74  size_t length = NextCharLengthNoException(str);
75  if (length == i) {
76  return length;
77  }
78  }
79  throw InvalidUTF8(str);
80  }
81 
85  static const char* NextChar(const char* str) {
86  return str + NextCharLength(str);
87  }
88 
92  static const char* PrevChar(const char* str) {
93  return str - PrevCharLength(str);
94  }
95 
102  static const char* FindNextInline(const char* str, const char ch) {
103  while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
104  str = NextChar(str);
105  }
106  return str;
107  }
108 
112  static bool IsLineEndingOrFileEnding(const char ch) {
113  return ch == '\0' || ch == '\n' || ch == '\r';
114  }
115 
119  static string FromSubstr(const char* str, size_t length) {
120  string newStr;
121  newStr.resize(length);
122  strncpy(const_cast<char*>(newStr.c_str()), str, length);
123  return newStr;
124  }
125 
129  static bool NotShorterThan(const char* str, size_t length) {
130  while (length > 0) {
131  if (*str == '\0') {
132  return false;
133  }
134  length--;
135  str++;
136  }
137  return true;
138  }
139 
144  static string TruncateUTF8(const char* str, size_t maxLength) {
145  string wordTrunc;
146  if (NotShorterThan(str, maxLength)) {
147  size_t len = 0;
148  const char* pStr = str;
149  while (len < maxLength) {
150  size_t nextLen = NextCharLength(pStr);
151  pStr += nextLen;
152  len += nextLen;
153  }
154  wordTrunc = FromSubstr(str, len);
155  } else {
156  wordTrunc = str;
157  }
158  return wordTrunc;
159  }
160 
164  static void ReplaceAll(string& str, const char* from, const char* to) {
165  string::size_type pos = 0;
166  string::size_type fromLen = strlen(from);
167  string::size_type toLen = strlen(to);
168  while ((pos = str.find(from, pos)) != string::npos) {
169  str.replace(pos, fromLen, to);
170  pos += toLen;
171  }
172  }
173 
177  static string Join(const vector<string>& strings, const string& separator) {
178  std::ostringstream buffer;
179  bool first = true;
180  for (const auto& str : strings) {
181  if (!first) {
182  buffer << separator;
183  }
184  buffer << str;
185  first = false;
186  }
187  return buffer.str();
188  }
189 
193  static string Join(const vector<string>& strings) {
194  std::ostringstream buffer;
195  for (const auto& str : strings) {
196  buffer << str;
197  }
198  return buffer.str();
199  }
200 };
201 }
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition: UTF8Util.hpp:71
static string Join(const vector< string > &strings)
Joins a string vector in to a string.
Definition: UTF8Util.hpp:193
Definition: Exception.hpp:85
static string Join(const vector< string > &strings, const string &separator)
Joins a string vector in to a string with a separator.
Definition: UTF8Util.hpp:177
static bool NotShorterThan(const char *str, size_t length)
Returns true if the given string is longer or as long as the given length.
Definition: UTF8Util.hpp:129
static void ReplaceAll(string &str, const char *from, const char *to)
Replaces all patterns in a string in place.
Definition: UTF8Util.hpp:164
static string FromSubstr(const char *str, size_t length)
Copies a substring with given length to a new std::string.
Definition: UTF8Util.hpp:119
Definition: BinaryDict.hpp:24
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition: UTF8Util.hpp:92
static string TruncateUTF8(const char *str, size_t maxLength)
Truncates a string with a maximal length.
Definition: UTF8Util.hpp:144
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition: UTF8Util.hpp:85
UTF8 string utilities.
Definition: UTF8Util.hpp:28
static const char * FindNextInline(const char *str, const char ch)
Finds a character in the same line.
Definition: UTF8Util.hpp:102
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:60
static bool IsLineEndingOrFileEnding(const char ch)
Returns ture if the character is a line ending or end of file.
Definition: UTF8Util.hpp:112
static size_t NextCharLengthNoException(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:39