OpenTREP Logo  0.07.7
C++ Open Travel Request Parsing Library
OTransliterator.cpp
Go to the documentation of this file.
1 // //////////////////////////////////////////////////////////////////////
2 // Import section
3 // //////////////////////////////////////////////////////////////////////
4 // STL
5 #include <cassert>
6 #include <sstream>
7 // OpenTrep
13 
14 namespace OPENTREP {
15 
16  // //////////////////////////////////////////////////////////////////////
18  : _punctuationRemover (NULL), _quoteRemover (NULL), _accentRemover (NULL),
19  _tranlist (NULL) {
20  init();
21  }
22 
23  // //////////////////////////////////////////////////////////////////////
25  : _punctuationRemover (NULL), _quoteRemover (NULL), _accentRemover (NULL),
26  _tranlist (NULL) {
27  assert (iTransliterator._punctuationRemover != NULL);
28  _punctuationRemover = iTransliterator._punctuationRemover->clone();
29 
30  assert (iTransliterator._quoteRemover != NULL);
31  _quoteRemover = iTransliterator._quoteRemover->clone();
32 
33  assert (iTransliterator._accentRemover != NULL);
34  _accentRemover = iTransliterator._accentRemover->clone();
35 
36  assert (iTransliterator._tranlist != NULL);
37  _tranlist = iTransliterator._tranlist->clone();
38 
39  }
40 
41  // //////////////////////////////////////////////////////////////////////
43  finalise();
44  }
45 
46  // //////////////////////////////////////////////////////////////////////
47  void OTransliterator::initPunctuationRemover() {
48  // Create a remover of punctuation
49  UErrorCode lStatus = U_ZERO_ERROR;
50  _punctuationRemover =
51  icu::Transliterator::createInstance (K_ICU_PUNCTUATION_REMOVAL_RULE,
52  UTRANS_FORWARD, lStatus);
53 
54  if (_punctuationRemover == NULL || U_FAILURE (lStatus)) {
55  std::ostringstream oStr;
56  oStr << "Unicode error: no Transliterator can be created for the '"
57  << K_ICU_PUNCTUATION_REMOVAL_RULE << "' rule.";
58  OPENTREP_LOG_ERROR (oStr.str());
59  throw UnicodeTransliteratorCreationException (oStr.str());
60  }
61  assert (_punctuationRemover != NULL);
62 
63  // Register the Unicode Transliterator
64  icu::Transliterator::registerInstance (_punctuationRemover);
65  }
66 
67  // //////////////////////////////////////////////////////////////////////
68  void OTransliterator::initQuoteRemover() {
69  // Create a remover of quotation
70  UErrorCode lStatus = U_ZERO_ERROR;
71  UParseError pError;
72  icu::UnicodeString lUnquotedRules (K_ICU_QUOTATION_REMOVAL_RULE);
73  _quoteRemover =
74  icu::Transliterator::createFromRules ("RBTUnaccent", lUnquotedRules,
75  UTRANS_FORWARD, pError, lStatus);
76 
77  if (_quoteRemover == NULL || U_FAILURE (lStatus)) {
78  std::ostringstream oStr;
79  oStr << "Unicode error: no Transliterator can be created for the '"
80  << K_ICU_QUOTATION_REMOVAL_RULE << "' rule.";
81  OPENTREP_LOG_ERROR (oStr.str());
82  throw UnicodeTransliteratorCreationException (oStr.str());
83  }
84  assert (_quoteRemover != NULL);
85 
86  // Register the Unicode Transliterator
87  icu::Transliterator::registerInstance (_quoteRemover);
88  }
89 
90  // //////////////////////////////////////////////////////////////////////
91  void OTransliterator::initAccentRemover() {
92  // Create a remover of accents
93  UErrorCode lStatus = U_ZERO_ERROR;
94  _accentRemover =
95  icu::Transliterator::createInstance (K_ICU_ACCENT_REMOVAL_RULE, UTRANS_FORWARD,
96  lStatus);
97 
98  if (_accentRemover == NULL || U_FAILURE (lStatus)) {
99  std::ostringstream oStr;
100  oStr << "Unicode error: no Transliterator can be created for the '"
101  << K_ICU_ACCENT_REMOVAL_RULE << "' rule.";
102  OPENTREP_LOG_ERROR (oStr.str());
103  throw UnicodeTransliteratorCreationException (oStr.str());
104  }
105  assert (_accentRemover != NULL);
106 
107  // Register the Unicode Transliterator
108  icu::Transliterator::registerInstance (_accentRemover);
109  }
110 
111  // //////////////////////////////////////////////////////////////////////
112  void OTransliterator::initTranlisterator() {
113  // Create a generic transliterator
114  UErrorCode lStatus = U_ZERO_ERROR;
115  _tranlist =
116  icu::Transliterator::createInstance (K_ICU_GENERIC_TRANSLITERATOR_RULE,
117  UTRANS_FORWARD, lStatus);
118 
119  if (_tranlist == NULL || U_FAILURE (lStatus)) {
120  std::ostringstream oStr;
121  oStr << "Unicode error: no Transliterator can be created for the '"
122  << K_ICU_GENERIC_TRANSLITERATOR_RULE << "' rule.";
123  OPENTREP_LOG_ERROR (oStr.str());
124  throw UnicodeTransliteratorCreationException (oStr.str());
125  }
126  assert (_tranlist != NULL);
127 
128  // Register the Unicode Transliterator
129  icu::Transliterator::registerInstance (_tranlist);
130  }
131 
132  // //////////////////////////////////////////////////////////////////////
133  void OTransliterator::init() {
134  initPunctuationRemover();
135  initQuoteRemover();
136  initAccentRemover();
137  initTranlisterator();
138  }
139 
140  // //////////////////////////////////////////////////////////////////////
141  void OTransliterator::finalise() {
142  delete _punctuationRemover; _punctuationRemover = NULL;
143  delete _quoteRemover; _quoteRemover = NULL;
144  delete _accentRemover; _accentRemover = NULL;
145  delete _tranlist; _tranlist = NULL;
146  }
147 
148  // //////////////////////////////////////////////////////////////////////
149  void OTransliterator::unpunctuate (icu::UnicodeString& ioString) const {
150  // Apply the punctuation removal scheme
151  assert (_punctuationRemover != NULL);
152  _punctuationRemover->transliterate (ioString);
153  }
154 
155  // //////////////////////////////////////////////////////////////////////
156  std::string OTransliterator::unpunctuate (const std::string& iString) const {
157  // Build a UnicodeString from the STL string
158  icu::UnicodeString lString (iString.c_str());
159 
160  // Apply the punctuation removal scheme
161  unpunctuate (lString);
162 
163  // Convert back from UnicodeString to UTF8-encoded STL string
164  const std::string& lPunctuatedString = getUTF8 (lString);
165 
166  return lPunctuatedString;
167  }
168 
169  // //////////////////////////////////////////////////////////////////////
170  void OTransliterator::unquote (icu::UnicodeString& ioString) const {
171  // Apply the quotation removal scheme
172  assert (_quoteRemover != NULL);
173  _quoteRemover->transliterate (ioString);
174  }
175 
176  // //////////////////////////////////////////////////////////////////////
177  std::string OTransliterator::unquote (const std::string& iString) const {
178  // Build a UnicodeString from the STL string
179  icu::UnicodeString lString (iString.c_str());
180 
181  // Apply the quotation removal scheme
182  unquote (lString);
183 
184  // Convert back from UnicodeString to UTF8-encoded STL string
185  const std::string& lUnquotedString = getUTF8 (lString);
186 
187  return lUnquotedString;
188  }
189 
190  // //////////////////////////////////////////////////////////////////////
191  void OTransliterator::unaccent (icu::UnicodeString& ioString) const {
192  // Apply the accent removal scheme
193  assert (_accentRemover != NULL);
194  _accentRemover->transliterate (ioString);
195  }
196 
197  // //////////////////////////////////////////////////////////////////////
198  std::string OTransliterator::unaccent (const std::string& iString) const {
199  // Build a UnicodeString from the STL string
200  icu::UnicodeString lString (iString.c_str());
201 
202  // Apply the accent removal scheme
203  unaccent (lString);
204 
205  // Convert back from UnicodeString to UTF8-encoded STL string
206  const std::string& lUnaccentuatedString = getUTF8 (lString);
207 
208  return lUnaccentuatedString;
209  }
210 
211  // //////////////////////////////////////////////////////////////////////
212  void OTransliterator::transliterate (icu::UnicodeString& ioString) const {
213  // Apply the transliteration scheme
214  assert (_tranlist != NULL);
215  _tranlist->transliterate (ioString);
216  }
217 
218  // //////////////////////////////////////////////////////////////////////
219  std::string OTransliterator::transliterate (const std::string& iString) const {
220  // Build a UnicodeString from the STL string
221  icu::UnicodeString lString (iString.c_str());
222 
223  // Apply the transliteration scheme
224  transliterate (lString);
225 
226  // Convert back from UnicodeString to UTF8-encoded STL string
227  const std::string& lTransliteratedString = getUTF8 (lString);
228 
229  return lTransliteratedString;
230  }
231 
232  // //////////////////////////////////////////////////////////////////////
233  std::string OTransliterator::normalise (const std::string& iString) const {
234  // Build a UnicodeString from the STL string
235  icu::UnicodeString lString (iString.c_str());
236 
237  // Apply the whole sery of transformators
238  unaccent (lString);
239  unquote (lString);
240  unpunctuate (lString);
241  transliterate (lString);
242 
243  // Convert back from UnicodeString to UTF8-encoded STL string
244  const std::string& lNormalisedString = getUTF8 (lString);
245 
246  return lNormalisedString;
247  }
248 
249 }
#define OPENTREP_LOG_ERROR(iToBeLogged)
Definition: Logger.hpp:24
std::string normalise(const std::string &iString) const
const char * K_ICU_PUNCTUATION_REMOVAL_RULE
Definition: BasConst.cpp:123
const char * K_ICU_GENERIC_TRANSLITERATOR_RULE
Definition: BasConst.cpp:129
std::string unaccent(const std::string &iString) const
std::string unpunctuate(const std::string &iString) const
const char * K_ICU_QUOTATION_REMOVAL_RULE
Definition: BasConst.cpp:116
std::string unquote(const std::string &iString) const
const char * K_ICU_ACCENT_REMOVAL_RULE
Definition: BasConst.cpp:110
std::string getUTF8(const icu::UnicodeString &iString)
Definition: icu_util.cpp:65
std::string transliterate(const std::string &iString) const