Open Chinese Convert  1.1.1
A project for conversion between Traditional and Simplified Chinese
UTF8Util.hpp
1 /*
2  * Open Chinese Convert
3  *
4  * Copyright 2013 Carbo Kuo <byvoid@byvoid.com>
5  *
6  * Licensed under the Apache License, Version 2.0 (the "License");
7  * you may not use this file except in compliance with the License.
8  * You may obtain a copy of the License at
9  *
10  * http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18 
19 #pragma once
20 
21 #ifdef _MSC_VER
22 #define NOMINMAX
23 #include <Windows.h>
24 #undef NOMINMAX
25 #endif // _MSC_VER
26 
27 #include <cstring>
28 
29 #include "Common.hpp"
30 #include "Exception.hpp"
31 
32 namespace opencc {
37 class OPENCC_EXPORT UTF8Util {
38 public:
42  static void SkipUtf8Bom(FILE* fp);
43 
48  static size_t NextCharLengthNoException(const char* str) {
49  char ch = *str;
50  if ((ch & 0xF0) == 0xE0) {
51  return 3;
52  } else if ((ch & 0x80) == 0x00) {
53  return 1;
54  } else if ((ch & 0xE0) == 0xC0) {
55  return 2;
56  } else if ((ch & 0xF8) == 0xF0) {
57  return 4;
58  } else if ((ch & 0xFC) == 0xF8) {
59  return 5;
60  } else if ((ch & 0xFE) == 0xFC) {
61  return 6;
62  }
63  return 0;
64  }
65 
69  static size_t NextCharLength(const char* str) {
70  size_t length = NextCharLengthNoException(str);
71  if (length == 0) {
72  throw InvalidUTF8(str);
73  }
74  return length;
75  }
76 
80  static size_t PrevCharLength(const char* str) {
81  {
82  const size_t length = NextCharLengthNoException(str - 3);
83  if (length == 3) {
84  return length;
85  }
86  }
87  {
88  const size_t length = NextCharLengthNoException(str - 1);
89  if (length == 1) {
90  return length;
91  }
92  }
93  {
94  const size_t length = NextCharLengthNoException(str - 2);
95  if (length == 2) {
96  return length;
97  }
98  }
99  for (size_t i = 4; i <= 6; i++) {
100  const size_t length = NextCharLengthNoException(str - i);
101  if (length == i) {
102  return length;
103  }
104  }
105  throw InvalidUTF8(str);
106  }
107 
111  static const char* NextChar(const char* str) {
112  return str + NextCharLength(str);
113  }
114 
118  static const char* PrevChar(const char* str) {
119  return str - PrevCharLength(str);
120  }
121 
125  static size_t Length(const char* str) {
126  size_t length = 0;
127  while (*str != '\0') {
128  str = NextChar(str);
129  length++;
130  }
131  return length;
132  }
133 
140  static const char* FindNextInline(const char* str, const char ch) {
141  while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
142  str = NextChar(str);
143  }
144  return str;
145  }
146 
150  static bool IsLineEndingOrFileEnding(const char ch) {
151  return ch == '\0' || ch == '\n' || ch == '\r';
152  }
153 
157  static std::string FromSubstr(const char* str, size_t length) {
158  std::string newStr;
159  newStr.resize(length);
160  strncpy(const_cast<char*>(newStr.c_str()), str, length);
161  return newStr;
162  }
163 
168  static bool NotShorterThan(const char* str, size_t byteLength) {
169  while (byteLength > 0) {
170  if (*str == '\0') {
171  return false;
172  }
173  byteLength--;
174  str++;
175  }
176  return true;
177  }
178 
183  static std::string TruncateUTF8(const char* str, size_t maxByteLength) {
184  std::string wordTrunc;
185  if (NotShorterThan(str, maxByteLength)) {
186  size_t len = 0;
187  const char* pStr = str;
188  for (;;) {
189  const size_t charLength = NextCharLength(pStr);
190  if (len + charLength > maxByteLength) {
191  break;
192  }
193  pStr += charLength;
194  len += charLength;
195  }
196  wordTrunc = FromSubstr(str, len);
197  } else {
198  wordTrunc = str;
199  }
200  return wordTrunc;
201  }
202 
206  static void ReplaceAll(std::string& str, const char* from, const char* to) {
207  std::string::size_type pos = 0;
208  std::string::size_type fromLen = strlen(from);
209  std::string::size_type toLen = strlen(to);
210  while ((pos = str.find(from, pos)) != std::string::npos) {
211  str.replace(pos, fromLen, to);
212  pos += toLen;
213  }
214  }
215 
219  static std::string Join(const std::vector<std::string>& strings,
220  const std::string& separator) {
221  std::ostringstream buffer;
222  bool first = true;
223  for (const auto& str : strings) {
224  if (!first) {
225  buffer << separator;
226  }
227  buffer << str;
228  first = false;
229  }
230  return buffer.str();
231  }
232 
236  static std::string Join(const std::vector<std::string>& strings) {
237  std::ostringstream buffer;
238  for (const auto& str : strings) {
239  buffer << str;
240  }
241  return buffer.str();
242  }
243 
244  static void GetByteMap(const char* str, const size_t utf8Length,
245  std::vector<size_t>* byteMap) {
246  if (byteMap->size() < utf8Length) {
247  byteMap->resize(utf8Length);
248  }
249  const char* pstr = str;
250  for (size_t i = 0; i < utf8Length; i++) {
251  (*byteMap)[i] = pstr - str;
252  pstr = NextChar(pstr);
253  }
254  }
255 
256 #ifdef _MSC_VER
257  static std::wstring GetPlatformString(const std::string& str) {
258  return U8ToU16(str);
259  }
260 #else
261  static std::string GetPlatformString(const std::string& str) { return str; }
262 #endif // _MSC_VER
263 
264 #ifdef _MSC_VER
265  static std::string U16ToU8(const std::wstring& wstr) {
266  std::string ret;
267  int length = static_cast<int>(wstr.length());
268  int convcnt = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, NULL, 0,
269  NULL, NULL);
270  if (convcnt > 0) {
271  ret.resize(convcnt);
272  WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, &ret[0], convcnt,
273  NULL, NULL);
274  }
275  return ret;
276  }
277 
278  static std::wstring U8ToU16(const std::string& str) {
279  std::wstring ret;
280  int length = static_cast<int>(str.length());
281  int convcnt = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, NULL, 0);
282  if (convcnt > 0) {
283  ret.resize(convcnt);
284  MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, &ret[0], convcnt);
285  }
286  return ret;
287  }
288 #endif // _MSC_VER
289 };
290 } // namespace opencc
opencc::UTF8Util::Length
static size_t Length(const char *str)
Returns the UTF8 length of a valid UTF8 std::string.
Definition: UTF8Util.hpp:125
opencc::UTF8Util::FindNextInline
static const char * FindNextInline(const char *str, const char ch)
Finds a character in the same line.
Definition: UTF8Util.hpp:140
opencc::UTF8Util::NextCharLengthNoException
static size_t NextCharLengthNoException(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:48
opencc::UTF8Util::NotShorterThan
static bool NotShorterThan(const char *str, size_t byteLength)
Returns true if the given std::string is longer or as long as the given length.
Definition: UTF8Util.hpp:168
opencc::UTF8Util::Join
static std::string Join(const std::vector< std::string > &strings)
Joins a std::string vector in to a std::string.
Definition: UTF8Util.hpp:236
opencc::UTF8Util
UTF8 std::string utilities.
Definition: UTF8Util.hpp:37
opencc::UTF8Util::Join
static std::string Join(const std::vector< std::string > &strings, const std::string &separator)
Joins a std::string vector in to a std::string with a separator.
Definition: UTF8Util.hpp:219
opencc::UTF8Util::NextCharLength
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition: UTF8Util.hpp:69
opencc::InvalidUTF8
Definition: Exception.hpp:77
opencc::UTF8Util::IsLineEndingOrFileEnding
static bool IsLineEndingOrFileEnding(const char ch)
Returns ture if the character is a line ending or end of file.
Definition: UTF8Util.hpp:150
opencc::UTF8Util::TruncateUTF8
static std::string TruncateUTF8(const char *str, size_t maxByteLength)
Truncates a std::string with a maximal length in byte.
Definition: UTF8Util.hpp:183
opencc::UTF8Util::PrevCharLength
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition: UTF8Util.hpp:80
opencc::UTF8Util::ReplaceAll
static void ReplaceAll(std::string &str, const char *from, const char *to)
Replaces all patterns in a std::string in place.
Definition: UTF8Util.hpp:206
opencc::UTF8Util::FromSubstr
static std::string FromSubstr(const char *str, size_t length)
Copies a substd::string with given length to a new std::string.
Definition: UTF8Util.hpp:157
opencc::UTF8Util::PrevChar
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition: UTF8Util.hpp:118
opencc::UTF8Util::NextChar
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition: UTF8Util.hpp:111