1 // Copyright (c) 2015 Ryan Prichard
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to
5 // deal in the Software without restriction, including without limitation the
6 // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7 // sell copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 #ifndef UNICODE_ENCODING_H
22 #define UNICODE_ENCODING_H
26 // Encode the Unicode codepoint with UTF-8. The buffer must be at least 4
28 static inline int encodeUtf8(char *out, uint32_t code) {
32 } else if (code < 0x800) {
33 out[0] = ((code >> 6) & 0x1F) | 0xC0;
34 out[1] = ((code >> 0) & 0x3F) | 0x80;
36 } else if (code < 0x10000) {
37 if (code >= 0xD800 && code <= 0xDFFF) {
38 // The code points 0xD800 to 0xDFFF are reserved for UTF-16
39 // surrogate pairs and do not have an encoding in UTF-8.
42 out[0] = ((code >> 12) & 0x0F) | 0xE0;
43 out[1] = ((code >> 6) & 0x3F) | 0x80;
44 out[2] = ((code >> 0) & 0x3F) | 0x80;
46 } else if (code < 0x110000) {
47 out[0] = ((code >> 18) & 0x07) | 0xF0;
48 out[1] = ((code >> 12) & 0x3F) | 0x80;
49 out[2] = ((code >> 6) & 0x3F) | 0x80;
50 out[3] = ((code >> 0) & 0x3F) | 0x80;
58 // Encode the Unicode codepoint with UTF-16. The buffer must be large enough
59 // to hold the output -- either 1 or 2 elements.
60 static inline int encodeUtf16(wchar_t *out, uint32_t code) {
62 if (code >= 0xD800 && code <= 0xDFFF) {
63 // The code points 0xD800 to 0xDFFF are reserved for UTF-16
64 // surrogate pairs and do not have an encoding in UTF-16.
69 } else if (code < 0x110000) {
71 out[0] = 0xD800 | (code >> 10);
72 out[1] = 0xDC00 | (code & 0x3FF);
80 // Return the byte size of a UTF-8 character using the value of the first
82 static inline int utf8CharLength(char firstByte) {
83 // This code would probably be faster if it used __builtin_clz.
84 if ((firstByte & 0x80) == 0) {
86 } else if ((firstByte & 0xE0) == 0xC0) {
88 } else if ((firstByte & 0xF0) == 0xE0) {
90 } else if ((firstByte & 0xF8) == 0xF0) {
98 // The pointer must point to 1-4 bytes, as indicated by the first byte.
99 // Returns -1 on decoding error.
100 static inline uint32_t decodeUtf8(const char *in) {
101 const uint32_t kInvalid = static_cast<uint32_t>(-1);
102 switch (utf8CharLength(in[0])) {
107 if ((in[1] & 0xC0) != 0x80) {
111 tmp = (in[0] & 0x1F) << 6;
112 tmp |= (in[1] & 0x3F);
113 return tmp <= 0x7F ? kInvalid : tmp;
116 if ((in[1] & 0xC0) != 0x80 ||
117 (in[2] & 0xC0) != 0x80) {
121 tmp = (in[0] & 0x0F) << 12;
122 tmp |= (in[1] & 0x3F) << 6;
123 tmp |= (in[2] & 0x3F);
124 if (tmp <= 0x07FF || (tmp >= 0xD800 && tmp <= 0xDFFF)) {
131 if ((in[1] & 0xC0) != 0x80 ||
132 (in[2] & 0xC0) != 0x80 ||
133 (in[3] & 0xC0) != 0x80) {
137 tmp = (in[0] & 0x07) << 18;
138 tmp |= (in[1] & 0x3F) << 12;
139 tmp |= (in[2] & 0x3F) << 6;
140 tmp |= (in[3] & 0x3F);
141 if (tmp <= 0xFFFF || tmp > 0x10FFFF) {
153 static inline uint32_t decodeSurrogatePair(wchar_t ch1, wchar_t ch2) {
154 return ((ch1 - 0xD800) << 10) + (ch2 - 0xDC00) + 0x10000;
157 #endif // UNICODE_ENCODING_H