2 * Copyright (c) 2019 The xterm.js authors. All rights reserved.
7 * Polyfill - Convert UTF32 codepoint into JS string.
8 * Note: The built-in String.fromCodePoint happens to be much slower
9 * due to additional sanity checks. We can avoid them since
10 * we always operate on legal UTF32 (granted by the input decoders)
11 * and use this faster version instead.
13 export function stringFromCodePoint(codePoint: number): string {
14 if (codePoint > 0xFFFF) {
16 return String.fromCharCode((codePoint >> 10) + 0xD800) + String.fromCharCode((codePoint % 0x400) + 0xDC00);
18 return String.fromCharCode(codePoint);
22 * Convert UTF32 char codes into JS string.
23 * Basically the same as `stringFromCodePoint` but for multiple codepoints
24 * in a loop (which is a lot faster).
26 export function utf32ToString(data: Uint32Array, start: number = 0, end: number = data.length): string {
28 for (let i = start; i < end; ++i) {
29 let codepoint = data[i];
30 if (codepoint > 0xFFFF) {
31 // JS strings are encoded as UTF16, thus a non BMP codepoint gets converted into a surrogate pair
33 // - subtract 0x10000 from code point, leaving a 20 bit number
34 // - add high 10 bits to 0xD800 --> first surrogate
35 // - add low 10 bits to 0xDC00 --> second surrogate
37 result += String.fromCharCode((codepoint >> 10) + 0xD800) + String.fromCharCode((codepoint % 0x400) + 0xDC00);
39 result += String.fromCharCode(codepoint);
46 * StringToUtf32 - decodes UTF16 sequences into UTF32 codepoints.
47 * To keep the decoder in line with JS strings it handles single surrogates as UCS2.
49 export class StringToUtf32 {
50 private _interim: number = 0;
53 * Clears interim and resets decoder to clean state.
55 public clear(): void {
60 * Decode JS string to UTF32 codepoints.
61 * The methods assumes stream input and will store partly transmitted
62 * surrogate pairs and decode them with the next data chunk.
63 * Note: The method does no bound checks for target, therefore make sure
64 * the provided input data does not exceed the size of `target`.
65 * Returns the number of written codepoints in `target`.
67 decode(input: string, target: Uint32Array): number {
68 const length = input.length;
77 // handle leftover surrogate high
79 const second = input.charCodeAt(startPos++);
80 if (0xDC00 <= second && second <= 0xDFFF) {
81 target[size++] = (this._interim - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
83 // illegal codepoint (USC2 handling)
84 target[size++] = this._interim;
85 target[size++] = second;
90 for (let i = startPos; i < length; ++i) {
91 const code = input.charCodeAt(i);
92 // surrogate pair first
93 if (0xD800 <= code && code <= 0xDBFF) {
98 const second = input.charCodeAt(i);
99 if (0xDC00 <= second && second <= 0xDFFF) {
100 target[size++] = (code - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
102 // illegal codepoint (USC2 handling)
103 target[size++] = code;
104 target[size++] = second;
108 target[size++] = code;
115 * Utf8Decoder - decodes UTF8 byte sequences into UTF32 codepoints.
117 export class Utf8ToUtf32 {
118 public interim: Uint8Array = new Uint8Array(3);
121 * Clears interim bytes and resets decoder to clean state.
123 public clear(): void {
124 this.interim.fill(0);
128 * Decodes UTF8 byte sequences in `input` to UTF32 codepoints in `target`.
129 * The methods assumes stream input and will store partly transmitted bytes
130 * and decode them with the next data chunk.
131 * Note: The method does no bound checks for target, therefore make sure
132 * the provided data chunk does not exceed the size of `target`.
133 * Returns the number of written codepoints in `target`.
135 decode(input: Uint8Array, target: Uint32Array): number {
136 const length = input.length;
150 // handle leftover bytes
151 if (this.interim[0]) {
152 let discardInterim = false;
153 let cp = this.interim[0];
154 cp &= ((((cp & 0xE0) === 0xC0)) ? 0x1F : (((cp & 0xF0) === 0xE0)) ? 0x0F : 0x07);
157 while ((tmp = this.interim[++pos] & 0x3F) && pos < 4) {
161 // missing bytes - read ahead from input
162 const type = (((this.interim[0] & 0xE0) === 0xC0)) ? 2 : (((this.interim[0] & 0xF0) === 0xE0)) ? 3 : 4;
163 const missing = type - pos;
164 while (startPos < missing) {
165 if (startPos >= length) {
168 tmp = input[startPos++];
169 if ((tmp & 0xC0) !== 0x80) {
170 // wrong continuation, discard interim bytes completely
172 discardInterim = true;
175 // need to save so we can continue short inputs in next call
176 this.interim[pos++] = tmp;
181 if (!discardInterim) {
182 // final test is type dependent
185 // wrong starter byte
190 } else if (type === 3) {
191 if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
197 if (cp < 0x010000 || cp > 0x10FFFF) {
204 this.interim.fill(0);
207 // loop through input
208 const fourStop = length - 4;
212 * ASCII shortcut with loop unrolled to 4 consecutive ASCII chars.
213 * This is a compromise between speed gain for ASCII
214 * and penalty for non ASCII:
215 * For best ASCII performance the char should be stored directly into target,
216 * but even a single attempt to write to target and compare afterwards
217 * penalizes non ASCII really bad (-50%), thus we load the char into byteX first,
218 * which reduces ASCII performance by ~15%.
219 * This trial for ASCII reduces non ASCII performance by ~10% which seems acceptible
220 * compared to the gains.
221 * Note that this optimization only takes place for 4 consecutive ASCII chars,
222 * for any shorter it bails out. Worst case - all 4 bytes being read but
223 * thrown away due to the last being a non ASCII char (-10% performance).
226 && !((byte1 = input[i]) & 0x80)
227 && !((byte2 = input[i + 1]) & 0x80)
228 && !((byte3 = input[i + 2]) & 0x80)
229 && !((byte4 = input[i + 3]) & 0x80))
231 target[size++] = byte1;
232 target[size++] = byte2;
233 target[size++] = byte3;
234 target[size++] = byte4;
243 target[size++] = byte1;
246 } else if ((byte1 & 0xE0) === 0xC0) {
248 this.interim[0] = byte1;
252 if ((byte2 & 0xC0) !== 0x80) {
253 // wrong continuation
257 codepoint = (byte1 & 0x1F) << 6 | (byte2 & 0x3F);
258 if (codepoint < 0x80) {
259 // wrong starter byte
263 target[size++] = codepoint;
266 } else if ((byte1 & 0xF0) === 0xE0) {
268 this.interim[0] = byte1;
272 if ((byte2 & 0xC0) !== 0x80) {
273 // wrong continuation
278 this.interim[0] = byte1;
279 this.interim[1] = byte2;
283 if ((byte3 & 0xC0) !== 0x80) {
284 // wrong continuation
288 codepoint = (byte1 & 0x0F) << 12 | (byte2 & 0x3F) << 6 | (byte3 & 0x3F);
289 if (codepoint < 0x0800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
290 // illegal codepoint, no i-- here
293 target[size++] = codepoint;
296 } else if ((byte1 & 0xF8) === 0xF0) {
298 this.interim[0] = byte1;
302 if ((byte2 & 0xC0) !== 0x80) {
303 // wrong continuation
308 this.interim[0] = byte1;
309 this.interim[1] = byte2;
313 if ((byte3 & 0xC0) !== 0x80) {
314 // wrong continuation
319 this.interim[0] = byte1;
320 this.interim[1] = byte2;
321 this.interim[2] = byte3;
325 if ((byte4 & 0xC0) !== 0x80) {
326 // wrong continuation
330 codepoint = (byte1 & 0x07) << 18 | (byte2 & 0x3F) << 12 | (byte3 & 0x3F) << 6 | (byte4 & 0x3F);
331 if (codepoint < 0x010000 || codepoint > 0x10FFFF) {
332 // illegal codepoint, no i-- here
335 target[size++] = codepoint;
337 // illegal byte, just skip