node_modules/xterm/src/common/input/TextDecoder.ts

   1 /**
   2  * Copyright (c) 2019 The xterm.js authors. All rights reserved.
   3  * @license MIT
   4  */
   5
   6 /**
   7  * Polyfill - Convert UTF32 codepoint into JS string.
   8  * Note: The built-in String.fromCodePoint happens to be much slower
   9  *       due to additional sanity checks. We can avoid them since
  10  *       we always operate on legal UTF32 (granted by the input decoders)
  11  *       and use this faster version instead.
  12  */
  13 export function stringFromCodePoint(codePoint: number): string {
  14   if (codePoint > 0xFFFF) {
  15     codePoint -= 0x10000;
  16     return String.fromCharCode((codePoint >> 10) + 0xD800) + String.fromCharCode((codePoint % 0x400) + 0xDC00);
  17   }
  18   return String.fromCharCode(codePoint);
  19 }
  20
  21 /**
  22  * Convert UTF32 char codes into JS string.
  23  * Basically the same as `stringFromCodePoint` but for multiple codepoints
  24  * in a loop (which is a lot faster).
  25  */
  26 export function utf32ToString(data: Uint32Array, start: number = 0, end: number = data.length): string {
  27   let result = '';
  28   for (let i = start; i < end; ++i) {
  29     let codepoint = data[i];
  30     if (codepoint > 0xFFFF) {
  31       // JS strings are encoded as UTF16, thus a non BMP codepoint gets converted into a surrogate pair
  32       // conversion rules:
  33       //  - subtract 0x10000 from code point, leaving a 20 bit number
  34       //  - add high 10 bits to 0xD800  --> first surrogate
  35       //  - add low 10 bits to 0xDC00   --> second surrogate
  36       codepoint -= 0x10000;
  37       result += String.fromCharCode((codepoint >> 10) + 0xD800) + String.fromCharCode((codepoint % 0x400) + 0xDC00);
  38     } else {
  39       result += String.fromCharCode(codepoint);
  40     }
  41   }
  42   return result;
  43 }
  44
  45 /**
  46  * StringToUtf32 - decodes UTF16 sequences into UTF32 codepoints.
  47  * To keep the decoder in line with JS strings it handles single surrogates as UCS2.
  48  */
  49 export class StringToUtf32 {
  50   private _interim: number = 0;
  51
  52   /**
  53    * Clears interim and resets decoder to clean state.
  54    */
  55   public clear(): void {
  56     this._interim = 0;
  57   }
  58
  59   /**
  60    * Decode JS string to UTF32 codepoints.
  61    * The methods assumes stream input and will store partly transmitted
  62    * surrogate pairs and decode them with the next data chunk.
  63    * Note: The method does no bound checks for target, therefore make sure
  64    * the provided input data does not exceed the size of `target`.
  65    * Returns the number of written codepoints in `target`.
  66    */
  67   decode(input: string, target: Uint32Array): number {
  68     const length = input.length;
  69
  70     if (!length) {
  71       return 0;
  72     }
  73
  74     let size = 0;
  75     let startPos = 0;
  76
  77     // handle leftover surrogate high
  78     if (this._interim) {
  79       const second = input.charCodeAt(startPos++);
  80       if (0xDC00 <= second && second <= 0xDFFF) {
  81         target[size++] = (this._interim - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
  82       } else {
  83         // illegal codepoint (USC2 handling)
  84         target[size++] = this._interim;
  85         target[size++] = second;
  86       }
  87       this._interim = 0;
  88     }
  89
  90     for (let i = startPos; i < length; ++i) {
  91       const code = input.charCodeAt(i);
  92       // surrogate pair first
  93       if (0xD800 <= code && code <= 0xDBFF) {
  94         if (++i >= length) {
  95           this._interim = code;
  96           return size;
  97         }
  98         const second = input.charCodeAt(i);
  99         if (0xDC00 <= second && second <= 0xDFFF) {
 100           target[size++] = (code - 0xD800) * 0x400 + second - 0xDC00 + 0x10000;
 101         } else {
 102           // illegal codepoint (USC2 handling)
 103           target[size++] = code;
 104           target[size++] = second;
 105         }
 106         continue;
 107       }
 108       target[size++] = code;
 109     }
 110     return size;
 111   }
 112 }
 113
 114 /**
 115  * Utf8Decoder - decodes UTF8 byte sequences into UTF32 codepoints.
 116  */
 117 export class Utf8ToUtf32 {
 118   public interim: Uint8Array = new Uint8Array(3);
 119
 120   /**
 121    * Clears interim bytes and resets decoder to clean state.
 122    */
 123   public clear(): void {
 124     this.interim.fill(0);
 125   }
 126
 127   /**
 128    * Decodes UTF8 byte sequences in `input` to UTF32 codepoints in `target`.
 129    * The methods assumes stream input and will store partly transmitted bytes
 130    * and decode them with the next data chunk.
 131    * Note: The method does no bound checks for target, therefore make sure
 132    * the provided data chunk does not exceed the size of `target`.
 133    * Returns the number of written codepoints in `target`.
 134    */
 135   decode(input: Uint8Array, target: Uint32Array): number {
 136     const length = input.length;
 137
 138     if (!length) {
 139       return 0;
 140     }
 141
 142     let size = 0;
 143     let byte1: number;
 144     let byte2: number;
 145     let byte3: number;
 146     let byte4: number;
 147     let codepoint = 0;
 148     let startPos = 0;
 149
 150     // handle leftover bytes
 151     if (this.interim[0]) {
 152       let discardInterim = false;
 153       let cp = this.interim[0];
 154       cp &= ((((cp & 0xE0) === 0xC0)) ? 0x1F : (((cp & 0xF0) === 0xE0)) ? 0x0F : 0x07);
 155       let pos = 0;
 156       let tmp: number;
 157       while ((tmp = this.interim[++pos] & 0x3F) && pos < 4) {
 158         cp <<= 6;
 159         cp |= tmp;
 160       }
 161       // missing bytes - read ahead from input
 162       const type = (((this.interim[0] & 0xE0) === 0xC0)) ? 2 : (((this.interim[0] & 0xF0) === 0xE0)) ? 3 : 4;
 163       const missing = type - pos;
 164       while (startPos < missing) {
 165         if (startPos >= length) {
 166           return 0;
 167         }
 168         tmp = input[startPos++];
 169         if ((tmp & 0xC0) !== 0x80) {
 170           // wrong continuation, discard interim bytes completely
 171           startPos--;
 172           discardInterim = true;
 173           break;
 174         } else {
 175           // need to save so we can continue short inputs in next call
 176           this.interim[pos++] = tmp;
 177           cp <<= 6;
 178           cp |= tmp & 0x3F;
 179         }
 180       }
 181       if (!discardInterim) {
 182         // final test is type dependent
 183         if (type === 2) {
 184           if (cp < 0x80) {
 185             // wrong starter byte
 186             startPos--;
 187           } else {
 188             target[size++] = cp;
 189           }
 190         } else if (type === 3) {
 191           if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) {
 192             // illegal codepoint
 193           } else {
 194             target[size++] = cp;
 195           }
 196         } else {
 197           if (cp < 0x010000 || cp > 0x10FFFF) {
 198             // illegal codepoint
 199           } else {
 200             target[size++] = cp;
 201           }
 202         }
 203       }
 204       this.interim.fill(0);
 205     }
 206
 207     // loop through input
 208     const fourStop = length - 4;
 209     let i = startPos;
 210     while (i < length) {
 211       /**
 212        * ASCII shortcut with loop unrolled to 4 consecutive ASCII chars.
 213        * This is a compromise between speed gain for ASCII
 214        * and penalty for non ASCII:
 215        * For best ASCII performance the char should be stored directly into target,
 216        * but even a single attempt to write to target and compare afterwards
 217        * penalizes non ASCII really bad (-50%), thus we load the char into byteX first,
 218        * which reduces ASCII performance by ~15%.
 219        * This trial for ASCII reduces non ASCII performance by ~10% which seems acceptible
 220        * compared to the gains.
 221        * Note that this optimization only takes place for 4 consecutive ASCII chars,
 222        * for any shorter it bails out. Worst case - all 4 bytes being read but
 223        * thrown away due to the last being a non ASCII char (-10% performance).
 224        */
 225       while (i < fourStop
 226         && !((byte1 = input[i]) & 0x80)
 227         && !((byte2 = input[i + 1]) & 0x80)
 228         && !((byte3 = input[i + 2]) & 0x80)
 229         && !((byte4 = input[i + 3]) & 0x80))
 230       {
 231         target[size++] = byte1;
 232         target[size++] = byte2;
 233         target[size++] = byte3;
 234         target[size++] = byte4;
 235         i += 4;
 236       }
 237
 238       // reread byte1
 239       byte1 = input[i++];
 240
 241       // 1 byte
 242       if (byte1 < 0x80) {
 243         target[size++] = byte1;
 244
 245         // 2 bytes
 246       } else if ((byte1 & 0xE0) === 0xC0) {
 247         if (i >= length) {
 248           this.interim[0] = byte1;
 249           return size;
 250         }
 251         byte2 = input[i++];
 252         if ((byte2 & 0xC0) !== 0x80) {
 253           // wrong continuation
 254           i--;
 255           continue;
 256         }
 257         codepoint = (byte1 & 0x1F) << 6 | (byte2 & 0x3F);
 258         if (codepoint < 0x80) {
 259           // wrong starter byte
 260           i--;
 261           continue;
 262         }
 263         target[size++] = codepoint;
 264
 265         // 3 bytes
 266       } else if ((byte1 & 0xF0) === 0xE0) {
 267         if (i >= length) {
 268           this.interim[0] = byte1;
 269           return size;
 270         }
 271         byte2 = input[i++];
 272         if ((byte2 & 0xC0) !== 0x80) {
 273           // wrong continuation
 274           i--;
 275           continue;
 276         }
 277         if (i >= length) {
 278           this.interim[0] = byte1;
 279           this.interim[1] = byte2;
 280           return size;
 281         }
 282         byte3 = input[i++];
 283         if ((byte3 & 0xC0) !== 0x80) {
 284           // wrong continuation
 285           i--;
 286           continue;
 287         }
 288         codepoint = (byte1 & 0x0F) << 12 | (byte2 & 0x3F) << 6 | (byte3 & 0x3F);
 289         if (codepoint < 0x0800 || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
 290           // illegal codepoint, no i-- here
 291           continue;
 292         }
 293         target[size++] = codepoint;
 294
 295         // 4 bytes
 296       } else if ((byte1 & 0xF8) === 0xF0) {
 297         if (i >= length) {
 298           this.interim[0] = byte1;
 299           return size;
 300         }
 301         byte2 = input[i++];
 302         if ((byte2 & 0xC0) !== 0x80) {
 303           // wrong continuation
 304           i--;
 305           continue;
 306         }
 307         if (i >= length) {
 308           this.interim[0] = byte1;
 309           this.interim[1] = byte2;
 310           return size;
 311         }
 312         byte3 = input[i++];
 313         if ((byte3 & 0xC0) !== 0x80) {
 314           // wrong continuation
 315           i--;
 316           continue;
 317         }
 318         if (i >= length) {
 319           this.interim[0] = byte1;
 320           this.interim[1] = byte2;
 321           this.interim[2] = byte3;
 322           return size;
 323         }
 324         byte4 = input[i++];
 325         if ((byte4 & 0xC0) !== 0x80) {
 326           // wrong continuation
 327           i--;
 328           continue;
 329         }
 330         codepoint = (byte1 & 0x07) << 18 | (byte2 & 0x3F) << 12 | (byte3 & 0x3F) << 6 | (byte4 & 0x3F);
 331         if (codepoint < 0x010000 || codepoint > 0x10FFFF) {
 332           // illegal codepoint, no i-- here
 333           continue;
 334         }
 335         target[size++] = codepoint;
 336       } else {
 337         // illegal byte, just skip
 338       }
 339     }
 340     return size;
 341   }
 342 }