.config/coc/extensions/node_modules/coc-prettier/node_modules/chardet/encoding/mbcs.js

   1 var util = require('util'),
   2   Match = require ('../match');
   3
   4 /**
   5  * Binary search implementation (recursive)
   6  */
   7 function binarySearch(arr, searchValue) {
   8   function find(arr, searchValue, left, right) {
   9     if (right < left)
  10       return -1;
  11
  12     /*
  13     int mid = mid = (left + right) / 2;
  14     There is a bug in the above line;
  15     Joshua Bloch suggests the following replacement:
  16     */
  17     var mid = Math.floor((left + right) >>> 1);
  18     if (searchValue > arr[mid])
  19       return find(arr, searchValue, mid + 1, right);
  20
  21     if (searchValue < arr[mid])
  22       return find(arr, searchValue, left, mid - 1);
  23
  24     return mid;
  25   };
  26
  27   return find(arr, searchValue, 0, arr.length - 1);
  28 };
  29
  30 // 'Character'  iterated character class.
  31 //    Recognizers for specific mbcs encodings make their 'characters' available
  32 //    by providing a nextChar() function that fills in an instance of iteratedChar
  33 //    with the next char from the input.
  34 //    The returned characters are not converted to Unicode, but remain as the raw
  35 //    bytes (concatenated into an int) from the codepage data.
  36 //
  37 //  For Asian charsets, use the raw input rather than the input that has been
  38 //   stripped of markup.  Detection only considers multi-byte chars, effectively
  39 //   stripping markup anyway, and double byte chars do occur in markup too.
  40 //
  41 function IteratedChar() {
  42
  43   this.charValue = 0; // 1-4 bytes from the raw input data
  44   this.index     = 0;
  45   this.nextIndex = 0;
  46   this.error     = false;
  47   this.done      = false;
  48
  49   this.reset = function() {
  50     this.charValue = 0;
  51     this.index     = -1;
  52     this.nextIndex = 0;
  53     this.error     = false;
  54     this.done      = false;
  55   };
  56
  57   this.nextByte = function(det) {
  58     if (this.nextIndex >= det.fRawLength) {
  59       this.done = true;
  60       return -1;
  61     }
  62     var byteValue = det.fRawInput[this.nextIndex++] & 0x00ff;
  63     return byteValue;
  64   };
  65 };
  66
  67
  68
  69 /**
  70  * Asian double or multi-byte - charsets.
  71  * Match is determined mostly by the input data adhering to the
  72  * encoding scheme for the charset, and, optionally,
  73  * frequency-of-occurence of characters.
  74  */
  75
  76 function mbcs() {};
  77
  78 /**
  79  * Test the match of this charset with the input text data
  80  *      which is obtained via the CharsetDetector object.
  81  *
  82  * @param det  The CharsetDetector, which contains the input text
  83  *             to be checked for being in this charset.
  84  * @return     Two values packed into one int  (Damn java, anyhow)
  85  *             bits 0-7:  the match confidence, ranging from 0-100
  86  *             bits 8-15: The match reason, an enum-like value.
  87  */
  88 mbcs.prototype.match = function(det) {
  89
  90   var singleByteCharCount = 0,  //TODO Do we really need this?
  91     doubleByteCharCount = 0,
  92     commonCharCount     = 0,
  93     badCharCount        = 0,
  94     totalCharCount      = 0,
  95     confidence          = 0;
  96
  97   var iter = new IteratedChar();
  98
  99   detectBlock: {
 100     for (iter.reset(); this.nextChar(iter, det);) {
 101       totalCharCount++;
 102       if (iter.error) {
 103         badCharCount++;
 104       } else {
 105         var cv = iter.charValue & 0xFFFFFFFF;
 106
 107         if (cv <= 0xff) {
 108           singleByteCharCount++;
 109         } else {
 110           doubleByteCharCount++;
 111           if (this.commonChars != null) {
 112             // NOTE: This assumes that there are no 4-byte common chars.
 113             if (binarySearch(this.commonChars, cv) >= 0) {
 114               commonCharCount++;
 115             }
 116           }
 117         }
 118       }
 119       if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
 120         // console.log('its here!')
 121         // Bail out early if the byte data is not matching the encoding scheme.
 122         break detectBlock;
 123       }
 124     }
 125
 126     if (doubleByteCharCount <= 10 && badCharCount== 0) {
 127       // Not many multi-byte chars.
 128       if (doubleByteCharCount == 0 && totalCharCount < 10) {
 129         // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
 130         // We don't have enough data to have any confidence.
 131         // Statistical analysis of single byte non-ASCII charcters would probably help here.
 132         confidence = 0;
 133       }
 134       else {
 135         //   ASCII or ISO file?  It's probably not our encoding,
 136         //   but is not incompatible with our encoding, so don't give it a zero.
 137         confidence = 10;
 138       }
 139       break detectBlock;
 140     }
 141
 142     //
 143     //  No match if there are too many characters that don't fit the encoding scheme.
 144     //    (should we have zero tolerance for these?)
 145     //
 146     if (doubleByteCharCount < 20 * badCharCount) {
 147       confidence = 0;
 148       break detectBlock;
 149     }
 150
 151     if (this.commonChars == null) {
 152       // We have no statistics on frequently occuring characters.
 153       //  Assess confidence purely on having a reasonable number of
 154       //  multi-byte characters (the more the better
 155       confidence = 30 + doubleByteCharCount - 20 * badCharCount;
 156       if (confidence > 100) {
 157         confidence = 100;
 158       }
 159     } else {
 160       //
 161       // Frequency of occurence statistics exist.
 162       //
 163       var maxVal = Math.log(parseFloat(doubleByteCharCount) / 4);
 164       var scaleFactor = 90.0 / maxVal;
 165       confidence = Math.floor(Math.log(commonCharCount + 1) * scaleFactor + 10);
 166       confidence = Math.min(confidence, 100);
 167     }
 168   }   // end of detectBlock:
 169
 170   return confidence == 0 ? null : new Match(det, this, confidence);
 171 };
 172
 173 /**
 174  * Get the next character (however many bytes it is) from the input data
 175  *    Subclasses for specific charset encodings must implement this function
 176  *    to get characters according to the rules of their encoding scheme.
 177  *
 178  *  This function is not a method of class iteratedChar only because
 179  *   that would require a lot of extra derived classes, which is awkward.
 180  * @param it  The iteratedChar 'struct' into which the returned char is placed.
 181  * @param det The charset detector, which is needed to get at the input byte data
 182  *            being iterated over.
 183  * @return    True if a character was returned, false at end of input.
 184  */
 185
 186 mbcs.prototype.nextChar = function(iter, det) {};
 187
 188
 189
 190 /**
 191  * Shift-JIS charset recognizer.
 192  */
 193 module.exports.sjis = function() {
 194   this.name = function() {
 195     return 'Shift-JIS';
 196   };
 197   this.language = function() {
 198     return 'ja';
 199   };
 200
 201   // TODO:  This set of data comes from the character frequency-
 202   //        of-occurence analysis tool.  The data needs to be moved
 203   //        into a resource and loaded from there.
 204   this.commonChars = [
 205     0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
 206     0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
 207     0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
 208     0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
 209     0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
 210     0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa
 211   ];
 212
 213   this.nextChar = function(iter, det) {
 214     iter.index = iter.nextIndex;
 215     iter.error = false;
 216
 217     var firstByte;
 218     firstByte = iter.charValue = iter.nextByte(det);
 219     if (firstByte < 0)
 220       return false;
 221
 222     if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
 223       return true;
 224
 225     var secondByte = iter.nextByte(det);
 226     if (secondByte < 0)
 227       return false;
 228
 229     iter.charValue = (firstByte << 8) | secondByte;
 230     if (! ((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) {
 231       // Illegal second byte value.
 232       iter.error = true;
 233     }
 234     return true;
 235   };
 236 };
 237 util.inherits(module.exports.sjis, mbcs);
 238
 239
 240
 241 /**
 242  *   Big5 charset recognizer.
 243  */
 244 module.exports.big5 = function() {
 245   this.name = function() {
 246     return 'Big5';
 247   };
 248   this.language = function() {
 249     return 'zh';
 250   };
 251   // TODO:  This set of data comes from the character frequency-
 252   //        of-occurence analysis tool.  The data needs to be moved
 253   //        into a resource and loaded from there.
 254   this.commonChars = [
 255     0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
 256     0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
 257     0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
 258     0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
 259     0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
 260     0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
 261     0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
 262     0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
 263     0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
 264     0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f
 265   ];
 266   this.nextChar = function(iter, det) {
 267     iter.index = iter.nextIndex;
 268     iter.error = false;
 269
 270     var firstByte = iter.charValue = iter.nextByte(det);
 271
 272     if (firstByte < 0)
 273       return false;
 274
 275     // single byte character.
 276     if (firstByte <= 0x7f || firstByte == 0xff)
 277       return true;
 278
 279     var secondByte = iter.nextByte(det);
 280
 281     if (secondByte < 0)
 282       return false;
 283
 284     iter.charValue = (iter.charValue << 8) | secondByte;
 285
 286     if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff)
 287       iter.error = true;
 288
 289     return true;
 290   };
 291 };
 292 util.inherits(module.exports.big5, mbcs);
 293
 294
 295
 296 /**
 297  *  EUC charset recognizers.  One abstract class that provides the common function
 298  *  for getting the next character according to the EUC encoding scheme,
 299  *  and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
 300  *
 301  *  Get the next character value for EUC based encodings.
 302  *  Character 'value' is simply the raw bytes that make up the character
 303  *     packed into an int.
 304  */
 305 function eucNextChar(iter, det) {
 306   iter.index = iter.nextIndex;
 307   iter.error = false;
 308   var firstByte  = 0;
 309   var secondByte = 0;
 310   var thirdByte  = 0;
 311   //int fourthByte = 0;
 312   buildChar: {
 313     firstByte = iter.charValue = iter.nextByte(det);
 314     if (firstByte < 0) {
 315       // Ran off the end of the input data
 316       iter.done = true;
 317       break buildChar;
 318     }
 319     if (firstByte <= 0x8d) {
 320       // single byte char
 321       break buildChar;
 322     }
 323     secondByte = iter.nextByte(det);
 324     iter.charValue = (iter.charValue << 8) | secondByte;
 325     if (firstByte >= 0xA1 && firstByte <= 0xfe) {
 326       // Two byte Char
 327       if (secondByte < 0xa1) {
 328         iter.error = true;
 329       }
 330       break buildChar;
 331     }
 332     if (firstByte == 0x8e) {
 333       // Code Set 2.
 334       //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
 335       //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
 336       // We don't know which we've got.
 337       // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
 338       //   bytes will look like a well formed 2 byte char.
 339       if (secondByte < 0xa1) {
 340         iter.error = true;
 341       }
 342       break buildChar;
 343     }
 344     if (firstByte == 0x8f) {
 345       // Code set 3.
 346       // Three byte total char size, two bytes of actual char value.
 347       thirdByte = iter.nextByte(det);
 348       iter.charValue = (iter.charValue << 8) | thirdByte;
 349       if (thirdByte < 0xa1) {
 350         iter.error = true;
 351       }
 352     }
 353   }
 354   return iter.done == false;
 355 };
 356
 357
 358
 359 /**
 360  * The charset recognize for EUC-JP.  A singleton instance of this class
 361  *    is created and kept by the public CharsetDetector class
 362  */
 363 module.exports.euc_jp = function() {
 364   this.name = function() {
 365     return 'EUC-JP';
 366   };
 367   this.language = function() {
 368     return 'ja';
 369   };
 370
 371   // TODO:  This set of data comes from the character frequency-
 372   //        of-occurence analysis tool.  The data needs to be moved
 373   //        into a resource and loaded from there.
 374   this.commonChars = [
 375     0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
 376     0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
 377     0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
 378     0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
 379     0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
 380     0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
 381     0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
 382     0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
 383     0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
 384     0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1
 385   ];
 386
 387   this.nextChar = eucNextChar;
 388 };
 389 util.inherits(module.exports.euc_jp, mbcs);
 390
 391
 392
 393 /**
 394  * The charset recognize for EUC-KR.  A singleton instance of this class
 395  *    is created and kept by the public CharsetDetector class
 396  */
 397 module.exports.euc_kr = function() {
 398   this.name = function() {
 399     return 'EUC-KR';
 400   };
 401   this.language = function() {
 402     return 'ko';
 403   };
 404
 405   // TODO:  This set of data comes from the character frequency-
 406   //        of-occurence analysis tool.  The data needs to be moved
 407   //        into a resource and loaded from there.
 408   this.commonChars = [
 409     0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
 410     0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
 411     0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
 412     0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
 413     0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
 414     0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
 415     0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
 416     0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
 417     0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
 418     0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad
 419   ];
 420
 421   this.nextChar = eucNextChar;
 422 };
 423 util.inherits(module.exports.euc_kr, mbcs);
 424
 425
 426
 427 /**
 428  *   GB-18030 recognizer. Uses simplified Chinese statistics.
 429  */
 430 module.exports.gb_18030 = function() {
 431   this.name = function() {
 432     return 'GB18030';
 433   };
 434   this.language = function() {
 435     return 'zh';
 436   };
 437
 438   /*
 439    *  Get the next character value for EUC based encodings.
 440    *  Character 'value' is simply the raw bytes that make up the character
 441    *     packed into an int.
 442    */
 443   this.nextChar = function(iter, det) {
 444     iter.index = iter.nextIndex;
 445     iter.error = false;
 446     var firstByte  = 0;
 447     var secondByte = 0;
 448     var thirdByte  = 0;
 449     var fourthByte = 0;
 450     buildChar: {
 451       firstByte = iter.charValue = iter.nextByte(det);
 452       if (firstByte < 0) {
 453         // Ran off the end of the input data
 454         iter.done = true;
 455         break buildChar;
 456       }
 457       if (firstByte <= 0x80) {
 458         // single byte char
 459         break buildChar;
 460       }
 461       secondByte = iter.nextByte(det);
 462       iter.charValue = (iter.charValue << 8) | secondByte;
 463       if (firstByte >= 0x81 && firstByte <= 0xFE) {
 464         // Two byte Char
 465         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
 466           break buildChar;
 467         }
 468         // Four byte char
 469         if (secondByte >= 0x30 && secondByte <= 0x39) {
 470           thirdByte = iter.nextByte(det);
 471           if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
 472             fourthByte = iter.nextByte(det);
 473             if (fourthByte >= 0x30 && fourthByte <= 0x39) {
 474               iter.charValue = (iter.charValue << 16) | (thirdByte << 8) | fourthByte;
 475               break buildChar;
 476             }
 477           }
 478         }
 479         iter.error = true;
 480         break buildChar;
 481       }
 482     }
 483     return iter.done == false;
 484   };
 485
 486   // TODO:  This set of data comes from the character frequency-
 487   //        of-occurence analysis tool.  The data needs to be moved
 488   //        into a resource and loaded from there.
 489   this.commonChars = [
 490     0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
 491     0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
 492     0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
 493     0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
 494     0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
 495     0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
 496     0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
 497     0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
 498     0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
 499     0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0
 500   ];
 501 };
 502 util.inherits(module.exports.gb_18030, mbcs);