1 var util = require('util'),
2 Match = require ('../match');
5 * Binary search implementation (recursive)
7 function binarySearch(arr, searchValue) {
8 function find(arr, searchValue, left, right) {
13 int mid = mid = (left + right) / 2;
14 There is a bug in the above line;
15 Joshua Bloch suggests the following replacement:
17 var mid = Math.floor((left + right) >>> 1);
18 if (searchValue > arr[mid])
19 return find(arr, searchValue, mid + 1, right);
21 if (searchValue < arr[mid])
22 return find(arr, searchValue, left, mid - 1);
27 return find(arr, searchValue, 0, arr.length - 1);
30 // 'Character' iterated character class.
31 // Recognizers for specific mbcs encodings make their 'characters' available
32 // by providing a nextChar() function that fills in an instance of iteratedChar
33 // with the next char from the input.
34 // The returned characters are not converted to Unicode, but remain as the raw
35 // bytes (concatenated into an int) from the codepage data.
37 // For Asian charsets, use the raw input rather than the input that has been
38 // stripped of markup. Detection only considers multi-byte chars, effectively
39 // stripping markup anyway, and double byte chars do occur in markup too.
41 function IteratedChar() {
43 this.charValue = 0; // 1-4 bytes from the raw input data
49 this.reset = function() {
57 this.nextByte = function(det) {
58 if (this.nextIndex >= det.fRawLength) {
62 var byteValue = det.fRawInput[this.nextIndex++] & 0x00ff;
70 * Asian double or multi-byte - charsets.
71 * Match is determined mostly by the input data adhering to the
72 * encoding scheme for the charset, and, optionally,
73 * frequency-of-occurence of characters.
79 * Test the match of this charset with the input text data
80 * which is obtained via the CharsetDetector object.
82 * @param det The CharsetDetector, which contains the input text
83 * to be checked for being in this charset.
84 * @return Two values packed into one int (Damn java, anyhow)
85 * bits 0-7: the match confidence, ranging from 0-100
86 * bits 8-15: The match reason, an enum-like value.
88 mbcs.prototype.match = function(det) {
90 var singleByteCharCount = 0, //TODO Do we really need this?
91 doubleByteCharCount = 0,
97 var iter = new IteratedChar();
100 for (iter.reset(); this.nextChar(iter, det);) {
105 var cv = iter.charValue & 0xFFFFFFFF;
108 singleByteCharCount++;
110 doubleByteCharCount++;
111 if (this.commonChars != null) {
112 // NOTE: This assumes that there are no 4-byte common chars.
113 if (binarySearch(this.commonChars, cv) >= 0) {
119 if (badCharCount >= 2 && badCharCount * 5 >= doubleByteCharCount) {
120 // console.log('its here!')
121 // Bail out early if the byte data is not matching the encoding scheme.
126 if (doubleByteCharCount <= 10 && badCharCount== 0) {
127 // Not many multi-byte chars.
128 if (doubleByteCharCount == 0 && totalCharCount < 10) {
129 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
130 // We don't have enough data to have any confidence.
131 // Statistical analysis of single byte non-ASCII charcters would probably help here.
135 // ASCII or ISO file? It's probably not our encoding,
136 // but is not incompatible with our encoding, so don't give it a zero.
143 // No match if there are too many characters that don't fit the encoding scheme.
144 // (should we have zero tolerance for these?)
146 if (doubleByteCharCount < 20 * badCharCount) {
151 if (this.commonChars == null) {
152 // We have no statistics on frequently occuring characters.
153 // Assess confidence purely on having a reasonable number of
154 // multi-byte characters (the more the better
155 confidence = 30 + doubleByteCharCount - 20 * badCharCount;
156 if (confidence > 100) {
161 // Frequency of occurence statistics exist.
163 var maxVal = Math.log(parseFloat(doubleByteCharCount) / 4);
164 var scaleFactor = 90.0 / maxVal;
165 confidence = Math.floor(Math.log(commonCharCount + 1) * scaleFactor + 10);
166 confidence = Math.min(confidence, 100);
168 } // end of detectBlock:
170 return confidence == 0 ? null : new Match(det, this, confidence);
174 * Get the next character (however many bytes it is) from the input data
175 * Subclasses for specific charset encodings must implement this function
176 * to get characters according to the rules of their encoding scheme.
178 * This function is not a method of class iteratedChar only because
179 * that would require a lot of extra derived classes, which is awkward.
180 * @param it The iteratedChar 'struct' into which the returned char is placed.
181 * @param det The charset detector, which is needed to get at the input byte data
182 * being iterated over.
183 * @return True if a character was returned, false at end of input.
186 mbcs.prototype.nextChar = function(iter, det) {};
191 * Shift-JIS charset recognizer.
193 module.exports.sjis = function() {
194 this.name = function() {
197 this.language = function() {
201 // TODO: This set of data comes from the character frequency-
202 // of-occurence analysis tool. The data needs to be moved
203 // into a resource and loaded from there.
205 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
206 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
207 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
208 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
209 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
210 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa
213 this.nextChar = function(iter, det) {
214 iter.index = iter.nextIndex;
218 firstByte = iter.charValue = iter.nextByte(det);
222 if (firstByte <= 0x7f || (firstByte > 0xa0 && firstByte <= 0xdf))
225 var secondByte = iter.nextByte(det);
229 iter.charValue = (firstByte << 8) | secondByte;
230 if (! ((secondByte >= 0x40 && secondByte <= 0x7f) || (secondByte >= 0x80 && secondByte <= 0xff))) {
231 // Illegal second byte value.
237 util.inherits(module.exports.sjis, mbcs);
242 * Big5 charset recognizer.
244 module.exports.big5 = function() {
245 this.name = function() {
248 this.language = function() {
251 // TODO: This set of data comes from the character frequency-
252 // of-occurence analysis tool. The data needs to be moved
253 // into a resource and loaded from there.
255 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
256 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
257 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
258 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
259 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
260 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
261 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
262 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
263 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
264 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f
266 this.nextChar = function(iter, det) {
267 iter.index = iter.nextIndex;
270 var firstByte = iter.charValue = iter.nextByte(det);
275 // single byte character.
276 if (firstByte <= 0x7f || firstByte == 0xff)
279 var secondByte = iter.nextByte(det);
284 iter.charValue = (iter.charValue << 8) | secondByte;
286 if (secondByte < 0x40 || secondByte == 0x7f || secondByte == 0xff)
292 util.inherits(module.exports.big5, mbcs);
297 * EUC charset recognizers. One abstract class that provides the common function
298 * for getting the next character according to the EUC encoding scheme,
299 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
301 * Get the next character value for EUC based encodings.
302 * Character 'value' is simply the raw bytes that make up the character
303 * packed into an int.
305 function eucNextChar(iter, det) {
306 iter.index = iter.nextIndex;
311 //int fourthByte = 0;
313 firstByte = iter.charValue = iter.nextByte(det);
315 // Ran off the end of the input data
319 if (firstByte <= 0x8d) {
323 secondByte = iter.nextByte(det);
324 iter.charValue = (iter.charValue << 8) | secondByte;
325 if (firstByte >= 0xA1 && firstByte <= 0xfe) {
327 if (secondByte < 0xa1) {
332 if (firstByte == 0x8e) {
334 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
335 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
336 // We don't know which we've got.
337 // Treat it like EUC-JP. If the data really was EUC-TW, the following two
338 // bytes will look like a well formed 2 byte char.
339 if (secondByte < 0xa1) {
344 if (firstByte == 0x8f) {
346 // Three byte total char size, two bytes of actual char value.
347 thirdByte = iter.nextByte(det);
348 iter.charValue = (iter.charValue << 8) | thirdByte;
349 if (thirdByte < 0xa1) {
354 return iter.done == false;
360 * The charset recognize for EUC-JP. A singleton instance of this class
361 * is created and kept by the public CharsetDetector class
363 module.exports.euc_jp = function() {
364 this.name = function() {
367 this.language = function() {
371 // TODO: This set of data comes from the character frequency-
372 // of-occurence analysis tool. The data needs to be moved
373 // into a resource and loaded from there.
375 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
376 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
377 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
378 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
379 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
380 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
381 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
382 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
383 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
384 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1
387 this.nextChar = eucNextChar;
389 util.inherits(module.exports.euc_jp, mbcs);
394 * The charset recognize for EUC-KR. A singleton instance of this class
395 * is created and kept by the public CharsetDetector class
397 module.exports.euc_kr = function() {
398 this.name = function() {
401 this.language = function() {
405 // TODO: This set of data comes from the character frequency-
406 // of-occurence analysis tool. The data needs to be moved
407 // into a resource and loaded from there.
409 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
410 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
411 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
412 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
413 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
414 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
415 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
416 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
417 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
418 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad
421 this.nextChar = eucNextChar;
423 util.inherits(module.exports.euc_kr, mbcs);
428 * GB-18030 recognizer. Uses simplified Chinese statistics.
430 module.exports.gb_18030 = function() {
431 this.name = function() {
434 this.language = function() {
439 * Get the next character value for EUC based encodings.
440 * Character 'value' is simply the raw bytes that make up the character
441 * packed into an int.
443 this.nextChar = function(iter, det) {
444 iter.index = iter.nextIndex;
451 firstByte = iter.charValue = iter.nextByte(det);
453 // Ran off the end of the input data
457 if (firstByte <= 0x80) {
461 secondByte = iter.nextByte(det);
462 iter.charValue = (iter.charValue << 8) | secondByte;
463 if (firstByte >= 0x81 && firstByte <= 0xFE) {
465 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
469 if (secondByte >= 0x30 && secondByte <= 0x39) {
470 thirdByte = iter.nextByte(det);
471 if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
472 fourthByte = iter.nextByte(det);
473 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
474 iter.charValue = (iter.charValue << 16) | (thirdByte << 8) | fourthByte;
483 return iter.done == false;
486 // TODO: This set of data comes from the character frequency-
487 // of-occurence analysis tool. The data needs to be moved
488 // into a resource and loaded from there.
490 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
491 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
492 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
493 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
494 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
495 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
496 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
497 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
498 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
499 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0
502 util.inherits(module.exports.gb_18030, mbcs);