unicode.js

   1 'use strict';
   2 var util = require('util'),
   3   Match = require ('../match');
   4
   5 /**
   6  * This class matches UTF-16 and UTF-32, both big- and little-endian. The
   7  * BOM will be used if it is present.
   8  */
   9 module.exports.UTF_16BE = function() {
  10   this.name = function() {
  11     return 'UTF-16BE';
  12   };
  13   this.match = function(det) {
  14     var input = det.fRawInput;
  15
  16     if (input.length >= 2 && ((input[0] & 0xff) == 0xfe && (input[1] & 0xff) == 0xff)) {
  17       return new Match(det, this, 100); // confidence = 100
  18     }
  19
  20     // TODO: Do some statistics to check for unsigned UTF-16BE
  21     return null;
  22   };
  23 };
  24
  25 module.exports.UTF_16LE = function() {
  26   this.name = function() {
  27     return 'UTF-16LE';
  28   };
  29   this.match = function(det) {
  30     var input = det.fRawInput;
  31
  32     if (input.length >= 2 && ((input[0] & 0xff) == 0xff && (input[1] & 0xff) == 0xfe)) {
  33       // LE BOM is present.
  34       if (input.length >= 4 && input[2] == 0x00 && input[3] == 0x00) {
  35         // It is probably UTF-32 LE, not UTF-16
  36         return null;
  37       }
  38       return new Match(det, this, 100); // confidence = 100
  39     }
  40
  41     // TODO: Do some statistics to check for unsigned UTF-16LE
  42     return null;
  43   }
  44 };
  45
  46 function UTF_32() {};
  47 UTF_32.prototype.match = function(det) {
  48   var input      = det.fRawInput,
  49     limit      = (det.fRawLength / 4) * 4,
  50     numValid   = 0,
  51     numInvalid = 0,
  52     hasBOM     = false,
  53     confidence = 0;
  54
  55   if (limit == 0) {
  56     return null;
  57   }
  58
  59   if (this.getChar(input, 0) == 0x0000FEFF) {
  60     hasBOM = true;
  61   }
  62
  63   for (var i = 0; i < limit; i += 4) {
  64     var ch = this.getChar(input, i);
  65
  66     if (ch < 0 || ch >= 0x10FFFF || (ch >= 0xD800 && ch <= 0xDFFF)) {
  67       numInvalid += 1;
  68     } else {
  69       numValid += 1;
  70     }
  71   }
  72
  73   // Cook up some sort of confidence score, based on presence of a BOM
  74   //    and the existence of valid and/or invalid multi-byte sequences.
  75   if (hasBOM && numInvalid == 0) {
  76     confidence = 100;
  77   } else if (hasBOM && numValid > numInvalid * 10) {
  78     confidence = 80;
  79   } else if (numValid > 3 && numInvalid == 0) {
  80     confidence = 100;
  81   } else if (numValid > 0 && numInvalid == 0) {
  82     confidence = 80;
  83   } else if (numValid > numInvalid * 10) {
  84     // Probably corrupt UTF-32BE data.  Valid sequences aren't likely by chance.
  85     confidence = 25;
  86   }
  87
  88   // return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
  89   return confidence == 0 ? null : new Match(det, this, confidence);
  90 };
  91
  92 module.exports.UTF_32BE = function() {
  93   this.name = function() {
  94     return 'UTF-32BE';
  95   };
  96   this.getChar = function(input, index) {
  97     return (input[index + 0] & 0xff) << 24 | (input[index + 1] & 0xff) << 16 |
  98          (input[index + 2] & 0xff) <<  8 | (input[index + 3] & 0xff);
  99   };
 100 };
 101 util.inherits(module.exports.UTF_32BE, UTF_32);
 102
 103 module.exports.UTF_32LE = function() {
 104   this.name = function() {
 105     return 'UTF-32LE';
 106   };
 107   this.getChar = function(input, index) {
 108     return (input[index + 3] & 0xff) << 24 | (input[index + 2] & 0xff) << 16 |
 109          (input[index + 1] & 0xff) <<  8 | (input[index + 0] & 0xff);
 110   };
 111 };
 112 util.inherits(module.exports.UTF_32LE, UTF_32);