node_modules/node-pty/deps/winpty/src/agent/UnicodeEncoding.h

   1 // Copyright (c) 2015 Ryan Prichard
   2 //
   3 // Permission is hereby granted, free of charge, to any person obtaining a copy
   4 // of this software and associated documentation files (the "Software"), to
   5 // deal in the Software without restriction, including without limitation the
   6 // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
   7 // sell copies of the Software, and to permit persons to whom the Software is
   8 // furnished to do so, subject to the following conditions:
   9 //
  10 // The above copyright notice and this permission notice shall be included in
  11 // all copies or substantial portions of the Software.
  12 //
  13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  18 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  19 // IN THE SOFTWARE.
  20
  21 #ifndef UNICODE_ENCODING_H
  22 #define UNICODE_ENCODING_H
  23
  24 #include <stdint.h>
  25
  26 // Encode the Unicode codepoint with UTF-8.  The buffer must be at least 4
  27 // bytes in size.
  28 static inline int encodeUtf8(char *out, uint32_t code) {
  29     if (code < 0x80) {
  30         out[0] = code;
  31         return 1;
  32     } else if (code < 0x800) {
  33         out[0] = ((code >> 6) & 0x1F) | 0xC0;
  34         out[1] = ((code >> 0) & 0x3F) | 0x80;
  35         return 2;
  36     } else if (code < 0x10000) {
  37         if (code >= 0xD800 && code <= 0xDFFF) {
  38             // The code points 0xD800 to 0xDFFF are reserved for UTF-16
  39             // surrogate pairs and do not have an encoding in UTF-8.
  40             return 0;
  41         }
  42         out[0] = ((code >> 12) & 0x0F) | 0xE0;
  43         out[1] = ((code >>  6) & 0x3F) | 0x80;
  44         out[2] = ((code >>  0) & 0x3F) | 0x80;
  45         return 3;
  46     } else if (code < 0x110000) {
  47         out[0] = ((code >> 18) & 0x07) | 0xF0;
  48         out[1] = ((code >> 12) & 0x3F) | 0x80;
  49         out[2] = ((code >>  6) & 0x3F) | 0x80;
  50         out[3] = ((code >>  0) & 0x3F) | 0x80;
  51         return 4;
  52     } else {
  53         // Encoding error
  54         return 0;
  55     }
  56 }
  57
  58 // Encode the Unicode codepoint with UTF-16.  The buffer must be large enough
  59 // to hold the output -- either 1 or 2 elements.
  60 static inline int encodeUtf16(wchar_t *out, uint32_t code) {
  61     if (code < 0x10000) {
  62         if (code >= 0xD800 && code <= 0xDFFF) {
  63             // The code points 0xD800 to 0xDFFF are reserved for UTF-16
  64             // surrogate pairs and do not have an encoding in UTF-16.
  65             return 0;
  66         }
  67         out[0] = code;
  68         return 1;
  69     } else if (code < 0x110000) {
  70         code -= 0x10000;
  71         out[0] = 0xD800 | (code >> 10);
  72         out[1] = 0xDC00 | (code & 0x3FF);
  73         return 2;
  74     } else {
  75         // Encoding error
  76         return 0;
  77     }
  78 }
  79
  80 // Return the byte size of a UTF-8 character using the value of the first
  81 // byte.
  82 static inline int utf8CharLength(char firstByte) {
  83     // This code would probably be faster if it used __builtin_clz.
  84     if ((firstByte & 0x80) == 0) {
  85         return 1;
  86     } else if ((firstByte & 0xE0) == 0xC0) {
  87         return 2;
  88     } else if ((firstByte & 0xF0) == 0xE0) {
  89         return 3;
  90     } else if ((firstByte & 0xF8) == 0xF0) {
  91         return 4;
  92     } else {
  93         // Malformed UTF-8.
  94         return 0;
  95     }
  96 }
  97
  98 // The pointer must point to 1-4 bytes, as indicated by the first byte.
  99 // Returns -1 on decoding error.
 100 static inline uint32_t decodeUtf8(const char *in) {
 101     const uint32_t kInvalid = static_cast<uint32_t>(-1);
 102     switch (utf8CharLength(in[0])) {
 103         case 1: {
 104             return in[0];
 105         }
 106         case 2: {
 107             if ((in[1] & 0xC0) != 0x80) {
 108                 return kInvalid;
 109             }
 110             uint32_t tmp = 0;
 111             tmp = (in[0] & 0x1F) << 6;
 112             tmp |= (in[1] & 0x3F);
 113             return tmp <= 0x7F ? kInvalid : tmp;
 114         }
 115         case 3: {
 116             if ((in[1] & 0xC0) != 0x80 ||
 117                     (in[2] & 0xC0) != 0x80) {
 118                 return kInvalid;
 119             }
 120             uint32_t tmp = 0;
 121             tmp = (in[0] & 0x0F) << 12;
 122             tmp |= (in[1] & 0x3F) << 6;
 123             tmp |= (in[2] & 0x3F);
 124             if (tmp <= 0x07FF || (tmp >= 0xD800 && tmp <= 0xDFFF)) {
 125                 return kInvalid;
 126             } else {
 127                 return tmp;
 128             }
 129         }
 130         case 4: {
 131             if ((in[1] & 0xC0) != 0x80 ||
 132                     (in[2] & 0xC0) != 0x80 ||
 133                     (in[3] & 0xC0) != 0x80) {
 134                 return kInvalid;
 135             }
 136             uint32_t tmp = 0;
 137             tmp = (in[0] & 0x07) << 18;
 138             tmp |= (in[1] & 0x3F) << 12;
 139             tmp |= (in[2] & 0x3F) << 6;
 140             tmp |= (in[3] & 0x3F);
 141             if (tmp <= 0xFFFF || tmp > 0x10FFFF) {
 142                 return kInvalid;
 143             } else {
 144                 return tmp;
 145             }
 146         }
 147         default: {
 148             return kInvalid;
 149         }
 150     }
 151 }
 152
 153 static inline uint32_t decodeSurrogatePair(wchar_t ch1, wchar_t ch2) {
 154     return ((ch1 - 0xD800) << 10) + (ch2 - 0xDC00) + 0x10000;
 155 }
 156
 157 #endif // UNICODE_ENCODING_H