--- /dev/null
+'use strict'
+
+var legacy = require('character-entities-legacy')
+var invalid = require('character-reference-invalid')
+var decimal = require('is-decimal')
+var hexadecimal = require('is-hexadecimal')
+var alphanumerical = require('is-alphanumerical')
+var decodeEntity = require('./decode-entity')
+
+module.exports = parseEntities
+
+var own = {}.hasOwnProperty
+var fromCharCode = String.fromCharCode
+var noop = Function.prototype
+
+// Default settings.
+var defaults = {
+ warning: null,
+ reference: null,
+ text: null,
+ warningContext: null,
+ referenceContext: null,
+ textContext: null,
+ position: {},
+ additional: null,
+ attribute: false,
+ nonTerminated: true
+}
+
+// Characters.
+var tab = 9 // '\t'
+var lineFeed = 10 // '\n'
+var formFeed = 12 // '\f'
+var space = 32 // ' '
+var ampersand = 38 // '&'
+var semicolon = 59 // ';'
+var lessThan = 60 // '<'
+var equalsTo = 61 // '='
+var numberSign = 35 // '#'
+var uppercaseX = 88 // 'X'
+var lowercaseX = 120 // 'x'
+var replacementCharacter = 65533 // '�'
+
+// Reference types.
+var name = 'named'
+var hexa = 'hexadecimal'
+var deci = 'decimal'
+
+// Map of bases.
+var bases = {}
+
+bases[hexa] = 16
+bases[deci] = 10
+
+// Map of types to tests.
+// Each type of character reference accepts different characters.
+// This test is used to detect whether a reference has ended (as the semicolon
+// is not strictly needed).
+var tests = {}
+
+tests[name] = alphanumerical
+tests[deci] = decimal
+tests[hexa] = hexadecimal
+
+// Warning types.
+var namedNotTerminated = 1
+var numericNotTerminated = 2
+var namedEmpty = 3
+var numericEmpty = 4
+var namedUnknown = 5
+var numericDisallowed = 6
+var numericProhibited = 7
+
+// Warning messages.
+var messages = {}
+
+messages[namedNotTerminated] =
+ 'Named character references must be terminated by a semicolon'
+messages[numericNotTerminated] =
+ 'Numeric character references must be terminated by a semicolon'
+messages[namedEmpty] = 'Named character references cannot be empty'
+messages[numericEmpty] = 'Numeric character references cannot be empty'
+messages[namedUnknown] = 'Named character references must be known'
+messages[numericDisallowed] =
+ 'Numeric character references cannot be disallowed'
+messages[numericProhibited] =
+ 'Numeric character references cannot be outside the permissible Unicode range'
+
+// Wrap to ensure clean parameters are given to `parse`.
+function parseEntities(value, options) {
+ var settings = {}
+ var option
+ var key
+
+ if (!options) {
+ options = {}
+ }
+
+ for (key in defaults) {
+ option = options[key]
+ settings[key] =
+ option === null || option === undefined ? defaults[key] : option
+ }
+
+ if (settings.position.indent || settings.position.start) {
+ settings.indent = settings.position.indent || []
+ settings.position = settings.position.start
+ }
+
+ return parse(value, settings)
+}
+
+// Parse entities.
+// eslint-disable-next-line complexity
+function parse(value, settings) {
+ var additional = settings.additional
+ var nonTerminated = settings.nonTerminated
+ var handleText = settings.text
+ var handleReference = settings.reference
+ var handleWarning = settings.warning
+ var textContext = settings.textContext
+ var referenceContext = settings.referenceContext
+ var warningContext = settings.warningContext
+ var pos = settings.position
+ var indent = settings.indent || []
+ var length = value.length
+ var index = 0
+ var lines = -1
+ var column = pos.column || 1
+ var line = pos.line || 1
+ var queue = ''
+ var result = []
+ var entityCharacters
+ var namedEntity
+ var terminated
+ var characters
+ var character
+ var reference
+ var following
+ var warning
+ var reason
+ var output
+ var entity
+ var begin
+ var start
+ var type
+ var test
+ var prev
+ var next
+ var diff
+ var end
+
+ if (typeof additional === 'string') {
+ additional = additional.charCodeAt(0)
+ }
+
+ // Cache the current point.
+ prev = now()
+
+ // Wrap `handleWarning`.
+ warning = handleWarning ? parseError : noop
+
+ // Ensure the algorithm walks over the first character and the end (inclusive).
+ index--
+ length++
+
+ while (++index < length) {
+ // If the previous character was a newline.
+ if (character === lineFeed) {
+ column = indent[lines] || 1
+ }
+
+ character = value.charCodeAt(index)
+
+ if (character === ampersand) {
+ following = value.charCodeAt(index + 1)
+
+ // The behaviour depends on the identity of the next character.
+ if (
+ following === tab ||
+ following === lineFeed ||
+ following === formFeed ||
+ following === space ||
+ following === ampersand ||
+ following === lessThan ||
+ following !== following ||
+ (additional && following === additional)
+ ) {
+ // Not a character reference.
+ // No characters are consumed, and nothing is returned.
+ // This is not an error, either.
+ queue += fromCharCode(character)
+ column++
+
+ continue
+ }
+
+ start = index + 1
+ begin = start
+ end = start
+
+ if (following === numberSign) {
+ // Numerical entity.
+ end = ++begin
+
+ // The behaviour further depends on the next character.
+ following = value.charCodeAt(end)
+
+ if (following === uppercaseX || following === lowercaseX) {
+ // ASCII hex digits.
+ type = hexa
+ end = ++begin
+ } else {
+ // ASCII digits.
+ type = deci
+ }
+ } else {
+ // Named entity.
+ type = name
+ }
+
+ entityCharacters = ''
+ entity = ''
+ characters = ''
+ test = tests[type]
+ end--
+
+ while (++end < length) {
+ following = value.charCodeAt(end)
+
+ if (!test(following)) {
+ break
+ }
+
+ characters += fromCharCode(following)
+
+ // Check if we can match a legacy named reference.
+ // If so, we cache that as the last viable named reference.
+ // This ensures we do not need to walk backwards later.
+ if (type === name && own.call(legacy, characters)) {
+ entityCharacters = characters
+ entity = legacy[characters]
+ }
+ }
+
+ terminated = value.charCodeAt(end) === semicolon
+
+ if (terminated) {
+ end++
+
+ namedEntity = type === name ? decodeEntity(characters) : false
+
+ if (namedEntity) {
+ entityCharacters = characters
+ entity = namedEntity
+ }
+ }
+
+ diff = 1 + end - start
+
+ if (!terminated && !nonTerminated) {
+ // Empty.
+ } else if (!characters) {
+ // An empty (possible) entity is valid, unless it’s numeric (thus an
+ // ampersand followed by an octothorp).
+ if (type !== name) {
+ warning(numericEmpty, diff)
+ }
+ } else if (type === name) {
+ // An ampersand followed by anything unknown, and not terminated, is
+ // invalid.
+ if (terminated && !entity) {
+ warning(namedUnknown, 1)
+ } else {
+ // If theres something after an entity name which is not known, cap
+ // the reference.
+ if (entityCharacters !== characters) {
+ end = begin + entityCharacters.length
+ diff = 1 + end - begin
+ terminated = false
+ }
+
+ // If the reference is not terminated, warn.
+ if (!terminated) {
+ reason = entityCharacters ? namedNotTerminated : namedEmpty
+
+ if (settings.attribute) {
+ following = value.charCodeAt(end)
+
+ if (following === equalsTo) {
+ warning(reason, diff)
+ entity = null
+ } else if (alphanumerical(following)) {
+ entity = null
+ } else {
+ warning(reason, diff)
+ }
+ } else {
+ warning(reason, diff)
+ }
+ }
+ }
+
+ reference = entity
+ } else {
+ if (!terminated) {
+ // All non-terminated numeric entities are not rendered, and trigger a
+ // warning.
+ warning(numericNotTerminated, diff)
+ }
+
+ // When terminated and number, parse as either hexadecimal or decimal.
+ reference = parseInt(characters, bases[type])
+
+ // Trigger a warning when the parsed number is prohibited, and replace
+ // with replacement character.
+ if (prohibited(reference)) {
+ warning(numericProhibited, diff)
+ reference = fromCharCode(replacementCharacter)
+ } else if (reference in invalid) {
+ // Trigger a warning when the parsed number is disallowed, and replace
+ // by an alternative.
+ warning(numericDisallowed, diff)
+ reference = invalid[reference]
+ } else {
+ // Parse the number.
+ output = ''
+
+ // Trigger a warning when the parsed number should not be used.
+ if (disallowed(reference)) {
+ warning(numericDisallowed, diff)
+ }
+
+ // Stringify the number.
+ if (reference > 0xffff) {
+ reference -= 0x10000
+ output += fromCharCode((reference >>> (10 & 0x3ff)) | 0xd800)
+ reference = 0xdc00 | (reference & 0x3ff)
+ }
+
+ reference = output + fromCharCode(reference)
+ }
+ }
+
+ // Found it!
+ // First eat the queued characters as normal text, then eat an entity.
+ if (reference) {
+ flush()
+
+ prev = now()
+ index = end - 1
+ column += end - start + 1
+ result.push(reference)
+ next = now()
+ next.offset++
+
+ if (handleReference) {
+ handleReference.call(
+ referenceContext,
+ reference,
+ {start: prev, end: next},
+ value.slice(start - 1, end)
+ )
+ }
+
+ prev = next
+ } else {
+ // If we could not find a reference, queue the checked characters (as
+ // normal characters), and move the pointer to their end.
+ // This is possible because we can be certain neither newlines nor
+ // ampersands are included.
+ characters = value.slice(start - 1, end)
+ queue += characters
+ column += characters.length
+ index = end - 1
+ }
+ } else {
+ // Handle anything other than an ampersand, including newlines and EOF.
+ if (
+ character === 10 // Line feed
+ ) {
+ line++
+ lines++
+ column = 0
+ }
+
+ if (character === character) {
+ queue += fromCharCode(character)
+ column++
+ } else {
+ flush()
+ }
+ }
+ }
+
+ // Return the reduced nodes, and any possible warnings.
+ return result.join('')
+
+ // Get current position.
+ function now() {
+ return {
+ line: line,
+ column: column,
+ offset: index + (pos.offset || 0)
+ }
+ }
+
+ // “Throw” a parse-error: a warning.
+ function parseError(code, offset) {
+ var position = now()
+
+ position.column += offset
+ position.offset += offset
+
+ handleWarning.call(warningContext, messages[code], position, code)
+ }
+
+ // Flush `queue` (normal text).
+ // Macro invoked before each entity and at the end of `value`.
+ // Does nothing when `queue` is empty.
+ function flush() {
+ if (queue) {
+ result.push(queue)
+
+ if (handleText) {
+ handleText.call(textContext, queue, {start: prev, end: now()})
+ }
+
+ queue = ''
+ }
+ }
+}
+
+// Check if `character` is outside the permissible unicode range.
+function prohibited(code) {
+ return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff
+}
+
+// Check if `character` is disallowed.
+function disallowed(code) {
+ return (
+ (code >= 0x0001 && code <= 0x0008) ||
+ code === 0x000b ||
+ (code >= 0x000d && code <= 0x001f) ||
+ (code >= 0x007f && code <= 0x009f) ||
+ (code >= 0xfdd0 && code <= 0xfdef) ||
+ (code & 0xffff) === 0xffff ||
+ (code & 0xffff) === 0xfffe
+ )
+}