3 var legacy = require('character-entities-legacy')
4 var invalid = require('character-reference-invalid')
5 var decimal = require('is-decimal')
6 var hexadecimal = require('is-hexadecimal')
7 var alphanumerical = require('is-alphanumerical')
8 var decodeEntity = require('./decode-entity')
10 module.exports = parseEntities
12 var own = {}.hasOwnProperty
13 var fromCharCode = String.fromCharCode
14 var noop = Function.prototype
22 referenceContext: null,
32 var lineFeed = 10 // '\n'
33 var formFeed = 12 // '\f'
35 var ampersand = 38 // '&'
36 var semicolon = 59 // ';'
37 var lessThan = 60 // '<'
38 var equalsTo = 61 // '='
39 var numberSign = 35 // '#'
40 var uppercaseX = 88 // 'X'
41 var lowercaseX = 120 // 'x'
42 var replacementCharacter = 65533 // '�'
46 var hexa = 'hexadecimal'
55 // Map of types to tests.
56 // Each type of character reference accepts different characters.
57 // This test is used to detect whether a reference has ended (as the semicolon
58 // is not strictly needed).
61 tests[name] = alphanumerical
63 tests[hexa] = hexadecimal
66 var namedNotTerminated = 1
67 var numericNotTerminated = 2
71 var numericDisallowed = 6
72 var numericProhibited = 7
77 messages[namedNotTerminated] =
78 'Named character references must be terminated by a semicolon'
79 messages[numericNotTerminated] =
80 'Numeric character references must be terminated by a semicolon'
81 messages[namedEmpty] = 'Named character references cannot be empty'
82 messages[numericEmpty] = 'Numeric character references cannot be empty'
83 messages[namedUnknown] = 'Named character references must be known'
84 messages[numericDisallowed] =
85 'Numeric character references cannot be disallowed'
86 messages[numericProhibited] =
87 'Numeric character references cannot be outside the permissible Unicode range'
89 // Wrap to ensure clean parameters are given to `parse`.
90 function parseEntities(value, options) {
99 for (key in defaults) {
100 option = options[key]
102 option === null || option === undefined ? defaults[key] : option
105 if (settings.position.indent || settings.position.start) {
106 settings.indent = settings.position.indent || []
107 settings.position = settings.position.start
110 return parse(value, settings)
114 // eslint-disable-next-line complexity
115 function parse(value, settings) {
116 var additional = settings.additional
117 var nonTerminated = settings.nonTerminated
118 var handleText = settings.text
119 var handleReference = settings.reference
120 var handleWarning = settings.warning
121 var textContext = settings.textContext
122 var referenceContext = settings.referenceContext
123 var warningContext = settings.warningContext
124 var pos = settings.position
125 var indent = settings.indent || []
126 var length = value.length
129 var column = pos.column || 1
130 var line = pos.line || 1
153 if (typeof additional === 'string') {
154 additional = additional.charCodeAt(0)
157 // Cache the current point.
160 // Wrap `handleWarning`.
161 warning = handleWarning ? parseError : noop
163 // Ensure the algorithm walks over the first character and the end (inclusive).
167 while (++index < length) {
168 // If the previous character was a newline.
169 if (character === lineFeed) {
170 column = indent[lines] || 1
173 character = value.charCodeAt(index)
175 if (character === ampersand) {
176 following = value.charCodeAt(index + 1)
178 // The behaviour depends on the identity of the next character.
181 following === lineFeed ||
182 following === formFeed ||
183 following === space ||
184 following === ampersand ||
185 following === lessThan ||
186 following !== following ||
187 (additional && following === additional)
189 // Not a character reference.
190 // No characters are consumed, and nothing is returned.
191 // This is not an error, either.
192 queue += fromCharCode(character)
202 if (following === numberSign) {
206 // The behaviour further depends on the next character.
207 following = value.charCodeAt(end)
209 if (following === uppercaseX || following === lowercaseX) {
222 entityCharacters = ''
228 while (++end < length) {
229 following = value.charCodeAt(end)
231 if (!test(following)) {
235 characters += fromCharCode(following)
237 // Check if we can match a legacy named reference.
238 // If so, we cache that as the last viable named reference.
239 // This ensures we do not need to walk backwards later.
240 if (type === name && own.call(legacy, characters)) {
241 entityCharacters = characters
242 entity = legacy[characters]
246 terminated = value.charCodeAt(end) === semicolon
251 namedEntity = type === name ? decodeEntity(characters) : false
254 entityCharacters = characters
259 diff = 1 + end - start
261 if (!terminated && !nonTerminated) {
263 } else if (!characters) {
264 // An empty (possible) entity is valid, unless it’s numeric (thus an
265 // ampersand followed by an octothorp).
267 warning(numericEmpty, diff)
269 } else if (type === name) {
270 // An ampersand followed by anything unknown, and not terminated, is
272 if (terminated && !entity) {
273 warning(namedUnknown, 1)
275 // If theres something after an entity name which is not known, cap
277 if (entityCharacters !== characters) {
278 end = begin + entityCharacters.length
279 diff = 1 + end - begin
283 // If the reference is not terminated, warn.
285 reason = entityCharacters ? namedNotTerminated : namedEmpty
287 if (settings.attribute) {
288 following = value.charCodeAt(end)
290 if (following === equalsTo) {
291 warning(reason, diff)
293 } else if (alphanumerical(following)) {
296 warning(reason, diff)
299 warning(reason, diff)
307 // All non-terminated numeric entities are not rendered, and trigger a
309 warning(numericNotTerminated, diff)
312 // When terminated and number, parse as either hexadecimal or decimal.
313 reference = parseInt(characters, bases[type])
315 // Trigger a warning when the parsed number is prohibited, and replace
316 // with replacement character.
317 if (prohibited(reference)) {
318 warning(numericProhibited, diff)
319 reference = fromCharCode(replacementCharacter)
320 } else if (reference in invalid) {
321 // Trigger a warning when the parsed number is disallowed, and replace
322 // by an alternative.
323 warning(numericDisallowed, diff)
324 reference = invalid[reference]
329 // Trigger a warning when the parsed number should not be used.
330 if (disallowed(reference)) {
331 warning(numericDisallowed, diff)
334 // Stringify the number.
335 if (reference > 0xffff) {
337 output += fromCharCode((reference >>> (10 & 0x3ff)) | 0xd800)
338 reference = 0xdc00 | (reference & 0x3ff)
341 reference = output + fromCharCode(reference)
346 // First eat the queued characters as normal text, then eat an entity.
352 column += end - start + 1
353 result.push(reference)
357 if (handleReference) {
358 handleReference.call(
361 {start: prev, end: next},
362 value.slice(start - 1, end)
368 // If we could not find a reference, queue the checked characters (as
369 // normal characters), and move the pointer to their end.
370 // This is possible because we can be certain neither newlines nor
371 // ampersands are included.
372 characters = value.slice(start - 1, end)
374 column += characters.length
378 // Handle anything other than an ampersand, including newlines and EOF.
380 character === 10 // Line feed
387 if (character === character) {
388 queue += fromCharCode(character)
396 // Return the reduced nodes, and any possible warnings.
397 return result.join('')
399 // Get current position.
404 offset: index + (pos.offset || 0)
408 // “Throw” a parse-error: a warning.
409 function parseError(code, offset) {
412 position.column += offset
413 position.offset += offset
415 handleWarning.call(warningContext, messages[code], position, code)
418 // Flush `queue` (normal text).
419 // Macro invoked before each entity and at the end of `value`.
420 // Does nothing when `queue` is empty.
426 handleText.call(textContext, queue, {start: prev, end: now()})
434 // Check if `character` is outside the permissible unicode range.
435 function prohibited(code) {
436 return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff
439 // Check if `character` is disallowed.
440 function disallowed(code) {
442 (code >= 0x0001 && code <= 0x0008) ||
444 (code >= 0x000d && code <= 0x001f) ||
445 (code >= 0x007f && code <= 0x009f) ||
446 (code >= 0xfdd0 && code <= 0xfdef) ||
447 (code & 0xffff) === 0xffff ||
448 (code & 0xffff) === 0xfffe