Skip to content

Commit

Permalink
Correctly translate XML entities and character references (#4417)
Browse files Browse the repository at this point in the history
* Translate XML Entities in tXml.js

XML entities are special character sequences such as "&" (&).

* Rename entity to entityReference

and add link to the appropriate XML specifications

* Translate character references too

* Add "predefined" to function descriptor comment
  • Loading branch information
Andrews54757 committed Mar 22, 2024
1 parent e96f7ce commit dbdd14c
Showing 1 changed file with 59 additions and 2 deletions.
61 changes: 59 additions & 2 deletions externals/tXml.js
Expand Up @@ -38,6 +38,63 @@
* @property {(a: tNode, b: tNode) => boolean} [filter]
*/

/**
* Predefined general entities used in XML
* See https://www.w3.org/TR/xml/#sec-predefined-ent
*/
export const XML_ENTITIES = {
'&': '&',
'>': '>',
'&lt;': '<',
'&quot;': '"',
'&apos;': "'"
};

/**
* Translates XML predefined entities and character references to their respective characters.
* @param {Object} entitiesList
* @param {String} str
* @returns {String}
*/
function translateEntitiesAndCharacterReferences(entitiesList, str) {
const entitySplit = str.split(/(&[#a-zA-Z0-9]+;)/);
if (entitySplit.length <= 1) { // No entities. Skip the rest of the function.
return str;
}

for (let i = 1; i < entitySplit.length; i += 2) {
const reference = entitySplit[i];

/*
* Check if it is a character reference of the form
* /&#[0-9]+;/ - Encoded in decimal, or
* /&#x[0-9a-fA-F]+;/ - Encoded in hexadecimal
* See https://www.w3.org/TR/xml/#sec-references
*/
if (reference.charAt(1) === '#') {
let code;
if (reference.charAt(2) === 'x') { // Hexadecimal
code = parseInt(reference.substring(3, reference.length - 1), 16);
} else { // Decimal
code = parseInt(reference.substring(2, reference.length - 1), 10);
}

// Translate into string according to ISO/IEC 10646
if (!isNaN(code) && code >= 0 && code <= 0x10FFFF) {
entitySplit[i] = String.fromCodePoint(code);
}
}
/*
* Translate entity references using a dictionary.
*/
else if (entitiesList.hasOwnProperty(reference)) {
entitySplit[i] = entitiesList[reference];
}
}

return entitySplit.join('');
};

/**
* parseXML / html into a DOM Object. with no validation and some failur tolerance
* @param {string} S your XML to parse
Expand Down Expand Up @@ -191,7 +248,7 @@
return parseInt(value);
}

let attrValue = value;
let attrValue = translateEntitiesAndCharacterReferences(XML_ENTITIES, value);
attrMatchers.forEach(matcher => {
if (matcher.test(tagName, attrName, value)) {
attrValue = matcher.converter(value);
Expand All @@ -209,7 +266,7 @@
pos = S.indexOf(openBracket, pos) - 1;
if (pos === -2)
pos = S.length;
return S.slice(start, pos + 1);
return translateEntitiesAndCharacterReferences(XML_ENTITIES, S.slice(start, pos + 1));
}
/**
* returns text until the first nonAlphabetic letter
Expand Down

0 comments on commit dbdd14c

Please sign in to comment.