// ==ClosureCompiler== // @output_file_name default.js // @compilation_level SIMPLE_OPTIMIZATIONS // ==/ClosureCompiler== // module.exports = { // parse: parse, // simplify: simplify, // simplifyLostLess: simplifyLostLess, // filter: filter, // stringify: stringify, // toContentString: toContentString, // getElementById: getElementById, // getElementsByClassName: getElementsByClassName, // transformStream: transformStream, // }; /** * @author: Tobias Nickel * @created: 06.04.2015 * I needed a small xmlparser chat can be used in a worker. */ interface Node { tagName: string; attributes: Record; children: (Node | string)[]; } interface ParseOptions { attrName?: string; attrValue?: string; parseNode: any; pos?: number; noChildNodes?: string[]; setPos?: boolean; keepComments?: boolean; keepWhitespace?: boolean; simplify?: boolean; filter?: (a: Node, b: Node) => boolean; } /** * parseXML / html into a DOM Object. with no validation and some failur tolerance * @param {string} source your XML to parse * @param {ParseOptions} [options] all other options: * @return {(Node | string)[]} */ export function parse(source: string, options: ParseOptions = {}): (Node | string)[] { var pos = options.pos || 0; var keepComments = !!options.keepComments; var keepWhitespace = !!options.keepWhitespace var openBracket = "<"; var openBracketCC = "<".charCodeAt(0); var closeBracket = ">"; var closeBracketCC = ">".charCodeAt(0); var minusCC = "-".charCodeAt(0); var slashCC = "/".charCodeAt(0); var exclamationCC = '!'.charCodeAt(0); var singleQuoteCC = "'".charCodeAt(0); var doubleQuoteCC = '"'.charCodeAt(0); var openCornerBracketCC = '['.charCodeAt(0); var closeCornerBracketCC = ']'.charCodeAt(0); /** * parsing a list of entries */ function parseChildren(tagName: string) { var children = []; while (source[pos]) { if (source.charCodeAt(pos) == openBracketCC) { if (source.charCodeAt(pos + 1) === slashCC) { var closeStart = pos + 2; pos = source.indexOf(closeBracket, pos); var closeTag = source.substring(closeStart, pos) if (closeTag.indexOf(tagName) == -1) { var parsedText = source.substring(0, pos).split('\n'); throw new Error( 'Unexpected close tag\nLine: ' + (parsedText.length - 1) + '\nColumn: ' + (parsedText[parsedText.length - 1].length + 1) + '\nChar: ' + source[pos] ); } if (pos + 1) pos += 1 return children; } else if (source.charCodeAt(pos + 1) === exclamationCC) { if (source.charCodeAt(pos + 2) == minusCC) { //comment support const startCommentPos = pos; while (pos !== -1 && !(source.charCodeAt(pos) === closeBracketCC && source.charCodeAt(pos - 1) == minusCC && source.charCodeAt(pos - 2) == minusCC && pos != -1)) { pos = source.indexOf(closeBracket, pos + 1); } if (pos === -1) { pos = source.length } if (keepComments) { children.push(source.substring(startCommentPos, pos + 1)); } } else if ( source.charCodeAt(pos + 2) === openCornerBracketCC && source.charCodeAt(pos + 8) === openCornerBracketCC && source.substr(pos + 3, 5).toLowerCase() === 'cdata' ) { // cdata var cdataEndIndex = source.indexOf(']]>', pos); if (cdataEndIndex == -1) { children.push(source.substr(pos + 9)); pos = source.length; } else { children.push(source.substring(pos + 9, cdataEndIndex)); pos = cdataEndIndex + 3; } continue; } else { // doctypesupport const startDoctype = pos + 1; pos += 2; var encapsuled = false; while ((source.charCodeAt(pos) !== closeBracketCC || encapsuled === true) && source[pos]) { if (source.charCodeAt(pos) === openCornerBracketCC) { encapsuled = true; } else if (encapsuled === true && source.charCodeAt(pos) === closeCornerBracketCC) { encapsuled = false; } pos++; } children.push(source.substring(startDoctype, pos)); } pos++; continue; } var node = parseNode(); children.push(node); if (node.tagName[0] === '?') { children.push(...node.children); node.children = []; } } else { var text = parseText(); if (keepWhitespace) { if (text.length > 0) { children.push(text); } } else { var trimmed = text.trim(); if (trimmed.length > 0) { children.push(trimmed); } } pos++; } } return children; } /** * returns the text outside of texts until the first '<' */ function parseText() { var start = pos; pos = source.indexOf(openBracket, pos) - 1; if (pos === -2) pos = source.length; return source.slice(start, pos + 1); } /** * returns text until the first nonAlphabetic letter */ var nameSpacer = '\r\n\t>/= '; function parseName() { var start = pos; while (nameSpacer.indexOf(source[pos]) === -1 && source[pos]) { pos++; } return source.slice(start, pos); } /** * is parsing a node, including tagName, Attributes and its children, * to parse children it uses the parseChildren again, that makes the parsing recursive */ var NoChildNodes = options.noChildNodes || ['img', 'br', 'input', 'meta', 'link', 'hr']; function parseNode(): Node { pos++; const tagName = parseName(); const attributes: Record = {}; let children: (string | Node)[] = []; // parsing attributes while (source.charCodeAt(pos) !== closeBracketCC && source[pos]) { var c = source.charCodeAt(pos); if ((c > 64 && c < 91) || (c > 96 && c < 123)) { //if('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'.indexOf(S[pos])!==-1 ){ var name = parseName(); // search beginning of the string var code = source.charCodeAt(pos); while (code && code !== singleQuoteCC && code !== doubleQuoteCC && !((code > 64 && code < 91) || (code > 96 && code < 123)) && code !== closeBracketCC) { pos++; code = source.charCodeAt(pos); } if (code === singleQuoteCC || code === doubleQuoteCC) { var value = parseString(); if (pos === -1) { return { tagName, attributes, children, }; } } else { value = null; pos--; } attributes[name] = value; } pos++; } // optional parsing of children if (source.charCodeAt(pos - 1) !== slashCC) { if (tagName == "script") { var start = pos + 1; pos = source.indexOf('', pos); children = [source.slice(start, pos)]; pos += 9; } else if (tagName == "style") { var start = pos + 1; pos = source.indexOf('', pos); children = [source.slice(start, pos)]; pos += 8; } else if (NoChildNodes.indexOf(tagName) === -1) { pos++; children = parseChildren(tagName); } else { pos++ } } else { pos++; } return { tagName, attributes, children, }; } /** * is parsing a string, that starts with a char and with the same usually ' or " */ function parseString() { var startChar = source[pos]; var startpos = pos + 1; pos = source.indexOf(startChar, startpos) return source.slice(startpos, pos); } /** * */ function findElements() { var r = new RegExp('\\s' + options.attrName + '\\s*=[\'"]' + options.attrValue + '[\'"]').exec(source) if (r) { return r.index; } else { return -1; } } var out: (string | Node)[] | null = null; if (options.attrValue !== undefined) { options.attrName = options.attrName || 'id'; out = []; while ((pos = findElements()) !== -1) { pos = source.lastIndexOf('<', pos); if (pos !== -1) { out.push(parseNode()); } source = source.substr(pos); pos = 0; } } else if (options.parseNode) { out = parseNode() } else { out = parseChildren(''); } if (options.filter) { out = filter(out, options.filter); } if (options.simplify) { return simplify(Array.isArray(out) ? out : [out]); } if (options.setPos) { out.pos = pos; } return out; } /** * transform the DomObject to an object that is like the object of PHP`s simple_xmp_load_*() methods. * this format helps you to write that is more likely to keep your program working, even if there a small changes in the XML schema. * be aware, that it is not possible to reproduce the original xml from a simplified version, because the order of elements is not saved. * therefore your program will be more flexible and easier to read. * * @param {tNode[]} children the childrenList */ export function simplify(children: any[]) { var out = {}; if (!children.length) { return ''; } if (children.length === 1 && typeof children[0] == 'string') { return children[0]; } // map each object children.forEach(function(child: { tagName: string | number; children: any; attributes: {}; }) { if (typeof child !== 'object') { return; } if (!out[child.tagName]) out[child.tagName] = []; var kids = simplify(child.children); out[child.tagName].push(kids); if (Object.keys(child.attributes).length && typeof kids !== 'string') { kids._attributes = child.attributes; } }); for (var i in out) { if (out[i].length == 1) { out[i] = out[i][0]; } } return out; }; /** * similar to simplify, but lost less * * @param {tNode[]} children the childrenList */ export function simplifyLostLess(children: any[], parentAttributes = {}) { var out = {}; if (!children.length) { return out; } if (children.length === 1 && typeof children[0] == 'string') { return Object.keys(parentAttributes).length ? { _attributes: parentAttributes, value: children[0] } : children[0]; } // map each object children.forEach(function(child: { tagName: string | number; children: any; attributes: {} | undefined; }) { if (typeof child !== 'object') { return; } if (!out[child.tagName]) out[child.tagName] = []; var kids = simplifyLostLess(child.children || [], child.attributes); out[child.tagName].push(kids); if (Object.keys(child.attributes).length) { kids._attributes = child.attributes; } }); return out; }; /** * behaves the same way as Array.filter, if the filter method return true, the element is in the resultList * @params children{Array} the children of a node * @param f{function} the filter method */ export function filter(children: any[], f: (arg0: any, arg1: any, arg2: number, arg3: string) => any, dept = 0, path = '') { var out: any[] = []; children.forEach(function(child: { children: any; tagName: string; }, i: string) { if (typeof(child) === 'object' && f(child, i, dept, path)) out.push(child); if (child.children) { var kids = filter(child.children, f, dept + 1, (path ? path + '.' : '') + i + '.' + child.tagName); out = out.concat(kids); } }); return out; }; /** * stringify a previously parsed string object. * this is useful, * 1. to remove whitespace * 2. to recreate xml data, with some changed data. * @param {tNode} O the object to Stringify */ export function stringify(O: any) { var out = ''; function writeChildren(O: string | any[]) { if (O) { for (var i = 0; i < O.length; i++) { if (typeof O[i] == 'string') { out += O[i].trim(); } else { writeNode(O[i]); } } } } function writeNode(N: { tagName: string | string[]; attributes: { [x: string]: string; }; children: any; }) { out += "<" + N.tagName; for (var i in N.attributes) { if (N.attributes[i] === null) { out += ' ' + i; } else if (N.attributes[i].indexOf('"') === -1) { out += ' ' + i + '="' + N.attributes[i].trim() + '"'; } else { out += ' ' + i + "='" + N.attributes[i].trim() + "'"; } } if (N.tagName[0] === '?') { out += '?>'; return; } out += '>'; writeChildren(N.children); out += ''; } writeChildren(O); return out; }; /** * use this method to read the text content, of some node. * It is great if you have mixed content like: * this text has some big text and a link * @return {string} */ export function toContentString(tDom: string): string { if (Array.isArray(tDom)) { var out = ''; tDom.forEach(function(e) { out += ' ' + toContentString(e); out = out.trim(); }); return out; } else if (typeof tDom === 'object') { return toContentString(tDom.children) } else { return ' ' + tDom; } }; export function getElementById(S: string, id: any, simplified: any) { var out = parse(S, { attrValue: id }); return simplified ? simplify(out) : out[0]; }; export function getElementsByClassName(S: string, classname: string, simplified: any) { const out = parse(S, { attrName: 'class', attrValue: '[a-zA-Z0-9- ]*' + classname + '[a-zA-Z0-9- ]*' }); return simplified ? simplify(out) : out; };