htmlparser.js 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. /*!
  2. * HTML Parser By John Resig (ejohn.org)
  3. * Modified by Juriy "kangax" Zaytsev
  4. * Original code by Erik Arvidsson, Mozilla Public License
  5. * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
  6. */
  7. /*
  8. * // Use like so:
  9. * HTMLParser(htmlString, {
  10. * start: function(tag, attrs, unary) {},
  11. * end: function(tag) {},
  12. * chars: function(text) {},
  13. * comment: function(text) {}
  14. * });
  15. *
  16. * // or to get an XML string:
  17. * HTMLtoXML(htmlString);
  18. *
  19. * // or to get an XML DOM Document
  20. * HTMLtoDOM(htmlString);
  21. *
  22. * // or to inject into an existing document/DOM node
  23. * HTMLtoDOM(htmlString, document);
  24. * HTMLtoDOM(htmlString, document.body);
  25. *
  26. */
  27. /* global ActiveXObject, DOMDocument */
  28. 'use strict';
  29. var createMapFromString = require('./utils').createMapFromString;
  30. var replaceAsync = require('./utils').replaceAsync;
  31. function makeMap(values) {
  32. return createMapFromString(values, true);
  33. }
  34. // Regular Expressions for parsing tags and attributes
  35. var singleAttrIdentifier = /([^\s"'<>/=]+)/,
  36. singleAttrAssigns = [/=/],
  37. singleAttrValues = [
  38. // attr value double quotes
  39. /"([^"]*)"+/.source,
  40. // attr value, single quotes
  41. /'([^']*)'+/.source,
  42. // attr value, no quotes
  43. /([^ \t\n\f\r"'`=<>]+)/.source
  44. ],
  45. // https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-QName
  46. qnameCapture = (function() {
  47. // based on https://www.npmjs.com/package/ncname
  48. var combiningChar = '\\u0300-\\u0345\\u0360\\u0361\\u0483-\\u0486\\u0591-\\u05A1\\u05A3-\\u05B9\\u05BB-\\u05BD\\u05BF\\u05C1\\u05C2\\u05C4\\u064B-\\u0652\\u0670\\u06D6-\\u06E4\\u06E7\\u06E8\\u06EA-\\u06ED\\u0901-\\u0903\\u093C\\u093E-\\u094D\\u0951-\\u0954\\u0962\\u0963\\u0981-\\u0983\\u09BC\\u09BE-\\u09C4\\u09C7\\u09C8\\u09CB-\\u09CD\\u09D7\\u09E2\\u09E3\\u0A02\\u0A3C\\u0A3E-\\u0A42\\u0A47\\u0A48\\u0A4B-\\u0A4D\\u0A70\\u0A71\\u0A81-\\u0A83\\u0ABC\\u0ABE-\\u0AC5\\u0AC7-\\u0AC9\\u0ACB-\\u0ACD\\u0B01-\\u0B03\\u0B3C\\u0B3E-\\u0B43\\u0B47\\u0B48\\u0B4B-\\u0B4D\\u0B56\\u0B57\\u0B82\\u0B83\\u0BBE-\\u0BC2\\u0BC6-\\u0BC8\\u0BCA-\\u0BCD\\u0BD7\\u0C01-\\u0C03\\u0C3E-\\u0C44\\u0C46-\\u0C48\\u0C4A-\\u0C4D\\u0C55\\u0C56\\u0C82\\u0C83\\u0CBE-\\u0CC4\\u0CC6-\\u0CC8\\u0CCA-\\u0CCD\\u0CD5\\u0CD6\\u0D02\\u0D03\\u0D3E-\\u0D43\\u0D46-\\u0D48\\u0D4A-\\u0D4D\\u0D57\\u0E31\\u0E34-\\u0E3A\\u0E47-\\u0E4E\\u0EB1\\u0EB4-\\u0EB9\\u0EBB\\u0EBC\\u0EC8-\\u0ECD\\u0F18\\u0F19\\u0F35\\u0F37\\u0F39\\u0F3E\\u0F3F\\u0F71-\\u0F84\\u0F86-\\u0F8B\\u0F90-\\u0F95\\u0F97\\u0F99-\\u0FAD\\u0FB1-\\u0FB7\\u0FB9\\u20D0-\\u20DC\\u20E1\\u302A-\\u302F\\u3099\\u309A';
  49. var digit = '0-9\\u0660-\\u0669\\u06F0-\\u06F9\\u0966-\\u096F\\u09E6-\\u09EF\\u0A66-\\u0A6F\\u0AE6-\\u0AEF\\u0B66-\\u0B6F\\u0BE7-\\u0BEF\\u0C66-\\u0C6F\\u0CE6-\\u0CEF\\u0D66-\\u0D6F\\u0E50-\\u0E59\\u0ED0-\\u0ED9\\u0F20-\\u0F29';
  50. var extender = '\\xB7\\u02D0\\u02D1\\u0387\\u0640\\u0E46\\u0EC6\\u3005\\u3031-\\u3035\\u309D\\u309E\\u30FC-\\u30FE';
  51. var letter = 'A-Za-z\\xC0-\\xD6\\xD8-\\xF6\\xF8-\\u0131\\u0134-\\u013E\\u0141-\\u0148\\u014A-\\u017E\\u0180-\\u01C3\\u01CD-\\u01F0\\u01F4\\u01F5\\u01FA-\\u0217\\u0250-\\u02A8\\u02BB-\\u02C1\\u0386\\u0388-\\u038A\\u038C\\u038E-\\u03A1\\u03A3-\\u03CE\\u03D0-\\u03D6\\u03DA\\u03DC\\u03DE\\u03E0\\u03E2-\\u03F3\\u0401-\\u040C\\u040E-\\u044F\\u0451-\\u045C\\u045E-\\u0481\\u0490-\\u04C4\\u04C7\\u04C8\\u04CB\\u04CC\\u04D0-\\u04EB\\u04EE-\\u04F5\\u04F8\\u04F9\\u0531-\\u0556\\u0559\\u0561-\\u0586\\u05D0-\\u05EA\\u05F0-\\u05F2\\u0621-\\u063A\\u0641-\\u064A\\u0671-\\u06B7\\u06BA-\\u06BE\\u06C0-\\u06CE\\u06D0-\\u06D3\\u06D5\\u06E5\\u06E6\\u0905-\\u0939\\u093D\\u0958-\\u0961\\u0985-\\u098C\\u098F\\u0990\\u0993-\\u09A8\\u09AA-\\u09B0\\u09B2\\u09B6-\\u09B9\\u09DC\\u09DD\\u09DF-\\u09E1\\u09F0\\u09F1\\u0A05-\\u0A0A\\u0A0F\\u0A10\\u0A13-\\u0A28\\u0A2A-\\u0A30\\u0A32\\u0A33\\u0A35\\u0A36\\u0A38\\u0A39\\u0A59-\\u0A5C\\u0A5E\\u0A72-\\u0A74\\u0A85-\\u0A8B\\u0A8D\\u0A8F-\\u0A91\\u0A93-\\u0AA8\\u0AAA-\\u0AB0\\u0AB2\\u0AB3\\u0AB5-\\u0AB9\\u0ABD\\u0AE0\\u0B05-\\u0B0C\\u0B0F\\u0B10\\u0B13-\\u0B28\\u0B2A-\\u0B30\\u0B32\\u0B33\\u0B36-\\u0B39\\u0B3D\\u0B5C\\u0B5D\\u0B5F-\\u0B61\\u0B85-\\u0B8A\\u0B8E-\\u0B90\\u0B92-\\u0B95\\u0B99\\u0B9A\\u0B9C\\u0B9E\\u0B9F\\u0BA3\\u0BA4\\u0BA8-\\u0BAA\\u0BAE-\\u0BB5\\u0BB7-\\u0BB9\\u0C05-\\u0C0C\\u0C0E-\\u0C10\\u0C12-\\u0C28\\u0C2A-\\u0C33\\u0C35-\\u0C39\\u0C60\\u0C61\\u0C85-\\u0C8C\\u0C8E-\\u0C90\\u0C92-\\u0CA8\\u0CAA-\\u0CB3\\u0CB5-\\u0CB9\\u0CDE\\u0CE0\\u0CE1\\u0D05-\\u0D0C\\u0D0E-\\u0D10\\u0D12-\\u0D28\\u0D2A-\\u0D39\\u0D60\\u0D61\\u0E01-\\u0E2E\\u0E30\\u0E32\\u0E33\\u0E40-\\u0E45\\u0E81\\u0E82\\u0E84\\u0E87\\u0E88\\u0E8A\\u0E8D\\u0E94-\\u0E97\\u0E99-\\u0E9F\\u0EA1-\\u0EA3\\u0EA5\\u0EA7\\u0EAA\\u0EAB\\u0EAD\\u0EAE\\u0EB0\\u0EB2\\u0EB3\\u0EBD\\u0EC0-\\u0EC4\\u0F40-\\u0F47\\u0F49-\\u0F69\\u10A0-\\u10C5\\u10D0-\\u10F6\\u1100\\u1102\\u1103\\u1105-\\u1107\\u1109\\u110B\\u110C\\u110E-\\u1112\\u113C\\u113E\\u1140\\u114C\\u114E\\u1150\\u1154\\u1155\\u1159\\u115F-\\u1161\\u1163\\u1165\\u1167\\u1169\\u116D\\u116E\\u1172\\u1173\\u1175\\u119E\\u11A8\\u11AB\\u11AE\\u11AF\\u11B7\\u11B8\\u11BA\\u11BC-\\u11C2\\u11EB\\u11F0\\u11F9\\u1E00-\\u1E9B\\u1EA0-\\u1EF9\\u1F00-\\u1F15\\u1F18-\\u1F1D\\u1F20-\\u1F45\\u1F48-\\u1F4D\\u1F50-\\u1F57\\u1F59\\u1F5B\\u1F5D\\u1F5F-\\u1F7D\\u1F80-\\u1FB4\\u1FB6-\\u1FBC\\u1FBE\\u1FC2-\\u1FC4\\u1FC6-\\u1FCC\\u1FD0-\\u1FD3\\u1FD6-\\u1FDB\\u1FE0-\\u1FEC\\u1FF2-\\u1FF4\\u1FF6-\\u1FFC\\u2126\\u212A\\u212B\\u212E\\u2180-\\u2182\\u3007\\u3021-\\u3029\\u3041-\\u3094\\u30A1-\\u30FA\\u3105-\\u312C\\u4E00-\\u9FA5\\uAC00-\\uD7A3';
  52. var ncname = '[' + letter + '_][' + letter + digit + '\\.\\-_' + combiningChar + extender + ']*';
  53. return '((?:' + ncname + '\\:)?' + ncname + ')';
  54. })(),
  55. startTagOpen = new RegExp('^<' + qnameCapture),
  56. startTagClose = /^\s*(\/?)>/,
  57. endTag = new RegExp('^<\\/' + qnameCapture + '[^>]*>'),
  58. doctype = /^<!DOCTYPE\s?[^>]+>/i;
  59. var IS_REGEX_CAPTURING_BROKEN = false;
  60. 'x'.replace(/x(.)?/g, function(m, g) {
  61. IS_REGEX_CAPTURING_BROKEN = g === '';
  62. });
  63. // Empty Elements
  64. var empty = makeMap('area,base,basefont,br,col,embed,frame,hr,img,input,isindex,keygen,link,meta,param,source,track,wbr');
  65. // Inline Elements
  66. var inline = makeMap('a,abbr,acronym,applet,b,basefont,bdo,big,br,button,cite,code,del,dfn,em,font,i,iframe,img,input,ins,kbd,label,map,noscript,object,q,s,samp,script,select,small,span,strike,strong,sub,sup,svg,textarea,tt,u,var');
  67. // Elements that you can, intentionally, leave open
  68. // (and which close themselves)
  69. var closeSelf = makeMap('colgroup,dd,dt,li,option,p,td,tfoot,th,thead,tr,source');
  70. // Attributes that have their values filled in disabled='disabled'
  71. var fillAttrs = makeMap('checked,compact,declare,defer,disabled,ismap,multiple,nohref,noresize,noshade,nowrap,readonly,selected');
  72. // Special Elements (can contain anything)
  73. var special = makeMap('script,style');
  74. // HTML5 tags https://html.spec.whatwg.org/multipage/indices.html#elements-3
  75. // Phrasing Content https://html.spec.whatwg.org/multipage/dom.html#phrasing-content
  76. var nonPhrasing = makeMap('address,article,aside,base,blockquote,body,caption,col,colgroup,dd,details,dialog,div,dl,dt,fieldset,figcaption,figure,footer,form,h1,h2,h3,h4,h5,h6,head,header,hgroup,hr,html,legend,li,menuitem,meta,ol,optgroup,option,param,rp,rt,source,style,summary,tbody,td,tfoot,th,thead,title,tr,track,ul');
  77. var reCache = {};
  78. function attrForHandler(handler) {
  79. var pattern = singleAttrIdentifier.source +
  80. '(?:\\s*(' + joinSingleAttrAssigns(handler) + ')' +
  81. '[ \\t\\n\\f\\r]*(?:' + singleAttrValues.join('|') + '))?';
  82. if (handler.customAttrSurround) {
  83. var attrClauses = [];
  84. for (var i = handler.customAttrSurround.length - 1; i >= 0; i--) {
  85. attrClauses[i] = '(?:' +
  86. '(' + handler.customAttrSurround[i][0].source + ')\\s*' +
  87. pattern +
  88. '\\s*(' + handler.customAttrSurround[i][1].source + ')' +
  89. ')';
  90. }
  91. attrClauses.push('(?:' + pattern + ')');
  92. pattern = '(?:' + attrClauses.join('|') + ')';
  93. }
  94. return new RegExp('^\\s*' + pattern);
  95. }
  96. function joinSingleAttrAssigns(handler) {
  97. return singleAttrAssigns.concat(
  98. handler.customAttrAssign || []
  99. ).map(function(assign) {
  100. return '(?:' + assign.source + ')';
  101. }).join('|');
  102. }
  103. class HTMLParser {
  104. constructor(html, handler) {
  105. this.html = html;
  106. this.handler = handler;
  107. }
  108. async parse() {
  109. let html = this.html;
  110. const handler = this.handler;
  111. var stack = [], lastTag;
  112. var attribute = attrForHandler(handler);
  113. var last, prevTag, nextTag;
  114. while (html) {
  115. last = html;
  116. // Make sure we're not in a script or style element
  117. if (!lastTag || !special(lastTag)) {
  118. var textEnd = html.indexOf('<');
  119. if (textEnd === 0) {
  120. // Comment:
  121. if (/^<!--/.test(html)) {
  122. var commentEnd = html.indexOf('-->');
  123. if (commentEnd >= 0) {
  124. if (handler.comment) {
  125. await handler.comment(html.substring(4, commentEnd));
  126. }
  127. html = html.substring(commentEnd + 3);
  128. prevTag = '';
  129. continue;
  130. }
  131. }
  132. // https://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment
  133. if (/^<!\[/.test(html)) {
  134. var conditionalEnd = html.indexOf(']>');
  135. if (conditionalEnd >= 0) {
  136. if (handler.comment) {
  137. await handler.comment(html.substring(2, conditionalEnd + 1), true /* non-standard */);
  138. }
  139. html = html.substring(conditionalEnd + 2);
  140. prevTag = '';
  141. continue;
  142. }
  143. }
  144. // Doctype:
  145. var doctypeMatch = html.match(doctype);
  146. if (doctypeMatch) {
  147. if (handler.doctype) {
  148. handler.doctype(doctypeMatch[0]);
  149. }
  150. html = html.substring(doctypeMatch[0].length);
  151. prevTag = '';
  152. continue;
  153. }
  154. // End tag:
  155. var endTagMatch = html.match(endTag);
  156. if (endTagMatch) {
  157. html = html.substring(endTagMatch[0].length);
  158. await replaceAsync(endTagMatch[0], endTag, parseEndTag);
  159. prevTag = '/' + endTagMatch[1].toLowerCase();
  160. continue;
  161. }
  162. // Start tag:
  163. var startTagMatch = parseStartTag(html);
  164. if (startTagMatch) {
  165. html = startTagMatch.rest;
  166. await handleStartTag(startTagMatch);
  167. prevTag = startTagMatch.tagName.toLowerCase();
  168. continue;
  169. }
  170. // Treat `<` as text
  171. if (handler.continueOnParseError) {
  172. textEnd = html.indexOf('<', 1);
  173. }
  174. }
  175. var text;
  176. if (textEnd >= 0) {
  177. text = html.substring(0, textEnd);
  178. html = html.substring(textEnd);
  179. }
  180. else {
  181. text = html;
  182. html = '';
  183. }
  184. // next tag
  185. var nextTagMatch = parseStartTag(html);
  186. if (nextTagMatch) {
  187. nextTag = nextTagMatch.tagName;
  188. }
  189. else {
  190. nextTagMatch = html.match(endTag);
  191. if (nextTagMatch) {
  192. nextTag = '/' + nextTagMatch[1];
  193. }
  194. else {
  195. nextTag = '';
  196. }
  197. }
  198. if (handler.chars) {
  199. await handler.chars(text, prevTag, nextTag);
  200. }
  201. prevTag = '';
  202. }
  203. else {
  204. var stackedTag = lastTag.toLowerCase();
  205. var reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)</' + stackedTag + '[^>]*>', 'i'));
  206. html = await replaceAsync(html, reStackedTag, async(_, text) => {
  207. if (stackedTag !== 'script' && stackedTag !== 'style' && stackedTag !== 'noscript') {
  208. text = text
  209. .replace(/<!--([\s\S]*?)-->/g, '$1')
  210. .replace(/<!\[CDATA\[([\s\S]*?)]]>/g, '$1');
  211. }
  212. if (handler.chars) {
  213. await handler.chars(text);
  214. }
  215. return '';
  216. });
  217. await parseEndTag('</' + stackedTag + '>', stackedTag);
  218. }
  219. if (html === last) {
  220. throw new Error('Parse Error: ' + html);
  221. }
  222. }
  223. if (!handler.partialMarkup) {
  224. // Clean up any remaining tags
  225. await parseEndTag();
  226. }
  227. function parseStartTag(input) {
  228. var start = input.match(startTagOpen);
  229. if (start) {
  230. var match = {
  231. tagName: start[1],
  232. attrs: []
  233. };
  234. input = input.slice(start[0].length);
  235. var end, attr;
  236. while (!(end = input.match(startTagClose)) && (attr = input.match(attribute))) {
  237. input = input.slice(attr[0].length);
  238. match.attrs.push(attr);
  239. }
  240. if (end) {
  241. match.unarySlash = end[1];
  242. match.rest = input.slice(end[0].length);
  243. return match;
  244. }
  245. }
  246. }
  247. async function closeIfFound(tagName) {
  248. if (findTag(tagName) >= 0) {
  249. await parseEndTag('', tagName);
  250. return true;
  251. }
  252. }
  253. async function handleStartTag(match) {
  254. var tagName = match.tagName;
  255. var unarySlash = match.unarySlash;
  256. if (handler.html5) {
  257. if (lastTag === 'p' && nonPhrasing(tagName)) {
  258. await parseEndTag('', lastTag);
  259. }
  260. else if (tagName === 'tbody') {
  261. await closeIfFound('thead');
  262. }
  263. else if (tagName === 'tfoot') {
  264. if (!await closeIfFound('tbody')) {
  265. await closeIfFound('thead');
  266. }
  267. }
  268. if (tagName === 'col' && findTag('colgroup') < 0) {
  269. lastTag = 'colgroup';
  270. stack.push({ tag: lastTag, attrs: [] });
  271. if (handler.start) {
  272. await handler.start(lastTag, [], false, '');
  273. }
  274. }
  275. }
  276. if (!handler.html5 && !inline(tagName)) {
  277. while (lastTag && inline(lastTag)) {
  278. await parseEndTag('', lastTag);
  279. }
  280. }
  281. if (closeSelf(tagName) && lastTag === tagName) {
  282. await parseEndTag('', tagName);
  283. }
  284. var unary = empty(tagName) || tagName === 'html' && lastTag === 'head' || !!unarySlash;
  285. var attrs = match.attrs.map(function(args) {
  286. var name, value, customOpen, customClose, customAssign, quote;
  287. var ncp = 7; // number of captured parts, scalar
  288. // hackish work around FF bug https://bugzilla.mozilla.org/show_bug.cgi?id=369778
  289. if (IS_REGEX_CAPTURING_BROKEN && args[0].indexOf('""') === -1) {
  290. if (args[3] === '') { delete args[3]; }
  291. if (args[4] === '') { delete args[4]; }
  292. if (args[5] === '') { delete args[5]; }
  293. }
  294. function populate(index) {
  295. customAssign = args[index];
  296. value = args[index + 1];
  297. if (typeof value !== 'undefined') {
  298. return '"';
  299. }
  300. value = args[index + 2];
  301. if (typeof value !== 'undefined') {
  302. return '\'';
  303. }
  304. value = args[index + 3];
  305. if (typeof value === 'undefined' && fillAttrs(name)) {
  306. value = name;
  307. }
  308. return '';
  309. }
  310. var j = 1;
  311. if (handler.customAttrSurround) {
  312. for (var i = 0, l = handler.customAttrSurround.length; i < l; i++, j += ncp) {
  313. name = args[j + 1];
  314. if (name) {
  315. quote = populate(j + 2);
  316. customOpen = args[j];
  317. customClose = args[j + 6];
  318. break;
  319. }
  320. }
  321. }
  322. if (!name && (name = args[j])) {
  323. quote = populate(j + 1);
  324. }
  325. return {
  326. name: name,
  327. value: value,
  328. customAssign: customAssign || '=',
  329. customOpen: customOpen || '',
  330. customClose: customClose || '',
  331. quote: quote || ''
  332. };
  333. });
  334. if (!unary) {
  335. stack.push({ tag: tagName, attrs: attrs });
  336. lastTag = tagName;
  337. unarySlash = '';
  338. }
  339. if (handler.start) {
  340. await handler.start(tagName, attrs, unary, unarySlash);
  341. }
  342. }
  343. function findTag(tagName) {
  344. var pos;
  345. var needle = tagName.toLowerCase();
  346. for (pos = stack.length - 1; pos >= 0; pos--) {
  347. if (stack[pos].tag.toLowerCase() === needle) {
  348. break;
  349. }
  350. }
  351. return pos;
  352. }
  353. async function parseEndTag(tag, tagName) {
  354. var pos;
  355. // Find the closest opened tag of the same type
  356. if (tagName) {
  357. pos = findTag(tagName);
  358. }
  359. // If no tag name is provided, clean shop
  360. else {
  361. pos = 0;
  362. }
  363. if (pos >= 0) {
  364. // Close all the open elements, up the stack
  365. for (var i = stack.length - 1; i >= pos; i--) {
  366. if (handler.end) {
  367. handler.end(stack[i].tag, stack[i].attrs, i > pos || !tag);
  368. }
  369. }
  370. // Remove the open elements from the stack
  371. stack.length = pos;
  372. lastTag = pos && stack[pos - 1].tag;
  373. }
  374. else if (tagName.toLowerCase() === 'br') {
  375. if (handler.start) {
  376. await handler.start(tagName, [], true, '');
  377. }
  378. }
  379. else if (tagName.toLowerCase() === 'p') {
  380. if (handler.start) {
  381. await handler.start(tagName, [], false, '', true);
  382. }
  383. if (handler.end) {
  384. handler.end(tagName, []);
  385. }
  386. }
  387. }
  388. }
  389. }
  390. exports.HTMLParser = HTMLParser;
  391. exports.HTMLtoXML = function(html) {
  392. var results = '';
  393. new HTMLParser(html, {
  394. start: function(tag, attrs, unary) {
  395. results += '<' + tag;
  396. for (var i = 0, len = attrs.length; i < len; i++) {
  397. results += ' ' + attrs[i].name + '="' + (attrs[i].value || '').replace(/"/g, '&#34;') + '"';
  398. }
  399. results += (unary ? '/' : '') + '>';
  400. },
  401. end: function(tag) {
  402. results += '</' + tag + '>';
  403. },
  404. chars: function(text) {
  405. results += text;
  406. },
  407. comment: function(text) {
  408. results += '<!--' + text + '-->';
  409. },
  410. ignore: function(text) {
  411. results += text;
  412. }
  413. });
  414. return results;
  415. };
  416. exports.HTMLtoDOM = function(html, doc) {
  417. // There can be only one of these elements
  418. var one = {
  419. html: true,
  420. head: true,
  421. body: true,
  422. title: true
  423. };
  424. // Enforce a structure for the document
  425. var structure = {
  426. link: 'head',
  427. base: 'head'
  428. };
  429. if (doc) {
  430. doc = doc.ownerDocument || doc.getOwnerDocument && doc.getOwnerDocument() || doc;
  431. }
  432. else if (typeof DOMDocument !== 'undefined') {
  433. doc = new DOMDocument();
  434. }
  435. else if (typeof document !== 'undefined' && document.implementation && document.implementation.createDocument) {
  436. doc = document.implementation.createDocument('', '', null);
  437. }
  438. else if (typeof ActiveX !== 'undefined') {
  439. doc = new ActiveXObject('Msxml.DOMDocument');
  440. }
  441. var elems = [],
  442. documentElement = doc.documentElement ||
  443. doc.getDocumentElement && doc.getDocumentElement();
  444. // If we're dealing with an empty document then we
  445. // need to pre-populate it with the HTML document structure
  446. if (!documentElement && doc.createElement) {
  447. (function() {
  448. var html = doc.createElement('html');
  449. var head = doc.createElement('head');
  450. head.appendChild(doc.createElement('title'));
  451. html.appendChild(head);
  452. html.appendChild(doc.createElement('body'));
  453. doc.appendChild(html);
  454. })();
  455. }
  456. // Find all the unique elements
  457. if (doc.getElementsByTagName) {
  458. for (var i in one) {
  459. one[i] = doc.getElementsByTagName(i)[0];
  460. }
  461. }
  462. // If we're working with a document, inject contents into
  463. // the body element
  464. var curParentNode = one.body;
  465. new HTMLParser(html, {
  466. start: function(tagName, attrs, unary) {
  467. // If it's a pre-built element, then we can ignore
  468. // its construction
  469. if (one[tagName]) {
  470. curParentNode = one[tagName];
  471. return;
  472. }
  473. var elem = doc.createElement(tagName);
  474. for (var attr in attrs) {
  475. elem.setAttribute(attrs[attr].name, attrs[attr].value);
  476. }
  477. if (structure[tagName] && typeof one[structure[tagName]] !== 'boolean') {
  478. one[structure[tagName]].appendChild(elem);
  479. }
  480. else if (curParentNode && curParentNode.appendChild) {
  481. curParentNode.appendChild(elem);
  482. }
  483. if (!unary) {
  484. elems.push(elem);
  485. curParentNode = elem;
  486. }
  487. },
  488. end: function(/* tag */) {
  489. elems.length -= 1;
  490. // Init the new parentNode
  491. curParentNode = elems[elems.length - 1];
  492. },
  493. chars: function(text) {
  494. curParentNode.appendChild(doc.createTextNode(text));
  495. },
  496. comment: function(/* text */) {
  497. // create comment node
  498. },
  499. ignore: function(/* text */) {
  500. // What to do here?
  501. }
  502. });
  503. return doc;
  504. };
  505. exports.endTag = endTag;