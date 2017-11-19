Now with less explosions!

The purpose of this library is not to be the best XML parsing library ever conceived. Because it's not. It's meant to be an HTML/XML parser that doesn't require valid HTML/XML. It's also meant to act as a sanitizer, which is the main reason for its existence.

For example, you can just shove a blob of text into it, and it will happily parse as if it were valid XML.

Licensed under MIT.

Installation

npm install html-parser

Callback based parsing

var htmlParser = require ( 'html-parser' ); var html = '<!doctype html><html><body onload="alert(\'hello\');">Hello<br />world</body></html>' ; htmlParser.parse(html, { openElement : function ( name ) { console .log( 'open: %s' , name); }, closeOpenedElement : function ( name, token, unary ) { console .log( 'token: %s, unary: %s' , token, unary); }, closeElement : function ( name ) { console .log( 'close: %s' , name); }, comment : function ( value ) { console .log( 'comment: %s' , value); }, cdata : function ( value ) { console .log( 'cdata: %s' , value); }, attribute : function ( name, value ) { console .log( 'attribute: %s=%s' , name, value); }, docType : function ( value ) { console .log( 'doctype: %s' , value); }, text : function ( value ) { console .log( 'text: %s' , value); } });

Sanitization

var htmlParser = require ( 'html-parser' ); var html = '<script>alert(\'danger!\')</script><p onclick="alert(\'danger!\')">blah blah<!-- useless comment --></p>' ; var sanitized = htmlParser.sanitize(html, { elements : [ 'script' ], attributes : [ 'onclick' ], comments : true }); console .log(sanitized);

Using callbacks

var htmlParser = require ( 'html-parser' ); var html = '<script>alert(\'danger!\')</script><p onclick="alert(\'danger!\')">blah blah<!-- useless comment --></p>' ; var sanitized = htmlParser.sanitize(html, { elements : function ( name ) { return name === 'script' ; }, attributes : function ( name, value ) { return /^on/i .test(name) || /^javascript:/i .test(value); }, comments : true }); console .log(sanitized);

Custom data elements

You can parser custom data elements like php code or underscore templates with regex.dataElements config

helpers.parseString( '<div><?= "<div>$var</div>" ?></div>' , { openElement : function ( name ) { console .log(name); }, closeElement : function ( name ) { console .log(name); }, phpEcho : function ( value ) { console .log(value); } }, { dataElements : { phpEcho : { start : '<?=' , data : function ( string ) { var index = string.indexOf( '?>' ), code = string.slice( 0 , index); return code; return { length : code.length, someProperty : code }; }, end : '?>' } } });

API

parse(htmlString, callbacks, regex) parseFile(fileName, encoding, callbacks, callback) sanitize(htmlString, removalCallbacks)

