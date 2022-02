Parse Web Archive (WARC) files or create WARC files using

Run npm install node-warc or yarn add node-warc to ge started

Documentation

Full documentation available at n0tan3rd.github.io/node-warc

Parsing

Using async iteration

Requires node 10 or greater

const fs = require ( 'fs' ) const zlib = require ( 'zlib' ) const { recordIterator } = require ( 'node-warc' ) async function iterateRecords ( warcStream ) { for await ( const record of recordIterator(warcStream)) { console .log(record) } } iterateRecords( fs.createReadStream( '<path-to-gzipd-warcfile>' ).pipe(zlib.createGunzip()) ).then( () => { console .log( 'done' ) })

Or using one of the parsers

for await ( const record of new AutoWARCParser( '<path-to-warcfile>' )) { console .log(record) }

Using Stream Transform

const fs = require ( 'fs' ) const { WARCStreamTransform } = require ( 'node-warc' ) fs .createReadStream( '<path-to-warcfile>' ) .pipe( new WARCStreamTransform()) .on( 'data' , record => { console .log(record) })

Both .warc and .warc.gz

const { AutoWARCParser } = require ( 'node-warc' ) const parser = new AutoWARCParser( '<path-to-warcfile>' ) parser.on( 'record' , record => { console .log(record) }) parser.on( 'done' , () => { console .log( 'finished' ) }) parser.on( 'error' , error => { console .error(error) }) parser.start()

Only gzip'd warc files

const { WARCGzParser } = require ( 'node-warc' ) const parser = new WARCGzParser( '<path-to-warcfile>' ) parser.on( 'record' , record => { console .log(record) }) parser.on( 'done' , () => { console .log( 'finished' ) }) parser.on( 'error' , error => { console .error(error) }) parser.start()

Only non gzip'd warc files

const { WARCGzParser } = require ( 'node-warc' ) const parser = new WARCParser( '<path-to-gzipd-warcfile>' ) parser.on( 'record' , record => { console .log(record) }) parser.on( 'done' , () => { console .log( 'finished' ) }) parser.on( 'error' , error => { console .error(error) }) parser.start()

WARC Creation

Environment

NODEWARC_WRITE_GZIPPED - enable writing gzipped records to WARC outputs.

Examples

const CRI = require ( 'chrome-remote-interface' ) const { RemoteChromeWARCWriter, RemoteChromeCapturer } = require ( 'node-warc' ) ; ( async ( ) => { const client = await CRI() await Promise .all([ client.Page.enable(), client.Network.enable(), ]) const cap = new RemoteChromeCapturer(client.Network) cap.startCapturing() await client.Page.navigate({ url : 'http://example.com' }); await client.Page.loadEventFired() const warcGen = new RemoteChromeWARCWriter() await warcGen.generateWARC(cap, client.Network, { warcOpts : { warcPath : 'myWARC.warc' }, winfo : { description : 'I created a warc!' , isPartOf : 'My awesome pywb collection' } }) await client.close() })()

const { CRIExtra, Events, Page } = require ( 'chrome-remote-interface-extra' ) const { CRIExtraWARCGenerator, CRIExtraCapturer } = require ( 'node-warc' ) ; ( async ( ) => { let client try { client = await CRIExtra({ host : 'localhost' , port : 9222 }) const page = await Page.create(client) const cap = new CRIExtraCapturer(page, Events.Page.Request) cap.startCapturing() await page.goto( 'https://example.com' , { waitUntil : 'networkIdle' }) const warcGen = new CRIExtraWARCGenerator() await warcGen.generateWARC(cap, { warcOpts : { warcPath : 'myWARC.warc' }, winfo : { description : 'I created a warc!' , isPartOf : 'My awesome pywb collection' } }) } catch (err) { console .error(err) } finally { if (client) { await client.close() } } })()

const puppeteer = require ( 'puppeteer' ) const { Events } = require ( 'puppeteer' ) const { PuppeteerWARCGenerator, PuppeteerCapturer } = require ( 'node-warc' ) ; ( async ( ) => { const browser = await puppeteer.launch() const page = await browser.newPage() const cap = new PuppeteerCapturer(page, Events.Page.Request) cap.startCapturing() await page.goto( 'http://example.com' , { waitUntil : 'networkidle0' }) const warcGen = new PuppeteerWARCGenerator() await warcGen.generateWARC(cap, { warcOpts : { warcPath : 'myWARC.warc' }, winfo : { description : 'I created a warc!' , isPartOf : 'My awesome pywb collection' } }) await page.close() await browser.close() })()

Note

The generateWARC method used in the preceding examples is helper function for making the WARC generation process simple. See its implementation for a full example of WARC generation using node-warc

Or see one of the crawler implementations provided by Squidwarc.