nc

nest-crawler

An easiest crawling and scraping module for NestJS

Showing:

Popularity

Downloads/wk

44

GitHub Stars

28

Maintenance

Last Commit

2yrs ago

Contributors

2

Package

Dependencies

1

Size (min+gzip)

262.0KB

License

MIT

Type Definitions

Tree-Shakeable

No?

Categories

Readme

Nest Logo

😎 nest-crawler 😎

Crawler and Scraper Module for NestJS

Package License (MIT)

Installation

$ npm install --save nest-crawler

Usage

First, register it in the application module so that Nest can handle dependencies:

import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';

@Module({
  imports: [
    NestCrawlerModule,
  ],
})
export class AppModule {}

Then, just import it and use it:

crawler.module.ts

import { Module } from '@nestjs/common';
import { NestCrawlerModule } from 'nest-crawler';
@Module({
  imports: [
    NestCrawlerModule,
  ],
})
export class CrawlerModule {}

crawler.service.ts

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  // scraping the specific page
  public async scrape(): Promise<void> {
    interface ExampleCom {
      title: string;
      info: string;
      content: string;
    }

    const data: ExampleCom = await this.crawler.fetch({
      target: 'http://example.com',
      fetch: {
        title: 'h1',
        info: {
          selector: 'p > a',
          attr: 'href',
        },
        content: {
          selector: '.content',
          how: 'html',
        },
      },
    });

    console.log(data);
    // {
    //   title: 'Example Domain',
    //   info: 'http://www.iana.org/domains/example',
    //   content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>'
    // }
  }

  // crawling multi pages is also supported
  public async crawl(): Promise<void> {
    interface HackerNewsPage {
      title: string;
    }

    const pages: HackerNewsPage[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });

    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}

Recipe

Single Page Scraping

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async scrape(): Promise<void> {
    interface ExampleCom {
      title: string;
      info: string;
      content: string;
    }

    const data: ExampleCom = await this.crawler.fetch({
      target: 'http://example.com',
      fetch: {
        title: 'h1',
        info: {
          selector: 'p > a',
          attr: 'href',
        },
        content: {
          selector: '.content',
          how: 'html',
        }
      },
    });

    console.log(data);
    // {
    //   title: 'Example Domain',
    //   info: 'http://www.iana.org/domains/example',
    //   content: '<div><h1>Example Heading</h1><p>Example Paragraph</p></div>'
    // }
  }
}

Multi Pages Crawling

You Know the target urls already

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Site {
      title: string;
    }

    const sites: Site[] = await this.crawler.fetch({
      target: [
        'https://example1.com',
        'https://example2.com',
        'https://example3.com',
      ],
      fetch: (data: any, index: number, url: string) => ({
        title: 'h1',
      }),
    });

    console.log(sites);
    // [
    //   { title: 'An easiest crawling and scraping module for NestJS' },
    //   { title: 'A minimalistic boilerplate on top of Webpack, Babel, TypeScript and React' },
    //   { title: '[Experimental] React SSR as a view template engine' }
    // ]
  }
}

You Don't Know the Target Urls so Want to Crawl Dynamically

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Page {
      title: string;
    }

    const pages: Page[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      // fetch each `https://news.ycombinator.com/${x}` and scrape data
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });

    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}

You Need to Pass Data Dynamically

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Img {
      src: string;
    }

    const images: Img[] = await this.crawler.fetch({
      target: {
        url: 'https://some.image.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://some.image.com${x}`,
        },
        fetch: {
          imageIds: {
            listItem: 'div.image',
            data: {
              id: {
                selector: 'div.image-wrapper',
                attr: 'data-image-id',
              },
            },
          },
        },
      },
      // fetch each `https://some.image.com${x}`, pass data and scrape data
      fetch: (data: any, index: number, url: string) => ({
        src: {
          convert: () => `https://some.image.com/images/${data.imageIds[index]}.png`,
        },
      }),
    });

    console.log(images);
    // [
    //   { src: 'https://some.image.com/images/1.png' },
    //   { src: 'https://some.image.com/images/2.png' },
    //   ...
    //   ...
    //   { src: 'https://some.image.com/images/100.png' }
    // ]
  }
}

Waitable (by using puppeteer)

import { Injectable } from '@nestjs/common';
import { NestCrawlerService } from 'nest-crawler';

@Injectable()
export class CrawlerService {
  constructor(
    private readonly crawler: NestCrawlerService,
  ) {}

  public async crawl(): Promise<void> {
    interface Page {
      title: string;
    }

    const pages: Page[] = await this.crawler.fetch({
      target: {
        url: 'https://news.ycombinator.com',
        iterator: {
          selector: 'span.age > a',
          convert: (x: string) => `https://news.ycombinator.com/${x}`,
        },
      },
      waitFor: 3 * 1000, // wait for the content loaded! (like single page apps)
      fetch: (data: any, index: number, url: string) => ({
        title: '.title > a',
      }),
    });

    console.log(pages);
    // [
    //   { title: 'Post Title 1' },
    //   { title: 'Post Title 2' },
    //   ...
    //   ...
    //   { title: 'Post Title 30' }
    // ]
  }
}

Rate & Review

Great Documentation0
Easy to Use0
Performant0
Highly Customizable0
Bleeding Edge0
Responsive Maintainers0
Poor Documentation0
Hard to Use0
Slow0
Buggy0
Abandoned0
Unwelcoming Community0
100
No reviews found
Be the first to rate

Alternatives

No alternatives found

Tutorials

No tutorials found
Add a tutorial