├── .editorconfig
├── .eslintrc
├── .github
└── workflows
│ └── build-test.yaml
├── .gitignore
├── .npmignore
├── .prettierrc
├── CHANGELOG.md
├── LICENSE
├── README.md
├── __tests__
├── public
│ └── index.html
└── scraping.test.ts
├── changelog-ci-config.json
├── jest.config.js
├── package.json
├── src
├── DataModeler.ts
├── SchemeInterpreter.ts
├── index.ts
└── typings.ts
├── tsconfig.json
└── yarn.lock
/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 |
3 | [*]
4 | indent_style = space
5 | indent_size = 2
6 | charset = utf-8
7 | trim_trailing_whitespace = true
8 | insert_final_newline = true
9 |
--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
1 | {
2 | "env": {
3 | "es2021": true,
4 | "node": true
5 | },
6 | "parser": "@typescript-eslint/parser",
7 | "parserOptions": {
8 | "ecmaVersion": 2020,
9 | "sourceType": "module",
10 | "impliedStrict": true
11 | },
12 | "extends": [
13 | "standard",
14 | "plugin:@typescript-eslint/recommended",
15 | "prettier/@typescript-eslint",
16 | "plugin:prettier/recommended"
17 | ],
18 | "plugins": [
19 | "@typescript-eslint"
20 | ],
21 | "rules": {},
22 | "ignorePatterns": [
23 | "**/*.js"
24 | ]
25 | }
26 |
--------------------------------------------------------------------------------
/.github/workflows/build-test.yaml:
--------------------------------------------------------------------------------
1 | name: Build & test
2 |
3 | on:
4 | push:
5 | branches-ignore:
6 | - master
7 | pull_request:
8 | branches-ignore:
9 | - master
10 |
11 | jobs:
12 | build:
13 | runs-on: ${{ matrix.os }}
14 |
15 | strategy:
16 | matrix:
17 | node-version: [10.x, 12.x, 14.x]
18 | os: [ubuntu-latest, windows-latest, macos-latest]
19 |
20 | steps:
21 | - name: Checkout repository
22 | uses: actions/checkout@v2
23 | with:
24 | fetch-depth: 0
25 |
26 | - name: Setup Node.js ${{ matrix.node-version }}
27 | uses: actions/setup-node@v1
28 | with:
29 | node-version: ${{ matrix.node-version }}
30 |
31 | - name: Install dependencies
32 | run: yarn --check-files --non-interactive
33 |
34 | - name: Build project
35 | run: yarn run build
36 |
37 | - name: Lint code
38 | run: yarn lintfix
39 |
40 | - name: Run unit tests
41 | run: yarn test
42 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 | lerna-debug.log*
8 |
9 | # Diagnostic reports (https://nodejs.org/api/report.html)
10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
11 |
12 | # Runtime data
13 | pids
14 | *.pid
15 | *.seed
16 | *.pid.lock
17 |
18 | # Directory for instrumented libs generated by jscoverage/JSCover
19 | lib-cov
20 |
21 | # Coverage directory used by tools like istanbul
22 | coverage
23 | *.lcov
24 |
25 | # nyc test coverage
26 | .nyc_output
27 |
28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
29 | .grunt
30 |
31 | # Bower dependency directory (https://bower.io/)
32 | bower_components
33 |
34 | # node-waf configuration
35 | .lock-wscript
36 |
37 | # Compiled binary addons (https://nodejs.org/api/addons.html)
38 | build/Release
39 |
40 | # Dependency directories
41 | node_modules/
42 | jspm_packages/
43 |
44 | # Snowpack dependency directory (https://snowpack.dev/)
45 | web_modules/
46 |
47 | # TypeScript cache
48 | *.tsbuildinfo
49 |
50 | # Optional npm cache directory
51 | .npm
52 |
53 | # Optional eslint cache
54 | .eslintcache
55 |
56 | # Microbundle cache
57 | .rpt2_cache/
58 | .rts2_cache_cjs/
59 | .rts2_cache_es/
60 | .rts2_cache_umd/
61 |
62 | # Optional REPL history
63 | .node_repl_history
64 |
65 | # Output of 'npm pack'
66 | *.tgz
67 |
68 | # Yarn Integrity file
69 | .yarn-integrity
70 |
71 | # dotenv environment variables file
72 | .env
73 | .env.test
74 |
75 | # parcel-bundler cache (https://parceljs.org/)
76 | .cache
77 | .parcel-cache
78 |
79 | # Next.js build output
80 | .next
81 | out
82 |
83 | # Nuxt.js build / generate output
84 | .nuxt
85 | dist
86 |
87 | # Gatsby files
88 | .cache/
89 | # Comment in the public line in if your project uses Gatsby and not Next.js
90 | # https://nextjs.org/blog/next-9-1#public-directory-support
91 | # public
92 |
93 | # vuepress build output
94 | .vuepress/dist
95 |
96 | # Serverless directories
97 | .serverless/
98 |
99 | # FuseBox cache
100 | .fusebox/
101 |
102 | # DynamoDB Local files
103 | .dynamodb/
104 |
105 | # TernJS port file
106 | .tern-port
107 |
108 | # Stores VSCode versions used for testing VSCode extensions
109 | .vscode-test
110 |
111 | # yarn v2
112 | .yarn/cache
113 | .yarn/unplugged
114 | .yarn/build-state.yml
115 | .yarn/install-state.gz
116 | .pnp.*
117 |
118 | # exclusive use of yarn
119 | package-lock.json
120 | local
121 |
--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | src/
2 | __tests__/
3 | .github/
4 | local
5 | .editorconfig
6 | .eslintrc
7 | .prettierrc
8 | changelog-ci-config.json
9 | CHANGELOG.md
10 | jest.config.js
11 | tsconfig.json
--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 | "printWidth": 90,
3 | "trailingComma": "none",
4 | "arrowParens": "always",
5 | "tabWidth": 2,
6 | "semi": false,
7 | "singleQuote": true
8 | }
9 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tanukijs/scrape-them-all/d34eb74f88754006e138e60f7e99321f32fa462f/CHANGELOG.md
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Tanuki
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | **[Scrape-Them-All](http://npmjs.com/package/scrape-them-all)** is a [Cheerio](https://cheerio.js.org) layer which improves your scraping experience.
10 |
11 | **This package is recent, if you have any suggestions or you notice that something is not working, feel free to open an issue or a pull-request, we will be happy to answer them as soon as possible**
12 |
13 | ---
14 |
15 | # 📦 Installation
16 |
17 | ```sh
18 | # Using NPM
19 | npm install --save scrape-them-all
20 | npm install --save fetch-cookie #optional
21 |
22 | # Using Yarn
23 | yarn add scrape-them-all
24 | yarn add fetch-cookie #optional
25 | ```
26 |
27 | `fetch-cookie` is only required if you plan to use the `cookieJar` option on requests.
28 |
29 | **⚠ If you get a `too many redirects` error when you scrape, we recommend to install `fetch-cookie` and use the option `cookieJar: true` in your request. You can also pass an instance of `tough.CookieJar` to this parameter.**
30 |
31 | Example:
32 |
33 | ```js
34 | scrapeTA({ url: 'https://google.com', cookieJar: true }, ...)
35 | ```
36 |
37 | ---
38 |
39 | # 📚 Documentation
40 |
41 | ### `scrapeTA(query, schema)`
42 |
43 | Params:
44 |
45 | - **query** `String` or `Object`: The page url or the page url and node-fetch options.
46 | - **schema** `Object`: the list of elements to scrape and the corresponding HTML tags.
47 |
48 | Returns:
49 |
50 | - `Promise`: A promise containing the result as JSON.
51 |
52 | ## Schema options
53 |
54 | | Option | Type | Description |
55 | | ------------- | ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
56 | | **selector** | `String` or `Object` | Can be a string expression, DOM Element, array of DOM elements, or cheerio object. | |
57 | | **trim** | `Boolean` | Trim whitespaces in the result. **Default as `true`**. |
58 | | **attribute** | `String` | Return the value of the indicated attribute on the selected element. |
59 | | **accessor** | `String` or `Function` | Cheerio access method name (like `html` for returning html code) or a custom function that take a Cheerio instance as first parameter. |
60 | | **transformer** | `Function` | The first parameter is your current value for the selected item. Can return a `Promise`. |
61 | | **listModel** | `Object` | Contains the options stated above in case of a list. |
62 |
63 | ## Example output
64 |
65 | ```json
66 | {
67 | "title": "An amazing game",
68 | "description": "With an amazing description
",
69 | "image": "https://amazing.game/image.jpg",
70 | "price": 10.99,
71 | "users": [
72 | {
73 | "username": "Tanuki",
74 | "badges": [
75 | { "name": "An amazing player" },
76 | ...
77 | ]
78 | },
79 | ...
80 | ]
81 | }
82 | ```
83 |
84 | ## The code that goes with it
85 |
86 | ```js
87 | const { ScrapeTA } = require('scrape-them-all')
88 | ScrapeTA('url_or_https_options', {
89 | title: '.header h1',
90 | description: {
91 | selector: '.header p',
92 | accessor: 'html',
93 | // accessor: selected => selected.html(),
94 | trim: false
95 | },
96 | image: {
97 | selector: 'img',
98 | attribute: 'src'
99 | },
100 | price: {
101 | selector: '.footer #price',
102 | transformer: (value) => parseFloat(value)
103 | },
104 | users: {
105 | selector: '.body .users',
106 | listModel: {
107 | username: '.username',
108 | badges: {
109 | selector: '.badges',
110 | listModel: {
111 | name: '.badgeName'
112 | }
113 | }
114 | }
115 | }
116 | })
117 | .then((data) => console.log(data))
118 | .catch((error) => console.error(error))
119 | ```
120 |
121 | ---
122 |
123 | # 💪 Contributions
124 |
125 | TODO
126 |
127 | ---
128 |
129 | # 📜 License
130 |
131 | [MIT](https://github.com/tanukijs/scrape-them-all/blob/typescript/LICENSE) © [Tanuki](https://github.com/tanukijs), [Aperrix](https://github.com/Aperrix).
132 |
--------------------------------------------------------------------------------
/__tests__/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | ScrapeThemAll Exampels
6 |
7 |
8 |
9 | Title
10 | Lorem ipsum
11 |
12 | Line0
13 | Line1
14 |
15 |
16 |
17 | Line0
18 | Line1
19 |
20 | Line2
21 |
22 | 1988-01-01
23 |
24 | 1
25 | 2
26 | 3
27 | 4
28 | 5
29 | 6
30 |
31 |
32 |
Foo
33 |
34 |
1
35 |
36 | 2
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 | Sydney
45 |
46 |
47 | info
48 |
49 |
50 |
51 |
52 |
53 | one way street
54 |
55 |
56 | info-1
57 |
58 |
59 |
60 | GT Road
61 |
62 |
63 | info-2
64 |
65 |
66 |
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/__tests__/scraping.test.ts:
--------------------------------------------------------------------------------
1 | import { scrapeTA } from '../src'
2 | import { createServer, Server } from 'http'
3 | import { readFileSync } from 'fs'
4 | import { join } from 'path'
5 |
6 | let server: Server
7 | const port = 8080
8 |
9 | beforeAll((done) => {
10 | server = createServer((_req, res) => {
11 | const indexPath = join(__dirname, 'public/index.html')
12 | const indexHTML = readFileSync(indexPath)
13 | res.write(indexHTML)
14 | res.end()
15 | })
16 | server.listen(port, () => done() && console.log('Server running on port 8080'))
17 | })
18 | afterAll((done) => {
19 | server.close(() => done())
20 | })
21 |
22 | describe('Scrape basic data', () => {
23 | test('Directly target the HTML element', async () => {
24 | const { response, data } = await scrapeTA(`http://localhost:${port}`, {
25 | title: 'h1.title',
26 | description: '.description',
27 | date: {
28 | selector: '.date',
29 | transformer: (x) => new Date(x)
30 | }
31 | })
32 | expect(response.ok).toBe(true)
33 | expect(data).toEqual({
34 | title: 'Title',
35 | description: 'Lorem ipsum',
36 | date: new Date('1988-01-01')
37 | })
38 | })
39 |
40 | test('Use a reserved keyword', async () => {
41 | const { response, data } = await scrapeTA(`http://localhost:${port}`, {
42 | title: 'h1.title',
43 | _attribute: {
44 | selector: 'img',
45 | attribute: 'src'
46 | }
47 | })
48 | expect(response.ok).toBe(true)
49 | expect(data).toEqual({
50 | title: 'Title',
51 | attribute:
52 | 'https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/ee/ee276885cdbec23bdb9780509210c3c24dc7070e_full.jpg'
53 | })
54 | })
55 | })
56 |
57 | describe('Scrape list', () => {
58 | test('With transform', async () => {
59 | const { response, data } = await scrapeTA(`http://localhost:${port}`, {
60 | features: {
61 | selector: '.features',
62 | listModel: {
63 | selector: 'li',
64 | transformer: (x) => parseInt(x, 10)
65 | }
66 | }
67 | })
68 | expect(response.ok).toBe(true)
69 | expect(data).toEqual({
70 | features: [1, 2, 3, 4, 5, 6]
71 | })
72 | })
73 |
74 | test('Without transform', async () => {
75 | const { response, data } = await scrapeTA(`http://localhost:${port}`, {
76 | features: {
77 | selector: '.features',
78 | listModel: {
79 | selector: 'li'
80 | }
81 | }
82 | })
83 | expect(response.ok).toBe(true)
84 | expect(data).toEqual({
85 | features: ['1', '2', '3', '4', '5', '6']
86 | })
87 | })
88 | })
89 |
90 | describe('Scrape nested object', () => {
91 | test('Object nested with multiple custom keys', async () => {
92 | const { response, data } = await scrapeTA(`http://localhost:${port}`, {
93 | nested: {
94 | selector: '.nested',
95 | foo: {
96 | level1: {
97 | selector: '.level1',
98 | level2: {
99 | selector: 'span',
100 | accessor: (x) => x.eq(1).text()
101 | }
102 | },
103 | level1Text: {
104 | selector: 'span'
105 | },
106 | level2Text: {
107 | selector: '.level2'
108 | }
109 | }
110 | }
111 | })
112 | expect(response.ok).toBe(true)
113 | expect(data).toEqual({
114 | nested: {
115 | foo: {
116 | level1: {
117 | level2: '2'
118 | },
119 | level1Text: 'Foo12',
120 | level2Text: '2'
121 | }
122 | }
123 | })
124 | })
125 |
126 | test('Scrape tables using accessor', async () => {
127 | const { response, data } = await scrapeTA(`http://localhost:${port}`, {
128 | addresses: {
129 | selector: 'table tbody tr',
130 | listModel: {
131 | address: '.address',
132 | city: {
133 | accessor: (x) => x.closest('table').find('thead .city').text()
134 | }
135 | }
136 | }
137 | })
138 | expect(response.ok).toBe(true)
139 | expect(data).toEqual({
140 | addresses: [
141 | { address: 'one way street', city: 'Sydney' },
142 | { address: 'GT Road', city: 'Sydney' }
143 | ]
144 | })
145 | })
146 | })
147 |
148 | describe('Scrape using options', () => {
149 | test('Store redirections using cookieJar option', async () => {
150 | const { response, data } = await scrapeTA(
151 | { url: 'http://www.krosmoz.com/en/almanax/2020-01-01', cookieJar: true },
152 | {
153 | month: {
154 | selector: '#almanax_day .day-text'
155 | }
156 | }
157 | )
158 | expect(response.ok).toBe(true)
159 | expect(data).toEqual({
160 | month: 'Javian'
161 | })
162 | })
163 |
164 | test('Get data modified by AJAX using headers option', async () => {
165 | const { response, data } = await scrapeTA(
166 | {
167 | url:
168 | 'https://www.dofus.com/en/mmorpg/encyclopedia/pets/11950-ankascraper?level=100&_pjax=.ak-item-details-container',
169 | headers: {
170 | 'x-requested-with': 'XMLHttpRequest',
171 | 'x-pjax': 'true',
172 | 'x-pjax-container': '.ak-item-details-container'
173 | }
174 | },
175 | {
176 | effect: {
177 | selector: '.ak-container.ak-content-list.ak-displaymode-col .ak-title',
178 | accessor: (x) => x.eq(0).text()
179 | }
180 | }
181 | )
182 | expect(response.ok).toBe(true)
183 | expect(data).toEqual({
184 | effect: '120 Chance'
185 | })
186 | })
187 | })
188 |
189 | describe('Herror handling', () => {
190 | test('Scrape invalid URL', async () => {
191 | await expect(
192 | scrapeTA('http://gertkafgzngegzegerj.com', {
193 | title: 'h1.title'
194 | })
195 | ).rejects.toThrow()
196 | })
197 |
198 | test('Use reserved keyword directly', async () => {
199 | await expect(
200 | scrapeTA(`http://localhost:${port}`, {
201 | attribute: 'h1.title'
202 | })
203 | ).rejects.toThrow('Root object must be a nested object.')
204 | })
205 |
206 | test('Use reserved keyword in nested object', async () => {
207 | await expect(
208 | scrapeTA(`http://localhost:${port}`, {
209 | accessor: {
210 | img: {
211 | selector: 'img',
212 | attribute: 'src'
213 | }
214 | }
215 | })
216 | ).rejects.toThrow(
217 | 'The property "accessor" expects a string or a function. If you want to use "accessor" as a result key, prefix it with an underscore (the first will be stripped automatically).'
218 | )
219 | })
220 | })
221 |
--------------------------------------------------------------------------------
/changelog-ci-config.json:
--------------------------------------------------------------------------------
1 | {
2 | "header_prefix": "Release:",
3 | "group_config": [
4 | {
5 | "title": ":bug: Bug Fixes",
6 | "labels": ["bug", "bugfix"]
7 | },
8 | {
9 | "title": ":rocket: Code Improvements",
10 | "labels": ["improvements", "enhancement"]
11 | },
12 | {
13 | "title": ":sparkles: New Features",
14 | "labels": ["feature"]
15 | },
16 | {
17 | "title": ":books: Documentation Updates",
18 | "labels": ["docs", "documentation", "doc"]
19 | }
20 | ]
21 | }
22 |
--------------------------------------------------------------------------------
/jest.config.js:
--------------------------------------------------------------------------------
1 | // For a detailed explanation regarding each configuration property, visit:
2 | // https://jestjs.io/docs/en/configuration.html
3 |
4 | module.exports = {
5 | // All imported modules in your tests should be mocked automatically
6 | // automock: false,
7 |
8 | // Stop running tests after `n` failures
9 | // bail: 0,
10 |
11 | // The directory where Jest should store its cached dependency information
12 | // cacheDirectory: "/tmp/jest_rs",
13 |
14 | // Automatically clear mock calls and instances between every test
15 | clearMocks: true,
16 |
17 | // Indicates whether the coverage information should be collected while executing the test
18 | // collectCoverage: false,
19 |
20 | // An array of glob patterns indicating a set of files for which coverage information should be collected
21 | // collectCoverageFrom: undefined,
22 |
23 | // The directory where Jest should output its coverage files
24 | coverageDirectory: "coverage",
25 |
26 | // An array of regexp pattern strings used to skip coverage collection
27 | // coveragePathIgnorePatterns: [
28 | // "/node_modules/"
29 | // ],
30 |
31 | // Indicates which provider should be used to instrument code for coverage
32 | coverageProvider: "v8",
33 |
34 | // A list of reporter names that Jest uses when writing coverage reports
35 | // coverageReporters: [
36 | // "json",
37 | // "text",
38 | // "lcov",
39 | // "clover"
40 | // ],
41 |
42 | // An object that configures minimum threshold enforcement for coverage results
43 | // coverageThreshold: undefined,
44 |
45 | // A path to a custom dependency extractor
46 | // dependencyExtractor: undefined,
47 |
48 | // Make calling deprecated APIs throw helpful error messages
49 | // errorOnDeprecated: false,
50 |
51 | // Force coverage collection from ignored files using an array of glob patterns
52 | // forceCoverageMatch: [],
53 |
54 | // A path to a module which exports an async function that is triggered once before all test suites
55 | // globalSetup: undefined,
56 |
57 | // A path to a module which exports an async function that is triggered once after all test suites
58 | // globalTeardown: undefined,
59 |
60 | // A set of global variables that need to be available in all test environments
61 | // globals: {},
62 |
63 | // The maximum amount of workers used to run your tests. Can be specified as % or a number. E.g. maxWorkers: 10% will use 10% of your CPU amount + 1 as the maximum worker number. maxWorkers: 2 will use a maximum of 2 workers.
64 | // maxWorkers: "50%",
65 |
66 | // An array of directory names to be searched recursively up from the requiring module's location
67 | // moduleDirectories: [
68 | // "node_modules"
69 | // ],
70 |
71 | // An array of file extensions your modules use
72 | // moduleFileExtensions: [
73 | // "js",
74 | // "json",
75 | // "jsx",
76 | // "ts",
77 | // "tsx",
78 | // "node"
79 | // ],
80 |
81 | // A map from regular expressions to module names or to arrays of module names that allow to stub out resources with a single module
82 | // moduleNameMapper: {},
83 |
84 | // An array of regexp pattern strings, matched against all module paths before considered 'visible' to the module loader
85 | // modulePathIgnorePatterns: [],
86 |
87 | // Activates notifications for test results
88 | // notify: false,
89 |
90 | // An enum that specifies notification mode. Requires { notify: true }
91 | // notifyMode: "failure-change",
92 |
93 | // A preset that is used as a base for Jest's configuration
94 | // preset: undefined,
95 |
96 | // Run tests from one or more projects
97 | // projects: undefined,
98 |
99 | // Use this configuration option to add custom reporters to Jest
100 | // reporters: undefined,
101 |
102 | // Automatically reset mock state between every test
103 | // resetMocks: false,
104 |
105 | // Reset the module registry before running each individual test
106 | // resetModules: false,
107 |
108 | // A path to a custom resolver
109 | // resolver: undefined,
110 |
111 | // Automatically restore mock state between every test
112 | // restoreMocks: false,
113 |
114 | // The root directory that Jest should scan for tests and modules within
115 | // rootDir: undefined,
116 |
117 | // A list of paths to directories that Jest should use to search for files in
118 | // roots: [
119 | // ""
120 | // ],
121 |
122 | // Allows you to use a custom runner instead of Jest's default test runner
123 | // runner: "jest-runner",
124 |
125 | // The paths to modules that run some code to configure or set up the testing environment before each test
126 | // setupFiles: [],
127 |
128 | // A list of paths to modules that run some code to configure or set up the testing framework before each test
129 | // setupFilesAfterEnv: [],
130 |
131 | // The number of seconds after which a test is considered as slow and reported as such in the results.
132 | // slowTestThreshold: 5,
133 |
134 | // A list of paths to snapshot serializer modules Jest should use for snapshot testing
135 | // snapshotSerializers: [],
136 |
137 | // The test environment that will be used for testing
138 | testEnvironment: "node",
139 |
140 | // Options that will be passed to the testEnvironment
141 | // testEnvironmentOptions: {},
142 |
143 | // Adds a location field to test results
144 | // testLocationInResults: false,
145 |
146 | // The glob patterns Jest uses to detect test files
147 | testMatch: [
148 | "**/__tests__/**/*.+(ts|tsx|js)",
149 | "**/?(*.)+(spec|test).+(ts|tsx|js)"
150 | ],
151 |
152 | // An array of regexp pattern strings that are matched against all test paths, matched tests are skipped
153 | testPathIgnorePatterns: [
154 | "/node_modules/",
155 | "/dist/"
156 | ],
157 |
158 | // The regexp pattern or array of patterns that Jest uses to detect test files
159 | // testRegex: [],
160 |
161 | // This option allows the use of a custom results processor
162 | // testResultsProcessor: undefined,
163 |
164 | // This option allows use of a custom test runner
165 | // testRunner: "jasmine2",
166 |
167 | // This option sets the URL for the jsdom environment. It is reflected in properties such as location.href
168 | // testURL: "http://localhost",
169 |
170 | // Setting this value to "fake" allows the use of fake timers for functions such as "setTimeout"
171 | // timers: "real",
172 |
173 | // A map from regular expressions to paths to transformers
174 | transform: {
175 | "^.+\\.(ts|tsx)$": "ts-jest"
176 | },
177 |
178 | // An array of regexp pattern strings that are matched against all source file paths, matched files will skip transformation
179 | // transformIgnorePatterns: [
180 | // "/node_modules/",
181 | // "\\.pnp\\.[^\\/]+$"
182 | // ],
183 |
184 | // An array of regexp pattern strings that are matched against all modules before the module loader will automatically return a mock for them
185 | // unmockedModulePathPatterns: undefined,
186 |
187 | // Indicates whether each individual test should be reported during the run
188 | // verbose: undefined,
189 |
190 | // An array of regexp patterns that are matched against all source file paths before re-running tests in watch mode
191 | // watchPathIgnorePatterns: [],
192 |
193 | // Whether to use watchman for file crawling
194 | // watchman: true,
195 | };
196 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "scrape-them-all",
3 | "description": "🚀 An easy-to-handle Node.js scraper that allow you to scrape them all in a record time.",
4 | "version": "1.0.4",
5 | "license": "MIT",
6 | "author": "Tanuki (https://github.com/tanukijs)",
7 | "contributors": [
8 | "Aperrix (https://github.com/Aperrix)"
9 | ],
10 | "repository": "https://github.com/tanukijs/scrape-them-all.git",
11 | "source": "src/index.ts",
12 | "main": "dist/index.js",
13 | "types": "dist/index.d.ts",
14 | "engines": {
15 | "node": ">=10"
16 | },
17 | "keywords": [
18 | "scraper",
19 | "node-scraper",
20 | "scraping",
21 | "web-scraping",
22 | "htmlparser",
23 | "parser",
24 | "parsing",
25 | "crawler",
26 | "crawling"
27 | ],
28 | "scripts": {
29 | "lint": "eslint --ext .ts --ignore-path .gitignore .",
30 | "lintfix": "eslint --fix --ext .ts --ignore-path .gitignore .",
31 | "build": "rimraf dist && tsc -p tsconfig.json",
32 | "test": "jest",
33 | "prepare": "npm run build && npm run test"
34 | },
35 | "dependencies": {
36 | "cheerio": "^1.0.0-rc.3",
37 | "node-fetch": "^2.6.1"
38 | },
39 | "devDependencies": {
40 | "@types/cheerio": "^0.22.22",
41 | "@types/jest": "^26.0.14",
42 | "@types/node": "^14.11.8",
43 | "@types/node-fetch": "^2.5.7",
44 | "@typescript-eslint/eslint-plugin": "^4.4.1",
45 | "@typescript-eslint/parser": "^4.4.1",
46 | "eslint": "^7.11.0",
47 | "eslint-config-prettier": "^6.12.0",
48 | "eslint-config-standard": "^14.1.1",
49 | "eslint-plugin-import": "^2.22.1",
50 | "eslint-plugin-node": "^11.1.0",
51 | "eslint-plugin-prettier": "^3.1.4",
52 | "eslint-plugin-promise": "^4.2.1",
53 | "eslint-plugin-standard": "^4.0.1",
54 | "fetch-cookie": "^0.10.1",
55 | "jest": "^26.5.3",
56 | "prettier": "^2.1.2",
57 | "rimraf": "^3.0.2",
58 | "ts-jest": "^26.4.1",
59 | "tslib": "^2.0.3",
60 | "typescript": "^4.0.3"
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/DataModeler.ts:
--------------------------------------------------------------------------------
1 | import cheerio from 'cheerio'
2 | import { EOptionType, SchemeInterpreter } from './SchemeInterpreter'
3 |
4 | export class DataModeler {
5 | private $root: cheerio.Root
6 |
7 | constructor(body: string) {
8 | this.$root = cheerio.load(body)
9 | }
10 |
11 | /**
12 | * Generate data from HTML body & user-designed JSON scheme
13 | *
14 | * @param {SchemeInterpreter} opts
15 | * @param {cheerio.Cheerio} [context]
16 | *
17 | * @returns {Promise>}
18 | */
19 | async generate(
20 | opts: SchemeInterpreter,
21 | context?: cheerio.Cheerio
22 | ): Promise> {
23 | if (opts.type !== EOptionType.OBJECT)
24 | throw new Error('Root object must be a nested object.')
25 |
26 | const mappedResult = {}
27 |
28 | for (const key in opts.children) {
29 | const value = new SchemeInterpreter(opts.children[key])
30 | const cheerioRoot =
31 | context && value.selector
32 | ? context.find(value.selector)
33 | : context || this.$root(value.selector)
34 |
35 | if (value.type === EOptionType.OBJECT) {
36 | mappedResult[key] = await this.generate(value, cheerioRoot)
37 | continue
38 | }
39 |
40 | const result =
41 | value.type === EOptionType.VALUE
42 | ? this.processValue(cheerioRoot, value)
43 | : value.type === EOptionType.ARRAY
44 | ? this.processArray(cheerioRoot, value.listModel as SchemeInterpreter)
45 | : value.type === EOptionType.OBJECT_ARRAY
46 | ? this.processObjectArray(cheerioRoot, value.listModel as SchemeInterpreter)
47 | : undefined
48 | mappedResult[key] = await (Array.isArray(result) ? Promise.all(result) : result)
49 | }
50 |
51 | return mappedResult
52 | }
53 |
54 | /**
55 | * Process single item
56 | *
57 | * @param {cheerio.Cheerio} element
58 | * @param {SchemeInterpreter} opts
59 | *
60 | * @returns {unknown}
61 | */
62 | private processValue(element: cheerio.Cheerio, opts: SchemeInterpreter): unknown {
63 | let value =
64 | typeof opts.accessor === 'function'
65 | ? opts.accessor(element)
66 | : typeof opts.accessor === 'string' && typeof element[opts.accessor] !== undefined
67 | ? element[opts.accessor]()
68 | : null
69 |
70 | if (opts.attribute) value = element.attr(opts.attribute)
71 | if (opts.trim && value && typeof value === 'string') value = value.trim()
72 | if (opts.transformer) value = opts.transformer(value)
73 |
74 | return value
75 | }
76 |
77 | /**
78 | * Process basic list
79 | *
80 | * @param {cheerio.Cheerio} element
81 | * @param {SchemeInterpreter} listModel
82 | *
83 | * @returns {unknown[]}
84 | */
85 | private processArray(
86 | element: cheerio.Cheerio,
87 | listModel: SchemeInterpreter
88 | ): unknown[] {
89 | const values = []
90 | const children = element.find(listModel.selector)
91 | for (let i = 0; i < children.length; i++) {
92 | const value = this.processValue(children.eq(i), listModel)
93 | values.push(value)
94 | }
95 | return values
96 | }
97 |
98 | /**
99 | * Process list of objects
100 | *
101 | * @param {cheerio.Cheerio} element
102 | * @param {SchemeInterpreter} listModel
103 | *
104 | * @returns {Promise>[]}
105 | */
106 | private processObjectArray(
107 | element: cheerio.Cheerio,
108 | listModel: SchemeInterpreter
109 | ): Promise>[] {
110 | const values = []
111 | for (let i = 0; i < element.length; i++) {
112 | const value = this.generate(listModel, element.eq(i))
113 | values.push(value)
114 | }
115 | return values
116 | }
117 | }
118 |
--------------------------------------------------------------------------------
/src/SchemeInterpreter.ts:
--------------------------------------------------------------------------------
1 | import { ScrapeTAScheme } from './typings'
2 |
3 | export const enum EOptionType {
4 | VALUE,
5 | ARRAY,
6 | OBJECT,
7 | OBJECT_ARRAY
8 | }
9 |
10 | export class SchemeInterpreter {
11 | readonly selector: string = ''
12 | readonly trim: boolean = true
13 | readonly accessor: string | ((node: cheerio.Cheerio) => unknown) = 'text'
14 | readonly attribute?: string
15 | readonly transformer?: (value: string) => unknown
16 | readonly listModel?: string | ScrapeTAScheme | SchemeInterpreter
17 | readonly children: Record = {}
18 |
19 | constructor(opts: string | Partial = '') {
20 | if (typeof opts === 'string') {
21 | this.selector = opts
22 | } else {
23 | this.selector = opts.selector || ''
24 | this.trim = opts.trim || true
25 | this.accessor = opts.accessor || 'text'
26 | this.attribute = opts.attribute
27 | this.transformer = opts.transformer
28 | this.listModel = opts.listModel ? new SchemeInterpreter(opts.listModel) : undefined
29 |
30 | const reservedKeys = Object.keys(this)
31 | for (const key in opts) {
32 | const normalizedKey = key[0] === '_' ? key.slice(1) : key
33 | if (reservedKeys.includes(key) && normalizedKey === key) continue
34 | this.children[normalizedKey] = opts[key]
35 | }
36 | }
37 | this.validate()
38 | }
39 |
40 | /**
41 | * Get type of an input
42 | *
43 | * @param {ScrapeTAScheme[K]} scheme
44 | * @returns {EOptionType}
45 | */
46 | public get type(): EOptionType {
47 | if (Object.keys(this.children).length > 0) return EOptionType.OBJECT
48 | if (!this.listModel) return EOptionType.VALUE
49 |
50 | if (
51 | this.listModel instanceof SchemeInterpreter &&
52 | Object.keys(this.listModel.children).length > 0
53 | )
54 | return EOptionType.OBJECT_ARRAY
55 | return EOptionType.ARRAY
56 | }
57 |
58 | /**
59 | * Validate current SchemeInterpreter object
60 | *
61 | * @returns {void}
62 | * @throws {Error}
63 | */
64 | public validate(): void {
65 | const expected = [
66 | { property: 'selector', equalsTo: ['string'] },
67 | { property: 'trim', equalsTo: ['boolean'] },
68 | { property: 'accessor', equalsTo: ['string', 'function'] },
69 | { property: 'attribute', equalsTo: ['string'] },
70 | { property: 'transformer', equalsTo: ['function'] },
71 | { property: 'listModel', equalsTo: ['string', 'object'] }
72 | ]
73 |
74 | for (const { property, equalsTo } of expected) {
75 | if (!this[property]) continue
76 | const asExpectedValue = equalsTo.map((type) => typeof this[property] === type)
77 | if (asExpectedValue.includes(true)) continue
78 | const errorTypes = equalsTo.join(' or a ')
79 | const errorMessage = [
80 | `The property "${property}" expects a ${errorTypes}.`,
81 | `If you want to use "${property}" as a result key, prefix it with an underscore (the first will be stripped automatically).`
82 | ].join(' ')
83 | throw new Error(errorMessage)
84 | }
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | import nodeFetch, { RequestInfo, RequestInit } from 'node-fetch'
2 | import { SchemeInterpreter } from './SchemeInterpreter'
3 | import { DataModeler } from './DataModeler'
4 | import {
5 | ScrapeTAExtraParams,
6 | ScrapeTARequest,
7 | ScrapeTAScheme,
8 | ScrapeTAResult
9 | } from './typings'
10 |
11 | /**
12 | * Create an instance of node-fetch with managed cookies
13 | *
14 | * @param {ScrapeTAExtraParams} query
15 | * @returns {Promise}
16 | */
17 | async function withCookies(query: ScrapeTAExtraParams): Promise {
18 | try {
19 | const { default: fetchCookie } = await import('fetch-cookie/node-fetch')
20 | const cookieJar = typeof query.cookieJar === 'boolean' ? undefined : query.cookieJar
21 | return fetchCookie(nodeFetch, cookieJar) as typeof nodeFetch
22 | } catch (e) {
23 | throw new Error('Please run `npm install fetch-cookie` to use the cookieJar option.')
24 | }
25 | }
26 |
27 | /**
28 | * Get HTML body and transform it as user-designed object
29 | *
30 | * @param {ScrapeTARequest} query
31 | * @param {ScrapeTAScheme} scheme
32 | *
33 | * @returns {Promise}
34 | */
35 | export async function scrapeTA(
36 | request: ScrapeTARequest,
37 | scheme: ScrapeTAScheme
38 | ): Promise {
39 | const fetch =
40 | typeof request === 'object' && 'cookieJar' in request && request.cookieJar
41 | ? await withCookies(request)
42 | : nodeFetch
43 | const requestInfo = ((typeof request === 'object' && 'url' in request && request.url) ||
44 | request) as RequestInfo
45 | const requestInit = typeof request === 'object' ? (request as RequestInit) : undefined
46 | const response = await fetch(requestInfo, requestInit)
47 | const responseHTML = await response.text()
48 | const dataModeler = new DataModeler(responseHTML)
49 | const usableScheme = new SchemeInterpreter(scheme)
50 | const data = await dataModeler.generate(usableScheme)
51 | return { response, data }
52 | }
53 |
54 | export * from './typings'
55 |
--------------------------------------------------------------------------------
/src/typings.ts:
--------------------------------------------------------------------------------
1 | import { RequestInfo, RequestInit, Response } from 'node-fetch'
2 | import { CookieJar } from 'fetch-cookie'
3 | import { SchemeInterpreter } from './SchemeInterpreter'
4 |
5 | export type ScrapeTAExtraParams = {
6 | url: RequestInfo
7 | cookieJar?: boolean | CookieJar
8 | }
9 |
10 | export type ScrapeTARequest = RequestInfo | (ScrapeTAExtraParams & RequestInit)
11 |
12 | export type ScrapeTAResult = {
13 | response: Response
14 | data: Record
15 | }
16 |
17 | type TSchemeInterpreter = Partial<
18 | Pick<
19 | SchemeInterpreter,
20 | 'selector' | 'accessor' | 'attribute' | 'trim' | 'listModel' | 'transformer'
21 | >
22 | >
23 |
24 | export type ScrapeTAScheme = {
25 | [key: string]: string | TSchemeInterpreter | ScrapeTAScheme
26 | }
27 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | /* Basic Options */
4 | "target": "ESNext",
5 | "module": "CommonJS",
6 | "lib": ["ESNext", "DOM"],
7 | "allowJs": true,
8 | "checkJs": true,
9 | "declaration": true,
10 | "declarationMap": false,
11 | "sourceMap": false,
12 | "outDir": "dist",
13 | "rootDir": "src",
14 | "removeComments": true,
15 | "importHelpers": true,
16 | "downlevelIteration": true,
17 | "esModuleInterop": true,
18 |
19 | /* Strict Type-Checking Options */
20 | "strict": true,
21 | "noImplicitAny": true,
22 | "strictNullChecks": true,
23 | "strictFunctionTypes": true,
24 | "strictBindCallApply": true,
25 | "strictPropertyInitialization": true,
26 | "noImplicitThis": true,
27 | "alwaysStrict": true,
28 |
29 | /* Additional Checks */
30 | "noUnusedLocals": true,
31 | "noUnusedParameters": true,
32 | "noImplicitReturns": true,
33 | "noFallthroughCasesInSwitch": true,
34 | "suppressImplicitAnyIndexErrors": true,
35 |
36 | /* Module Resolution Options */
37 | "moduleResolution": "node",
38 | "resolveJsonModule": true,
39 | "baseUrl": "src",
40 | "paths": {
41 | "@/*": ["*"],
42 | },
43 |
44 | /* Advanced Options */
45 | "forceConsistentCasingInFileNames": true,
46 | },
47 | "include": ["src"],
48 | "exclude": ["node_modules", "dist"]
49 | }
50 |
--------------------------------------------------------------------------------