├── .editorconfig ├── .eslintrc ├── .github └── workflows │ └── build-test.yaml ├── .gitignore ├── .npmignore ├── .prettierrc ├── CHANGELOG.md ├── LICENSE ├── README.md ├── __tests__ ├── public │ └── index.html └── scraping.test.ts ├── changelog-ci-config.json ├── jest.config.js ├── package.json ├── src ├── DataModeler.ts ├── SchemeInterpreter.ts ├── index.ts └── typings.ts ├── tsconfig.json └── yarn.lock /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | -------------------------------------------------------------------------------- /.eslintrc: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "es2021": true, 4 | "node": true 5 | }, 6 | "parser": "@typescript-eslint/parser", 7 | "parserOptions": { 8 | "ecmaVersion": 2020, 9 | "sourceType": "module", 10 | "impliedStrict": true 11 | }, 12 | "extends": [ 13 | "standard", 14 | "plugin:@typescript-eslint/recommended", 15 | "prettier/@typescript-eslint", 16 | "plugin:prettier/recommended" 17 | ], 18 | "plugins": [ 19 | "@typescript-eslint" 20 | ], 21 | "rules": {}, 22 | "ignorePatterns": [ 23 | "**/*.js" 24 | ] 25 | } 26 | -------------------------------------------------------------------------------- /.github/workflows/build-test.yaml: -------------------------------------------------------------------------------- 1 | name: Build & test 2 | 3 | on: 4 | push: 5 | branches-ignore: 6 | - master 7 | pull_request: 8 | branches-ignore: 9 | - master 10 | 11 | jobs: 12 | build: 13 | runs-on: ${{ matrix.os }} 14 | 15 | strategy: 16 | matrix: 17 | node-version: [10.x, 12.x, 14.x] 18 | os: [ubuntu-latest, windows-latest, macos-latest] 19 | 20 | steps: 21 | - name: Checkout repository 22 | uses: actions/checkout@v2 23 | with: 24 | fetch-depth: 0 25 | 26 | - name: Setup Node.js ${{ matrix.node-version }} 27 | uses: actions/setup-node@v1 28 | with: 29 | node-version: ${{ matrix.node-version }} 30 | 31 | - name: Install dependencies 32 | run: yarn --check-files --non-interactive 33 | 34 | - name: Build project 35 | run: yarn run build 36 | 37 | - name: Lint code 38 | run: yarn lintfix 39 | 40 | - name: Run unit tests 41 | run: yarn test 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # Snowpack dependency directory (https://snowpack.dev/) 45 | web_modules/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | .parcel-cache 78 | 79 | # Next.js build output 80 | .next 81 | out 82 | 83 | # Nuxt.js build / generate output 84 | .nuxt 85 | dist 86 | 87 | # Gatsby files 88 | .cache/ 89 | # Comment in the public line in if your project uses Gatsby and not Next.js 90 | # https://nextjs.org/blog/next-9-1#public-directory-support 91 | # public 92 | 93 | # vuepress build output 94 | .vuepress/dist 95 | 96 | # Serverless directories 97 | .serverless/ 98 | 99 | # FuseBox cache 100 | .fusebox/ 101 | 102 | # DynamoDB Local files 103 | .dynamodb/ 104 | 105 | # TernJS port file 106 | .tern-port 107 | 108 | # Stores VSCode versions used for testing VSCode extensions 109 | .vscode-test 110 | 111 | # yarn v2 112 | .yarn/cache 113 | .yarn/unplugged 114 | .yarn/build-state.yml 115 | .yarn/install-state.gz 116 | .pnp.* 117 | 118 | # exclusive use of yarn 119 | package-lock.json 120 | local 121 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | src/ 2 | __tests__/ 3 | .github/ 4 | local 5 | .editorconfig 6 | .eslintrc 7 | .prettierrc 8 | changelog-ci-config.json 9 | CHANGELOG.md 10 | jest.config.js 11 | tsconfig.json -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "printWidth": 90, 3 | "trailingComma": "none", 4 | "arrowParens": "always", 5 | "tabWidth": 2, 6 | "semi": false, 7 | "singleQuote": true 8 | } 9 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tanukijs/scrape-them-all/d34eb74f88754006e138e60f7e99321f32fa462f/CHANGELOG.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Tanuki 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | Scrape Them All 3 |

4 |

5 | License 6 | Build & tests 7 |

8 | 9 | **[Scrape-Them-All](http://npmjs.com/package/scrape-them-all)** is a [Cheerio](https://cheerio.js.org) layer which improves your scraping experience. 10 | 11 | **This package is recent, if you have any suggestions or you notice that something is not working, feel free to open an issue or a pull-request, we will be happy to answer them as soon as possible** 12 | 13 | --- 14 | 15 | # 📦 Installation 16 | 17 | ```sh 18 | # Using NPM 19 | npm install --save scrape-them-all 20 | npm install --save fetch-cookie #optional 21 | 22 | # Using Yarn 23 | yarn add scrape-them-all 24 | yarn add fetch-cookie #optional 25 | ``` 26 | 27 | `fetch-cookie` is only required if you plan to use the `cookieJar` option on requests. 28 | 29 | **⚠ If you get a `too many redirects` error when you scrape, we recommend to install `fetch-cookie` and use the option `cookieJar: true` in your request. You can also pass an instance of `tough.CookieJar` to this parameter.** 30 | 31 | Example: 32 | 33 | ```js 34 | scrapeTA({ url: 'https://google.com', cookieJar: true }, ...) 35 | ``` 36 | 37 | --- 38 | 39 | # 📚 Documentation 40 | 41 | ### `scrapeTA(query, schema)` 42 | 43 | Params: 44 | 45 | - **query** `String` or `Object`: The page url or the page url and node-fetch options. 46 | - **schema** `Object`: the list of elements to scrape and the corresponding HTML tags. 47 | 48 | Returns: 49 | 50 | - `Promise`: A promise containing the result as JSON. 51 | 52 | ## Schema options 53 | 54 | | Option | Type | Description | 55 | | ------------- | ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------- | 56 | | **selector** | `String` or `Object` | Can be a string expression, DOM Element, array of DOM elements, or cheerio object. | | 57 | | **trim** | `Boolean` | Trim whitespaces in the result. **Default as `true`**. | 58 | | **attribute** | `String` | Return the value of the indicated attribute on the selected element. | 59 | | **accessor** | `String` or `Function` | Cheerio access method name (like `html` for returning html code) or a custom function that take a Cheerio instance as first parameter. | 60 | | **transformer** | `Function` | The first parameter is your current value for the selected item. Can return a `Promise`. | 61 | | **listModel** | `Object` | Contains the options stated above in case of a list. | 62 | 63 | ## Example output 64 | 65 | ```json 66 | { 67 | "title": "An amazing game", 68 | "description": "

With an amazing description

", 69 | "image": "https://amazing.game/image.jpg", 70 | "price": 10.99, 71 | "users": [ 72 | { 73 | "username": "Tanuki", 74 | "badges": [ 75 | { "name": "An amazing player" }, 76 | ... 77 | ] 78 | }, 79 | ... 80 | ] 81 | } 82 | ``` 83 | 84 | ## The code that goes with it 85 | 86 | ```js 87 | const { ScrapeTA } = require('scrape-them-all') 88 | ScrapeTA('url_or_https_options', { 89 | title: '.header h1', 90 | description: { 91 | selector: '.header p', 92 | accessor: 'html', 93 | // accessor: selected => selected.html(), 94 | trim: false 95 | }, 96 | image: { 97 | selector: 'img', 98 | attribute: 'src' 99 | }, 100 | price: { 101 | selector: '.footer #price', 102 | transformer: (value) => parseFloat(value) 103 | }, 104 | users: { 105 | selector: '.body .users', 106 | listModel: { 107 | username: '.username', 108 | badges: { 109 | selector: '.badges', 110 | listModel: { 111 | name: '.badgeName' 112 | } 113 | } 114 | } 115 | } 116 | }) 117 | .then((data) => console.log(data)) 118 | .catch((error) => console.error(error)) 119 | ``` 120 | 121 | --- 122 | 123 | # 💪 Contributions 124 | 125 | TODO 126 | 127 | --- 128 | 129 | # 📜 License 130 | 131 | [MIT](https://github.com/tanukijs/scrape-them-all/blob/typescript/LICENSE) © [Tanuki](https://github.com/tanukijs), [Aperrix](https://github.com/Aperrix). 132 | -------------------------------------------------------------------------------- /__tests__/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | ScrapeThemAll Exampels 6 | 7 | 8 | dog 9 |

Title

10 |

Lorem ipsum

11 |

12 | Line0
13 | Line1 14 |

15 |

16 | 17 | Line0
18 | Line1 19 |
20 | Line2 21 |

22 |

1988-01-01

23 | 31 |
32 | Foo 33 |
34 | 1 35 |
36 | 2 37 |
38 |
39 |
40 | 41 | 42 | 43 | 46 | 49 | 50 | 51 | 52 | 53 | 55 | 58 | 59 | 60 | 62 | 65 | 66 | 67 |
44 |
Sydney
45 |
47 |
info
48 |
one way street 54 | 56 |
info-1
57 |
GT Road 61 | 63 |
info-2
64 |
68 | 69 | 70 | -------------------------------------------------------------------------------- /__tests__/scraping.test.ts: -------------------------------------------------------------------------------- 1 | import { scrapeTA } from '../src' 2 | import { createServer, Server } from 'http' 3 | import { readFileSync } from 'fs' 4 | import { join } from 'path' 5 | 6 | let server: Server 7 | const port = 8080 8 | 9 | beforeAll((done) => { 10 | server = createServer((_req, res) => { 11 | const indexPath = join(__dirname, 'public/index.html') 12 | const indexHTML = readFileSync(indexPath) 13 | res.write(indexHTML) 14 | res.end() 15 | }) 16 | server.listen(port, () => done() && console.log('Server running on port 8080')) 17 | }) 18 | afterAll((done) => { 19 | server.close(() => done()) 20 | }) 21 | 22 | describe('Scrape basic data', () => { 23 | test('Directly target the HTML element', async () => { 24 | const { response, data } = await scrapeTA(`http://localhost:${port}`, { 25 | title: 'h1.title', 26 | description: '.description', 27 | date: { 28 | selector: '.date', 29 | transformer: (x) => new Date(x) 30 | } 31 | }) 32 | expect(response.ok).toBe(true) 33 | expect(data).toEqual({ 34 | title: 'Title', 35 | description: 'Lorem ipsum', 36 | date: new Date('1988-01-01') 37 | }) 38 | }) 39 | 40 | test('Use a reserved keyword', async () => { 41 | const { response, data } = await scrapeTA(`http://localhost:${port}`, { 42 | title: 'h1.title', 43 | _attribute: { 44 | selector: 'img', 45 | attribute: 'src' 46 | } 47 | }) 48 | expect(response.ok).toBe(true) 49 | expect(data).toEqual({ 50 | title: 'Title', 51 | attribute: 52 | 'https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/ee/ee276885cdbec23bdb9780509210c3c24dc7070e_full.jpg' 53 | }) 54 | }) 55 | }) 56 | 57 | describe('Scrape list', () => { 58 | test('With transform', async () => { 59 | const { response, data } = await scrapeTA(`http://localhost:${port}`, { 60 | features: { 61 | selector: '.features', 62 | listModel: { 63 | selector: 'li', 64 | transformer: (x) => parseInt(x, 10) 65 | } 66 | } 67 | }) 68 | expect(response.ok).toBe(true) 69 | expect(data).toEqual({ 70 | features: [1, 2, 3, 4, 5, 6] 71 | }) 72 | }) 73 | 74 | test('Without transform', async () => { 75 | const { response, data } = await scrapeTA(`http://localhost:${port}`, { 76 | features: { 77 | selector: '.features', 78 | listModel: { 79 | selector: 'li' 80 | } 81 | } 82 | }) 83 | expect(response.ok).toBe(true) 84 | expect(data).toEqual({ 85 | features: ['1', '2', '3', '4', '5', '6'] 86 | }) 87 | }) 88 | }) 89 | 90 | describe('Scrape nested object', () => { 91 | test('Object nested with multiple custom keys', async () => { 92 | const { response, data } = await scrapeTA(`http://localhost:${port}`, { 93 | nested: { 94 | selector: '.nested', 95 | foo: { 96 | level1: { 97 | selector: '.level1', 98 | level2: { 99 | selector: 'span', 100 | accessor: (x) => x.eq(1).text() 101 | } 102 | }, 103 | level1Text: { 104 | selector: 'span' 105 | }, 106 | level2Text: { 107 | selector: '.level2' 108 | } 109 | } 110 | } 111 | }) 112 | expect(response.ok).toBe(true) 113 | expect(data).toEqual({ 114 | nested: { 115 | foo: { 116 | level1: { 117 | level2: '2' 118 | }, 119 | level1Text: 'Foo12', 120 | level2Text: '2' 121 | } 122 | } 123 | }) 124 | }) 125 | 126 | test('Scrape tables using accessor', async () => { 127 | const { response, data } = await scrapeTA(`http://localhost:${port}`, { 128 | addresses: { 129 | selector: 'table tbody tr', 130 | listModel: { 131 | address: '.address', 132 | city: { 133 | accessor: (x) => x.closest('table').find('thead .city').text() 134 | } 135 | } 136 | } 137 | }) 138 | expect(response.ok).toBe(true) 139 | expect(data).toEqual({ 140 | addresses: [ 141 | { address: 'one way street', city: 'Sydney' }, 142 | { address: 'GT Road', city: 'Sydney' } 143 | ] 144 | }) 145 | }) 146 | }) 147 | 148 | describe('Scrape using options', () => { 149 | test('Store redirections using cookieJar option', async () => { 150 | const { response, data } = await scrapeTA( 151 | { url: 'http://www.krosmoz.com/en/almanax/2020-01-01', cookieJar: true }, 152 | { 153 | month: { 154 | selector: '#almanax_day .day-text' 155 | } 156 | } 157 | ) 158 | expect(response.ok).toBe(true) 159 | expect(data).toEqual({ 160 | month: 'Javian' 161 | }) 162 | }) 163 | 164 | test('Get data modified by AJAX using headers option', async () => { 165 | const { response, data } = await scrapeTA( 166 | { 167 | url: 168 | 'https://www.dofus.com/en/mmorpg/encyclopedia/pets/11950-ankascraper?level=100&_pjax=.ak-item-details-container', 169 | headers: { 170 | 'x-requested-with': 'XMLHttpRequest', 171 | 'x-pjax': 'true', 172 | 'x-pjax-container': '.ak-item-details-container' 173 | } 174 | }, 175 | { 176 | effect: { 177 | selector: '.ak-container.ak-content-list.ak-displaymode-col .ak-title', 178 | accessor: (x) => x.eq(0).text() 179 | } 180 | } 181 | ) 182 | expect(response.ok).toBe(true) 183 | expect(data).toEqual({ 184 | effect: '120 Chance' 185 | }) 186 | }) 187 | }) 188 | 189 | describe('Herror handling', () => { 190 | test('Scrape invalid URL', async () => { 191 | await expect( 192 | scrapeTA('http://gertkafgzngegzegerj.com', { 193 | title: 'h1.title' 194 | }) 195 | ).rejects.toThrow() 196 | }) 197 | 198 | test('Use reserved keyword directly', async () => { 199 | await expect( 200 | scrapeTA(`http://localhost:${port}`, { 201 | attribute: 'h1.title' 202 | }) 203 | ).rejects.toThrow('Root object must be a nested object.') 204 | }) 205 | 206 | test('Use reserved keyword in nested object', async () => { 207 | await expect( 208 | scrapeTA(`http://localhost:${port}`, { 209 | accessor: { 210 | img: { 211 | selector: 'img', 212 | attribute: 'src' 213 | } 214 | } 215 | }) 216 | ).rejects.toThrow( 217 | 'The property "accessor" expects a string or a function. If you want to use "accessor" as a result key, prefix it with an underscore (the first will be stripped automatically).' 218 | ) 219 | }) 220 | }) 221 | -------------------------------------------------------------------------------- /changelog-ci-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "header_prefix": "Release:", 3 | "group_config": [ 4 | { 5 | "title": ":bug: Bug Fixes", 6 | "labels": ["bug", "bugfix"] 7 | }, 8 | { 9 | "title": ":rocket: Code Improvements", 10 | "labels": ["improvements", "enhancement"] 11 | }, 12 | { 13 | "title": ":sparkles: New Features", 14 | "labels": ["feature"] 15 | }, 16 | { 17 | "title": ":books: Documentation Updates", 18 | "labels": ["docs", "documentation", "doc"] 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | // For a detailed explanation regarding each configuration property, visit: 2 | // https://jestjs.io/docs/en/configuration.html 3 | 4 | module.exports = { 5 | // All imported modules in your tests should be mocked automatically 6 | // automock: false, 7 | 8 | // Stop running tests after `n` failures 9 | // bail: 0, 10 | 11 | // The directory where Jest should store its cached dependency information 12 | // cacheDirectory: "/tmp/jest_rs", 13 | 14 | // Automatically clear mock calls and instances between every test 15 | clearMocks: true, 16 | 17 | // Indicates whether the coverage information should be collected while executing the test 18 | // collectCoverage: false, 19 | 20 | // An array of glob patterns indicating a set of files for which coverage information should be collected 21 | // collectCoverageFrom: undefined, 22 | 23 | // The directory where Jest should output its coverage files 24 | coverageDirectory: "coverage", 25 | 26 | // An array of regexp pattern strings used to skip coverage collection 27 | // coveragePathIgnorePatterns: [ 28 | // "/node_modules/" 29 | // ], 30 | 31 | // Indicates which provider should be used to instrument code for coverage 32 | coverageProvider: "v8", 33 | 34 | // A list of reporter names that Jest uses when writing coverage reports 35 | // coverageReporters: [ 36 | // "json", 37 | // "text", 38 | // "lcov", 39 | // "clover" 40 | // ], 41 | 42 | // An object that configures minimum threshold enforcement for coverage results 43 | // coverageThreshold: undefined, 44 | 45 | // A path to a custom dependency extractor 46 | // dependencyExtractor: undefined, 47 | 48 | // Make calling deprecated APIs throw helpful error messages 49 | // errorOnDeprecated: false, 50 | 51 | // Force coverage collection from ignored files using an array of glob patterns 52 | // forceCoverageMatch: [], 53 | 54 | // A path to a module which exports an async function that is triggered once before all test suites 55 | // globalSetup: undefined, 56 | 57 | // A path to a module which exports an async function that is triggered once after all test suites 58 | // globalTeardown: undefined, 59 | 60 | // A set of global variables that need to be available in all test environments 61 | // globals: {}, 62 | 63 | // The maximum amount of workers used to run your tests. Can be specified as % or a number. E.g. maxWorkers: 10% will use 10% of your CPU amount + 1 as the maximum worker number. maxWorkers: 2 will use a maximum of 2 workers. 64 | // maxWorkers: "50%", 65 | 66 | // An array of directory names to be searched recursively up from the requiring module's location 67 | // moduleDirectories: [ 68 | // "node_modules" 69 | // ], 70 | 71 | // An array of file extensions your modules use 72 | // moduleFileExtensions: [ 73 | // "js", 74 | // "json", 75 | // "jsx", 76 | // "ts", 77 | // "tsx", 78 | // "node" 79 | // ], 80 | 81 | // A map from regular expressions to module names or to arrays of module names that allow to stub out resources with a single module 82 | // moduleNameMapper: {}, 83 | 84 | // An array of regexp pattern strings, matched against all module paths before considered 'visible' to the module loader 85 | // modulePathIgnorePatterns: [], 86 | 87 | // Activates notifications for test results 88 | // notify: false, 89 | 90 | // An enum that specifies notification mode. Requires { notify: true } 91 | // notifyMode: "failure-change", 92 | 93 | // A preset that is used as a base for Jest's configuration 94 | // preset: undefined, 95 | 96 | // Run tests from one or more projects 97 | // projects: undefined, 98 | 99 | // Use this configuration option to add custom reporters to Jest 100 | // reporters: undefined, 101 | 102 | // Automatically reset mock state between every test 103 | // resetMocks: false, 104 | 105 | // Reset the module registry before running each individual test 106 | // resetModules: false, 107 | 108 | // A path to a custom resolver 109 | // resolver: undefined, 110 | 111 | // Automatically restore mock state between every test 112 | // restoreMocks: false, 113 | 114 | // The root directory that Jest should scan for tests and modules within 115 | // rootDir: undefined, 116 | 117 | // A list of paths to directories that Jest should use to search for files in 118 | // roots: [ 119 | // "" 120 | // ], 121 | 122 | // Allows you to use a custom runner instead of Jest's default test runner 123 | // runner: "jest-runner", 124 | 125 | // The paths to modules that run some code to configure or set up the testing environment before each test 126 | // setupFiles: [], 127 | 128 | // A list of paths to modules that run some code to configure or set up the testing framework before each test 129 | // setupFilesAfterEnv: [], 130 | 131 | // The number of seconds after which a test is considered as slow and reported as such in the results. 132 | // slowTestThreshold: 5, 133 | 134 | // A list of paths to snapshot serializer modules Jest should use for snapshot testing 135 | // snapshotSerializers: [], 136 | 137 | // The test environment that will be used for testing 138 | testEnvironment: "node", 139 | 140 | // Options that will be passed to the testEnvironment 141 | // testEnvironmentOptions: {}, 142 | 143 | // Adds a location field to test results 144 | // testLocationInResults: false, 145 | 146 | // The glob patterns Jest uses to detect test files 147 | testMatch: [ 148 | "**/__tests__/**/*.+(ts|tsx|js)", 149 | "**/?(*.)+(spec|test).+(ts|tsx|js)" 150 | ], 151 | 152 | // An array of regexp pattern strings that are matched against all test paths, matched tests are skipped 153 | testPathIgnorePatterns: [ 154 | "/node_modules/", 155 | "/dist/" 156 | ], 157 | 158 | // The regexp pattern or array of patterns that Jest uses to detect test files 159 | // testRegex: [], 160 | 161 | // This option allows the use of a custom results processor 162 | // testResultsProcessor: undefined, 163 | 164 | // This option allows use of a custom test runner 165 | // testRunner: "jasmine2", 166 | 167 | // This option sets the URL for the jsdom environment. It is reflected in properties such as location.href 168 | // testURL: "http://localhost", 169 | 170 | // Setting this value to "fake" allows the use of fake timers for functions such as "setTimeout" 171 | // timers: "real", 172 | 173 | // A map from regular expressions to paths to transformers 174 | transform: { 175 | "^.+\\.(ts|tsx)$": "ts-jest" 176 | }, 177 | 178 | // An array of regexp pattern strings that are matched against all source file paths, matched files will skip transformation 179 | // transformIgnorePatterns: [ 180 | // "/node_modules/", 181 | // "\\.pnp\\.[^\\/]+$" 182 | // ], 183 | 184 | // An array of regexp pattern strings that are matched against all modules before the module loader will automatically return a mock for them 185 | // unmockedModulePathPatterns: undefined, 186 | 187 | // Indicates whether each individual test should be reported during the run 188 | // verbose: undefined, 189 | 190 | // An array of regexp patterns that are matched against all source file paths before re-running tests in watch mode 191 | // watchPathIgnorePatterns: [], 192 | 193 | // Whether to use watchman for file crawling 194 | // watchman: true, 195 | }; 196 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scrape-them-all", 3 | "description": "🚀 An easy-to-handle Node.js scraper that allow you to scrape them all in a record time.", 4 | "version": "1.0.4", 5 | "license": "MIT", 6 | "author": "Tanuki (https://github.com/tanukijs)", 7 | "contributors": [ 8 | "Aperrix (https://github.com/Aperrix)" 9 | ], 10 | "repository": "https://github.com/tanukijs/scrape-them-all.git", 11 | "source": "src/index.ts", 12 | "main": "dist/index.js", 13 | "types": "dist/index.d.ts", 14 | "engines": { 15 | "node": ">=10" 16 | }, 17 | "keywords": [ 18 | "scraper", 19 | "node-scraper", 20 | "scraping", 21 | "web-scraping", 22 | "htmlparser", 23 | "parser", 24 | "parsing", 25 | "crawler", 26 | "crawling" 27 | ], 28 | "scripts": { 29 | "lint": "eslint --ext .ts --ignore-path .gitignore .", 30 | "lintfix": "eslint --fix --ext .ts --ignore-path .gitignore .", 31 | "build": "rimraf dist && tsc -p tsconfig.json", 32 | "test": "jest", 33 | "prepare": "npm run build && npm run test" 34 | }, 35 | "dependencies": { 36 | "cheerio": "^1.0.0-rc.3", 37 | "node-fetch": "^2.6.1" 38 | }, 39 | "devDependencies": { 40 | "@types/cheerio": "^0.22.22", 41 | "@types/jest": "^26.0.14", 42 | "@types/node": "^14.11.8", 43 | "@types/node-fetch": "^2.5.7", 44 | "@typescript-eslint/eslint-plugin": "^4.4.1", 45 | "@typescript-eslint/parser": "^4.4.1", 46 | "eslint": "^7.11.0", 47 | "eslint-config-prettier": "^6.12.0", 48 | "eslint-config-standard": "^14.1.1", 49 | "eslint-plugin-import": "^2.22.1", 50 | "eslint-plugin-node": "^11.1.0", 51 | "eslint-plugin-prettier": "^3.1.4", 52 | "eslint-plugin-promise": "^4.2.1", 53 | "eslint-plugin-standard": "^4.0.1", 54 | "fetch-cookie": "^0.10.1", 55 | "jest": "^26.5.3", 56 | "prettier": "^2.1.2", 57 | "rimraf": "^3.0.2", 58 | "ts-jest": "^26.4.1", 59 | "tslib": "^2.0.3", 60 | "typescript": "^4.0.3" 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/DataModeler.ts: -------------------------------------------------------------------------------- 1 | import cheerio from 'cheerio' 2 | import { EOptionType, SchemeInterpreter } from './SchemeInterpreter' 3 | 4 | export class DataModeler { 5 | private $root: cheerio.Root 6 | 7 | constructor(body: string) { 8 | this.$root = cheerio.load(body) 9 | } 10 | 11 | /** 12 | * Generate data from HTML body & user-designed JSON scheme 13 | * 14 | * @param {SchemeInterpreter} opts 15 | * @param {cheerio.Cheerio} [context] 16 | * 17 | * @returns {Promise>} 18 | */ 19 | async generate( 20 | opts: SchemeInterpreter, 21 | context?: cheerio.Cheerio 22 | ): Promise> { 23 | if (opts.type !== EOptionType.OBJECT) 24 | throw new Error('Root object must be a nested object.') 25 | 26 | const mappedResult = {} 27 | 28 | for (const key in opts.children) { 29 | const value = new SchemeInterpreter(opts.children[key]) 30 | const cheerioRoot = 31 | context && value.selector 32 | ? context.find(value.selector) 33 | : context || this.$root(value.selector) 34 | 35 | if (value.type === EOptionType.OBJECT) { 36 | mappedResult[key] = await this.generate(value, cheerioRoot) 37 | continue 38 | } 39 | 40 | const result = 41 | value.type === EOptionType.VALUE 42 | ? this.processValue(cheerioRoot, value) 43 | : value.type === EOptionType.ARRAY 44 | ? this.processArray(cheerioRoot, value.listModel as SchemeInterpreter) 45 | : value.type === EOptionType.OBJECT_ARRAY 46 | ? this.processObjectArray(cheerioRoot, value.listModel as SchemeInterpreter) 47 | : undefined 48 | mappedResult[key] = await (Array.isArray(result) ? Promise.all(result) : result) 49 | } 50 | 51 | return mappedResult 52 | } 53 | 54 | /** 55 | * Process single item 56 | * 57 | * @param {cheerio.Cheerio} element 58 | * @param {SchemeInterpreter} opts 59 | * 60 | * @returns {unknown} 61 | */ 62 | private processValue(element: cheerio.Cheerio, opts: SchemeInterpreter): unknown { 63 | let value = 64 | typeof opts.accessor === 'function' 65 | ? opts.accessor(element) 66 | : typeof opts.accessor === 'string' && typeof element[opts.accessor] !== undefined 67 | ? element[opts.accessor]() 68 | : null 69 | 70 | if (opts.attribute) value = element.attr(opts.attribute) 71 | if (opts.trim && value && typeof value === 'string') value = value.trim() 72 | if (opts.transformer) value = opts.transformer(value) 73 | 74 | return value 75 | } 76 | 77 | /** 78 | * Process basic list 79 | * 80 | * @param {cheerio.Cheerio} element 81 | * @param {SchemeInterpreter} listModel 82 | * 83 | * @returns {unknown[]} 84 | */ 85 | private processArray( 86 | element: cheerio.Cheerio, 87 | listModel: SchemeInterpreter 88 | ): unknown[] { 89 | const values = [] 90 | const children = element.find(listModel.selector) 91 | for (let i = 0; i < children.length; i++) { 92 | const value = this.processValue(children.eq(i), listModel) 93 | values.push(value) 94 | } 95 | return values 96 | } 97 | 98 | /** 99 | * Process list of objects 100 | * 101 | * @param {cheerio.Cheerio} element 102 | * @param {SchemeInterpreter} listModel 103 | * 104 | * @returns {Promise>[]} 105 | */ 106 | private processObjectArray( 107 | element: cheerio.Cheerio, 108 | listModel: SchemeInterpreter 109 | ): Promise>[] { 110 | const values = [] 111 | for (let i = 0; i < element.length; i++) { 112 | const value = this.generate(listModel, element.eq(i)) 113 | values.push(value) 114 | } 115 | return values 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/SchemeInterpreter.ts: -------------------------------------------------------------------------------- 1 | import { ScrapeTAScheme } from './typings' 2 | 3 | export const enum EOptionType { 4 | VALUE, 5 | ARRAY, 6 | OBJECT, 7 | OBJECT_ARRAY 8 | } 9 | 10 | export class SchemeInterpreter { 11 | readonly selector: string = '' 12 | readonly trim: boolean = true 13 | readonly accessor: string | ((node: cheerio.Cheerio) => unknown) = 'text' 14 | readonly attribute?: string 15 | readonly transformer?: (value: string) => unknown 16 | readonly listModel?: string | ScrapeTAScheme | SchemeInterpreter 17 | readonly children: Record = {} 18 | 19 | constructor(opts: string | Partial = '') { 20 | if (typeof opts === 'string') { 21 | this.selector = opts 22 | } else { 23 | this.selector = opts.selector || '' 24 | this.trim = opts.trim || true 25 | this.accessor = opts.accessor || 'text' 26 | this.attribute = opts.attribute 27 | this.transformer = opts.transformer 28 | this.listModel = opts.listModel ? new SchemeInterpreter(opts.listModel) : undefined 29 | 30 | const reservedKeys = Object.keys(this) 31 | for (const key in opts) { 32 | const normalizedKey = key[0] === '_' ? key.slice(1) : key 33 | if (reservedKeys.includes(key) && normalizedKey === key) continue 34 | this.children[normalizedKey] = opts[key] 35 | } 36 | } 37 | this.validate() 38 | } 39 | 40 | /** 41 | * Get type of an input 42 | * 43 | * @param {ScrapeTAScheme[K]} scheme 44 | * @returns {EOptionType} 45 | */ 46 | public get type(): EOptionType { 47 | if (Object.keys(this.children).length > 0) return EOptionType.OBJECT 48 | if (!this.listModel) return EOptionType.VALUE 49 | 50 | if ( 51 | this.listModel instanceof SchemeInterpreter && 52 | Object.keys(this.listModel.children).length > 0 53 | ) 54 | return EOptionType.OBJECT_ARRAY 55 | return EOptionType.ARRAY 56 | } 57 | 58 | /** 59 | * Validate current SchemeInterpreter object 60 | * 61 | * @returns {void} 62 | * @throws {Error} 63 | */ 64 | public validate(): void { 65 | const expected = [ 66 | { property: 'selector', equalsTo: ['string'] }, 67 | { property: 'trim', equalsTo: ['boolean'] }, 68 | { property: 'accessor', equalsTo: ['string', 'function'] }, 69 | { property: 'attribute', equalsTo: ['string'] }, 70 | { property: 'transformer', equalsTo: ['function'] }, 71 | { property: 'listModel', equalsTo: ['string', 'object'] } 72 | ] 73 | 74 | for (const { property, equalsTo } of expected) { 75 | if (!this[property]) continue 76 | const asExpectedValue = equalsTo.map((type) => typeof this[property] === type) 77 | if (asExpectedValue.includes(true)) continue 78 | const errorTypes = equalsTo.join(' or a ') 79 | const errorMessage = [ 80 | `The property "${property}" expects a ${errorTypes}.`, 81 | `If you want to use "${property}" as a result key, prefix it with an underscore (the first will be stripped automatically).` 82 | ].join(' ') 83 | throw new Error(errorMessage) 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import nodeFetch, { RequestInfo, RequestInit } from 'node-fetch' 2 | import { SchemeInterpreter } from './SchemeInterpreter' 3 | import { DataModeler } from './DataModeler' 4 | import { 5 | ScrapeTAExtraParams, 6 | ScrapeTARequest, 7 | ScrapeTAScheme, 8 | ScrapeTAResult 9 | } from './typings' 10 | 11 | /** 12 | * Create an instance of node-fetch with managed cookies 13 | * 14 | * @param {ScrapeTAExtraParams} query 15 | * @returns {Promise} 16 | */ 17 | async function withCookies(query: ScrapeTAExtraParams): Promise { 18 | try { 19 | const { default: fetchCookie } = await import('fetch-cookie/node-fetch') 20 | const cookieJar = typeof query.cookieJar === 'boolean' ? undefined : query.cookieJar 21 | return fetchCookie(nodeFetch, cookieJar) as typeof nodeFetch 22 | } catch (e) { 23 | throw new Error('Please run `npm install fetch-cookie` to use the cookieJar option.') 24 | } 25 | } 26 | 27 | /** 28 | * Get HTML body and transform it as user-designed object 29 | * 30 | * @param {ScrapeTARequest} query 31 | * @param {ScrapeTAScheme} scheme 32 | * 33 | * @returns {Promise} 34 | */ 35 | export async function scrapeTA( 36 | request: ScrapeTARequest, 37 | scheme: ScrapeTAScheme 38 | ): Promise { 39 | const fetch = 40 | typeof request === 'object' && 'cookieJar' in request && request.cookieJar 41 | ? await withCookies(request) 42 | : nodeFetch 43 | const requestInfo = ((typeof request === 'object' && 'url' in request && request.url) || 44 | request) as RequestInfo 45 | const requestInit = typeof request === 'object' ? (request as RequestInit) : undefined 46 | const response = await fetch(requestInfo, requestInit) 47 | const responseHTML = await response.text() 48 | const dataModeler = new DataModeler(responseHTML) 49 | const usableScheme = new SchemeInterpreter(scheme) 50 | const data = await dataModeler.generate(usableScheme) 51 | return { response, data } 52 | } 53 | 54 | export * from './typings' 55 | -------------------------------------------------------------------------------- /src/typings.ts: -------------------------------------------------------------------------------- 1 | import { RequestInfo, RequestInit, Response } from 'node-fetch' 2 | import { CookieJar } from 'fetch-cookie' 3 | import { SchemeInterpreter } from './SchemeInterpreter' 4 | 5 | export type ScrapeTAExtraParams = { 6 | url: RequestInfo 7 | cookieJar?: boolean | CookieJar 8 | } 9 | 10 | export type ScrapeTARequest = RequestInfo | (ScrapeTAExtraParams & RequestInit) 11 | 12 | export type ScrapeTAResult = { 13 | response: Response 14 | data: Record 15 | } 16 | 17 | type TSchemeInterpreter = Partial< 18 | Pick< 19 | SchemeInterpreter, 20 | 'selector' | 'accessor' | 'attribute' | 'trim' | 'listModel' | 'transformer' 21 | > 22 | > 23 | 24 | export type ScrapeTAScheme = { 25 | [key: string]: string | TSchemeInterpreter | ScrapeTAScheme 26 | } 27 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Basic Options */ 4 | "target": "ESNext", 5 | "module": "CommonJS", 6 | "lib": ["ESNext", "DOM"], 7 | "allowJs": true, 8 | "checkJs": true, 9 | "declaration": true, 10 | "declarationMap": false, 11 | "sourceMap": false, 12 | "outDir": "dist", 13 | "rootDir": "src", 14 | "removeComments": true, 15 | "importHelpers": true, 16 | "downlevelIteration": true, 17 | "esModuleInterop": true, 18 | 19 | /* Strict Type-Checking Options */ 20 | "strict": true, 21 | "noImplicitAny": true, 22 | "strictNullChecks": true, 23 | "strictFunctionTypes": true, 24 | "strictBindCallApply": true, 25 | "strictPropertyInitialization": true, 26 | "noImplicitThis": true, 27 | "alwaysStrict": true, 28 | 29 | /* Additional Checks */ 30 | "noUnusedLocals": true, 31 | "noUnusedParameters": true, 32 | "noImplicitReturns": true, 33 | "noFallthroughCasesInSwitch": true, 34 | "suppressImplicitAnyIndexErrors": true, 35 | 36 | /* Module Resolution Options */ 37 | "moduleResolution": "node", 38 | "resolveJsonModule": true, 39 | "baseUrl": "src", 40 | "paths": { 41 | "@/*": ["*"], 42 | }, 43 | 44 | /* Advanced Options */ 45 | "forceConsistentCasingInFileNames": true, 46 | }, 47 | "include": ["src"], 48 | "exclude": ["node_modules", "dist"] 49 | } 50 | --------------------------------------------------------------------------------