├── .editorconfig
├── .eslintrc
├── .github
    └── workflows
    │   └── build-test.yaml
├── .gitignore
├── .npmignore
├── .prettierrc
├── CHANGELOG.md
├── LICENSE
├── README.md
├── __tests__
    ├── public
    │   └── index.html
    └── scraping.test.ts
├── changelog-ci-config.json
├── jest.config.js
├── package.json
├── src
    ├── DataModeler.ts
    ├── SchemeInterpreter.ts
    ├── index.ts
    └── typings.ts
├── tsconfig.json
└── yarn.lock


/.editorconfig:
--------------------------------------------------------------------------------
1 | root = true
2 | 
3 | [*]
4 | indent_style = space
5 | indent_size = 2
6 | charset = utf-8
7 | trim_trailing_whitespace = true
8 | insert_final_newline = true
9 | 


--------------------------------------------------------------------------------
/.eslintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "env": {
 3 |     "es2021": true,
 4 |     "node": true
 5 |   },
 6 |   "parser": "@typescript-eslint/parser",
 7 |   "parserOptions": {
 8 |     "ecmaVersion": 2020,
 9 |     "sourceType": "module",
10 |     "impliedStrict": true
11 |   },
12 |   "extends": [
13 |     "standard",
14 |     "plugin:@typescript-eslint/recommended",
15 |     "prettier/@typescript-eslint",
16 |     "plugin:prettier/recommended"
17 |   ],
18 |   "plugins": [
19 |     "@typescript-eslint"
20 |   ],
21 |   "rules": {},
22 |   "ignorePatterns": [
23 |     "**/*.js"
24 |   ]
25 | }
26 | 


--------------------------------------------------------------------------------
/.github/workflows/build-test.yaml:
--------------------------------------------------------------------------------
 1 | name: Build & test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches-ignore:
 6 |       - master
 7 |   pull_request:
 8 |     branches-ignore:
 9 |       - master
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ${{ matrix.os }}
14 | 
15 |     strategy:
16 |       matrix:
17 |         node-version: [10.x, 12.x, 14.x]
18 |         os: [ubuntu-latest, windows-latest, macos-latest]
19 | 
20 |     steps:
21 |       - name: Checkout repository
22 |         uses: actions/checkout@v2
23 |         with:
24 |           fetch-depth: 0
25 | 
26 |       - name: Setup Node.js ${{ matrix.node-version }}
27 |         uses: actions/setup-node@v1
28 |         with:
29 |           node-version: ${{ matrix.node-version }}
30 | 
31 |       - name: Install dependencies
32 |         run: yarn --check-files --non-interactive
33 |       
34 |       - name: Build project
35 |         run: yarn run build
36 |       
37 |       - name: Lint code
38 |         run: yarn lintfix
39 | 
40 |       - name: Run unit tests
41 |         run: yarn test
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | 
  9 | # Diagnostic reports (https://nodejs.org/api/report.html)
 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 11 | 
 12 | # Runtime data
 13 | pids
 14 | *.pid
 15 | *.seed
 16 | *.pid.lock
 17 | 
 18 | # Directory for instrumented libs generated by jscoverage/JSCover
 19 | lib-cov
 20 | 
 21 | # Coverage directory used by tools like istanbul
 22 | coverage
 23 | *.lcov
 24 | 
 25 | # nyc test coverage
 26 | .nyc_output
 27 | 
 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 29 | .grunt
 30 | 
 31 | # Bower dependency directory (https://bower.io/)
 32 | bower_components
 33 | 
 34 | # node-waf configuration
 35 | .lock-wscript
 36 | 
 37 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 38 | build/Release
 39 | 
 40 | # Dependency directories
 41 | node_modules/
 42 | jspm_packages/
 43 | 
 44 | # Snowpack dependency directory (https://snowpack.dev/)
 45 | web_modules/
 46 | 
 47 | # TypeScript cache
 48 | *.tsbuildinfo
 49 | 
 50 | # Optional npm cache directory
 51 | .npm
 52 | 
 53 | # Optional eslint cache
 54 | .eslintcache
 55 | 
 56 | # Microbundle cache
 57 | .rpt2_cache/
 58 | .rts2_cache_cjs/
 59 | .rts2_cache_es/
 60 | .rts2_cache_umd/
 61 | 
 62 | # Optional REPL history
 63 | .node_repl_history
 64 | 
 65 | # Output of 'npm pack'
 66 | *.tgz
 67 | 
 68 | # Yarn Integrity file
 69 | .yarn-integrity
 70 | 
 71 | # dotenv environment variables file
 72 | .env
 73 | .env.test
 74 | 
 75 | # parcel-bundler cache (https://parceljs.org/)
 76 | .cache
 77 | .parcel-cache
 78 | 
 79 | # Next.js build output
 80 | .next
 81 | out
 82 | 
 83 | # Nuxt.js build / generate output
 84 | .nuxt
 85 | dist
 86 | 
 87 | # Gatsby files
 88 | .cache/
 89 | # Comment in the public line in if your project uses Gatsby and not Next.js
 90 | # https://nextjs.org/blog/next-9-1#public-directory-support
 91 | # public
 92 | 
 93 | # vuepress build output
 94 | .vuepress/dist
 95 | 
 96 | # Serverless directories
 97 | .serverless/
 98 | 
 99 | # FuseBox cache
100 | .fusebox/
101 | 
102 | # DynamoDB Local files
103 | .dynamodb/
104 | 
105 | # TernJS port file
106 | .tern-port
107 | 
108 | # Stores VSCode versions used for testing VSCode extensions
109 | .vscode-test
110 | 
111 | # yarn v2
112 | .yarn/cache
113 | .yarn/unplugged
114 | .yarn/build-state.yml
115 | .yarn/install-state.gz
116 | .pnp.*
117 | 
118 | # exclusive use of yarn
119 | package-lock.json
120 | local
121 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | src/
 2 | __tests__/
 3 | .github/
 4 | local
 5 | .editorconfig
 6 | .eslintrc
 7 | .prettierrc
 8 | changelog-ci-config.json
 9 | CHANGELOG.md
10 | jest.config.js
11 | tsconfig.json


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |     "printWidth": 90,
3 |     "trailingComma": "none",
4 |     "arrowParens": "always",
5 |     "tabWidth": 2,
6 |     "semi": false,
7 |     "singleQuote": true
8 | }
9 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tanukijs/scrape-them-all/d34eb74f88754006e138e60f7e99321f32fa462f/CHANGELOG.md


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Tanuki
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img src="https://i.imgur.com/rhrbozr.png" alt="Scrape Them All">
  3 | </p>
  4 | <p align="center">
  5 |   <img src="https://img.shields.io/github/license/tanukijs/scrape-them-all" alt="License">
  6 |   <img src="https://github.com/tanukijs/scrape-them-all/workflows/Build%20&%20tests/badge.svg" alt="Build & tests">
  7 | </p>
  8 | 
  9 | **[Scrape-Them-All](http://npmjs.com/package/scrape-them-all)** is a [Cheerio](https://cheerio.js.org) layer which improves your scraping experience.
 10 | 
 11 | **This package is recent, if you have any suggestions or you notice that something is not working, feel free to open an issue or a pull-request, we will be happy to answer them as soon as possible**
 12 | 
 13 | ---
 14 | 
 15 | # 📦 Installation
 16 | 
 17 | ```sh
 18 | # Using NPM
 19 | npm install --save scrape-them-all
 20 | npm install --save fetch-cookie #optional
 21 | 
 22 | # Using Yarn
 23 | yarn add scrape-them-all
 24 | yarn add fetch-cookie #optional
 25 | ```
 26 | 
 27 | `fetch-cookie` is only required if you plan to use the `cookieJar` option on requests.
 28 | 
 29 | **⚠ If you get a `too many redirects` error when you scrape, we recommend to install `fetch-cookie` and use the option `cookieJar: true` in your request. You can also pass an instance of `tough.CookieJar` to this parameter.**
 30 | 
 31 | Example:
 32 | 
 33 | ```js
 34 | scrapeTA({ url: 'https://google.com', cookieJar: true }, ...)
 35 | ```
 36 | 
 37 | ---
 38 | 
 39 | # 📚 Documentation
 40 | 
 41 | ### `scrapeTA(query, schema)`
 42 | 
 43 | Params:
 44 | 
 45 | - **query** `String` or `Object`: The page url or the page url and node-fetch options.
 46 | - **schema** `Object`: the list of elements to scrape and the corresponding HTML tags.
 47 | 
 48 | Returns:
 49 | 
 50 | - `Promise<Object>`: A promise containing the result as JSON.
 51 | 
 52 | ## Schema options
 53 | 
 54 | | Option        | Type                   | Description                                                                                                                            |
 55 | | ------------- | ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------- |
 56 | | **selector**  | `String` or `Object`   | Can be a string expression, DOM Element, array of DOM elements, or cheerio object.                                                     |  |
 57 | | **trim**      | `Boolean`              | Trim whitespaces in the result. **Default as `true`**.                                                                                 |
 58 | | **attribute** | `String`               | Return the value of the indicated attribute on the selected element.                                                                   |
 59 | | **accessor**    | `String` or `Function` | Cheerio access method name (like `html` for returning html code) or a custom function that take a Cheerio instance as first parameter. |
 60 | | **transformer** | `Function`             | The first parameter is your current value for the selected item. Can return a `Promise`.                                               |
 61 | | **listModel** | `Object`               | Contains the options stated above in case of a list.                                                                                   |
 62 | 
 63 | ## Example output
 64 | 
 65 | ```json
 66 | {
 67 |     "title": "An amazing game",
 68 |     "description": "<p>With an amazing description</p>",
 69 |     "image": "https://amazing.game/image.jpg",
 70 |     "price": 10.99,
 71 |     "users": [
 72 |         {
 73 |             "username": "Tanuki",
 74 |             "badges": [
 75 |                 { "name": "An amazing player" },
 76 |                 ...
 77 |             ]
 78 |         },
 79 |         ...
 80 |     ]
 81 | }
 82 | ```
 83 | 
 84 | ## The code that goes with it
 85 | 
 86 | ```js
 87 | const { ScrapeTA } = require('scrape-them-all')
 88 | ScrapeTA('url_or_https_options', {
 89 |   title: '.header h1',
 90 |   description: {
 91 |     selector: '.header p',
 92 |     accessor: 'html',
 93 |     //  accessor: selected => selected.html(),
 94 |     trim: false
 95 |   },
 96 |   image: {
 97 |     selector: 'img',
 98 |     attribute: 'src'
 99 |   },
100 |   price: {
101 |     selector: '.footer #price',
102 |     transformer: (value) => parseFloat(value)
103 |   },
104 |   users: {
105 |     selector: '.body .users',
106 |     listModel: {
107 |       username: '.username',
108 |       badges: {
109 |         selector: '.badges',
110 |         listModel: {
111 |           name: '.badgeName'
112 |         }
113 |       }
114 |     }
115 |   }
116 | })
117 |   .then((data) => console.log(data))
118 |   .catch((error) => console.error(error))
119 | ```
120 | 
121 | ---
122 | 
123 | # 💪 Contributions
124 | 
125 | TODO
126 | 
127 | ---
128 | 
129 | # 📜 License
130 | 
131 | [MIT](https://github.com/tanukijs/scrape-them-all/blob/typescript/LICENSE) © [Tanuki](https://github.com/tanukijs), [Aperrix](https://github.com/Aperrix).
132 | 


--------------------------------------------------------------------------------
/__tests__/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |       <meta charset="UTF-8">
 5 |       <title>ScrapeThemAll Exampels</title>
 6 |   </head>
 7 |   <body>
 8 |     <img src="https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/ee/ee276885cdbec23bdb9780509210c3c24dc7070e_full.jpg" alt="dog">
 9 |     <h1 class="title">Title</h1>
10 |     <p class="description">Lorem ipsum</p>
11 |     <p class="textnodes">
12 |       Line0<br>
13 |       Line1
14 |     </p>
15 |     <p class="deep-textnodes">
16 |       <span>
17 |         Line0<br>
18 |         Line1
19 |       </span>
20 |       Line2
21 |     </p>
22 |     <p class="date">1988-01-01</p>
23 |     <ul class="features">
24 |       <li>1</li>
25 |       <li>2</li>
26 |       <li>3</li>
27 |       <li>4</li>
28 |       <li>5</li>
29 |       <li>6</li>
30 |     </ul>
31 |     <div class="nested">
32 |       <span>Foo</span>
33 |       <div class="level1">
34 |         <span>1</span>
35 |         <div class="level2">
36 |           <span>2</span>
37 |         </div>
38 |       </div>
39 |     </div>
40 |     <table id='detail'>
41 |       <thead>
42 |         <tr>
43 |           <th>
44 |             <div id="start-A" class="city">Sydney</div>
45 |           </th>
46 |           <th>
47 |             <div class="col-two-head">info</div>
48 |           </th>
49 |         </tr>
50 |       </thead>
51 |       <tbody>
52 |           <tr>
53 |             <td><a href="../1234" class="address">one way street</a>
54 |             </td>
55 |             <td>
56 |               <div class="col-two">info-1</div>
57 |             </td>
58 |           </tr>
59 |           <tr>
60 |             <td><a href="../4567" class="address">GT Road</a>
61 |             </td>
62 |             <td>
63 |               <div class="col-two">info-2</div>
64 |             </td>
65 |           </tr>
66 |       </tbody>
67 |     </table>
68 |   </body>
69 | </html>
70 | 


--------------------------------------------------------------------------------
/__tests__/scraping.test.ts:
--------------------------------------------------------------------------------
  1 | import { scrapeTA } from '../src'
  2 | import { createServer, Server } from 'http'
  3 | import { readFileSync } from 'fs'
  4 | import { join } from 'path'
  5 | 
  6 | let server: Server
  7 | const port = 8080
  8 | 
  9 | beforeAll((done) => {
 10 |   server = createServer((_req, res) => {
 11 |     const indexPath = join(__dirname, 'public/index.html')
 12 |     const indexHTML = readFileSync(indexPath)
 13 |     res.write(indexHTML)
 14 |     res.end()
 15 |   })
 16 |   server.listen(port, () => done() && console.log('Server running on port 8080'))
 17 | })
 18 | afterAll((done) => {
 19 |   server.close(() => done())
 20 | })
 21 | 
 22 | describe('Scrape basic data', () => {
 23 |   test('Directly target the HTML element', async () => {
 24 |     const { response, data } = await scrapeTA(`http://localhost:${port}`, {
 25 |       title: 'h1.title',
 26 |       description: '.description',
 27 |       date: {
 28 |         selector: '.date',
 29 |         transformer: (x) => new Date(x)
 30 |       }
 31 |     })
 32 |     expect(response.ok).toBe(true)
 33 |     expect(data).toEqual({
 34 |       title: 'Title',
 35 |       description: 'Lorem ipsum',
 36 |       date: new Date('1988-01-01')
 37 |     })
 38 |   })
 39 | 
 40 |   test('Use a reserved keyword', async () => {
 41 |     const { response, data } = await scrapeTA(`http://localhost:${port}`, {
 42 |       title: 'h1.title',
 43 |       _attribute: {
 44 |         selector: 'img',
 45 |         attribute: 'src'
 46 |       }
 47 |     })
 48 |     expect(response.ok).toBe(true)
 49 |     expect(data).toEqual({
 50 |       title: 'Title',
 51 |       attribute:
 52 |         'https://steamcdn-a.akamaihd.net/steamcommunity/public/images/avatars/ee/ee276885cdbec23bdb9780509210c3c24dc7070e_full.jpg'
 53 |     })
 54 |   })
 55 | })
 56 | 
 57 | describe('Scrape list', () => {
 58 |   test('With transform', async () => {
 59 |     const { response, data } = await scrapeTA(`http://localhost:${port}`, {
 60 |       features: {
 61 |         selector: '.features',
 62 |         listModel: {
 63 |           selector: 'li',
 64 |           transformer: (x) => parseInt(x, 10)
 65 |         }
 66 |       }
 67 |     })
 68 |     expect(response.ok).toBe(true)
 69 |     expect(data).toEqual({
 70 |       features: [1, 2, 3, 4, 5, 6]
 71 |     })
 72 |   })
 73 | 
 74 |   test('Without transform', async () => {
 75 |     const { response, data } = await scrapeTA(`http://localhost:${port}`, {
 76 |       features: {
 77 |         selector: '.features',
 78 |         listModel: {
 79 |           selector: 'li'
 80 |         }
 81 |       }
 82 |     })
 83 |     expect(response.ok).toBe(true)
 84 |     expect(data).toEqual({
 85 |       features: ['1', '2', '3', '4', '5', '6']
 86 |     })
 87 |   })
 88 | })
 89 | 
 90 | describe('Scrape nested object', () => {
 91 |   test('Object nested with multiple custom keys', async () => {
 92 |     const { response, data } = await scrapeTA(`http://localhost:${port}`, {
 93 |       nested: {
 94 |         selector: '.nested',
 95 |         foo: {
 96 |           level1: {
 97 |             selector: '.level1',
 98 |             level2: {
 99 |               selector: 'span',
100 |               accessor: (x) => x.eq(1).text()
101 |             }
102 |           },
103 |           level1Text: {
104 |             selector: 'span'
105 |           },
106 |           level2Text: {
107 |             selector: '.level2'
108 |           }
109 |         }
110 |       }
111 |     })
112 |     expect(response.ok).toBe(true)
113 |     expect(data).toEqual({
114 |       nested: {
115 |         foo: {
116 |           level1: {
117 |             level2: '2'
118 |           },
119 |           level1Text: 'Foo12',
120 |           level2Text: '2'
121 |         }
122 |       }
123 |     })
124 |   })
125 | 
126 |   test('Scrape tables using accessor', async () => {
127 |     const { response, data } = await scrapeTA(`http://localhost:${port}`, {
128 |       addresses: {
129 |         selector: 'table tbody tr',
130 |         listModel: {
131 |           address: '.address',
132 |           city: {
133 |             accessor: (x) => x.closest('table').find('thead .city').text()
134 |           }
135 |         }
136 |       }
137 |     })
138 |     expect(response.ok).toBe(true)
139 |     expect(data).toEqual({
140 |       addresses: [
141 |         { address: 'one way street', city: 'Sydney' },
142 |         { address: 'GT Road', city: 'Sydney' }
143 |       ]
144 |     })
145 |   })
146 | })
147 | 
148 | describe('Scrape using options', () => {
149 |   test('Store redirections using cookieJar option', async () => {
150 |     const { response, data } = await scrapeTA(
151 |       { url: 'http://www.krosmoz.com/en/almanax/2020-01-01', cookieJar: true },
152 |       {
153 |         month: {
154 |           selector: '#almanax_day .day-text'
155 |         }
156 |       }
157 |     )
158 |     expect(response.ok).toBe(true)
159 |     expect(data).toEqual({
160 |       month: 'Javian'
161 |     })
162 |   })
163 | 
164 |   test('Get data modified by AJAX using headers option', async () => {
165 |     const { response, data } = await scrapeTA(
166 |       {
167 |         url:
168 |           'https://www.dofus.com/en/mmorpg/encyclopedia/pets/11950-ankascraper?level=100&_pjax=.ak-item-details-container',
169 |         headers: {
170 |           'x-requested-with': 'XMLHttpRequest',
171 |           'x-pjax': 'true',
172 |           'x-pjax-container': '.ak-item-details-container'
173 |         }
174 |       },
175 |       {
176 |         effect: {
177 |           selector: '.ak-container.ak-content-list.ak-displaymode-col .ak-title',
178 |           accessor: (x) => x.eq(0).text()
179 |         }
180 |       }
181 |     )
182 |     expect(response.ok).toBe(true)
183 |     expect(data).toEqual({
184 |       effect: '120 Chance'
185 |     })
186 |   })
187 | })
188 | 
189 | describe('Herror handling', () => {
190 |   test('Scrape invalid URL', async () => {
191 |     await expect(
192 |       scrapeTA('http://gertkafgzngegzegerj.com', {
193 |         title: 'h1.title'
194 |       })
195 |     ).rejects.toThrow()
196 |   })
197 | 
198 |   test('Use reserved keyword directly', async () => {
199 |     await expect(
200 |       scrapeTA(`http://localhost:${port}`, {
201 |         attribute: 'h1.title'
202 |       })
203 |     ).rejects.toThrow('Root object must be a nested object.')
204 |   })
205 | 
206 |   test('Use reserved keyword in nested object', async () => {
207 |     await expect(
208 |       scrapeTA(`http://localhost:${port}`, {
209 |         accessor: {
210 |           img: {
211 |             selector: 'img',
212 |             attribute: 'src'
213 |           }
214 |         }
215 |       })
216 |     ).rejects.toThrow(
217 |       'The property "accessor" expects a string or a function. If you want to use "accessor" as a result key, prefix it with an underscore (the first will be stripped automatically).'
218 |     )
219 |   })
220 | })
221 | 


--------------------------------------------------------------------------------
/changelog-ci-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "header_prefix": "Release:",
 3 |   "group_config": [
 4 |     {
 5 |       "title": ":bug: Bug Fixes",
 6 |       "labels": ["bug", "bugfix"]
 7 |     },
 8 |     {
 9 |       "title": ":rocket: Code Improvements",
10 |       "labels": ["improvements", "enhancement"]
11 |     },
12 |     {
13 |       "title": ":sparkles: New Features",
14 |       "labels": ["feature"]
15 |     },
16 |     {
17 |       "title": ":books: Documentation Updates",
18 |       "labels": ["docs", "documentation", "doc"]
19 |     }
20 |   ]
21 | }
22 | 


--------------------------------------------------------------------------------
/jest.config.js:
--------------------------------------------------------------------------------
  1 | // For a detailed explanation regarding each configuration property, visit:
  2 | // https://jestjs.io/docs/en/configuration.html
  3 | 
  4 | module.exports = {
  5 |   // All imported modules in your tests should be mocked automatically
  6 |   // automock: false,
  7 | 
  8 |   // Stop running tests after `n` failures
  9 |   // bail: 0,
 10 | 
 11 |   // The directory where Jest should store its cached dependency information
 12 |   // cacheDirectory: "/tmp/jest_rs",
 13 | 
 14 |   // Automatically clear mock calls and instances between every test
 15 |   clearMocks: true,
 16 | 
 17 |   // Indicates whether the coverage information should be collected while executing the test
 18 |   // collectCoverage: false,
 19 | 
 20 |   // An array of glob patterns indicating a set of files for which coverage information should be collected
 21 |   // collectCoverageFrom: undefined,
 22 | 
 23 |   // The directory where Jest should output its coverage files
 24 |   coverageDirectory: "coverage",
 25 | 
 26 |   // An array of regexp pattern strings used to skip coverage collection
 27 |   // coveragePathIgnorePatterns: [
 28 |   //   "/node_modules/"
 29 |   // ],
 30 | 
 31 |   // Indicates which provider should be used to instrument code for coverage
 32 |   coverageProvider: "v8",
 33 | 
 34 |   // A list of reporter names that Jest uses when writing coverage reports
 35 |   // coverageReporters: [
 36 |   //   "json",
 37 |   //   "text",
 38 |   //   "lcov",
 39 |   //   "clover"
 40 |   // ],
 41 | 
 42 |   // An object that configures minimum threshold enforcement for coverage results
 43 |   // coverageThreshold: undefined,
 44 | 
 45 |   // A path to a custom dependency extractor
 46 |   // dependencyExtractor: undefined,
 47 | 
 48 |   // Make calling deprecated APIs throw helpful error messages
 49 |   // errorOnDeprecated: false,
 50 | 
 51 |   // Force coverage collection from ignored files using an array of glob patterns
 52 |   // forceCoverageMatch: [],
 53 | 
 54 |   // A path to a module which exports an async function that is triggered once before all test suites
 55 |   // globalSetup: undefined,
 56 | 
 57 |   // A path to a module which exports an async function that is triggered once after all test suites
 58 |   // globalTeardown: undefined,
 59 | 
 60 |   // A set of global variables that need to be available in all test environments
 61 |   // globals: {},
 62 | 
 63 |   // The maximum amount of workers used to run your tests. Can be specified as % or a number. E.g. maxWorkers: 10% will use 10% of your CPU amount + 1 as the maximum worker number. maxWorkers: 2 will use a maximum of 2 workers.
 64 |   // maxWorkers: "50%",
 65 | 
 66 |   // An array of directory names to be searched recursively up from the requiring module's location
 67 |   // moduleDirectories: [
 68 |   //   "node_modules"
 69 |   // ],
 70 | 
 71 |   // An array of file extensions your modules use
 72 |   // moduleFileExtensions: [
 73 |   //   "js",
 74 |   //   "json",
 75 |   //   "jsx",
 76 |   //   "ts",
 77 |   //   "tsx",
 78 |   //   "node"
 79 |   // ],
 80 | 
 81 |   // A map from regular expressions to module names or to arrays of module names that allow to stub out resources with a single module
 82 |   // moduleNameMapper: {},
 83 | 
 84 |   // An array of regexp pattern strings, matched against all module paths before considered 'visible' to the module loader
 85 |   // modulePathIgnorePatterns: [],
 86 | 
 87 |   // Activates notifications for test results
 88 |   // notify: false,
 89 | 
 90 |   // An enum that specifies notification mode. Requires { notify: true }
 91 |   // notifyMode: "failure-change",
 92 | 
 93 |   // A preset that is used as a base for Jest's configuration
 94 |   // preset: undefined,
 95 | 
 96 |   // Run tests from one or more projects
 97 |   // projects: undefined,
 98 | 
 99 |   // Use this configuration option to add custom reporters to Jest
100 |   // reporters: undefined,
101 | 
102 |   // Automatically reset mock state between every test
103 |   // resetMocks: false,
104 | 
105 |   // Reset the module registry before running each individual test
106 |   // resetModules: false,
107 | 
108 |   // A path to a custom resolver
109 |   // resolver: undefined,
110 | 
111 |   // Automatically restore mock state between every test
112 |   // restoreMocks: false,
113 | 
114 |   // The root directory that Jest should scan for tests and modules within
115 |   // rootDir: undefined,
116 | 
117 |   // A list of paths to directories that Jest should use to search for files in
118 |   // roots: [
119 |   //   "<rootDir>"
120 |   // ],
121 | 
122 |   // Allows you to use a custom runner instead of Jest's default test runner
123 |   // runner: "jest-runner",
124 | 
125 |   // The paths to modules that run some code to configure or set up the testing environment before each test
126 |   // setupFiles: [],
127 | 
128 |   // A list of paths to modules that run some code to configure or set up the testing framework before each test
129 |   // setupFilesAfterEnv: [],
130 | 
131 |   // The number of seconds after which a test is considered as slow and reported as such in the results.
132 |   // slowTestThreshold: 5,
133 | 
134 |   // A list of paths to snapshot serializer modules Jest should use for snapshot testing
135 |   // snapshotSerializers: [],
136 | 
137 |   // The test environment that will be used for testing
138 |   testEnvironment: "node",
139 | 
140 |   // Options that will be passed to the testEnvironment
141 |   // testEnvironmentOptions: {},
142 | 
143 |   // Adds a location field to test results
144 |   // testLocationInResults: false,
145 | 
146 |   // The glob patterns Jest uses to detect test files
147 |   testMatch: [
148 |     "**/__tests__/**/*.+(ts|tsx|js)",
149 |     "**/?(*.)+(spec|test).+(ts|tsx|js)"
150 |   ],
151 | 
152 |   // An array of regexp pattern strings that are matched against all test paths, matched tests are skipped
153 |   testPathIgnorePatterns: [
154 |     "/node_modules/",
155 |     "/dist/"
156 |   ],
157 | 
158 |   // The regexp pattern or array of patterns that Jest uses to detect test files
159 |   // testRegex: [],
160 | 
161 |   // This option allows the use of a custom results processor
162 |   // testResultsProcessor: undefined,
163 | 
164 |   // This option allows use of a custom test runner
165 |   // testRunner: "jasmine2",
166 | 
167 |   // This option sets the URL for the jsdom environment. It is reflected in properties such as location.href
168 |   // testURL: "http://localhost",
169 | 
170 |   // Setting this value to "fake" allows the use of fake timers for functions such as "setTimeout"
171 |   // timers: "real",
172 | 
173 |   // A map from regular expressions to paths to transformers
174 |   transform: {
175 |     "^.+\\.(ts|tsx)$": "ts-jest"
176 |   },
177 | 
178 |   // An array of regexp pattern strings that are matched against all source file paths, matched files will skip transformation
179 |   // transformIgnorePatterns: [
180 |   //   "/node_modules/",
181 |   //   "\\.pnp\\.[^\\/]+$"
182 |   // ],
183 | 
184 |   // An array of regexp pattern strings that are matched against all modules before the module loader will automatically return a mock for them
185 |   // unmockedModulePathPatterns: undefined,
186 | 
187 |   // Indicates whether each individual test should be reported during the run
188 |   // verbose: undefined,
189 | 
190 |   // An array of regexp patterns that are matched against all source file paths before re-running tests in watch mode
191 |   // watchPathIgnorePatterns: [],
192 | 
193 |   // Whether to use watchman for file crawling
194 |   // watchman: true,
195 | };
196 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "scrape-them-all",
 3 |   "description": "🚀 An easy-to-handle Node.js scraper that allow you to scrape them all in a record time.",
 4 |   "version": "1.0.4",
 5 |   "license": "MIT",
 6 |   "author": "Tanuki <tanuki.contact@gmail.com> (https://github.com/tanukijs)",
 7 |   "contributors": [
 8 |     "Aperrix <aperrix@gmail.com> (https://github.com/Aperrix)"
 9 |   ],
10 |   "repository": "https://github.com/tanukijs/scrape-them-all.git",
11 |   "source": "src/index.ts",
12 |   "main": "dist/index.js",
13 |   "types": "dist/index.d.ts",
14 |   "engines": {
15 |     "node": ">=10"
16 |   },
17 |   "keywords": [
18 |     "scraper",
19 |     "node-scraper",
20 |     "scraping",
21 |     "web-scraping",
22 |     "htmlparser",
23 |     "parser",
24 |     "parsing",
25 |     "crawler",
26 |     "crawling"
27 |   ],
28 |   "scripts": {
29 |     "lint": "eslint --ext .ts --ignore-path .gitignore .",
30 |     "lintfix": "eslint --fix --ext .ts --ignore-path .gitignore .",
31 |     "build": "rimraf dist && tsc -p tsconfig.json",
32 |     "test": "jest",
33 |     "prepare": "npm run build && npm run test"
34 |   },
35 |   "dependencies": {
36 |     "cheerio": "^1.0.0-rc.3",
37 |     "node-fetch": "^2.6.1"
38 |   },
39 |   "devDependencies": {
40 |     "@types/cheerio": "^0.22.22",
41 |     "@types/jest": "^26.0.14",
42 |     "@types/node": "^14.11.8",
43 |     "@types/node-fetch": "^2.5.7",
44 |     "@typescript-eslint/eslint-plugin": "^4.4.1",
45 |     "@typescript-eslint/parser": "^4.4.1",
46 |     "eslint": "^7.11.0",
47 |     "eslint-config-prettier": "^6.12.0",
48 |     "eslint-config-standard": "^14.1.1",
49 |     "eslint-plugin-import": "^2.22.1",
50 |     "eslint-plugin-node": "^11.1.0",
51 |     "eslint-plugin-prettier": "^3.1.4",
52 |     "eslint-plugin-promise": "^4.2.1",
53 |     "eslint-plugin-standard": "^4.0.1",
54 |     "fetch-cookie": "^0.10.1",
55 |     "jest": "^26.5.3",
56 |     "prettier": "^2.1.2",
57 |     "rimraf": "^3.0.2",
58 |     "ts-jest": "^26.4.1",
59 |     "tslib": "^2.0.3",
60 |     "typescript": "^4.0.3"
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/DataModeler.ts:
--------------------------------------------------------------------------------
  1 | import cheerio from 'cheerio'
  2 | import { EOptionType, SchemeInterpreter } from './SchemeInterpreter'
  3 | 
  4 | export class DataModeler {
  5 |   private $root: cheerio.Root
  6 | 
  7 |   constructor(body: string) {
  8 |     this.$root = cheerio.load(body)
  9 |   }
 10 | 
 11 |   /**
 12 |    * Generate data from HTML body & user-designed JSON scheme
 13 |    *
 14 |    * @param {SchemeInterpreter} opts
 15 |    * @param {cheerio.Cheerio} [context]
 16 |    *
 17 |    * @returns {Promise<Record<string, unknown>>}
 18 |    */
 19 |   async generate(
 20 |     opts: SchemeInterpreter,
 21 |     context?: cheerio.Cheerio
 22 |   ): Promise<Record<string, unknown>> {
 23 |     if (opts.type !== EOptionType.OBJECT)
 24 |       throw new Error('Root object must be a nested object.')
 25 | 
 26 |     const mappedResult = {}
 27 | 
 28 |     for (const key in opts.children) {
 29 |       const value = new SchemeInterpreter(opts.children[key])
 30 |       const cheerioRoot =
 31 |         context && value.selector
 32 |           ? context.find(value.selector)
 33 |           : context || this.$root(value.selector)
 34 | 
 35 |       if (value.type === EOptionType.OBJECT) {
 36 |         mappedResult[key] = await this.generate(value, cheerioRoot)
 37 |         continue
 38 |       }
 39 | 
 40 |       const result =
 41 |         value.type === EOptionType.VALUE
 42 |           ? this.processValue(cheerioRoot, value)
 43 |           : value.type === EOptionType.ARRAY
 44 |           ? this.processArray(cheerioRoot, value.listModel as SchemeInterpreter)
 45 |           : value.type === EOptionType.OBJECT_ARRAY
 46 |           ? this.processObjectArray(cheerioRoot, value.listModel as SchemeInterpreter)
 47 |           : undefined
 48 |       mappedResult[key] = await (Array.isArray(result) ? Promise.all(result) : result)
 49 |     }
 50 | 
 51 |     return mappedResult
 52 |   }
 53 | 
 54 |   /**
 55 |    * Process single item
 56 |    *
 57 |    * @param {cheerio.Cheerio} element
 58 |    * @param {SchemeInterpreter} opts
 59 |    *
 60 |    * @returns {unknown}
 61 |    */
 62 |   private processValue(element: cheerio.Cheerio, opts: SchemeInterpreter): unknown {
 63 |     let value =
 64 |       typeof opts.accessor === 'function'
 65 |         ? opts.accessor(element)
 66 |         : typeof opts.accessor === 'string' && typeof element[opts.accessor] !== undefined
 67 |         ? element[opts.accessor]()
 68 |         : null
 69 | 
 70 |     if (opts.attribute) value = element.attr(opts.attribute)
 71 |     if (opts.trim && value && typeof value === 'string') value = value.trim()
 72 |     if (opts.transformer) value = opts.transformer(value)
 73 | 
 74 |     return value
 75 |   }
 76 | 
 77 |   /**
 78 |    * Process basic list
 79 |    *
 80 |    * @param {cheerio.Cheerio} element
 81 |    * @param {SchemeInterpreter} listModel
 82 |    *
 83 |    * @returns {unknown[]}
 84 |    */
 85 |   private processArray(
 86 |     element: cheerio.Cheerio,
 87 |     listModel: SchemeInterpreter
 88 |   ): unknown[] {
 89 |     const values = []
 90 |     const children = element.find(listModel.selector)
 91 |     for (let i = 0; i < children.length; i++) {
 92 |       const value = this.processValue(children.eq(i), listModel)
 93 |       values.push(value)
 94 |     }
 95 |     return values
 96 |   }
 97 | 
 98 |   /**
 99 |    * Process list of objects
100 |    *
101 |    * @param {cheerio.Cheerio} element
102 |    * @param {SchemeInterpreter} listModel
103 |    *
104 |    * @returns {Promise<Record<string, unknown>>[]}
105 |    */
106 |   private processObjectArray(
107 |     element: cheerio.Cheerio,
108 |     listModel: SchemeInterpreter
109 |   ): Promise<Record<string, unknown>>[] {
110 |     const values = []
111 |     for (let i = 0; i < element.length; i++) {
112 |       const value = this.generate(listModel, element.eq(i))
113 |       values.push(value)
114 |     }
115 |     return values
116 |   }
117 | }
118 | 


--------------------------------------------------------------------------------
/src/SchemeInterpreter.ts:
--------------------------------------------------------------------------------
 1 | import { ScrapeTAScheme } from './typings'
 2 | 
 3 | export const enum EOptionType {
 4 |   VALUE,
 5 |   ARRAY,
 6 |   OBJECT,
 7 |   OBJECT_ARRAY
 8 | }
 9 | 
10 | export class SchemeInterpreter {
11 |   readonly selector: string = ''
12 |   readonly trim: boolean = true
13 |   readonly accessor: string | ((node: cheerio.Cheerio) => unknown) = 'text'
14 |   readonly attribute?: string
15 |   readonly transformer?: (value: string) => unknown
16 |   readonly listModel?: string | ScrapeTAScheme | SchemeInterpreter
17 |   readonly children: Record<string, string | ScrapeTAScheme> = {}
18 | 
19 |   constructor(opts: string | Partial<SchemeInterpreter> = '') {
20 |     if (typeof opts === 'string') {
21 |       this.selector = opts
22 |     } else {
23 |       this.selector = opts.selector || ''
24 |       this.trim = opts.trim || true
25 |       this.accessor = opts.accessor || 'text'
26 |       this.attribute = opts.attribute
27 |       this.transformer = opts.transformer
28 |       this.listModel = opts.listModel ? new SchemeInterpreter(opts.listModel) : undefined
29 | 
30 |       const reservedKeys = Object.keys(this)
31 |       for (const key in opts) {
32 |         const normalizedKey = key[0] === '_' ? key.slice(1) : key
33 |         if (reservedKeys.includes(key) && normalizedKey === key) continue
34 |         this.children[normalizedKey] = opts[key]
35 |       }
36 |     }
37 |     this.validate()
38 |   }
39 | 
40 |   /**
41 |    * Get type of an input
42 |    *
43 |    * @param {ScrapeTAScheme[K]} scheme
44 |    * @returns {EOptionType}
45 |    */
46 |   public get type(): EOptionType {
47 |     if (Object.keys(this.children).length > 0) return EOptionType.OBJECT
48 |     if (!this.listModel) return EOptionType.VALUE
49 | 
50 |     if (
51 |       this.listModel instanceof SchemeInterpreter &&
52 |       Object.keys(this.listModel.children).length > 0
53 |     )
54 |       return EOptionType.OBJECT_ARRAY
55 |     return EOptionType.ARRAY
56 |   }
57 | 
58 |   /**
59 |    * Validate current SchemeInterpreter object
60 |    *
61 |    * @returns {void}
62 |    * @throws {Error}
63 |    */
64 |   public validate(): void {
65 |     const expected = [
66 |       { property: 'selector', equalsTo: ['string'] },
67 |       { property: 'trim', equalsTo: ['boolean'] },
68 |       { property: 'accessor', equalsTo: ['string', 'function'] },
69 |       { property: 'attribute', equalsTo: ['string'] },
70 |       { property: 'transformer', equalsTo: ['function'] },
71 |       { property: 'listModel', equalsTo: ['string', 'object'] }
72 |     ]
73 | 
74 |     for (const { property, equalsTo } of expected) {
75 |       if (!this[property]) continue
76 |       const asExpectedValue = equalsTo.map((type) => typeof this[property] === type)
77 |       if (asExpectedValue.includes(true)) continue
78 |       const errorTypes = equalsTo.join(' or a ')
79 |       const errorMessage = [
80 |         `The property "${property}" expects a ${errorTypes}.`,
81 |         `If you want to use "${property}" as a result key, prefix it with an underscore (the first will be stripped automatically).`
82 |       ].join(' ')
83 |       throw new Error(errorMessage)
84 |     }
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
 1 | import nodeFetch, { RequestInfo, RequestInit } from 'node-fetch'
 2 | import { SchemeInterpreter } from './SchemeInterpreter'
 3 | import { DataModeler } from './DataModeler'
 4 | import {
 5 |   ScrapeTAExtraParams,
 6 |   ScrapeTARequest,
 7 |   ScrapeTAScheme,
 8 |   ScrapeTAResult
 9 | } from './typings'
10 | 
11 | /**
12 |  * Create an instance of node-fetch with managed cookies
13 |  *
14 |  * @param {ScrapeTAExtraParams} query
15 |  * @returns {Promise<typeof nodeFetch>}
16 |  */
17 | async function withCookies(query: ScrapeTAExtraParams): Promise<typeof nodeFetch> {
18 |   try {
19 |     const { default: fetchCookie } = await import('fetch-cookie/node-fetch')
20 |     const cookieJar = typeof query.cookieJar === 'boolean' ? undefined : query.cookieJar
21 |     return fetchCookie(nodeFetch, cookieJar) as typeof nodeFetch
22 |   } catch (e) {
23 |     throw new Error('Please run `npm install fetch-cookie` to use the cookieJar option.')
24 |   }
25 | }
26 | 
27 | /**
28 |  * Get HTML body and transform it as user-designed object
29 |  *
30 |  * @param {ScrapeTARequest} query
31 |  * @param {ScrapeTAScheme} scheme
32 |  *
33 |  * @returns {Promise<ScrapeTAResult>}
34 |  */
35 | export async function scrapeTA(
36 |   request: ScrapeTARequest,
37 |   scheme: ScrapeTAScheme
38 | ): Promise<ScrapeTAResult> {
39 |   const fetch =
40 |     typeof request === 'object' && 'cookieJar' in request && request.cookieJar
41 |       ? await withCookies(request)
42 |       : nodeFetch
43 |   const requestInfo = ((typeof request === 'object' && 'url' in request && request.url) ||
44 |     request) as RequestInfo
45 |   const requestInit = typeof request === 'object' ? (request as RequestInit) : undefined
46 |   const response = await fetch(requestInfo, requestInit)
47 |   const responseHTML = await response.text()
48 |   const dataModeler = new DataModeler(responseHTML)
49 |   const usableScheme = new SchemeInterpreter(scheme)
50 |   const data = await dataModeler.generate(usableScheme)
51 |   return { response, data }
52 | }
53 | 
54 | export * from './typings'
55 | 


--------------------------------------------------------------------------------
/src/typings.ts:
--------------------------------------------------------------------------------
 1 | import { RequestInfo, RequestInit, Response } from 'node-fetch'
 2 | import { CookieJar } from 'fetch-cookie'
 3 | import { SchemeInterpreter } from './SchemeInterpreter'
 4 | 
 5 | export type ScrapeTAExtraParams = {
 6 |   url: RequestInfo
 7 |   cookieJar?: boolean | CookieJar
 8 | }
 9 | 
10 | export type ScrapeTARequest = RequestInfo | (ScrapeTAExtraParams & RequestInit)
11 | 
12 | export type ScrapeTAResult = {
13 |   response: Response
14 |   data: Record<string, unknown>
15 | }
16 | 
17 | type TSchemeInterpreter = Partial<
18 |   Pick<
19 |     SchemeInterpreter,
20 |     'selector' | 'accessor' | 'attribute' | 'trim' | 'listModel' | 'transformer'
21 |   >
22 | >
23 | 
24 | export type ScrapeTAScheme = {
25 |   [key: string]: string | TSchemeInterpreter | ScrapeTAScheme
26 | }
27 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     /* Basic Options */
 4 |     "target": "ESNext",
 5 |     "module": "CommonJS",
 6 |     "lib": ["ESNext", "DOM"],
 7 |     "allowJs": true,
 8 |     "checkJs": true,
 9 |     "declaration": true,
10 |     "declarationMap": false,
11 |     "sourceMap": false,
12 |     "outDir": "dist",
13 |     "rootDir": "src",
14 |     "removeComments": true,
15 |     "importHelpers": true,
16 |     "downlevelIteration": true,
17 |     "esModuleInterop": true,
18 | 
19 |     /* Strict Type-Checking Options */
20 |     "strict": true,
21 |     "noImplicitAny": true,
22 |     "strictNullChecks": true,
23 |     "strictFunctionTypes": true,
24 |     "strictBindCallApply": true,
25 |     "strictPropertyInitialization": true,
26 |     "noImplicitThis": true,
27 |     "alwaysStrict": true,
28 | 
29 |     /* Additional Checks */
30 |     "noUnusedLocals": true,
31 |     "noUnusedParameters": true,
32 |     "noImplicitReturns": true,
33 |     "noFallthroughCasesInSwitch": true,
34 |     "suppressImplicitAnyIndexErrors": true,
35 | 
36 |     /* Module Resolution Options */
37 |     "moduleResolution": "node",
38 |     "resolveJsonModule": true,
39 |     "baseUrl": "src",
40 |     "paths": {
41 |         "@/*": ["*"],
42 |     },
43 | 
44 |     /* Advanced Options */
45 |     "forceConsistentCasingInFileNames": true,
46 |   },
47 |   "include": ["src"],
48 |   "exclude": ["node_modules", "dist"]
49 | }
50 | 


--------------------------------------------------------------------------------