├── .actrc
├── .eslintrc.json
├── .github
    ├── act
    │   └── event.json
    ├── actions
    │   └── acceptance
    │   │   └── action.yml
    └── workflows
    │   ├── audit.yml
    │   └── test.yml
├── .gitignore
├── .nycrc.json
├── LICENSE
├── README.md
├── bin
    └── gsfscrape
├── changelog.md
├── cloud
    ├── ansible
    │   ├── gsf-postgresql-logs
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── gsf-postgresql-setup
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── gsf-scraper-benchmark
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── gsf-scraper-export
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── gsf-scraper-logs
    │   │   └── tasks
    │   │   │   └── main.yml
    │   ├── gsf-scraper-queue
    │   │   ├── tasks
    │   │   │   └── main.yml
    │   │   └── templates
    │   │   │   └── queue.j2
    │   ├── gsf-scraper-setup
    │   │   ├── defaults
    │   │   │   └── main.yml
    │   │   ├── handlers
    │   │   │   └── main.yml
    │   │   ├── tasks
    │   │   │   └── main.yml
    │   │   └── templates
    │   │   │   └── getsetfetch.service.j2
    │   └── gsf-scraper-stats
    │   │   └── tasks
    │   │       └── main.yml
    └── terraform
    │   ├── main.tf
    │   ├── provider.tf
    │   ├── templates
    │       └── hosts.tpl
    │   ├── user_data_pg.yml
    │   ├── user_data_scraper.yml
    │   └── variables.tf
├── datasets
    ├── .gitignore
    ├── javascript-libs-from-top-1mm-sites
    │   ├── ansible
    │   │   ├── files
    │   │   │   └── ExtractScriptsPlugin.js
    │   │   ├── inventory
    │   │   │   └── .gitkeep
    │   │   ├── pg-setup.yml
    │   │   ├── scraper-setup.yml
    │   │   ├── templates
    │   │   │   └── js-scripts-config.json.j2
    │   │   └── vault.yml
    │   ├── charts
    │   │   ├── extract
    │   │   │   ├── CategoryExtractor.ts
    │   │   │   ├── ScriptParser.ts
    │   │   │   └── summarize-js-libs.ts
    │   │   ├── most-used-js-libs-2022-06-05-thumb.png
    │   │   ├── most-used-js-libs-2022-06-05.csv
    │   │   ├── most-used-js-libs-2022-06-05.png
    │   │   ├── most-used-js-libs-2022-06-05.svg
    │   │   └── most-used-js-libs.html
    │   ├── exports
    │   │   └── .gitkeep
    │   ├── readme.md
    │   └── terraform
    │   │   ├── main.tf
    │   │   ├── provider.tf
    │   │   └── variables.tf
    ├── readme.md
    ├── tsconfig.datasets.json
    └── utils
    │   ├── map.ts
    │   └── serve-static.ts
├── development.md
├── docker
    ├── Dockerfile
    ├── data
    │   ├── chromium-security-profile.json
    │   ├── config-pg-puppeteer.json
    │   ├── config-sqlite-cheerio.json
    │   └── config-sqlite-puppeteer.json
    ├── docker.md
    └── pg-puppeteer
    │   └── docker-compose.yml
├── examples
    ├── article-excerpts
    │   ├── ReadabilityPlugin.ts
    │   ├── article-excerpts-config.json
    │   └── article-excerpts.ts
    ├── cloud
    │   ├── ansible
    │   │   ├── files
    │   │   │   ├── ExtractScriptsPlugin.js
    │   │   │   ├── gsf-config.json
    │   │   │   └── gsf.service
    │   │   ├── pg-setup.yml
    │   │   ├── pg-stats.yml
    │   │   ├── scraper-export.yml
    │   │   ├── scraper-logs.yml
    │   │   ├── scraper-setup.yml
    │   │   ├── scraper-systemd-logs.yml
    │   │   └── vars.yml
    │   ├── readme.md
    │   └── terraform
    │   │   ├── provider.tf
    │   │   ├── resource-ansible-inventory.tf
    │   │   ├── resource-pg.sh
    │   │   ├── resource-pg.tf
    │   │   ├── resource-scraper.sh
    │   │   ├── resource-scraper.tf
    │   │   ├── resource-vpc.tf
    │   │   └── templates
    │   │       └── hosts.tpl
    ├── console-content
    │   ├── ConsoleBrowserFetchPlugin.ts
    │   ├── ConsolePuppeteerClient.ts
    │   ├── console-content-config.json
    │   └── console-content.ts
    ├── in-memory-queue
    │   ├── InMemoryConnection.ts
    │   ├── InMemoryQueue.ts
    │   ├── in-memory-queue-config.json
    │   └── in-memory-queue.ts
    ├── infinite-scrolling
    │   ├── infinite-scrolling-config.json
    │   └── infinite-scrolling.ts
    ├── pdf-extraction
    │   ├── pdf-extraction-config.json
    │   └── pdf-extraction.ts
    ├── product-details
    │   ├── product-details-config.json
    │   └── product-details.ts
    ├── sitemap
    │   ├── ExtractSameHostUrlsPlugin.ts
    │   ├── SitemapExporter.ts
    │   ├── SkipExtractHtmlContentPlugin.ts
    │   ├── scrape-config.json
    │   └── sitemap.ts
    ├── tabular-data
    │   ├── tabular-data-config.json
    │   └── tabular-data.ts
    ├── tls-fingerprinting
    │   ├── RandomTlsFingerprintFetch.ts
    │   ├── readme.md
    │   ├── tls-fingerprinting-config.json
    │   └── tls-fingerprinting.ts
    └── tsconfig.examples.json
├── package-lock.json
├── package.json
├── src
    ├── browserclient
    │   ├── BrowserClient.ts
    │   ├── PlaywrightClient.ts
    │   └── PuppeteerClient.ts
    ├── cli
    │   └── cli.ts
    ├── confighash
    │   ├── config-hash.ts
    │   └── dictionary-v1.json
    ├── domclient
    │   ├── CheerioClient.ts
    │   ├── DomClient.ts
    │   ├── JsdomClient.ts
    │   ├── NativeClient.ts
    │   └── client-utils.ts
    ├── export
    │   ├── CsvExporter.ts
    │   ├── Exporter.ts
    │   ├── MimeTypes.json
    │   └── ZipExporter.ts
    ├── index.ts
    ├── logger
    │   └── Logger.ts
    ├── pipelines
    │   ├── BrowserStaticContentPipeline.ts
    │   ├── DomStaticContentPipeline.ts
    │   └── pipelines.ts
    ├── plugins
    │   ├── Plugin.ts
    │   ├── default
    │   │   ├── BaseFetchPlugin.ts
    │   │   ├── BrowserFetchPlugin.ts
    │   │   ├── ExtractHtmlContentPlugin.ts
    │   │   ├── ExtractUrlsPlugin.ts
    │   │   ├── InsertResourcesPlugin.ts
    │   │   ├── NodeFetchPlugin.ts
    │   │   ├── ScrollPlugin.ts
    │   │   └── UpsertResourcePlugin.ts
    │   ├── dom-utils.ts
    │   ├── file-utils.ts
    │   └── url-utils.ts
    ├── pluginstore
    │   └── PluginStore.ts
    ├── schema
    │   └── SchemaHelper.ts
    ├── scraper
    │   ├── ConcurrencyManager.ts
    │   ├── QueueBuffer.ts
    │   ├── RuntimeMetrics.ts
    │   └── Scraper.ts
    └── storage
    │   ├── ConnectionManager.ts
    │   ├── base
    │       ├── Connection.ts
    │       ├── Entity.ts
    │       ├── Project.ts
    │       ├── Queue.ts
    │       ├── Resource.ts
    │       └── Storage.ts
    │   └── knex
    │       ├── KnexConnection.ts
    │       ├── KnexProject.ts
    │       ├── KnexQueue.ts
    │       ├── KnexResource.ts
    │       └── KnexStorage.ts
├── test
    ├── .mocharc.js
    ├── acceptance
    │   ├── acceptance-suite.ts
    │   ├── cheerio.ts
    │   ├── cli
    │   │   ├── config
    │   │   │   ├── config-single-page-single-content-entry-custom-plugin.json
    │   │   │   ├── config-single-page-single-content-entry.json
    │   │   │   ├── config-with-external-resources.json
    │   │   │   └── config-with-invalid-external-resources.json
    │   │   ├── plugins
    │   │   │   └── h1-counter-plugin.js
    │   │   ├── resources
    │   │   │   ├── resources-single-entry.csv
    │   │   │   ├── resources.csv
    │   │   │   └── unnormalized-resources.csv
    │   │   └── test-cli.ts
    │   ├── docker
    │   │   ├── config
    │   │   │   └── base-config.json
    │   │   └── test-docker.ts
    │   ├── jsdom.ts
    │   ├── playwright_chromium.ts
    │   └── puppeteer_chromium.ts
    ├── config
    │   ├── browserclient
    │   │   ├── playwright
    │   │   │   └── playwright-chromium.json
    │   │   └── puppeteer
    │   │   │   └── puppeteer-chromium.json
    │   └── storage
    │   │   ├── mysql
    │   │       ├── mysql-conn.json
    │   │       └── mysql.yml
    │   │   ├── pg
    │   │       ├── pg-conn.json
    │   │       └── pg.yml
    │   │   └── sqlite
    │   │       └── sqlite-conn.json
    ├── tmp
    │   └── .gitkeep
    ├── tsconfig.test.json
    ├── unit
    │   ├── confighash
    │   │   └── test-config-hash.ts
    │   ├── domclients
    │   │   ├── test-cheerio-client.ts
    │   │   └── test-jsdom-client.ts
    │   ├── exporter
    │   │   ├── test-csv-exporter.ts
    │   │   └── test-zip-exporter.ts
    │   ├── logwrapper
    │   │   └── test-log-wrapper.ts
    │   ├── pipelines
    │   │   └── test-merge-plugin-opts.ts
    │   ├── plugins
    │   │   ├── test-browser-fetch-plugin.ts
    │   │   ├── test-dom-utils.ts
    │   │   ├── test-extract-html-content-plugin.ts
    │   │   ├── test-extract-urls-plugin.ts
    │   │   ├── test-insert-resources-plugin.ts
    │   │   ├── test-node-fetch-plugin.ts
    │   │   ├── test-scroll-plugin.ts
    │   │   ├── test-upsert-resource-plugin.ts
    │   │   └── test-url-utils.ts
    │   ├── pluginstore
    │   │   ├── input-cjs-js
    │   │   │   ├── BaseJs.js
    │   │   │   ├── Extended.js
    │   │   │   ├── ExtendedDomRead.js
    │   │   │   └── expected-extended-dom-read-bundle.txt
    │   │   ├── input-esm-js
    │   │   │   ├── BaseJs.js
    │   │   │   ├── ExtendedDomRead.js
    │   │   │   └── expected-extended-dom-read-bundle.txt
    │   │   ├── input-esm-ts
    │   │   │   ├── BaseTs.ts
    │   │   │   ├── Extended.ts
    │   │   │   ├── ExtendedDomRead.ts
    │   │   │   └── expected-extended-dom-read-bundle.txt
    │   │   ├── input-mixed-esm-cjs-ts-js
    │   │   │   ├── BaseJs.js
    │   │   │   ├── BaseTs.ts
    │   │   │   ├── ExtendedDomRead.ts
    │   │   │   └── expected-extended-dom-read-bundle.txt
    │   │   └── test-plugin-store.ts
    │   ├── schema
    │   │   └── test-schema-helper.ts
    │   ├── scraper
    │   │   ├── test-concurrency-manager.ts
    │   │   ├── test-runtime-metrics.ts
    │   │   ├── test-scraper-concurrency-constraints.ts
    │   │   ├── test-scraper-discovery.ts
    │   │   └── test-scraper-single-project.ts
    │   └── storage
    │   │   ├── mysql-unit-suite.ts
    │   │   ├── pg-unit-suite.ts
    │   │   ├── sqlite3-unit-suite.ts
    │   │   ├── test-project-crud.ts
    │   │   ├── test-resource-crud.ts
    │   │   └── unit-suite.ts
    └── utils
    │   ├── shims.js
    │   └── ts-node-config.js
├── tsconfig.debug.json
├── tsconfig.esm.json
└── tsconfig.json


/.actrc:
--------------------------------------------------------------------------------
1 | -P ubuntu-latest=nektos/act-environments-ubuntu:20.04
2 | -e .github/act/event.json


--------------------------------------------------------------------------------
/.github/act/event.json:
--------------------------------------------------------------------------------
1 | {
2 |     "act": true,
3 |     "pull_request": {
4 |         "head": {
5 |             "ref": "next"
6 |         }
7 |     }
8 |  }


--------------------------------------------------------------------------------
/.github/actions/acceptance/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Acceptance Tests'
 2 | description: 'Acceptance Tests using dynamic inputs: storage, client, docker file, ..'
 3 | inputs:
 4 |   storage:
 5 |     description: oneOf - sqlite, mysql, pg
 6 |     required: true
 7 |   storage_deps:
 8 |     description: storage npm dependencies
 9 |     required: true
10 |   client:
11 |     description: oneOf - cheerio, jsdom, puppeteer_chromium, playwright_chromium
12 |     required: true
13 |   client_deps:
14 |     description: client npm dependencies
15 |     required: true
16 |   concurrency:
17 |     description: oneOf - sequential, parallel
18 |     required: true
19 |   docker_file:
20 |     description: if present starts/stops a corresponding docker container
21 |   coveralls_token:
22 |     required: true
23 |   run_storage_unit_tests:
24 |     description: whether or not to also run unit tests for the selected storage
25 |     required: true
26 |     default: false
27 | 
28 | runs:
29 |   using: "composite"
30 |   steps:
31 |     - uses: actions/checkout@v2
32 | 
33 |     - name: Cache node modules
34 |       uses: actions/cache@v2
35 |       with:
36 |         # npm cache files are stored in `~/.npm` on Linux/macOS
37 |         # can't cache based on package-lock.json as it doesn't contain the peerDependencies we want to cache
38 |         path: ~/.npm
39 |         key: ${{ runner.os }}-npm-${{ hashFiles('**/package.json') }}
40 | 
41 |     - name: Docker ${{ inputs.storage }} up
42 |       if: ${{ inputs.docker_file }}
43 |       run: /bin/sh -c 'docker_file="${{ inputs.docker_file }}"; if [ $docker_file ]; then docker-compose -f ${{ inputs.docker_file }} up -d; else echo "ignored, no docker file"; fi'
44 | 
45 |     - name: Setup node 16
46 |       uses: actions/setup-node@v3
47 |       with:
48 |         node-version: 16
49 | 
50 |     - name: Install dependencies
51 |       run: npm ci
52 | 
53 |     - name: Install storage peer dependencies for ${{ inputs.storage }}
54 |       run: npm install ${{ inputs.storage_deps }} --save
55 | 
56 |     - name: Install client peer dependencies for ${{ inputs.client }}
57 |       run: npm install ${{ inputs.client_deps }} --save
58 | 
59 |     - name: ${{ inputs.storage }} unit tests
60 |       if: ${{ inputs.run_storage_unit_tests }}
61 |       run: npx nyc mocha --config test/.mocharc.js \"test/unit/storage/${{ inputs.storage }}-unit-suite.ts\"
62 | 
63 |     - name: Coveralls for unit tests
64 |       if: ${{ inputs.run_storage_unit_tests }}
65 |       uses: coverallsapp/github-action@master
66 |       with:
67 |         github-token: ${{ inputs.coveralls_token }}
68 |         flag-name: unit - ${{ inputs.storage }}
69 |         parallel: true
70 | 
71 |     - name: Acceptance tests - ${{ inputs.client }} - ${{ inputs.storage }} - ${{ inputs.concurrency }}
72 |       run: | 
73 |         npx nyc --exclude=**/BrowserFetchPlugin.ts --exclude=**/utils.ts mocha --config test/.mocharc.js test/acceptance/${{ inputs.client }}.ts --grep '${{ inputs.storage }} - concurrency: ${{ inputs.concurrency }}'",
74 | 
75 |     - name: Coveralls for acceptance tests
76 |       uses: coverallsapp/github-action@master
77 |       with:
78 |         github-token: ${{ inputs.coveralls_token }}
79 |         flag-name: acceptance - ${{ inputs.storage }} - ${{ inputs.client }} - ${{ inputs.concurrency }}
80 |         parallel: true
81 | 
82 |     - name: Docker ${{ inputs.storage }} down
83 |       if: ${{ inputs.docker_file }}
84 |       run: /bin/sh -c 'docker_file="${{ inputs.docker_file }}"; if [ $docker_file ]; then docker-compose -f ${{ inputs.docker_file }} down; else echo "ignored, no docker file"; fi'
85 | 
86 | 


--------------------------------------------------------------------------------
/.github/workflows/audit.yml:
--------------------------------------------------------------------------------
 1 | name: audit
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main, next ]
 6 |   pull_request:
 7 |     branches: [ main, next ]
 8 | 
 9 | jobs:
10 |   audit:
11 |     if: ${{ !github.event.act }}
12 |     runs-on: ubuntu-18.04
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v2
16 |     - name: Setup node 16
17 |       uses: actions/setup-node@v3
18 |       with:
19 |         node-version: 16
20 |     - name: NPM Audit
21 |       run: npm audit --audit-level=high


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | 
  9 | # Diagnostic reports (https://nodejs.org/api/report.html)
 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 11 | 
 12 | # Runtime data
 13 | pids
 14 | *.pid
 15 | *.seed
 16 | *.pid.lock
 17 | 
 18 | # Directory for instrumented libs generated by jscoverage/JSCover
 19 | lib-cov
 20 | 
 21 | # Coverage directory used by tools like istanbul
 22 | coverage
 23 | *.lcov
 24 | 
 25 | # nyc test coverage
 26 | .nyc_output
 27 | 
 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 29 | .grunt
 30 | 
 31 | # Bower dependency directory (https://bower.io/)
 32 | bower_components
 33 | 
 34 | # node-waf configuration
 35 | .lock-wscript
 36 | 
 37 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 38 | build/Release
 39 | 
 40 | # Dependency directories
 41 | node_modules/
 42 | jspm_packages/
 43 | 
 44 | # TypeScript v1 declaration files
 45 | typings/
 46 | 
 47 | # TypeScript cache
 48 | *.tsbuildinfo
 49 | 
 50 | # Optional npm cache directory
 51 | .npm
 52 | 
 53 | # Optional eslint cache
 54 | .eslintcache
 55 | 
 56 | # Microbundle cache
 57 | .rpt2_cache/
 58 | .rts2_cache_cjs/
 59 | .rts2_cache_es/
 60 | .rts2_cache_umd/
 61 | 
 62 | # Optional REPL history
 63 | .node_repl_history
 64 | 
 65 | # Output of 'npm pack'
 66 | *.tgz
 67 | 
 68 | # Yarn Integrity file
 69 | .yarn-integrity
 70 | 
 71 | # dotenv environment variables file
 72 | .env
 73 | .env.test
 74 | 
 75 | # parcel-bundler cache (https://parceljs.org/)
 76 | .cache
 77 | 
 78 | # Next.js build output
 79 | .next
 80 | 
 81 | # Nuxt.js build / generate output
 82 | .nuxt
 83 | dist
 84 | 
 85 | # Gatsby files
 86 | .cache/
 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js
 88 | # https://nextjs.org/blog/next-9-1#public-directory-support
 89 | # public
 90 | 
 91 | # vuepress build output
 92 | .vuepress/dist
 93 | 
 94 | # Serverless directories
 95 | .serverless/
 96 | 
 97 | # FuseBox cache
 98 | .fusebox/
 99 | 
100 | # DynamoDB Local files
101 | .dynamodb/
102 | 
103 | # TernJS port file
104 | .tern-port
105 | 
106 | # notes
107 | *.txt
108 | 
109 | # typescript outdir
110 | .dist
111 | 
112 | # examples data
113 | examples/**/*.csv
114 | examples/**/*.zip
115 | examples/**/*.sqlite
116 | examples/**/*.log
117 | !examples/*.gitignore
118 | 
119 | # test tmp dir
120 | test/tmp
121 | !test/tmp/*.gitignore
122 | 
123 | # vscode settings
124 | .vscode
125 | 
126 | # act workflows
127 | workflow
128 | 
129 | .terraform
130 | *.tfstate*
131 | *.hcl
132 | 
133 | .private
134 | 
135 | majestic-million*
136 | 


--------------------------------------------------------------------------------
/.nycrc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "reporter": ["lcov"]
3 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 get-set-fetch
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/bin/gsfscrape:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | 
3 | const { default: cli } = require('../dist/cjs/cli/cli.js');
4 | cli(process.argv);


--------------------------------------------------------------------------------
/cloud/ansible/gsf-postgresql-logs/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: "Retrieve systemd log messages since last boot"
 3 |   ansible.builtin.shell: "journalctl -u postgresql -b"
 4 |   register: journalctl
 5 | 
 6 | - name: "Retrieve service status"
 7 |   ansible.builtin.shell: systemctl status postgresql
 8 |   register: systemctl
 9 | 
10 | - name: "Copy output to local file"
11 |   delegate_to: localhost
12 |   ansible.builtin.copy:
13 |     dest: "{{export_dir}}/pg-{{ inventory_hostname }}-systemd.log"
14 |     content: "{{ systemctl.stdout }}\n\n{{ journalctl.stdout }}"
15 | 
16 | - name: "Copy output to local file"
17 |   ansible.builtin.fetch:
18 |     src: "/var/log/postgresql/postgresql-14-main.log"
19 |     dest: "{{export_dir}}/pg-{{ ansible_host }}-main.log"
20 |     validate_checksum: false # it keeps changing as resources are scraped
21 |     flat: true
22 | 


--------------------------------------------------------------------------------
/cloud/ansible/gsf-postgresql-setup/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # pg tuning for 4 vCPU, 8 GB RAM, using pgtune as base config
 3 | pg_config:
 4 |     max_connections: 100
 5 |     shared_buffers: 2GB
 6 |     effective_cache_size: 6GB
 7 |     maintenance_work_mem: 512MB
 8 |     checkpoint_completion_target: 0.9
 9 |     wal_buffers: 16MB
10 |     default_statistics_target: 100
11 |     random_page_cost: 1.1
12 |     effective_io_concurrency: 200
13 |     work_mem: 10485kB
14 |     min_wal_size: 1GB
15 |     max_wal_size: 4GB
16 |     max_worker_processes: 4
17 |     max_parallel_workers_per_gather: 2
18 |     max_parallel_workers: 4
19 |     max_parallel_maintenance_workers: 2
20 | 
21 |     # log sql statements duration
22 |     # log_destination: stderr
23 |     # log_min_duration_statement: 20
24 | 


--------------------------------------------------------------------------------
/cloud/ansible/gsf-postgresql-setup/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: restart postgres
3 |   service: name=postgresql state=restarted
4 | 


--------------------------------------------------------------------------------
/cloud/ansible/gsf-postgresql-setup/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Load defaults
 3 |   include_vars:
 4 |     file: './defaults/main.yml'
 5 |     name: defaults
 6 | 
 7 | - name: In-place merge of input dicts (pg_config) with the default ones
 8 |   set_fact:
 9 |     pg_config: "{{ defaults.pg_config | combine(pg_config, recursive=True) }}"
10 | 
11 | - name: "Create database"
12 |   postgresql_db:
13 |     state: present
14 |     name: "{{ db.name }}"
15 |   become: true
16 |   become_user: postgres
17 | 
18 | - name: "Create user"
19 |   postgresql_user:
20 |     state: present
21 |     name: "{{ db.user }}"
22 |     password: "{{ db.password }}"
23 |   become: true
24 |   become_user: postgres
25 | 
26 | - name: "Grant user access to database"
27 |   postgresql_privs:
28 |     type: database
29 |     database: "{{ db.name }}"
30 |     roles: "{{ db.user }}"
31 |     grant_option: false
32 |     privs: all
33 |   become: true
34 |   become_user: postgres
35 | 
36 | - name: "Allow remote connections on private network"
37 |   postgresql_set:
38 |     name: listen_addresses
39 |     value: 'localhost, {{ private_ip_address }}'
40 |   become: true
41 |   become_user: postgres
42 |   notify: restart postgres
43 | 
44 | - name: "Allow md5 connection for user"
45 |   postgresql_pg_hba:
46 |     dest: /etc/postgresql/14/main/pg_hba.conf
47 |     contype: host
48 |     address: all
49 |     databases: all
50 |     method: md5
51 |     users: "{{ db.user }}"
52 |     create: true
53 |   become: true
54 |   become_user: postgres
55 |   notify: restart postgres
56 | 
57 | - name: "Tunning server for 8GB RAM"
58 |   postgresql_set:
59 |     name: "{{ item.key }}"
60 |     value: "{{ item.value }}"
61 |   become: true
62 |   become_user: postgres
63 |   notify: restart postgres
64 |   with_dict: "{{ pg_config }}"
65 | 


--------------------------------------------------------------------------------
/cloud/ansible/gsf-scraper-benchmark/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | - name: "sql: project id"
 2 |   postgresql_query:
 3 |     db: "{{ db_name }}"
 4 |     login_user: "{{ db_user }}"
 5 |     login_password: "{{ db_password }}"
 6 |     login_host: "localhost"
 7 |     query: select id from "projects" where name = %(project_name)s;
 8 |     named_args:
 9 |       project_name: "{{ project_name }}"
10 |   register: sql_project_id
11 | 
12 | - name: "sql: group and sort asc scrapedAt timestamps"
13 |   postgresql_query:
14 |     db: "{{ db_name }}"
15 |     login_user: "{{ db_user }}"
16 |     login_password: "{{ db_password }}"
17 |     login_host: "localhost"
18 |     query: >
19 |         select count(*), truncated_time from (
20 |             select date_trunc('minute', "scrapedAt") as truncated_time from "{{ sql_project_id.query_result[0].id }}-resources" order by truncated_time
21 |         ) as truncated group by truncated_time;
22 |   register: sql_group_status
23 | 
24 | - name: "generate csv rows"
25 |   set_fact:
26 |     csv_rows: |
27 |       {% for entry in (sql_group_status.query_result) %}
28 |       {{ entry.truncated_time }},{{ entry.count }}
29 |       {% endfor %}
30 |   delegate_to: localhost
31 | 
32 | - name: write to file
33 |   delegate_to: localhost
34 |   copy:
35 |     content: "{{ csv_rows }}"
36 |     dest: "{{ export_file }}"
37 | 


--------------------------------------------------------------------------------
/cloud/ansible/gsf-scraper-export/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | - name: "export results as csv"
 2 |   ansible.builtin.shell: gsfscrape --config gsf-config.json --loglevel {{log_level}} --logdestination {{log_destination}} --export {{ export_file | basename  }}
 3 |   args:
 4 |     chdir: "{{ work_dir }}"
 5 | 
 6 | - name: fetch results
 7 |   ansible.builtin.fetch:
 8 |     src: "{{ work_dir }}/{{ export_file | basename }}"
 9 |     dest: "{{ export_file }}"
10 |     flat: true
11 | 


--------------------------------------------------------------------------------
/cloud/ansible/gsf-scraper-logs/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Fetch scrape log
 3 |   ansible.builtin.fetch:
 4 |     src: "{{ work_dir }}/{{ log_destination }}"
 5 |     dest: "{{export_dir}}/{{ ansible_host }}-scrape.log"
 6 |     validate_checksum: false # it keeps changing as resources are scraped
 7 |     flat: true
 8 | 
 9 | - name: "Retrieve getsetfetch.service status"
10 |   ansible.builtin.shell: systemctl status getsetfetch.service
11 |   register: systemctl
12 | 
13 | - name: "Copy getsetfetch.service status output to local file"
14 |   delegate_to: localhost
15 |   ansible.builtin.copy:
16 |     dest: "{{export_dir}}/{{ inventory_hostname }}-systemd-status.log"
17 |     # content: "{{ systemctl.stdout }}\n\n{{ journalctl.stdout }}"
18 |     content: "{{ systemctl.stdout }}"
19 | 
20 | - name: Fetch getsetfetch.service output and error logs
21 |   ansible.builtin.fetch:
22 |     src: "{{ work_dir }}/{{item}}.log"
23 |     dest: "{{export_dir}}/{{ ansible_host }}-systemd-{{item}}.log"
24 |     validate_checksum: false # it keeps changing as resources are scraped
25 |     flat: true
26 |   with_items:
27 |     - output
28 |     - error


--------------------------------------------------------------------------------
/cloud/ansible/gsf-scraper-queue/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | - name: "sql: project id"
 2 |   postgresql_query:
 3 |     db: "{{ db_name }}"
 4 |     login_user: "{{ db_user }}"
 5 |     login_password: "{{ db_password }}"
 6 |     login_host: "localhost"
 7 |     query: select id from "projects" where name = %(project_name)s;
 8 |     named_args:
 9 |       project_name: "{{ project_name }}"
10 |   register: sql_project_id
11 | 
12 | - name: "sql: queue filtered by status"
13 |   postgresql_query:
14 |     db: "{{ db_name }}"
15 |     login_user: "{{ db_user }}"
16 |     login_password: "{{ db_password }}"
17 |     login_host: "localhost"
18 |     query: >
19 |         select url, status, error 
20 |         from "{{ sql_project_id.query_result[0].id }}-queue"
21 |         where status is not null and status / 100 = %(status)s;
22 |     named_args:
23 |       status: "{{ status }}"
24 |   register: sql_queue
25 |   no_log: true
26 | 
27 | - name: "stats_file: write csv header"
28 |   delegate_to: localhost
29 |   template:
30 |     src: templates/queue.j2
31 |     dest: "{{ export_file }}"
32 | 


--------------------------------------------------------------------------------
/cloud/ansible/gsf-scraper-queue/templates/queue.j2:
--------------------------------------------------------------------------------
1 | url,status,error
2 | {% for item in sql_queue.query_result %}
3 | {{ item.url }},{{ item.status }},{{ item.error }}
4 | {% endfor %}


--------------------------------------------------------------------------------
/cloud/ansible/gsf-scraper-setup/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | db:
 3 |     pool:
 4 |         min: 1
 5 |         max: 5
 6 | 
 7 | scraper:
 8 |     npm_install: []
 9 |     # 4 (node default) + db.pool.max + files/gsf-config.json->concurrency.maxRequests
10 |     uv_threadpool_size: 15
11 |     work_dir: /srv/gsf
12 |     log:
13 |         level: info
14 |         destination: scrape.log
15 |     files:
16 |         gsf_config: templates/gsf-config.json.j2
17 |         scrape_urls: ''
18 |         additional: []
19 | 


--------------------------------------------------------------------------------
/cloud/ansible/gsf-scraper-setup/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: start scraper
3 |   systemd:
4 |     name: getsetfetch
5 |     daemon_reload: true
6 |     state: started
7 |     enabled: yes
8 | 
9 | 


--------------------------------------------------------------------------------
/cloud/ansible/gsf-scraper-setup/templates/getsetfetch.service.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=get-set-fetch-scraper
 3 | After=network.target
 4 | 
 5 | [Service]
 6 | ExecStart=gsfscrape --config gsf-config.json {{args}} --loglevel {{scraper.log.level}} --logdestination {{scraper.log.destination}}
 7 | Restart=always
 8 | User=gsf
 9 | Group=nogroup
10 | Environment=PATH=/usr/bin:/usr/local/bin
11 | Environment=UV_THREADPOOL_SIZE={{scraper.uv_threadpool_size}}
12 | WorkingDirectory={{scraper.work_dir}}
13 | StandardOutput=append:{{scraper.work_dir}}/output.log
14 | StandardError=append:{{scraper.work_dir}}/error.log
15 | 
16 | [Install]
17 | WantedBy=multi-user.target


--------------------------------------------------------------------------------
/cloud/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | resource "digitalocean_vpc" "gsf" {
 2 |   name   = "getsetfetch-vpc"
 3 |   region = var.region
 4 | }
 5 | 
 6 | resource "digitalocean_ssh_key" "gsf" {
 7 |   name       = var.public_key_name
 8 |   public_key = file(var.public_key_file)
 9 | }
10 | 
11 | resource "digitalocean_droplet" "gsf_pg" {
12 |   image       = var.pg.image
13 |   name        = var.pg.name
14 |   region      = var.region
15 |   size        = var.pg.size
16 |   monitoring  = true
17 |   resize_disk = false
18 |   vpc_uuid    = digitalocean_vpc.gsf.id
19 | 
20 |   ssh_keys = [
21 |     digitalocean_ssh_key.gsf.id
22 |   ]
23 | 
24 |   user_data = file("${path.module}/user_data_pg.yml")
25 | 
26 |   provisioner "remote-exec" {
27 |     inline = [
28 |       "cloud-init status --wait"
29 |     ]
30 | 
31 |     connection {
32 |       host        = self.ipv4_address
33 |       type        = "ssh"
34 |       user        = "root"
35 |       private_key = file(var.private_key_file)
36 |     }
37 |   }
38 | 
39 |   provisioner "local-exec" {
40 |     command = <<EOT
41 |         ansible-playbook -u root -i '${self.ipv4_address},' \
42 |         --private-key ${var.private_key_file} \
43 |         -e 'private_ip_address=${self.ipv4_address_private}' \
44 |         ${var.pg.ansible_playbook_file}
45 |   EOT
46 |   }
47 | }
48 | 
49 | resource "digitalocean_droplet" "gsf_scraper" {
50 |   count       = var.scraper.count
51 |   image       = var.scraper.image
52 |   name        = "${var.scraper.name}-${count.index}"
53 |   region      = var.region
54 |   size        = var.scraper.size
55 |   monitoring  = true
56 |   resize_disk = false
57 |   vpc_uuid    = digitalocean_vpc.gsf.id
58 | 
59 |   ssh_keys = [
60 |     digitalocean_ssh_key.gsf.id
61 |   ]
62 | 
63 |   user_data = file("${path.module}/user_data_scraper.yml")
64 | 
65 |   provisioner "remote-exec" {
66 |     inline = [
67 |       "cloud-init status --wait"
68 |     ]
69 | 
70 |     connection {
71 |       host        = self.ipv4_address
72 |       type        = "ssh"
73 |       user        = "root"
74 |       private_key = file(var.private_key_file)
75 |     }
76 |   }
77 | 
78 |   provisioner "local-exec" {
79 |     command = <<EOT
80 |         ansible-playbook -u root -i '${self.ipv4_address},' \
81 |         --private-key ${var.private_key_file} \
82 |         -e 'db_host=${digitalocean_droplet.gsf_pg.ipv4_address_private} scraper_idx=${count.index}' \
83 |         ${var.scraper.ansible_playbook_file}
84 |   EOT
85 |   }
86 | }
87 | 
88 | resource "local_file" "ansible_inventory" {
89 |   content = templatefile("${path.module}/templates/hosts.tpl",
90 |     {
91 |       postgresql_ip = digitalocean_droplet.gsf_pg.ipv4_address
92 |       scraper_ips   = digitalocean_droplet.gsf_scraper.*.ipv4_address
93 |     }
94 |   )
95 |   filename = var.ansible_inventory_file
96 | }
97 | 


--------------------------------------------------------------------------------
/cloud/terraform/provider.tf:
--------------------------------------------------------------------------------
1 | terraform {
2 |   required_providers {
3 |     digitalocean = {
4 |       source  = "digitalocean/digitalocean"
5 |       version = "~> 2.4"
6 |     }
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/cloud/terraform/templates/hosts.tpl:
--------------------------------------------------------------------------------
1 | [postgresql]
2 | ${postgresql_ip}
3 | 
4 | [scraper]
5 | %{ for ip in scraper_ips ~}
6 | ${ip}
7 | %{ endfor ~}


--------------------------------------------------------------------------------
/cloud/terraform/user_data_pg.yml:
--------------------------------------------------------------------------------
 1 | #cloud-config
 2 | package_update: true
 3 | package_upgrade: true
 4 | packages:
 5 |     - libpq-dev 
 6 |     - python3-psycopg2
 7 | runcmd:
 8 |     - sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
 9 |     - wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
10 |     - sudo apt-get update
11 |     - sudo apt-get -y install postgresql


--------------------------------------------------------------------------------
/cloud/terraform/user_data_scraper.yml:
--------------------------------------------------------------------------------
1 | #cloud-config
2 | package_update: true
3 | package_upgrade: true
4 | runcmd:
5 |   - 'curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash -'
6 |   - 'sudo apt-get install -y nodejs'


--------------------------------------------------------------------------------
/cloud/terraform/variables.tf:
--------------------------------------------------------------------------------
 1 | variable "public_key_name" {
 2 |   type = string
 3 | }
 4 | variable "public_key_file" {
 5 |   type = string
 6 | }
 7 | variable "private_key_file" {
 8 |   type = string
 9 | }
10 | variable "ansible_inventory_file" {
11 |   type = string
12 | }
13 | 
14 | variable "region" {
15 |   type = string
16 | }
17 | 
18 | 
19 | variable "pg" {
20 |   type = object({
21 |     name                  = string
22 |     image                 = string
23 |     size                  = string
24 |     ansible_playbook_file = string
25 |   })
26 | }
27 | 
28 | variable "scraper" {
29 |   type = object({
30 |     count                 = number
31 |     name                  = string
32 |     image                 = string
33 |     size                  = string
34 |     ansible_playbook_file = string
35 |   })
36 | }
37 | 
38 | 


--------------------------------------------------------------------------------
/datasets/.gitignore:
--------------------------------------------------------------------------------
1 | # exports holds lots of csv, log data for each scraper node
2 | # ignore ansible csv files like majestic-million
3 | *.log
4 | *.csv
5 | *.gz
6 | 
7 | 


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/ansible/files/ExtractScriptsPlugin.js:
--------------------------------------------------------------------------------
 1 | class ExtractScriptsPlugin {
 2 |   // defines csv export columns
 3 |   getContentKeys() {
 4 |     return [ 'scripts' ];
 5 |   }
 6 | 
 7 |   test(project, resource) {
 8 |     if (!resource) return false;
 9 |     return (/html/i).test(resource.contentType);
10 |   }
11 | 
12 |   apply(project, resource, DomClient) {
13 |     const doc = new DomClient(resource.data);
14 | 
15 |     const scripts = [];
16 |     Array.from(doc.querySelectorAll('script')).forEach(script => {
17 |       let src = script.getAttribute('src');
18 |       let isInvalidScript;
19 |       if (src) {
20 |         src = src.trim();
21 | 
22 |         // src may contain actual js code, or just url fragments like "http://", "//", ...
23 |         isInvalidScript = src.startsWith('data:') || /function\s*\(|^(http)*:*[/\\]+$/.test(src);
24 |       }
25 |       else {
26 |         src = '<inline>';
27 |       }
28 | 
29 |       if (!isInvalidScript && !scripts.includes(src)) {
30 |         scripts.push(src);
31 |       }
32 |     });
33 | 
34 |     /*
35 |     a content entry is represented by an array containing one or multiple scraped values
36 |     we can have multiple content entries for a single resources due to dom selectors returning multiple results
37 |     */
38 |     return { content: [ scripts ] };
39 |   }
40 | }
41 | 
42 | module.exports = ExtractScriptsPlugin;
43 | 


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/ansible/inventory/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/get-set-fetch/scraper/879c03e2811aaa13479095a4db376563059b3b4d/datasets/javascript-libs-from-top-1mm-sites/ansible/inventory/.gitkeep


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/ansible/pg-setup.yml:
--------------------------------------------------------------------------------
 1 | - hosts: all
 2 |   vars_files:
 3 |     - vault.yml
 4 | 
 5 |   roles:
 6 |     - role: gsf-postgresql-setup
 7 |       vars:
 8 |         db:
 9 |           name: getsetfetch
10 |           user: "{{ vault_db_user }}"
11 |           password: "{{ vault_db_password }}"
12 |         pg_config:
13 |           max_connections: 210 # 20 scrapers * 10 max connection pool + 10
14 | 


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/ansible/scraper-setup.yml:
--------------------------------------------------------------------------------
 1 | - hosts: all
 2 |   vars_files:
 3 |     - vault.yml
 4 | 
 5 |   roles:
 6 |     - role: gsf-scraper-setup
 7 |       vars:
 8 |         db:
 9 |           name: getsetfetch
10 |           user: "{{ vault_db_user }}"
11 |           password: "{{ vault_db_password }}"
12 |           pool:
13 |             min: 10
14 |             max: 10
15 |         scraper:
16 |           uv_threadpool_size: 34 # 4 (default) + 30 (max concurrent dns.lookups)
17 |           npm_install:
18 |             - knex@1.0.7
19 |             - pg@8.7.3
20 |             - cheerio@1.0.0-rc.10
21 |             - "@get-set-fetch/scraper@0.11.0"
22 |             # - get-set-fetch-scraper-0.10.0.tgz
23 |           log:
24 |             level: info
25 |           files:
26 |               scrape_urls: majestic-million-compact.csv
27 |               gsf_config: templates/js-scripts-config.json.j2
28 |               additional:
29 |                   - ExtractScriptsPlugin.js


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/ansible/templates/js-scripts-config.json.j2:
--------------------------------------------------------------------------------
 1 | {
 2 |   "storage": {
 3 |     "client": "pg",
 4 |     "useNullAsDefault": true,
 5 |     "connection": {
 6 |       "host": "{{ db_host }}",
 7 |       "port": "5432",
 8 |       "user": "{{ db.user }}",
 9 |       "password": "{{ db.password }}",
10 |       "database": "{{ db.name }}"
11 |     },
12 |     "pool": { 
13 |       "min": {{ db.pool.min }},
14 |       "max": {{ db.pool.max }}
15 |     },
16 |     "debug": false
17 |   },
18 |   "client": {
19 |     "name": "cheerio"
20 |   },
21 |   "project": {
22 |     "name": "js-scripts1",
23 |     "resourcePath": "<resource_path>",
24 |     "pipeline": "dom-static-content",
25 |     "pluginOpts": [
26 |       {
27 |         "name": "ExtractUrlsPlugin",
28 |         "maxDepth": 0
29 |       },
30 |       {
31 |         "name": "NodeFetchPlugin",
32 |         "headers": {
33 |           "Accept-Encoding": "br,gzip,deflate",
34 |           "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0"
35 |         },
36 |         "tlsCheck": false,
37 |         "dnsResolution": "lookup"
38 |       },
39 |       {
40 |         "name": "ExtractScriptsPlugin",
41 |         "path": "ExtractScriptsPlugin.js",
42 |         "replace": "ExtractHtmlContentPlugin"
43 |       }
44 |     ]
45 |   },
46 |   "concurrency": {
47 |     "domain": {
48 |       "maxRequests": 30,
49 |       "delay": 50
50 |     },
51 |     "proxy": {
52 |         "maxRequests": 30,
53 |         "delay": 50
54 |     },
55 |     "session": {
56 |         "maxRequests": 30,
57 |         "delay": 50
58 |     }
59 |   }
60 | }


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/ansible/vault.yml:
--------------------------------------------------------------------------------
1 | vault_db_user: 
2 | vault_db_password: 


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/charts/extract/summarize-js-libs.ts:
--------------------------------------------------------------------------------
 1 | import fs from 'fs';
 2 | import CategoryExtractor from './CategoryExtractor';
 3 | import ScriptParser from './ScriptParser';
 4 | import { getTotals } from '../../../utils/map';
 5 | 
 6 | (async () => {
 7 |   const prefix = 'getsetfetch-dataset';
 8 | 
 9 |   // get script data as Map<hostname, Map<pathname, count>>
10 |   const scriptParser = new ScriptParser();
11 |   const scripts = await scriptParser.parse(`../../exports/${prefix}-javascript-libraries.csv`);
12 | 
13 |   // extract pathname (script name) counts with a min count of 10
14 |   const { pathnameTotal } = getTotals(scripts);
15 |   fs.writeFileSync(
16 |     `../${prefix}-javascript-libraries-frequency-count.csv`,
17 |     pathnameTotal
18 |       .filter(([ script, count ]) => count >= 10)
19 |       .map(([ script, count ]) => `${script},${count}`).join('\n'),
20 |   );
21 | 
22 |   const categoryExtractor = new CategoryExtractor();
23 |   categoryExtractor.parse(`../${prefix}-javascript-libraries-frequency-count.csv`);
24 |   fs.writeFileSync('../most-used-js-libs.csv', categoryExtractor.toCsv());
25 | })();
26 | 


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/charts/most-used-js-libs-2022-06-05-thumb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/get-set-fetch/scraper/879c03e2811aaa13479095a4db376563059b3b4d/datasets/javascript-libs-from-top-1mm-sites/charts/most-used-js-libs-2022-06-05-thumb.png


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/charts/most-used-js-libs-2022-06-05.csv:
--------------------------------------------------------------------------------
 1 | category,script,value
 2 | Utils,jQuery,419366
 3 | Utils,jQuery Migrate,207334
 4 | Utils,Google reCAPTCHA,54779
 5 | Utils,slick,43628
 6 | Utils,Modernizr,35714
 7 | Utils,Owl Carousel,35043
 8 | Utils,i18n,34633
 9 | Utils,Underscore,34011
10 | Utils,Cloudflare Email Decode,29961
11 | Analytics,Google Analytics,191205
12 | Analytics,WordPress Stats and Insights,30127
13 | Analytics,Cloudflare Insights,11924
14 | Analytics,gtm4wp-form-move-tracker,395
15 | Analytics,Google Analytics For Wordpress,331
16 | CMS,WordPress Core,150323
17 | CMS,WordPress Contact Form 7,68478
18 | CMS,WordPress Elementor,45632
19 | CMS,WordPress WooCommerce,26727
20 | CMS,WordPress Slider Revolution,23339
21 | CMS,WordPress Utilities,16101
22 | CMS,Gravity Forms,487
23 | CMS,WordPress Visual Composer,25
24 | UI Widgets,Bootstrap,97579
25 | UI Widgets,jQuery UI,98659
26 | UI Widgets,imagesLoaded,44877
27 | UI Widgets,jQuery FitVids,19770
28 | UI Widgets,Popper,19495
29 | UI Widgets,Google Maps,19283
30 | UI Widgets,jQuery FlexSlider,18303
31 | UI Widgets,jQuery Magnific Popup,16463
32 | UI Widgets,jQuery Fancybox,15192
33 | UI Widgets,Animate on Scroll,4947
34 | Advertising,Google Adsense,52970
35 | Advertising,Google Publisher Tags,15515
36 | Cookies,jQuery Cookie,16461
37 | Cookies,Cookie Consent,10335
38 | Cookies,JavaScript Cookie,9972
39 | Cookies,OneTrust Cookies Consent,9493
40 | Cookies,consent.cookiebot.com/uc,6692
41 | Cookies,Cookie Law Info,307
42 | Cookies,Cookie Notice,248
43 | Optimization,Cloudflare Rocket Loader,13218
44 | Optimization,WordPress Autoptimize,13151
45 | Optimization,LazySizes,11695
46 | Optimization,optimize,6584
47 | Optimization,LazyLoad,5669
48 | Optimization,smush-lazy-load,127


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/charts/most-used-js-libs-2022-06-05.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/get-set-fetch/scraper/879c03e2811aaa13479095a4db376563059b3b4d/datasets/javascript-libs-from-top-1mm-sites/charts/most-used-js-libs-2022-06-05.png


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/exports/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/get-set-fetch/scraper/879c03e2811aaa13479095a4db376563059b3b4d/datasets/javascript-libs-from-top-1mm-sites/exports/.gitkeep


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/readme.md:
--------------------------------------------------------------------------------
 1 | ### Javascript Libraries From Top 1 Million Sites
 2 | 
 3 | CSV files available as [open access dataset](https://zenodo.org/record/6617972)
 4 | - getsetfetch-dataset-javascript-libraries.csv.gz (146 MB)
 5 |     - Each row contains a page URL followed by script source URLs (absolute or relative) encountered in that page. Inline scripts have an \"\<inline>" value. \
 6 |     ex: https:// sitemaps.org/,"\<inline>","/lang.js"
 7 | 
 8 | - getsetfetch-dataset-javascript-libraries-frequency-count.csv.gz (214 KB) 
 9 |     - Each row contains a partial script pathname followed by a frequency count. 
10 |     The pathname is split in fragments based on "/" and expanded from right to left until the first non-generic fragment is found. If the full pathname contains only generic keywords (index, main, dist, etc...) the script hostname is added as well. Common suffixes like .min, .min.js are removed. \
11 |     ex: jquery/ui/core,62554
12 | 
13 | 
14 | #### Get Input Data
15 | The project scrapes URLs from Majestic 1 Million (June 5th, 2022). \
16 | Download the csv from the [official site](https://majestic.com/reports/majestic-million). \
17 | Keep 3rd column with the domain name. Manually remove 1st row containing labels.
18 | ```bash
19 | cd ansible/files
20 | cut -d, -f 3 downloaded-majestic-million.csv > majestic-million-compact.csv
21 | sed -i '1d' majestic-million-compact.csv
22 | ```
23 | 
24 | majestic-million-compact.csv is referenced by ansible playbook [scraper-setup.yml](ansible/scraper-setup.yml). It will be used to add the URLs to the initial scraping queue.
25 | 
26 | #### Scrape in Cloud
27 | See [getsetfetch.org/blog/cloud-scraping-running-existing-projects.html](https://getsetfetch.org/blog/cloud-scraping-running-existing-projects.html) on detailed instructions on how to setup Terraform and Ansible, start scraping, monitor progress and export scraped content.
28 | 
29 | The defined terraform module [main.tf](terraform/main.tf) provisions one central PostgreSQL instance and 20 scraper instances deployed on DigitalOcean Frankfurt FRA1 datacenter.
30 | 
31 | ```bash
32 | terraform apply \
33 | -var "api_token=${API_TOKEN}" \
34 | -var "public_key_file=<public_key_file>" \
35 | -var "private_key_file=<private_key_file>" \
36 | -parallelism=30
37 | ```
38 | 
39 | #### Summarize Scraped Data
40 | ```bash
41 | cd charts/extract
42 | npx ts-node summarize-js-libs.ts
43 | ```
44 | 
45 | #### Generate Chart(s)
46 | Start a basic http server serving static files from current directory on localhost:9000.
47 | ```bash
48 | cd charts
49 | npx ts-node ../../utils/serve-static.ts
50 | ```
51 | #### Most Used Javascript Libraries (percentage)
52 | [![Most Used Javascript Libraries](./charts/most-used-js-libs-2022-06-05-thumb.png)](./charts/most-used-js-libs-2022-06-05.svg)
53 | 
54 | - http://localhost:9000/most-used-js-libs.html
55 | - filters out libraries with less than 1% usage
56 | - groups libraries into categories with each category having a maximum of 9 entries
57 | 


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/terraform/main.tf:
--------------------------------------------------------------------------------
 1 | module "js_libs" {
 2 |   source = "../../../cloud/terraform"
 3 | 
 4 |   region                 = "fra1"
 5 |   public_key_name        = "get-set-fetch"
 6 |   public_key_file        = var.public_key_file
 7 |   private_key_file       = var.private_key_file
 8 |   ansible_inventory_file = "../ansible/inventory/hosts.cfg"
 9 | 
10 |   pg = {
11 |     name                  = "pg"
12 |     image                 = "ubuntu-20-04-x64"
13 |     size                  = "s-4vcpu-8gb"
14 |     ansible_playbook_file = "../ansible/pg-setup.yml"
15 |   }
16 | 
17 |   scraper = {
18 |     count                 = 20
19 |     name                  = "scraper"
20 |     image                 = "ubuntu-20-04-x64"
21 |     size                  = "s-1vcpu-1gb"
22 |     ansible_playbook_file = "../ansible/scraper-setup.yml"
23 |   }
24 | }
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/terraform/provider.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     digitalocean = {
 4 |       source  = "digitalocean/digitalocean"
 5 |       version = "~> 2.4"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | provider "digitalocean" {
11 |   token = var.api_token
12 | }
13 | 


--------------------------------------------------------------------------------
/datasets/javascript-libs-from-top-1mm-sites/terraform/variables.tf:
--------------------------------------------------------------------------------
1 | variable "api_token" {}
2 | variable "public_key_file" {}
3 | variable "private_key_file" {}
4 | 


--------------------------------------------------------------------------------
/datasets/readme.md:
--------------------------------------------------------------------------------
 1 | ### Datasets
 2 | 
 3 | Each directory represents a scraping project to be run in the cloud using Terraform and Ansible. Unless otherwise specified each project defines a central PostgreSQL instance and 20 scraper instances deployed on DigitalOcean Frankfurt FRA1 datacenter.
 4 | 
 5 | Check [getsetfetch.org/node/cloud.html](https://getsetfetch.org/node/cloud.html) for details on available Terraform modules and Ansible roles. 
 6 | 
 7 | Check [getsetfetch.org/blog/cloud-scraping-running-existing-projects.html](https://getsetfetch.org/blog/cloud-scraping-running-existing-projects.html) for detailed info on how to run the projects.
 8 | 
 9 | Available datasets:
10 | - [Javascript Libraries From Top 1 Million Sites](javascript-libs-from-top-1mm-sites/)
11 | 


--------------------------------------------------------------------------------
/datasets/tsconfig.datasets.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |       "esModuleInterop": true,
 4 |       "resolveJsonModule": true,
 5 |       "target": "esnext",
 6 |       "strict": false,
 7 |       "moduleResolution": "node",
 8 |       "module": "commonjs",
 9 |       "allowJs": true,
10 |       "newLine": "LF",
11 |   },
12 |   "include": [
13 |     "./"
14 |   ],
15 | }


--------------------------------------------------------------------------------
/datasets/utils/map.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable import/prefer-default-export */
 2 | 
 3 | export function getTotals(map: Map<string, Map<string, number>>)
 4 |   : {hostnameTotal:[string, number][], pathnameTotal: [string, number][]} {
 5 |   const hostnameTotalMap:Map<string, number> = new Map();
 6 |   const pathnameTotalMap:Map<string, number> = new Map();
 7 | 
 8 |   let totalScriptCount = 0;
 9 |   map.forEach((pathnames, hostname) => {
10 |     let hostnameCount = 0;
11 | 
12 |     // sum pathnames (script names) across all hostnames
13 |     pathnames.forEach((count, pathname) => {
14 |       addToMap(pathnameTotalMap, pathname, count);
15 |       hostnameCount += count;
16 |     });
17 | 
18 |     // record each hostname total scripts
19 |     hostnameTotalMap.set(hostname, hostnameCount);
20 |     totalScriptCount += hostnameCount;
21 |   });
22 | 
23 |   const avgScriptCount = totalScriptCount / pathnameTotalMap.size;
24 | 
25 |   // order descending
26 |   const hostnameTotal = Array.from(hostnameTotalMap.entries()).sort((a: [string, number], b: [string, number]) => b[1] - a[1]);
27 |   const pathnameTotal = Array.from(pathnameTotalMap.entries()).sort((a: [string, number], b: [string, number]) => b[1] - a[1]);
28 | 
29 |   return { hostnameTotal, pathnameTotal };
30 | }
31 | 
32 | export function getTopEntries(map: Map<string, Map<string, number>>, topHostnames:number = 20, topPathnames:number = 20)
33 |   : {hostnames: string[], pathnames: string[]} {
34 |   const { hostnameTotal, pathnameTotal } = getTotals(map);
35 | 
36 |   const hostnames:string[] = hostnameTotal.slice(0, topHostnames).map(([ key ]) => key);
37 |   const pathnames:string[] = pathnameTotal.slice(0, topPathnames).map(([ key ]) => key);
38 | 
39 |   return { hostnames, pathnames };
40 | }
41 | 
42 | export function addToMap(map: Map<string, number>, key: string, val: number = 1) {
43 |   const count = map.get(key);
44 |   if (!count) {
45 |     map.set(key, val);
46 |   }
47 |   else {
48 |     map.set(key, count + val);
49 |   }
50 | }
51 | 
52 | export function addToNestedMap(map: Map<string, Map<string, number>>, mainKey: string, subKey: string, val: number = 1) {
53 |   const subMap = map.get(mainKey);
54 |   if (!subMap) {
55 |     map.set(mainKey, new Map([ [ subKey, 1 ] ]));
56 |   }
57 |   else {
58 |     addToMap(subMap, subKey, val);
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/datasets/utils/serve-static.ts:
--------------------------------------------------------------------------------
 1 | import http from 'http';
 2 | import url from 'url';
 3 | import fs from 'fs';
 4 | import path from 'path';
 5 | 
 6 | const port = process.argv[2] ? parseInt(process.argv[2], 10) : 9000;
 7 | 
 8 | http.createServer((req, res) => {
 9 |   // console.log(`${req.method} ${req.url}`);
10 | 
11 |   // parse URL
12 |   const parsedUrl = url.parse(req.url);
13 |   // extract URL path
14 |   let pathname = `.${parsedUrl.pathname}`;
15 |   // based on the URL path, extract the file extension. e.g. .js, .doc, ...
16 |   const { ext } = path.parse(pathname);
17 |   // maps file extension to MIME typere
18 |   const map = {
19 |     '.ico': 'image/x-icon',
20 |     '.html': 'text/html',
21 |     '.js': 'text/javascript',
22 |     '.json': 'application/json',
23 |     '.css': 'text/css',
24 |     '.png': 'image/png',
25 |     '.jpg': 'image/jpeg',
26 |     '.wav': 'audio/wav',
27 |     '.mp3': 'audio/mpeg',
28 |     '.svg': 'image/svg+xml',
29 |     '.pdf': 'application/pdf',
30 |     '.doc': 'application/msword',
31 |   };
32 | 
33 |   fs.exists(pathname, exist => {
34 |     if (!exist) {
35 |       // if the file is not found, return 404
36 |       res.statusCode = 404;
37 |       res.end(`File ${pathname} not found!`);
38 |       return;
39 |     }
40 | 
41 |     // if is a directory search for index file matching the extension
42 |     if (fs.statSync(pathname).isDirectory()) pathname += `/index${ext}`;
43 | 
44 |     // read file from file system
45 |     fs.readFile(pathname, (err, data) => {
46 |       if (err) {
47 |         res.statusCode = 500;
48 |         res.end(`Error getting the file: ${err}.`);
49 |       }
50 |       else {
51 |         // if the file is found, set Content-type and send data
52 |         res.setHeader('Content-type', map[ext] || 'text/plain');
53 |         res.end(data);
54 |       }
55 |     });
56 |   });
57 | }).listen(port);
58 | 
59 | console.log(`Server listening on port ${port}`);
60 | 


--------------------------------------------------------------------------------
/development.md:
--------------------------------------------------------------------------------
 1 | ## Debugging in VSCode
 2 | 
 3 | ### All unit tests except command line
 4 | Just use vscode default "run and debug" settings. From "run and debug' panel, select the "node.js" option, "Select Launch Configuration" command palette will appear, select one of the "Run Script: test:" options.
 5 | 
 6 | ### Command line unit tests
 7 | Use the below VSCode launch configuration, modify args with the command line arguments you want to debug against.
 8 | ```json
 9 | {
10 |   "type": "pwa-node",
11 |   "request": "launch",
12 |   "name": "Launch Get-Set-Fetch Cli",
13 |   "program": "${workspaceFolder}/dist/cjs/cli/cli.js",
14 |   "args": [ "--version"],
15 |   "skipFiles": [
16 |     "<node_internals>/**"
17 |   ],
18 |   "preLaunchTask": "tsc: build - tsconfig.debug.json",
19 |   "outFiles": [
20 |     "${workspaceFolder}/dist/cjs/**/*.js"
21 |   ]
22 | }
23 | ```


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM alpine:3.15
 2 | 
 3 | # 1000 is the first UID assigned to a non root user (debian, ubuntu)
 4 | # this can mitigate permissions issues when mapping volumes between host and container
 5 | ARG USER_ID=1000
 6 | ARG GROUP_ID=1000
 7 | 
 8 | ARG STORAGE
 9 | ARG BROWSER_CLIENT
10 | ARG DOM_CLIENT
11 | ARG VERSION
12 | ARG BRANCH=main
13 | 
14 | # core apk packages
15 | RUN apk add --no-cache nodejs npm git
16 | 
17 | # node-gyp required for some packages like @vscode/sqlite3,
18 | # remove the virtual pkg group at the end
19 | RUN apk add --no-cache --virtual .gyp g++ make py3-pip
20 | 
21 | # puppeteer apk packages
22 | # install chromium (91.0.4472.164-r0) package, https://pkgs.alpinelinux.org/packages?name=chromium&branch=v3.14
23 | # puppeteer v9.1.1 works with this chromium version, https://github.com/puppeteer/puppeteer/releases
24 | RUN if [ "$BROWSER_CLIENT" = "puppeteer" ] ; then apk add --no-cache \
25 |     chromium \
26 |     nss \
27 |     freetype \
28 |     harfbuzz \
29 |     ca-certificates \
30 |     ttf-freefont; fi
31 | 
32 | # puppeteer env variables
33 | # skip installing chromium, puppeteer will be using the installed package
34 | ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
35 |     PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
36 | 
37 | # add user so we don't need --no-sandbox, https://developers.google.com/web/tools/puppeteer/troubleshooting#running-on-alpine
38 | # match uid, gid coming from the host user
39 | RUN addgroup --system --gid $GROUP_ID gsfuser
40 | RUN adduser --system --uid $USER_ID --ingroup gsfuser gsfuser
41 | 
42 | # run everything after as non-privileged user
43 | USER gsfuser
44 | 
45 | RUN mkdir -p /home/gsfuser/Downloads /home/gsfuser/scraper
46 | 
47 | # install and build get-set-fetch/scraper from github sources
48 | RUN if [ "$VERSION" = "source" ] ; then \
49 |     git clone -b "$BRANCH" --single-branch https://github.com/get-set-fetch/scraper.git /home/gsfuser/scraper \
50 |     && cd /home/gsfuser/scraper \
51 |     && npm ci \
52 |     && npm run build; fi
53 | 
54 | WORKDIR /home/gsfuser/scraper
55 | 
56 | # associative arrays not available in sh or ash
57 | # use some nested case statements for linking storage, browser and dom client npm packages to gsf versions
58 | RUN case "$VERSION" in \
59 |     'source') \
60 |         case "$STORAGE" in \
61 |             'sqlite') npm install knex@1.0.7 @vscode/sqlite3@5.0.8 ;; \
62 |             'pg') npm install knex@1.0.7 pg@8.7.3 ;; \
63 |             'mysql') npm install knex@1.0.7 mysql@2.18.1 ;; \
64 |         esac; \
65 |         case "$BROWSER_CLIENT" in \
66 |             'puppeteer') npm install puppeteer@14.3.0 ;; \
67 |             'playwright') npm install playwright-core@1.13.1 playwright-chromium@1.13.1 ;; \
68 |         esac; \
69 |         case "$DOM_CLIENT" in \
70 |             'cheerio') npm install cheerio@1.0.0-rc.10 ;; \
71 |             'jsdom') npm install jsdom@16.7.0 ;; \
72 |         esac \
73 |     ;; \
74 |     esac
75 | 
76 | # remove node-gyp related packages and switch back to gsfuser
77 | USER root
78 | RUN apk del .gyp
79 | USER gsfuser
80 | 
81 | # invoke entrypoint as exec form, gsfscrape will receive signals such as SIGTERM
82 | ENTRYPOINT ["/home/gsfuser/scraper/bin/gsfscrape"]
83 | 
84 | # default arguments
85 | CMD [ "--version" ]


--------------------------------------------------------------------------------
/docker/data/config-pg-puppeteer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "storage": {
 3 |     "client": "pg",
 4 |     "useNullAsDefault": true,
 5 |     "connection": {
 6 |       "host": "pg",
 7 |       "port": "5432",
 8 |       "user": "gsf-user",
 9 |       "password": "gsf-pswd",
10 |       "database": "gsf-db"
11 |     },
12 |     "debug": false
13 |   },
14 |   "client": {
15 |     "name": "puppeteer",
16 |     "opts": {
17 |       "ignoreHTTPSErrors": true,
18 |       "args": [
19 |         "--ignore-certificate-errors",
20 |         "--no-first-run",
21 |         "--single-process"
22 |       ]
23 |     }
24 |   },
25 |   "project": {
26 |     "name": "myProj",
27 |     "pipeline": "browser-static-content",
28 |     "pluginOpts": [
29 |       {
30 |         "name": "ExtractHtmlContentPlugin",
31 |         "selectorPairs": [
32 |           {
33 |             "contentSelector": "h3",
34 |             "label": "headline"
35 |           }
36 |         ]
37 |       },
38 |       {
39 |         "name": "InsertResourcesPlugin",
40 |         "maxResources": 1
41 |       },
42 |       {
43 |         "name": "UpsertResourcePlugin",
44 |         "keepHtmlData": true
45 |       }
46 |     ],
47 |     "resources": [
48 |       {
49 |         "url": "https://getsetfetch.org/index.html"
50 |       }
51 |     ]
52 |   }
53 | }


--------------------------------------------------------------------------------
/docker/data/config-sqlite-cheerio.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "storage": {
 3 |     "client": "sqlite3",
 4 |     "useNullAsDefault": true,
 5 |     "connection": {
 6 |       "filename": "gsf.sqlite"
 7 |     },
 8 |     "debug": false
 9 |   },
10 |   "client": {
11 |     "name": "cheerio"
12 |   },
13 |   "project": {
14 |     "name": "myProj",
15 |     "pipeline": "dom-static-content",
16 |     "pluginOpts": [
17 |       {
18 |         "name": "ExtractHtmlContentPlugin",
19 |         "selectorPairs": [
20 |           {
21 |             "contentSelector": "h3"
22 |           }
23 |         ]
24 |       },
25 | 	  {
26 |         "name": "InsertResourcesPlugin",
27 |         "maxResources": 1
28 |       }
29 |     ],
30 |     "resources": [
31 |       {
32 |         "url": "https://getsetfetch.org/index.html"
33 |       }
34 |     ]
35 |   }
36 | }


--------------------------------------------------------------------------------
/docker/data/config-sqlite-puppeteer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "storage": {
 3 |     "client": "sqlite3",
 4 |     "useNullAsDefault": true,
 5 |     "connection": {
 6 |       "filename": "gsf.sqlite"
 7 |     },
 8 |     "debug": false
 9 |   },
10 |   "client": {
11 |     "name": "puppeteer",
12 |     "opts": {
13 |       "ignoreHTTPSErrors": true,
14 |       "args": [
15 |         "--ignore-certificate-errors",
16 |         "--no-first-run",
17 |         "--single-process"
18 |       ]
19 |     }
20 |   },
21 |   "project": {
22 |     "name": "myProj",
23 |     "pipeline": "browser-static-content",
24 |     "pluginOpts": [
25 |       {
26 |         "name": "ExtractHtmlContentPlugin",
27 |         "selectorPairs": [
28 |           {
29 |             "contentSelector": "h3"
30 |           }
31 |         ]
32 |       },
33 | 	  {
34 |         "name": "InsertResourcesPlugin",
35 |         "maxResources": 1
36 |       }
37 |     ],
38 |     "resources": [
39 |       {
40 |         "url": "https://getsetfetch.org/index.html"
41 |       }
42 |     ]
43 |   }
44 | }


--------------------------------------------------------------------------------
/docker/docker.md:
--------------------------------------------------------------------------------
 1 | For both docker build and run commands make this repo directory the current working directory.
 2 | 
 3 | ## Build
 4 | All scraper images are based on alpine:3.14 docker image.
 5 | You have to build the images locally; they're not published on Docker Hub.
 6 | A set of built-time variables allows you to customize the docker image.
 7 | 
 8 |  Built-time Variable | Values | Default |
 9 | | ------- | -------| --|
10 | | BROWSER_CLIENT | puppeteer | -
11 | | DOM_CLIENT | cheerio, jsdom | -
12 | | STORAGE | sqlite, pg, mysql | -
13 | | VERSION | source | -
14 | | USER_ID |  | 1000
15 | | GROUP_ID |  | 1000
16 | 
17 | `BROWSER_CLIENT` and `DOM_CLIENT` variables are mutually exclusive. You either scrape using a headless browser or a HTML/DOM parser library.
18 | 
19 | `USER_ID` and `GROUP_ID` are used to add the `gsfuser` user to the container. This non-root user runs the scraper, reads and writes data to the `/home/gsfuser/scraper/data` container path mounted from the host. Use `--build-arg USER_ID=$(id -u)`, `--build-arg GROUP_ID=$(id -g)` to provide the same uid/gid as the currently logged in user. If you're on Windows you can ignore these two variables.
20 | 
21 | Create an image using cheerio, sqlite and latest source code.
22 | ```bash
23 | docker build \
24 | --tag getsetfetch \
25 | --build-arg DOM_CLIENT=cheerio \
26 | --build-arg STORAGE=sqlite \
27 | --build-arg VERSION=source \
28 | --build-arg USER_ID=$(id -u) \
29 | --build-arg GROUP_ID=$(id -g) .
30 | ```
31 | 
32 | Create an image using puppeteer, sqlite and latest source code.
33 | ```bash
34 | docker build \
35 | --tag getsetfetch \
36 | --build-arg BROWSER_CLIENT=puppeteer \
37 | --build-arg STORAGE=sqlite \
38 | --build-arg VERSION=source \
39 | --build-arg USER_ID=$(id -u) \
40 | --build-arg GROUP_ID=$(id -g) .
41 | ```
42 | 
43 | 
44 | ## Run
45 | All examples contain config, log, sqlite, csv files under `/home/gsfuser/scraper/data` container path mounted from the host for easy access to logs and exported scraped content. Remaining arguments represent [CLI arguments](/get-set-fetch/scraper#command-line-interface).
46 | 
47 | 
48 | Log, scrape and export data using [config-sqlite-cheerio.json](data/config-sqlite-cheerio.json).
49 | ```bash
50 | docker run \
51 | -v <host_dir>/scraper/docker/data:/home/gsfuser/scraper/data getsetfetch:latest \
52 | --version \
53 | --config data/config-sqlite-cheerio.json \
54 | --save \
55 | --overwrite \
56 | --scrape \
57 | --loglevel info \
58 | --logdestination data/scrape.log \
59 | --export data/export.csv
60 | ```
61 | 
62 | Log, scrape and export data using [config-sqlite-puppeteer.json](data/config-sqlite-puppeteer.json). Use either `--security-opt seccomp=unconfined` or `--security-opt seccomp=data/chromium-security-profile.json` ([source blog](https://blog.jessfraz.com/post/how-to-use-new-docker-seccomp-profiles/)) to allow Chromium syscalls.
63 | ```bash
64 | docker run \
65 | --security-opt seccomp=unconfined
66 | -v <host_dir>/scraper/docker/data:/home/gsfuser/scraper/data getsetfetch:latest \
67 | --version \
68 | --config data/config-sqlite-puppeteer.json \
69 | --save \
70 | --overwrite \
71 | --scrape \
72 | --loglevel info \
73 | --logdestination data/scrape.log \
74 | --export data/export.csv
75 | ```
76 | 
77 | You can also start the scraper as a [docker-compose service](pg-puppeteer/docker-compose.yml). This example scrapes using puppeteer and postgresql. Remember to build the corresponding image `--build-arg STORAGE=pg --build-arg BROWSER_CLIENT=puppeteer` first :)
78 | 
79 | ```bash
80 | cd ./pg-puppeteer
81 | 
82 | # start
83 | docker-compose up -d
84 | 
85 | # stop
86 | docker-compose down
87 | ```
88 | 


--------------------------------------------------------------------------------
/docker/pg-puppeteer/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.3"
 2 | services:
 3 |   pg:
 4 |     image: postgres:11-alpine
 5 |     environment:
 6 |       POSTGRES_USER: gsf-user
 7 |       POSTGRES_PASSWORD: gsf-pswd
 8 |       POSTGRES_DB: gsf-db
 9 | 
10 |   gsf:
11 |     image: getsetfetch:latest
12 |     command: >
13 |       --version
14 |       --config data/config-pg-puppeteer.json
15 |       --save
16 |       --overwrite
17 |       --scrape
18 |       --loglevel info
19 |       --logdestination data/scrape.log
20 |       --export data/export.csv
21 | 
22 |     volumes:
23 |       - ../data:/home/gsfuser/scraper/data
24 |     security_opt:
25 |       - seccomp:"../data/chromium-security-profile.json"
26 |     depends_on:
27 |     - pg
28 |       
29 | volumes:
30 |   data:


--------------------------------------------------------------------------------
/examples/article-excerpts/ReadabilityPlugin.ts:
--------------------------------------------------------------------------------
 1 | import { Readability } from '@mozilla/readability';
 2 | import { Plugin, Project, Resource } from '../../src/index';
 3 | 
 4 | /**
 5 |  * IMPORTANT NOTE !
 6 |  * if you're using plain javascript besides removing Project and Resource types, don't extend the abstract Plugin class
 7 |  * @rollup/plugin-commonjs will bundle the entire @get-set-fetch/scraper project including fs, jszip, ... imports
 8 |  */
 9 | export default class ReadabilityPlugin extends Plugin {
10 |   opts = {
11 |     domRead: true,
12 |   }
13 | 
14 |   test(project:Project, resource:Resource) {
15 |     if (!resource) return false;
16 |     return (/html/i).test(resource.contentType);
17 |   }
18 | 
19 |   apply() {
20 |     const article = new Readability(document).parse();
21 |     return { content: [ [ article.excerpt ] ] };
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/examples/article-excerpts/article-excerpts-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"storage": {
 3 | 		"client": "sqlite3",
 4 | 		"useNullAsDefault": true,
 5 | 		"connection": {
 6 | 			"filename": "article-excerpts.sqlite"
 7 | 		},
 8 | 		"debug": false
 9 | 	},
10 | 	"client": {
11 | 		"name": "puppeteer",
12 | 		"opts": {
13 | 			"args": [
14 | 				"--disable-gpu",
15 | 				"--disable-dev-shm-usage",
16 | 				"--disable-setuid-sandbox",
17 | 				"--no-first-run",
18 | 				"--no-sandbox",
19 | 				"--no-zygote",
20 | 				"--single-process"
21 | 			]
22 | 		}
23 | 	},
24 | 	"project": {
25 | 		"name": "BBCTechNews",
26 | 		"pipeline": "browser-static-content",
27 | 		"pluginOpts": [
28 | 			{
29 | 				"name": "ExtractUrlsPlugin",
30 | 				"maxDepth": 1,
31 | 				"selectorPairs": [
32 | 					{
33 | 						"urlSelector": "a[href ^= \"/news/technology-\"]"
34 | 					}
35 | 				]
36 | 			},
37 | 			{
38 | 				"name": "ReadabilityPlugin",
39 |                 "path": "ReadabilityPlugin.ts",
40 | 				"replace": "ExtractHtmlContentPlugin",
41 | 				"domRead": true
42 | 			},
43 | 			{
44 | 				"name": "InsertResourcesPlugin",
45 | 				"maxResources": 5
46 | 			}
47 | 		],
48 | 		"resources": [
49 | 			{
50 | 				"url": "https://www.bbc.com/news/technology"
51 | 			}
52 | 		]
53 | 	},
54 | 	"concurrency": {
55 | 		"session": {
56 | 			"maxRequests": 1,
57 | 			"delay": 3000
58 | 		}
59 | 	}
60 | }


--------------------------------------------------------------------------------
/examples/article-excerpts/article-excerpts.ts:
--------------------------------------------------------------------------------
 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 2 | import path from 'path';
 3 | import { destination } from 'pino';
 4 | import { PluginStore, Scraper, Project, setLogger, ScrapeEvent, CsvExporter } from '../../src/index';
 5 | 
 6 | /* scrape configuration */
 7 | import ScrapeConfig from './article-excerpts-config.json';
 8 | 
 9 | // write all INFO and above messages to 'scrape.log'
10 | setLogger({ level: 'info' }, destination('scrape.log'));
11 | 
12 | (async () => {
13 |   /*
14 |   manually register external plugin
15 |   not really needed in this case since the external config file contains a 'path' property to the ReadabilityPlugin
16 |   enabling automatic plugin registration
17 |   if config file is loaded from cli only js plugin files can be imported
18 |   */
19 |   await PluginStore.init();
20 |   await PluginStore.addEntry(path.join(__dirname, 'ReadabilityPlugin.ts'));
21 | 
22 |   /* create a scraper instance with the above settings */
23 |   const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client);
24 | 
25 |   scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => {
26 |     const exporter = new CsvExporter({ filepath: 'article-excerpts.csv' });
27 |     await exporter.export(project);
28 |   });
29 | 
30 |   /* start scraping by specifying project and concurrency settings */
31 |   scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency);
32 | })();
33 | 


--------------------------------------------------------------------------------
/examples/cloud/ansible/files/ExtractScriptsPlugin.js:
--------------------------------------------------------------------------------
 1 | class ExtractScriptsPlugin {
 2 |   // defines csv export columns
 3 |   getContentKeys() {
 4 |     return [ 'scripts' ];
 5 |   }
 6 | 
 7 |   test(project, resource) {
 8 |     if (!resource) return false;
 9 |     return (/html/i).test(resource.contentType);
10 |   }
11 | 
12 |   apply(project, resource, DomClient) {
13 |     const doc = new DomClient(resource.data);
14 | 
15 |     const scripts = [];
16 |     Array.from(doc.querySelectorAll('script')).forEach(script => {
17 |       const src = script.getAttribute('src') ? script.getAttribute('src') : '<inline>';
18 |       if (!scripts.includes(src)) {
19 |         scripts.push(src);
20 |       }
21 |     });
22 | 
23 |     /*
24 |     a content entry is represented by an array containing one or multiple scraped values
25 |     we can have multiple content entries for a single resources due to dom selectors returning multiple results
26 |     */
27 |     return { content: [ scripts ] };
28 |   }
29 | }
30 | 
31 | module.exports = ExtractScriptsPlugin;
32 | 


--------------------------------------------------------------------------------
/examples/cloud/ansible/files/gsf-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "storage": {
 3 |     "client": "pg",
 4 |     "useNullAsDefault": true,
 5 |     "connection": {
 6 |       "host": "<db_host>",
 7 |       "port": "5432",
 8 |       "user": "<db_user>",
 9 |       "password": "<db_password>",
10 |       "database": "<db_name>"
11 |     },
12 |     "pool": { 
13 |       "min": 2, 
14 |       "max": 50
15 |     },
16 |     "debug": false
17 |   },
18 |   "client": {
19 |     "name": "cheerio"
20 |   },
21 |   "project": {
22 |     "name": "top-1",
23 |     "resourcePath": "<resource_path>",
24 |     "pipeline": "dom-static-content",
25 |     "pluginOpts": [
26 | 		{
27 | 			"name": "ExtractUrlsPlugin",
28 | 			"maxDepth": 0
29 | 		},
30 | 		{
31 | 			"name": "NodeFetchPlugin",
32 | 			"headers": {
33 | 				"Accept-Encoding": "br,gzip,deflate",
34 | 				"User-Agent": "<user_agent>"
35 | 			}
36 | 		},
37 | 		{
38 | 			"name": "ExtractScriptsPlugin",
39 | 			"path": "ExtractScriptsPlugin.js",
40 | 			"replace": "ExtractHtmlContentPlugin"
41 | 		}
42 |     ]
43 |   },
44 |   "concurrency": {
45 |     "domain": {
46 |       "maxRequests": 100,
47 |       "delay": 1
48 |     },
49 |     "proxy": {
50 |         "maxRequests": 100,
51 |         "delay": 1
52 |     },
53 |     "session": {
54 |         "maxRequests": 100,
55 |         "delay": 1
56 |     }
57 |   }
58 | }


--------------------------------------------------------------------------------
/examples/cloud/ansible/files/gsf.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=get-set-fetch-scraper
 3 | After=network.target
 4 | 
 5 | [Service]
 6 | ExecStart=+gsfscrape --config gsf-config.json <args> --loglevel <log_level> --logdestination <log_destination>
 7 | Restart=always
 8 | User=nobody
 9 | Group=nogroup
10 | Environment=PATH=/usr/bin:/usr/local/bin
11 | WorkingDirectory=<working_directory>
12 | 
13 | [Install]
14 | WantedBy=multi-user.target
15 | 


--------------------------------------------------------------------------------
/examples/cloud/ansible/pg-setup.yml:
--------------------------------------------------------------------------------
 1 | - hosts: all
 2 |   become: true
 3 |   become_user: root
 4 | 
 5 |   vars_files:
 6 |     - vars.yml
 7 | 
 8 |   tasks:
 9 |     - name: "Create app database"
10 |       postgresql_db:
11 |         state: present
12 |         name: "{{ db_name }}"
13 |       become: true
14 |       become_user: postgres
15 | 
16 |     - name: "Create db user"
17 |       postgresql_user:
18 |         state: present
19 |         name: "{{ db_user }}"
20 |         password: "{{ db_password }}"
21 |       become: true
22 |       become_user: postgres
23 | 
24 |     - name: "Grant db user access to app db"
25 |       postgresql_privs:
26 |         type: database
27 |         database: "{{ db_name }}"
28 |         roles: "{{ db_user }}"
29 |         grant_option: false
30 |         privs: all
31 |       become: true
32 |       become_user: postgres
33 | 
34 |     - name: "Allow remote connections on private network"
35 |       postgresql_set:
36 |         name: listen_addresses
37 |         value: 'localhost, {{ private_ip_address }}'
38 |       become: true
39 |       become_user: postgres
40 |       notify: restart postgres
41 | 
42 |     - name: "Allow md5 connection for the db user"
43 |       postgresql_pg_hba:
44 |         dest: /etc/postgresql/14/main/pg_hba.conf
45 |         contype: host
46 |         address: all
47 |         databases: all
48 |         method: md5
49 |         users: "{{ db_user }}"
50 |         create: true
51 |       become: true
52 |       become_user: postgres
53 |       notify: restart postgres
54 | 
55 |     - name: "Tunning for 8GB RAM"
56 |       postgresql_set:
57 |         name: "{{ item.key }}"
58 |         value: "{{ item.value }}"
59 |       become: true
60 |       become_user: postgres
61 |       notify: restart postgres
62 |       with_dict: "{{ pg_config }}"
63 | 
64 |   handlers:
65 |     - name: restart postgres
66 |       service: name=postgresql state=restarted
67 | 


--------------------------------------------------------------------------------
/examples/cloud/ansible/scraper-export.yml:
--------------------------------------------------------------------------------
 1 | - hosts: scraper[0]
 2 | 
 3 |   vars_files:
 4 |     - vars.yml
 5 | 
 6 |   tasks:
 7 |     - name: "export results as csv"
 8 |       ansible.builtin.shell: gsfscrape --config gsf-config.json --loglevel {{scrape_log_level}} --logdestination {{scrape_log_destination}} --scrape --export {{ scrape_export_file  }}
 9 |       args:
10 |         chdir: "{{ scrape_dir }}"
11 | 
12 |     - name: fetch results
13 |       ansible.builtin.fetch:
14 |         src: "{{ scrape_dir }}/{{ scrape_export_file }}"
15 |         dest: "../exports/{{ scrape_export_file }}"
16 |         flat: true
17 | 


--------------------------------------------------------------------------------
/examples/cloud/ansible/scraper-logs.yml:
--------------------------------------------------------------------------------
 1 | - hosts: scraper
 2 | 
 3 |   vars_files:
 4 |     - vars.yml
 5 | 
 6 |   tasks:
 7 |     - name: fetch scrape log
 8 |       ansible.builtin.fetch:
 9 |         src: "{{ scrape_dir }}/{{ scrape_log_destination }}"
10 |         dest: "../exports/{{ ansible_host }}-scrape.log"
11 |         validate_checksum: false # it keeps changing as resources are scraped
12 |         flat: true
13 | 


--------------------------------------------------------------------------------
/examples/cloud/ansible/scraper-setup.yml:
--------------------------------------------------------------------------------
 1 | - hosts: all
 2 |   become: true
 3 |   become_user: root
 4 | 
 5 |   vars_files:
 6 |     - vars.yml
 7 | 
 8 |   tasks:
 9 |     # community.general.npm is not installing cheerio@rc version for some reason
10 |     - name: "Install get-set-fetch scraper and peer dependencies"
11 |       ansible.builtin.shell: npm install -g {{ item.name }}@{{ item.version }}
12 |       with_items:
13 |         - { name: "@get-set-fetch/scraper", version: "0.9.0" }
14 |         - { name: "knex", version: "1.0.5" }
15 |         - { name: "pg", version: "8.7.1" }
16 |         - { name: "cheerio", version: "1.0.0-rc.10" }
17 | 
18 |     - name: Create a directory if it does not exist
19 |       ansible.builtin.file:
20 |         path: "{{ scrape_dir }}"
21 |         state: directory
22 |         mode: 0644
23 | 
24 |     - name: "Copy input csv file(s)"
25 |       ansible.builtin.copy:
26 |         src: "files/{{ item }}"
27 |         dest: "{{ scrape_dir }}/{{ item }}"
28 |         owner: root
29 |         group: root
30 |         mode: 0644
31 |       with_items:
32 |         - "{{ scrape_resource_file }}"
33 |       when: scraper_idx == "0"
34 | 
35 |     - name: "Copy input config and plugin files"
36 |       ansible.builtin.copy:
37 |         src: "files/{{ item }}"
38 |         dest: "{{ scrape_dir }}/{{ item }}"
39 |         owner: root
40 |         group: root
41 |         mode: 0644
42 |       with_items:
43 |         - gsf-config.json
44 |         - ExtractScriptsPlugin.js
45 | 
46 |     - name: "Update scrape config"
47 |       ansible.builtin.replace:
48 |         path: "{{ scrape_dir }}/gsf-config.json"
49 |         regexp: "{{ item.regexp }}"
50 |         replace: "{{ item.replace }}"
51 |       with_items:
52 |         - { regexp: "<db_host>", replace: "{{ db_host }}" }
53 |         - { regexp: "<db_user>", replace: "{{ db_user }}" }
54 |         - { regexp: "<db_password>", replace: "{{ db_password }}" }
55 |         - { regexp: "<db_name>", replace: "{{ db_name }}" }
56 |         - { regexp: "<user_agent>", replace: "{{ scrape_user_agent }}" }
57 | 
58 |     - name: "Update scrape config external resource path"
59 |       ansible.builtin.replace:
60 |         path: "{{ scrape_dir }}/gsf-config.json"
61 |         regexp: "<resource_path>"
62 |         replace: "{{ scrape_resource_file }}"
63 |       when: scraper_idx == "0"
64 | 
65 |     - name: "Remove scrape config external resource path"
66 |       ansible.builtin.replace:
67 |         path: "{{ scrape_dir }}/gsf-config.json"
68 |         regexp: ".+resourcePath.+"
69 |         replace: ""
70 |       when: scraper_idx != "0"
71 | 
72 |     - name: Copy systemd service file
73 |       ansible.builtin.copy:
74 |         src: files/gsf.service
75 |         dest: /etc/systemd/system
76 |         owner: root
77 |         group: root
78 | 
79 |     - name: "Update systemd service file"
80 |       ansible.builtin.replace:
81 |         path: "/etc/systemd/system/gsf.service"
82 |         regexp: "{{ item.regexp }}"
83 |         replace: "{{ item.replace }}"
84 |       with_items:
85 |         - { regexp: "<log_level>", replace: "{{ scrape_log_level }}" }
86 |         - { regexp: "<log_destination>", replace: "{{ scrape_log_destination }}" }
87 |         - { regexp: "<working_directory>", replace: "{{ scrape_dir }}" }
88 |         - { regexp: "<args>", replace: "{{ '--save --discover --retry 30' if scraper_idx == '0' else '--discover --retry 30' }}" }
89 |       notify:
90 |         - start scraper
91 | 
92 |   handlers:
93 |     - name: start scraper
94 |       systemd:
95 |         name: gsf
96 |         state: started
97 |         enabled: yes


--------------------------------------------------------------------------------
/examples/cloud/ansible/scraper-systemd-logs.yml:
--------------------------------------------------------------------------------
 1 | - hosts: scraper
 2 | 
 3 |   vars_files:
 4 |     - vars.yml
 5 | 
 6 |   tasks:
 7 |     - name: "Retrieve systemd log messages since last boot"
 8 |       ansible.builtin.shell: "journalctl -u gsf.service -b"
 9 |       register: journalctl
10 | 
11 |     - name: "Retrieve service status"
12 |       ansible.builtin.shell: systemctl status gsf.service
13 |       register: systemctl
14 | 
15 |     - name: "Copy output to local file"
16 |       delegate_to: localhost
17 |       ansible.builtin.copy:
18 |         dest: "../exports/{{ inventory_hostname }}-systemd.log"
19 |         content: "{{ systemctl.stdout }}\n\n{{ journalctl.stdout }}"
20 | 


--------------------------------------------------------------------------------
/examples/cloud/ansible/vars.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | db_user:
 3 | db_password:
 4 | db_name:
 5 | db_stats_file: pg-stats.csv
 6 | 
 7 | # pg tunning for 4 vCPU, 8 GB RAM, using pgtune as base config
 8 | pg_config:
 9 |     max_connections: 100
10 |     shared_buffers: 2GB
11 |     effective_cache_size: 6GB
12 |     maintenance_work_mem: 512MB
13 |     checkpoint_completion_target: 0.9
14 |     wal_buffers: 16MB
15 |     default_statistics_target: 100
16 |     random_page_cost: 1.1
17 |     effective_io_concurrency: 200
18 |     work_mem: 10485kB
19 |     min_wal_size: 1GB
20 |     max_wal_size: 4GB
21 |     max_worker_processes: 4
22 |     max_parallel_workers_per_gather: 2
23 |     max_parallel_workers: 4
24 |     max_parallel_maintenance_workers: 2
25 | 
26 | scrape_dir: /srv/gsf
27 | scrape_log_level: debug
28 | scrape_log_destination: scrape.log
29 | scrape_resource_file: majestic_million-29-dec-2021.csv
30 | scrape_user_agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0
31 | scrape_export_file: project.csv
32 | 
33 | 


--------------------------------------------------------------------------------
/examples/cloud/readme.md:
--------------------------------------------------------------------------------
1 | distributed scraping using multiple get-set-fetch scraper instances and a central postgresql instance.
2 | 
3 | terraform creates the instances while ansible configures them.
4 | 
5 | scrape status - systemd status and logs for scraper instances, sql queries for postgresql - is monitored via ansible playbooks.
6 | 
7 | the scrape configuration referring ExtractScriptsPlugin is responsible for extracting js script urls from top 1 million sites as reported by majestic.


--------------------------------------------------------------------------------
/examples/cloud/terraform/provider.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     digitalocean = {
 4 |       source  = "digitalocean/digitalocean"
 5 |       version = "~> 2.0"
 6 |     }
 7 |   }
 8 | }
 9 | 
10 | variable "do_token" {}
11 | variable "pvt_key" {}
12 | variable "pub_key" {}
13 | 
14 | provider "digitalocean" {
15 |   token = var.do_token
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/cloud/terraform/resource-ansible-inventory.tf:
--------------------------------------------------------------------------------
 1 | resource "local_file" "ansible_inventory" {
 2 |   content = templatefile("${path.root}/templates/hosts.tpl",
 3 |     {
 4 |       postgresql_ip = digitalocean_droplet.getsetfetch_pg.ipv4_address
 5 |       scraper_ips   = digitalocean_droplet.getsetfetch_scraper.*.ipv4_address
 6 |     }
 7 |   )
 8 |   filename = "${path.root}/../ansible/inventory/hosts.cfg"
 9 | }
10 | 


--------------------------------------------------------------------------------
/examples/cloud/terraform/resource-pg.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
3 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
4 | sudo apt-get update
5 | sudo apt-get -y install postgresql
6 | 
7 | sudo apt-get -y install libpq-dev python3-psycopg2
8 | 


--------------------------------------------------------------------------------
/examples/cloud/terraform/resource-pg.tf:
--------------------------------------------------------------------------------
 1 | resource "digitalocean_droplet" "getsetfetch_pg" {
 2 |   image       = "ubuntu-20-04-x64"
 3 |   name        = "getsetfetch-pg"
 4 |   region      = "fra1"
 5 |   size        = "s-4vcpu-8gb"
 6 |   monitoring  = true
 7 |   resize_disk = false
 8 |   vpc_uuid    = digitalocean_vpc.getsetfetch_vpc.id
 9 | 
10 | 
11 |   ssh_keys = [
12 |     data.digitalocean_ssh_key.terraform.id
13 |   ]
14 | 
15 |   user_data = file("resource-pg.sh")
16 | 
17 |   provisioner "remote-exec" {
18 |     inline = [
19 |       "cloud-init status --wait",
20 |       "echo 'Connected!'"
21 |     ]
22 | 
23 |     connection {
24 |       host        = self.ipv4_address
25 |       type        = "ssh"
26 |       user        = "root"
27 |       private_key = file(var.pvt_key)
28 |     }
29 |   }
30 | 
31 | 
32 |   provisioner "local-exec" {
33 |     command = <<EOT
34 |       ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook -u root -i '${self.ipv4_address},' \
35 |       --private-key ${var.pvt_key} \
36 |       -e 'pub_key=${var.pub_key} private_ip_address=${self.ipv4_address_private}' \
37 |       ../ansible/pg-setup.yml
38 | EOT
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/examples/cloud/terraform/resource-scraper.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | sudo apt-get update
3 | sudo apt-get -y install nodejs npm
4 | 


--------------------------------------------------------------------------------
/examples/cloud/terraform/resource-scraper.tf:
--------------------------------------------------------------------------------
 1 | resource "digitalocean_droplet" "getsetfetch_scraper" {
 2 |   count       = 1
 3 |   image       = "ubuntu-20-04-x64"
 4 |   name        = "getsetfetch-scraper-${count.index}"
 5 |   region      = "fra1"
 6 |   size        = "s-1vcpu-1gb"
 7 |   monitoring  = true
 8 |   resize_disk = false
 9 |   vpc_uuid    = digitalocean_vpc.getsetfetch_vpc.id
10 | 
11 |   ssh_keys = [
12 |     data.digitalocean_ssh_key.terraform.id
13 |   ]
14 | 
15 |   user_data = file("resource-scraper.sh")
16 | 
17 |   provisioner "remote-exec" {
18 |     inline = [
19 |       "cloud-init status --wait",
20 |       "echo 'Connected!'"
21 |     ]
22 | 
23 |     connection {
24 |       host        = self.ipv4_address
25 |       type        = "ssh"
26 |       user        = "root"
27 |       private_key = file(var.pvt_key)
28 |     }
29 |   }
30 | 
31 |   provisioner "local-exec" {
32 |     command = <<EOT
33 |       ANSIBLE_HOST_KEY_CHECKING=False ansible-playbook -u root -i '${self.ipv4_address},' \
34 |       --private-key ${var.pvt_key} \
35 |       -e 'pub_key=${var.pub_key} db_host=${digitalocean_droplet.getsetfetch_pg.ipv4_address_private} scraper_idx=${count.index}' \
36 |       ../ansible/scraper-setup.yml
37 | EOT
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/examples/cloud/terraform/resource-vpc.tf:
--------------------------------------------------------------------------------
1 | resource "digitalocean_vpc" "getsetfetch_vpc" {
2 |   name   = "getsetfetch-vpc"
3 |   region = "fra1"
4 | }
5 | 


--------------------------------------------------------------------------------
/examples/cloud/terraform/templates/hosts.tpl:
--------------------------------------------------------------------------------
1 | [postgresql]
2 | ${postgresql_ip}
3 | 
4 | [scraper]
5 | %{ for ip in scraper_ips ~}
6 | ${ip}
7 | %{ endfor ~}


--------------------------------------------------------------------------------
/examples/console-content/ConsoleBrowserFetchPlugin.ts:
--------------------------------------------------------------------------------
 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 2 | import { BrowserFetchPlugin, Resource } from '../../src/index';
 3 | import ConsolePuppeteerClient from './ConsolePuppeteerClient';
 4 | 
 5 | export default class ConsoleBrowserFetchPlugin extends BrowserFetchPlugin {
 6 |   getContentKeys() {
 7 |     return [ 'type', 'text' ];
 8 |   }
 9 | 
10 |   async openInTab(resource: Resource, client: ConsolePuppeteerClient): Promise<Partial<Resource>> {
11 |     const result: Partial<Resource> = await super.openInTab(resource, client);
12 |     result.content = client.consoleContent;
13 |     return result;
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/examples/console-content/ConsolePuppeteerClient.ts:
--------------------------------------------------------------------------------
 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 2 | import { ConsoleMessage, HTTPResponse, WaitForOptions } from 'puppeteer';
 3 | import { PuppeteerClient } from '../../src/index';
 4 | 
 5 | export default class ConsolePuppeteerClient extends PuppeteerClient {
 6 |   consoleContent: string[][];
 7 | 
 8 |   async launch(): Promise<void> {
 9 |     await super.launch();
10 | 
11 |     const consoleHandler = (evt: ConsoleMessage) => {
12 |       this.consoleContent.push([
13 |         evt.type(),
14 |         evt.text(),
15 |       ]);
16 |     };
17 | 
18 |     this.page.on('console', consoleHandler);
19 |   }
20 | 
21 |   goto(url: string, opts: WaitForOptions): Promise<HTTPResponse> {
22 |     this.consoleContent = [];
23 |     return super.goto(url, opts);
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/examples/console-content/console-content-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "storage": {
 3 |         "client": "sqlite3",
 4 |         "useNullAsDefault": true,
 5 |         "connection": {
 6 |         "filename": "console-content.sqlite"
 7 |         },
 8 |         "debug": false
 9 |     },
10 |     "client": {
11 |         "name": "puppeteer"
12 |     },
13 |     "project": {
14 |         "name": "ConsoleContent",
15 |         "pipeline": "browser-static-content",
16 |         "pluginOpts": [
17 |             {
18 | 				"name": "ConsoleBrowserFetchPlugin",
19 |                 "path": "ConsoleBrowserFetchPlugin.ts",
20 | 				"replace": "BrowserFetchPlugin"
21 | 			},
22 |             {
23 | 				"name": "ExtractUrlsPlugin",
24 | 				"maxDepth": 1,
25 | 				"selectorPairs": [
26 | 					{
27 | 						"urlSelector": "nav a"
28 | 					}
29 | 				]
30 | 			},
31 |             {
32 |                 "name": "InsertResourcesPlugin",
33 |                 "maxResources": 5
34 |             }
35 |         ],
36 |         "resources": [
37 |             {
38 |                 "url": "https://en.wikipedia.org/wiki/Main_Page"
39 |             }
40 |         ]
41 |     },
42 |     "concurrency": {
43 |         "session": {
44 |             "maxRequests": 1,
45 |             "delay": 1000
46 |         }
47 |     }
48 | }
49 |   


--------------------------------------------------------------------------------
/examples/console-content/console-content.ts:
--------------------------------------------------------------------------------
 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 2 | import { destination } from 'pino';
 3 | import { Scraper, Project, setLogger, ScrapeEvent, CsvExporter, BrowserClient } from '../../src/index';
 4 | 
 5 | /* scrape configuration */
 6 | import ScrapeConfig from './console-content-config.json';
 7 | import ConsolePuppeteerClient from './ConsolePuppeteerClient';
 8 | 
 9 | // write all INFO and above messages to 'scrape.log'
10 | setLogger({ level: 'info' }, destination('scrape.log'));
11 | 
12 | /* create a scraper instance with the above settings */
13 | const browserClient: BrowserClient = new ConsolePuppeteerClient();
14 | const scraper = new Scraper(ScrapeConfig.storage, browserClient);
15 | 
16 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => {
17 |   const exporter = new CsvExporter({ filepath: 'console.csv' });
18 |   await exporter.export(project);
19 | });
20 | 
21 | /* start scraping by specifying project and concurrency settings */
22 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency);
23 | 


--------------------------------------------------------------------------------
/examples/in-memory-queue/InMemoryConnection.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable @typescript-eslint/no-empty-function */
 2 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 3 | import { IProjectStorage, IResourceStorage, IQueueStorage, Connection } from '../../src/index';
 4 | import InMemoryQueue from './InMemoryQueue';
 5 | 
 6 | export default class InMemoryConnection extends Connection {
 7 |   /*
 8 |   by default each connection type is established based on some config,
 9 |   there are no settings for in-memory storage, just specify a client value
10 |   */
11 |   constructor() {
12 |     super({ client: 'in-memory' });
13 |   }
14 | 
15 |   async open() {}
16 |   async close() {}
17 | 
18 |   getProjectStorage():IProjectStorage {
19 |     throw new Error('In-Memory Project not supported');
20 |   }
21 | 
22 |   getResourceStorage():IResourceStorage {
23 |     throw new Error('In-Memory Resource not supported');
24 |   }
25 | 
26 |   getQueueStorage():IQueueStorage {
27 |     return new InMemoryQueue(this);
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/examples/in-memory-queue/InMemoryQueue.ts:
--------------------------------------------------------------------------------
 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 2 | import { IQueueStorage, QueueEntry, Storage } from '../../src/index';
 3 | 
 4 | export default class InMemoryQueue extends Storage implements IQueueStorage {
 5 |   queue:Map<string, QueueEntry>;
 6 | 
 7 |   async drop() {
 8 |     delete this.queue;
 9 |   }
10 | 
11 |   async init() {
12 |     this.queue = new Map();
13 |   }
14 | 
15 |   async filterExistingEntries(urls: string[]) {
16 |     return urls
17 |       .filter(url => this.queue.has(url))
18 |       .map(url => ({ url }));
19 |   }
20 | 
21 |   async add(entries: QueueEntry[]) {
22 |     entries.forEach(entry => {
23 |       if (!this.queue.has(entry.url)) {
24 |         this.queue.set(entry.url, { ...entry, id: entry.url });
25 |       }
26 |     });
27 |   }
28 | 
29 |   async count() {
30 |     return this.queue.size;
31 |   }
32 | 
33 |   async getResourcesToScrape(limit:number = 10) {
34 |     const queueEntries:QueueEntry[] = [];
35 | 
36 |     const queueIt = this.queue.values();
37 |     let result: IteratorResult<QueueEntry> = queueIt.next();
38 | 
39 |     while (queueEntries.length < limit && !result.done) {
40 |       const queueEntry:QueueEntry = result.value;
41 | 
42 |       if (queueEntry.status === undefined) {
43 |         queueEntry.status = 1;
44 |         queueEntries.push(queueEntry);
45 |       }
46 | 
47 |       result = queueIt.next();
48 |     }
49 | 
50 |     return queueEntries;
51 |   }
52 | 
53 |   async getAll() {
54 |     return Array.from(this.queue.values());
55 |   }
56 | 
57 |   async updateStatus(url: string, status: number) {
58 |     const queueEntry = this.queue.get(url);
59 |     if (queueEntry) {
60 |       queueEntry.status = status;
61 |     }
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/examples/in-memory-queue/in-memory-queue-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"storage": {
 3 |         "client": "sqlite3",
 4 |         "useNullAsDefault": true,
 5 |         "connection": {
 6 |             "filename": "gsf.sqlite"
 7 |         },
 8 |         "debug": false
 9 | 	},
10 | 	"client": {
11 | 		"name": "cheerio"
12 | 	},
13 | 	"project": {
14 | 		"name": "sitemap",
15 | 		"resources": [
16 | 			{
17 | 				"url": "https://getsetfetch.org/node/storage.html"
18 | 			}
19 | 		],
20 | 		"pipeline": "dom-static-content",
21 | 		"pluginOpts": [
22 | 			{
23 |                 "name": "ExtractHtmlContentPlugin",
24 |                 "selectorPairs": [
25 |                     {
26 |                     "contentSelector": "h2.card-header-title",
27 |                     "label": "title"
28 |                     }
29 |                 ]
30 |             },
31 | 			{
32 | 				"name": "InsertResourcesPlugin",
33 | 				"maxResources": 3
34 | 			}
35 | 		]
36 | 	},
37 | 	"concurrency": {
38 | 		"session": {
39 | 			"maxRequests": 1,
40 | 			"delay": 1000
41 | 		}
42 | 	}
43 | }


--------------------------------------------------------------------------------
/examples/in-memory-queue/in-memory-queue.ts:
--------------------------------------------------------------------------------
 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 2 | import { destination } from 'pino';
 3 | import { Scraper, setLogger, ScrapeEvent, Project, CsvExporter } from '../../src/index';
 4 | 
 5 | /* scrape configuration */
 6 | import ScrapeConfig from './in-memory-queue-config.json';
 7 | import InMemoryConnection from './InMemoryConnection';
 8 | 
 9 | /* write all INFO and above messages to 'gsf.logs' */
10 | setLogger({ level: 'info' }, destination('scrape.log'));
11 | 
12 | /* create a scraper instance with the above settings */
13 | const conn = {
14 |   Project: ScrapeConfig.storage,
15 |   Queue: new InMemoryConnection(),
16 |   Resource: ScrapeConfig.storage,
17 | };
18 | const scraper = new Scraper(conn, ScrapeConfig.client);
19 | 
20 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => {
21 |   const exporter = new CsvExporter({ filepath: 'in-memory-queue.csv' });
22 |   await exporter.export(project);
23 | });
24 | 
25 | /* start scraping by specifying project and concurrency settings */
26 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency);
27 | 


--------------------------------------------------------------------------------
/examples/infinite-scrolling/infinite-scrolling-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "storage": {
 3 |         "client": "sqlite3",
 4 |         "useNullAsDefault": true,
 5 |         "connection": {
 6 |         "filename": "infinite-scrolling.sqlite"
 7 |         },
 8 |         "debug": false
 9 |     },
10 |     "client": {
11 |         "name": "puppeteer",
12 |         "opts": {
13 |             "args": [
14 |                 "--disable-gpu",
15 |                 "--disable-dev-shm-usage",
16 |                 "--disable-setuid-sandbox",
17 |                 "--no-first-run",
18 |                 "--no-sandbox",
19 |                 "--no-zygote",
20 |                 "--single-process"
21 |             ]
22 |         }
23 |     },
24 |     "project": {
25 |         "name": "HistoricalFigures",
26 |         "pipeline": "browser-static-content",
27 |         "pluginOpts": [
28 | 		{
29 |                 "name": "BrowserFetchPlugin",
30 |                 "stabilityCheck": 2000,
31 | 				"stabilityTimeout": 5000
32 |               },
33 |             {
34 |                 "name": "ExtractUrlsPlugin",
35 |                 "maxDepth": 0
36 |               },
37 |               {
38 |                 "name": "ExtractHtmlContentPlugin",
39 |                 "selectorPairs": [
40 |                   {
41 |                     "contentSelector": "li > a[data-galabel=grid-item] > span > span span:first-child",
42 |                     "label": "name"
43 |                   },
44 |                   {
45 |                     "contentSelector": "li > a[data-galabel=grid-item] > span > span span:last-child",
46 |                     "label": "items"
47 |                   }
48 |                 ]
49 |               },
50 |               {
51 |                 "name": "ScrollPlugin",
52 |                 "after": "UpsertResourcePlugin",
53 | 				"maxActions": 3,
54 | 				"delay": 1000,
55 |                 "stabilityCheck": 2000,
56 | 				"stabilityTimeout": 3000
57 |               }
58 |         ],
59 |         "resources": [
60 |             {
61 |                 "url": "https://artsandculture.google.com/incognito/category/historical-figure"
62 |             }
63 |         ]
64 |     },
65 |     "concurrency": {
66 |         "session": {
67 |             "maxRequests": 1,
68 |             "delay": 3000
69 |         }
70 |     }
71 | }


--------------------------------------------------------------------------------
/examples/infinite-scrolling/infinite-scrolling.ts:
--------------------------------------------------------------------------------
 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 2 | import { destination } from 'pino';
 3 | import { Scraper, Project, setLogger, ScrapeEvent, CsvExporter } from '../../src/index';
 4 | 
 5 | /* scrape configuration */
 6 | import ScrapeConfig from './infinite-scrolling-config.json';
 7 | 
 8 | // write all INFO and above messages to 'scrape.log'
 9 | setLogger({ level: 'info' }, destination('scrape.log'));
10 | 
11 | /* create a scraper instance with the above settings */
12 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client);
13 | 
14 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => {
15 |   const exporter = new CsvExporter({ filepath: 'historical-figures.csv' });
16 |   await exporter.export(project);
17 | });
18 | 
19 | /* start scraping by specifying project and concurrency settings */
20 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency);
21 | 


--------------------------------------------------------------------------------
/examples/pdf-extraction/pdf-extraction-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "storage": {
 3 |         "client": "sqlite3",
 4 |         "useNullAsDefault": true,
 5 |         "connection": {
 6 |         "filename": "pdf-extraction.sqlite"
 7 |         },
 8 |         "debug": false
 9 |     },
10 |     "client": {
11 |         "name": "cheerio"
12 |     },
13 |     "project": {
14 |         "name": "CovidUpdates",
15 |         "pipeline": "dom-static-content",
16 |         "pluginOpts": [
17 |             {
18 |                 "name": "ExtractUrlsPlugin",
19 |                 "maxDepth": 2,
20 |                 "selectorPairs": [
21 |                   {
22 |                     "urlSelector": ".sf-meeting-report-list:nth-child(5) > a.sf-meeting-report-list__item"
23 |                   },
24 |                   {
25 |                     "urlSelector": ".button-blue-background > a",
26 |                     "titleSelector": "h1.dynamic-content__heading"
27 |                   }
28 |                 ]
29 |               }
30 |         ],
31 |         "resources": [
32 |             {
33 |             "url": "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports"
34 |             }
35 |         ]
36 |     },
37 |     "concurrency": {
38 |         "session": {
39 |             "maxRequests": 1,
40 |             "delay": 3000
41 |         }
42 |     }
43 | }
44 |   


--------------------------------------------------------------------------------
/examples/pdf-extraction/pdf-extraction.ts:
--------------------------------------------------------------------------------
 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 2 | import { destination } from 'pino';
 3 | import { Scraper, Project, setLogger, ScrapeEvent, ZipExporter } from '../../src/index';
 4 | 
 5 | /* scrape configuration */
 6 | import ScrapeConfig from './pdf-extraction-config.json';
 7 | 
 8 | // write all INFO and above messages to 'scrape.log'
 9 | setLogger({ level: 'info' }, destination('scrape.log'));
10 | 
11 | /* create a scraper instance with the above settings */
12 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client);
13 | 
14 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => {
15 |   const exporter = new ZipExporter({ filepath: 'covid-updates.zip' });
16 |   await exporter.export(project);
17 | });
18 | 
19 | /* start scraping by specifying project and concurrency settings */
20 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency);
21 | 


--------------------------------------------------------------------------------
/examples/product-details/product-details-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "storage": {
 3 |         "client": "sqlite3",
 4 |         "useNullAsDefault": true,
 5 |         "connection": {
 6 |         "filename": "product-details.sqlite"
 7 |         },
 8 |         "debug": false
 9 |     },
10 |     "client": {
11 |         "name": "cheerio"
12 |     },
13 |     "project": {
14 |         "name": "AsimovBooks",
15 |         "pipeline": "dom-static-content",
16 |         "pluginOpts": [
17 |             {
18 |               "name": "ExtractUrlsPlugin",
19 |               "maxDepth": 3,
20 |               "selectorPairs": [
21 |                 {
22 |                     "urlSelector": "#searchResults ~ .pagination > a.ChoosePage:nth-child(2)"
23 |                 },
24 |                 {
25 |                     "urlSelector": "h3.booktitle a.results"
26 |                 },
27 |                 {
28 |                     "urlSelector": "a.coverLook > img.cover"
29 |                 }
30 |               ]
31 |             },
32 |             {
33 |                 "name": "ExtractHtmlContentPlugin",
34 |                 "selectorPairs": [
35 |                 {
36 |                     "contentSelector": "h1.work-title",
37 |                     "label": "title"
38 |                 },
39 |                 {
40 |                     "contentSelector": "h2.edition-byline a",
41 |                     "label": "author"
42 |                 },
43 |                 {
44 |                     "contentSelector": "ul.readers-stats > li.avg-ratings > span[itemProp=\"ratingValue\"]",
45 |                     "label": "rating value"
46 |                 },
47 |                 {
48 |                     "contentSelector": "ul.readers-stats > li > span[itemProp=\"reviewCount\"]",
49 |                     "label": "review count"
50 |                 }
51 |               ]
52 |             }
53 |           ],
54 |           "resources": [
55 |             {
56 |                 "url": "https://openlibrary.org/authors/OL34221A/Isaac_Asimov?page=1"
57 |             }
58 |           ]
59 |     },
60 |     "concurrency": {
61 |         "session": {
62 |             "maxRequests": 1,
63 |             "delay": 3000
64 |         }
65 |     }
66 | }
67 |   


--------------------------------------------------------------------------------
/examples/product-details/product-details.ts:
--------------------------------------------------------------------------------
 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 2 | import { destination } from 'pino';
 3 | import { Scraper, Project, setLogger, ScrapeEvent, CsvExporter, ZipExporter } from '../../src/index';
 4 | 
 5 | /* scrape configuration */
 6 | import ScrapeConfig from './product-details-config.json';
 7 | 
 8 | // write all INFO and above messages to 'scrape.log'
 9 | setLogger({ level: 'info' }, destination('scrape.log'));
10 | 
11 | /* create a scraper instance with the above settings */
12 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client);
13 | 
14 | /* export books details as csv */
15 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => {
16 |   const exporter = new CsvExporter({ filepath: 'books.csv' });
17 |   await exporter.export(project);
18 | });
19 | 
20 | /* export book covers as zip */
21 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => {
22 |   const exporter = new ZipExporter({ filepath: 'book-covers.zip' });
23 |   await exporter.export(project);
24 | });
25 | 
26 | /* start scraping by specifying project and concurrency settings */
27 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency);
28 | 


--------------------------------------------------------------------------------
/examples/sitemap/ExtractSameHostUrlsPlugin.ts:
--------------------------------------------------------------------------------
 1 | import { ExtractUrlsPlugin, PluginOpts } from '../../src/index';
 2 | 
 3 | export default class ExtractSameHostUrlsPlugin extends ExtractUrlsPlugin {
 4 |   constructor(opts:Partial<PluginOpts> = {}) {
 5 |     super(opts);
 6 | 
 7 |     /* parent plugin runs in browser by default, the current one doesn't */
 8 |     this.opts.domRead = false;
 9 |   }
10 | 
11 |   /* only extract URLs from the sitemap domain */
12 |   isValidUrl(url: URL) {
13 |     return url.hostname === 'www.getsetfetch.org';
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/examples/sitemap/SitemapExporter.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable no-await-in-loop */
 2 | import fs from 'fs';
 3 | import { Exporter, Resource, getLogger } from '../../src/index';
 4 | 
 5 | export default class SitemapExporter extends Exporter {
 6 |   logger = getLogger('SitemapExporter');
 7 | 
 8 |   wstream: fs.WriteStream;
 9 | 
10 |   getResourceQuery() {
11 |     return { cols: [ 'url' ], where: { contentType: 'text/html' } };
12 |   }
13 | 
14 |   async preParse() {
15 |     this.wstream = fs.createWriteStream(this.opts.filepath);
16 |     this.wstream.write('<?xml version="1.0" encoding="UTF-8"?>\n');
17 |     this.wstream.write('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n');
18 |   }
19 | 
20 |   async parse(resource: Partial<Resource>) {
21 |     this.wstream.write(`<url><loc>${resource.url}</loc></url>\n`);
22 |   }
23 | 
24 |   async postParse() {
25 |     this.wstream.write('</urlset>');
26 |     this.wstream.close();
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/examples/sitemap/SkipExtractHtmlContentPlugin.ts:
--------------------------------------------------------------------------------
 1 | import { Plugin } from '../../src/index';
 2 | 
 3 | /**
 4 |  * if you're using plain javascript besides removing Project and Resource types, don't extend the abstract Plugin class
 5 |  * @rollup/plugin-commonjs will bundle the entire @get-set-fetch/scraper project including fs, jszip, ... imports
 6 |  */
 7 | export default class SkipExtractHtmlContentPlugin extends Plugin {
 8 |   /*
 9 |   never invoke the plugin, it's just an empty placeholder for ExtractHtmlContentPlugin
10 |   since we're not interested in scraping content
11 |   */
12 |   test() {
13 |     return false;
14 |   }
15 | 
16 |   // eslint-disable-next-line @typescript-eslint/no-empty-function
17 |   apply() {}
18 | }
19 | 


--------------------------------------------------------------------------------
/examples/sitemap/scrape-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"storage": {
 3 | 		"client": "sqlite3",
 4 | 		"useNullAsDefault": true,
 5 | 		"connection": {
 6 | 			"filename": "gsf.sqlite"
 7 | 		},
 8 | 		"debug": false
 9 | 	},
10 | 	"client": {
11 | 		"name": "cheerio"
12 | 	},
13 | 	"project": {
14 | 		"name": "sitemap",
15 | 		"resources": [
16 | 			{
17 | 				"url": "https://www.getsetfetch.org/index.html"
18 | 			}
19 | 		],
20 | 		"pipeline": "dom-static-content",
21 | 		"pluginOpts": [
22 | 			{
23 | 				"name": "ExtractSameHostUrlsPlugin",
24 | 				"path": "ExtractSameHostUrlsPlugin.ts",
25 | 				"replace": "ExtractUrlsPlugin"
26 | 			},
27 | 			{
28 | 				"name": "SkipExtractHtmlContentPlugin",
29 | 				"path": "SkipExtractHtmlContentPlugin.ts",
30 | 				"replace": "ExtractHtmlContentPlugin"
31 | 			},
32 | 			{
33 | 				"name": "InsertResourcesPlugin",
34 | 				"maxResources": 100
35 | 			}
36 | 		]
37 | 	},
38 | 	"concurrency": {
39 | 		"session": {
40 | 			"maxRequests": 1,
41 | 			"delay": 3000
42 | 		}
43 | 	}
44 | }


--------------------------------------------------------------------------------
/examples/sitemap/sitemap.ts:
--------------------------------------------------------------------------------
 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 2 | import { destination } from 'pino';
 3 | import { Scraper, setLogger, ScrapeEvent, Project } from '../../src/index';
 4 | 
 5 | /* scrape configuration */
 6 | import ScrapeConfig from './scrape-config.json';
 7 | import SitemapExporter from './SitemapExporter';
 8 | 
 9 | /* write all INFO and above messages to 'gsf.logs' */
10 | setLogger({ level: 'info' }, destination('scrape.log'));
11 | 
12 | /* create a scraper instance with the above settings */
13 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client);
14 | 
15 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => {
16 |   const exporter = new SitemapExporter({ filepath: 'sitemap.xml' });
17 |   await exporter.export(project);
18 | });
19 | 
20 | /* start scraping by specifying project and concurrency settings */
21 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency);
22 | 


--------------------------------------------------------------------------------
/examples/tabular-data/tabular-data-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "storage": {
 3 |         "client": "sqlite3",
 4 |         "useNullAsDefault": true,
 5 |         "connection": {
 6 |             "filename": "tabular-data.sqlite"
 7 |         },
 8 |         "debug": false
 9 |     },
10 |     "client": {
11 |         "name": "cheerio"
12 |     },
13 |     "project": {
14 |         "name": "LanguageList",
15 |         "pipeline": "dom-static-content",
16 |         "pluginOpts": [
17 |             {
18 |                 "name": "ExtractUrlsPlugin",
19 |                 "maxDepth": 0
20 |             },
21 |             {
22 |                 "name": "ExtractHtmlContentPlugin",
23 |                 "selectorPairs": [
24 |                     {
25 |                         "contentSelector": "table:nth-of-type(2) td:nth-child(2) > a:first-child",
26 |                         "label": "language"
27 |                     },
28 |                     {
29 |                         "contentSelector": "table:nth-of-type(2) td:nth-child(3)",
30 |                         "label": "speakers (milions)"
31 |                     }
32 |                 ]
33 |             }
34 |         ],
35 |         "resources": [
36 |             {
37 |                 "url": "https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers"
38 |             }
39 |         ]
40 |     },
41 |     "concurrency": {
42 |         "session": {
43 |             "maxRequests": 1,
44 |             "delay": 3000
45 |         }
46 |     }
47 | }
48 |   


--------------------------------------------------------------------------------
/examples/tabular-data/tabular-data.ts:
--------------------------------------------------------------------------------
 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 2 | import { Scraper, Project, setLogger, ScrapeEvent, CsvExporter } from '../../src/index';
 3 | 
 4 | /* scrape configuration */
 5 | import ScrapeConfig from './tabular-data-config.json';
 6 | 
 7 | // write all INFO and above messages to 'scrape.log'
 8 | setLogger({ level: 'info' });
 9 | 
10 | /* create a scraper instance with the above settings */
11 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client);
12 | 
13 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => {
14 |   const exporter = new CsvExporter({ filepath: 'languages.csv' });
15 |   await exporter.export(project);
16 | });
17 | 
18 | /* start scraping by specifying project and concurrency settings */
19 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency);
20 | 


--------------------------------------------------------------------------------
/examples/tls-fingerprinting/RandomTlsFingerprintFetch.ts:
--------------------------------------------------------------------------------
 1 | import crypto from 'crypto';
 2 | import { NodeFetchPlugin, Resource } from '../../src/index';
 3 | import { Protocol } from '../../src/plugins/url-utils';
 4 | 
 5 | export default class RandomTlsFingerprintFetch extends NodeFetchPlugin {
 6 |   shuffledCipherList: string[];
 7 | 
 8 |   getShuffledCipherList():string[] {
 9 |     const nodeOrderedCipherList = crypto.constants.defaultCipherList.split(':');
10 | 
11 |     // keep the most important ciphers in the same order
12 |     const fixedCipherList = nodeOrderedCipherList.slice(0, 3);
13 | 
14 |     // shuffle the rest
15 |     const shuffledCipherList = nodeOrderedCipherList.slice(3)
16 |       .map(cipher => ({ cipher, sort: Math.random() }))
17 |       .sort((a, b) => a.sort - b.sort)
18 |       .map(({ cipher }) => cipher);
19 | 
20 |     return [
21 |       ...fixedCipherList,
22 |       ...shuffledCipherList,
23 |     ];
24 |   }
25 | 
26 |   async getRequestOptions(url:URL, resource: Resource) {
27 |     const reqOpts = await super.getRequestOptions(url, resource);
28 | 
29 |     if (url.protocol === Protocol.HTTPS) {
30 |       // one time initialization of randomly ordered ciphers
31 |       if (!this.shuffledCipherList) {
32 |         this.shuffledCipherList = this.getShuffledCipherList();
33 |         this.logger.info(this.shuffledCipherList, 'using shuffled cipherlist');
34 |       }
35 | 
36 |       reqOpts.ciphers = this.shuffledCipherList.join(':');
37 |     }
38 |     return reqOpts;
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/examples/tls-fingerprinting/readme.md:
--------------------------------------------------------------------------------
1 | Details available on [getsetfetch.org/blog/tls-fingerprint.html](https://getsetfetch.org/blog/tls-fingerprint.html).


--------------------------------------------------------------------------------
/examples/tls-fingerprinting/tls-fingerprinting-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "storage": {
 3 |         "client": "sqlite3",
 4 |         "useNullAsDefault": true,
 5 |         "connection": {
 6 |             "filename": "tls-fingerprinting-data.sqlite"
 7 |         },
 8 |         "debug": false
 9 |     },
10 |     "client": {
11 |         "name": "cheerio"
12 |     },
13 |     "project": {
14 |         "name": "TlsFingerprinting",
15 |         "pipeline": "dom-static-content",
16 |         "pluginOpts": [
17 |             {
18 |                 "name": "ExtractUrlsPlugin",
19 |                 "maxDepth": 0
20 |             },
21 |             {
22 | 				"name": "RandomTlsFingerprintFetch",
23 |                 "path": "RandomTlsFingerprintFetch.ts",
24 | 				"replace": "NodeFetchPlugin"
25 | 			},
26 |             {
27 |                 "name": "ExtractHtmlContentPlugin",
28 |                 "selectorPairs": [
29 |                     {
30 |                         "contentSelector": "table:nth-of-type(2) td:nth-child(2) > a:first-child",
31 |                         "label": "language"
32 |                     },
33 |                     {
34 |                         "contentSelector": "table:nth-of-type(2) td:nth-child(3)",
35 |                         "label": "speakers (milions)"
36 |                     }
37 |                 ]
38 |             }
39 |         ],
40 |         "resources": [
41 |             {
42 |                 "url": "https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers"
43 |             }
44 |         ]
45 |     },
46 |     "concurrency": {
47 |         "session": {
48 |             "maxRequests": 1,
49 |             "delay": 3000
50 |         }
51 |     }
52 | }
53 |   


--------------------------------------------------------------------------------
/examples/tls-fingerprinting/tls-fingerprinting.ts:
--------------------------------------------------------------------------------
 1 | import { destination } from 'pino';
 2 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */
 3 | import { Scraper, Project, setLogger, ScrapeEvent, CsvExporter } from '../../src/index';
 4 | 
 5 | /* scrape configuration */
 6 | import ScrapeConfig from './tls-fingerprinting-config.json';
 7 | 
 8 | // write all INFO and above messages to 'scrape.log'
 9 | setLogger({ level: 'info' }); // destination('scrape.log')
10 | 
11 | /* create a scraper instance with the above settings */
12 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client);
13 | 
14 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => {
15 |   const exporter = new CsvExporter({ filepath: 'languages.csv' });
16 |   await exporter.export(project);
17 | });
18 | 
19 | /* start scraping by specifying project and concurrency settings */
20 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency);
21 | 


--------------------------------------------------------------------------------
/examples/tsconfig.examples.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |       "esModuleInterop": true,
 4 |       "resolveJsonModule": true,
 5 |       "target": "esnext",
 6 |       "strict": false,
 7 |       "moduleResolution": "node",
 8 |       "module": "commonjs",
 9 |       "allowJs": true,
10 |       "newLine": "LF",
11 |   },
12 |   "include": [
13 |     "./"
14 |   ],
15 | }


--------------------------------------------------------------------------------
/src/browserclient/BrowserClient.ts:
--------------------------------------------------------------------------------
 1 | /** Provides a common API to interact with various browser clients. */
 2 | 
 3 | export type BaseResponse = {
 4 |   status(): number;
 5 |   url(): string;
 6 |   request(): {};
 7 |   ok(): boolean;
 8 |   headers();
 9 | }
10 | 
11 | export default abstract class BrowserClient {
12 |   /**
13 |    * whether or not the browser is launched
14 |    */
15 |   isLaunched: boolean;
16 | 
17 |   /**
18 |    * options for launching the browser
19 |    * if not specified {headless: true} is added
20 |    */
21 |   opts: {
22 |     browser?: string;
23 |     [key: string]:any;
24 |   };
25 | 
26 |   constructor(opts) {
27 |     this.opts = opts;
28 |     this.isLaunched = false;
29 |   }
30 | 
31 |   abstract launch():Promise<void>;
32 |   abstract close():Promise<void>;
33 |   abstract closePage():Promise<void>;
34 | 
35 |   /*
36 |   puppeteer supports evaluate with multiple arguments
37 |   playwright supports evaluate with a single argument object
38 |   use object destructuring to support both APIs
39 |   */
40 |   abstract evaluate(fnc, argObj?):Promise<any>;
41 | 
42 |   abstract getRedirectResponse(req):Promise<BaseResponse|null>;
43 | 
44 |   abstract goto(url: string, opts):Promise<BaseResponse>;
45 |   abstract getUrl():string;
46 | }
47 | 


--------------------------------------------------------------------------------
/src/browserclient/PuppeteerClient.ts:
--------------------------------------------------------------------------------
 1 | // @ts-ignore
 2 | // eslint-disable-next-line import/no-unresolved
 3 | import { Browser, LaunchOptions, launch as plaunch, Page, WaitForOptions, HTTPResponse, HTTPRequest, BrowserLaunchArgumentOptions, BrowserConnectOptions } from 'puppeteer';
 4 | import { getLogger } from '../logger/Logger';
 5 | import BrowserClient from './BrowserClient';
 6 | 
 7 | /** Puppeteer Client.  */
 8 | export default class PuppeteerClient extends BrowserClient {
 9 |   logger = getLogger('PuppeteerClient');
10 | 
11 |   browser: Browser;
12 |   page: Page;
13 |   opts: LaunchOptions;
14 | 
15 |   constructor(opts:LaunchOptions & BrowserLaunchArgumentOptions & BrowserConnectOptions = {}) {
16 |     super({ headlesss: true, ...opts });
17 |   }
18 | 
19 |   async launch():Promise<void> {
20 |     this.browser = await plaunch(this.opts);
21 |     this.page = await this.browser.newPage();
22 | 
23 |     this.isLaunched = true;
24 |   }
25 | 
26 |   async close():Promise<void> {
27 |     this.page = null;
28 |     await this.browser.close();
29 |     this.isLaunched = false;
30 |   }
31 | 
32 |   goto(url: string, opts: WaitForOptions):Promise<HTTPResponse> {
33 |     return this.page.goto(url, opts);
34 |   }
35 | 
36 |   async getRedirectResponse(req:HTTPRequest):Promise<HTTPResponse|null> {
37 |     const redirectChain = req.redirectChain();
38 |     return redirectChain.length > 0 ? redirectChain[0].response() : null;
39 |   }
40 | 
41 |   getUrl() {
42 |     return this.page.url();
43 |   }
44 | 
45 |   async closePage() {
46 |     if (this.page) {
47 |       await this.page.close();
48 |       this.page = null;
49 |     }
50 |   }
51 | 
52 |   async evaluate(pageFunction, argObj?) {
53 |     // if there's an error in the async fnc to be evaluated the page.evaluate return promise may never resolve
54 |     // listen to page errors and reject accordingly
55 |     return new Promise(async (resolve, reject) => {
56 |       const logConsole = this.logger.logger.level === 'trace' || this.logger.logger.level === 'debug';
57 |       const consoleHandler = msg => {
58 |         for (let i = 0; i < msg.args().length; i += 1) {
59 |           this.logger.debug(`DOM console: ${msg.args()[i]}`);
60 |         }
61 |       };
62 | 
63 |       if (logConsole) {
64 |         this.page.on('console', consoleHandler);
65 |       }
66 | 
67 |       const errorHandler = err => {
68 |         this.logger.error(err);
69 |         reject(err);
70 |         this.page.off('pageerror', errorHandler);
71 |         this.page.off('error', errorHandler);
72 |         if (logConsole) {
73 |           this.page.off('console', consoleHandler);
74 |         }
75 |       };
76 |       this.page.on('pageerror', errorHandler);
77 |       this.page.on('error', errorHandler);
78 | 
79 |       this.logger.trace({ pageFunction: pageFunction.toString(), argObj }, 'evaluate call');
80 |       const result = await this.page.evaluate(pageFunction, argObj);
81 |       resolve(result);
82 |       this.page.off('pageerror', errorHandler);
83 |       this.page.off('error', errorHandler);
84 |       this.page.off('console', consoleHandler);
85 |     });
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/confighash/config-hash.ts:
--------------------------------------------------------------------------------
 1 | import { Buffer } from 'buffer';
 2 | import { deflateSync, inflateSync, constants } from 'zlib';
 3 | import * as dictionaryV1 from './dictionary-v1.json';
 4 | 
 5 | /**
 6 |  * Converts a project configuration to a deflated based64 string.
 7 |  * @param input - project configuration
 8 |  */
 9 | function encode(input: object):string {
10 |   const deflatedIntArr = deflateSync(JSON.stringify(input), { dictionary: Buffer.from(JSON.stringify(dictionaryV1)), level: constants.Z_BEST_COMPRESSION });
11 |   return Buffer.from(deflatedIntArr).toString('base64');
12 | }
13 | 
14 | /**
15 |  * Converts a deflated based64 string to a project configuration.
16 |  * @param deflatedBase64String - project configuration
17 |  */
18 | function decode(deflatedBase64String: string) {
19 |   if (!deflatedBase64String || deflatedBase64String.length === 0) return null;
20 | 
21 |   let inflatedInstance = null;
22 |   const buffer = Buffer.from(deflatedBase64String, 'base64');
23 |   const inflatedString = inflateSync(buffer, { dictionary: Buffer.from(JSON.stringify(dictionaryV1)) });
24 |   inflatedInstance = JSON.parse(inflatedString.toString('utf-8'));
25 | 
26 |   return inflatedInstance;
27 | }
28 | 
29 | export {
30 |   encode, decode,
31 | };
32 | 


--------------------------------------------------------------------------------
/src/confighash/dictionary-v1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "",
 3 |   "pipeline": "browser-static-content|dom-static-content",
 4 |   "pluginOpts": [
 5 |     {
 6 |       "name": "ExtractHtmlContentPlugin",
 7 |       "selectorPairs": [
 8 |         {
 9 |           "label": "",
10 |           "contentSelector": "",
11 |           "contentProperty": "innerText"
12 |         }
13 |       ],
14 |       "domRead": true
15 |     },
16 |     {
17 |       "name": "ExtractUrlsPlugin",
18 |       "selectorPairs": [
19 |         { 
20 |           "urlSelector": "a",
21 |           "titleSelector": ""
22 |         }
23 |       ],
24 |       "maxDepth": -1
25 |     },
26 |     {
27 |       "name": "BrowserFetchPlugin",
28 |       "gotoOptions": {
29 |         "timeout": 0,
30 |         "waitUntil": 0
31 |       },
32 |       "stabilityCheck": 0,
33 |       "stabilityTimeout": 0
34 |     },
35 |     {
36 |       "name": "NodeFetchPlugin",
37 |       "headers": {}
38 |     },
39 |     {
40 |       "name": "InsertResourcesPlugin",
41 |       "maxResources": -1
42 |     },
43 |     {
44 |       "name": "ScrollPlugin",
45 |       "domWrite": true,
46 |       "delay": 1000,
47 |       "maxActions": -1
48 |     },
49 |     {
50 |       "name": "UpsertResourcePlugin"
51 |     }
52 |   ],
53 |   "resources": [
54 |     {
55 |       "url": "https://.com/.html"
56 |     }
57 |   ]
58 | }


--------------------------------------------------------------------------------
/src/domclient/CheerioClient.ts:
--------------------------------------------------------------------------------
 1 | // @ts-ignore
 2 | // eslint-disable-next-line import/no-unresolved
 3 | import cheerio from 'cheerio';
 4 | import { IDomNode } from './DomClient';
 5 | 
 6 | export default class CheerioClient implements IDomNode {
 7 |   root: cheerio.Root;
 8 |   elm:cheerio.Element;
 9 | 
10 |   constructor(bufferOrRoot: Buffer|cheerio.Root, elm?: cheerio.Element) {
11 |     this.root = bufferOrRoot instanceof Buffer ? cheerio.load(bufferOrRoot.toString('utf8')) : bufferOrRoot;
12 |     this.elm = elm;
13 |   }
14 | 
15 |   querySelectorAll(selector: string):IDomNode[] {
16 |     const elms = this.elm ? this.root(selector, this.elm) : this.root(selector);
17 |     return elms.toArray().map(elm => new CheerioClient(this.root, elm));
18 |   }
19 | 
20 |   getAttribute(prop:string) {
21 |     if (prop === 'innerText') {
22 |       return this.root(this.elm).text();
23 |     }
24 | 
25 |     return this.root(this.elm).attr(prop);
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/domclient/DomClient.ts:
--------------------------------------------------------------------------------
1 | export interface IDomNode {
2 |   querySelectorAll(selector: string):IDomNode[];
3 |   getAttribute(prop: string);
4 | }
5 | 
6 | export interface IDomClientConstructor {
7 |   new(...args): IDomNode;
8 | }
9 | 


--------------------------------------------------------------------------------
/src/domclient/JsdomClient.ts:
--------------------------------------------------------------------------------
 1 | // @ts-ignore
 2 | // eslint-disable-next-line import/no-unresolved
 3 | import { JSDOM } from 'jsdom';
 4 | import { IDomNode } from './DomClient';
 5 | 
 6 | export default class JsdomClient implements IDomNode {
 7 |   elm: Element;
 8 | 
 9 |   constructor(bufferOrElm: Buffer|Element) {
10 |     this.elm = bufferOrElm instanceof Buffer ? new JSDOM(bufferOrElm.toString('utf8')).window.document.querySelector('body') : bufferOrElm;
11 |   }
12 | 
13 |   querySelectorAll(selector: string):IDomNode[] {
14 |     return Array.from(this.elm.querySelectorAll(selector)).map(elm => new JsdomClient(elm));
15 |   }
16 | 
17 |   getAttribute(prop:string) {
18 |     if (prop === 'innerText') {
19 |       return this.elm.textContent;
20 |     }
21 | 
22 |     return this.elm.getAttribute(prop);
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/domclient/NativeClient.ts:
--------------------------------------------------------------------------------
 1 | import { IDomNode } from './DomClient';
 2 | 
 3 | export default class NativeClient implements IDomNode {
 4 |   document: Element;
 5 | 
 6 |   constructor(document: Element) {
 7 |     this.document = document;
 8 |   }
 9 | 
10 |   querySelectorAll(selector: string):IDomNode[] {
11 |     return Array.from(this.document.querySelectorAll(selector)).map(elm => new NativeClient(elm));
12 |   }
13 | 
14 |   getAttribute(prop:string) {
15 |     return this.document[prop] || this.document.getAttribute(prop);
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/domclient/client-utils.ts:
--------------------------------------------------------------------------------
 1 | import { PlaywrightClient, PuppeteerClient, CheerioClient, JsdomClient, BrowserClient,
 2 |   IDomClientConstructor } from '../index';
 3 | import { ClientOptions } from '../scraper/Scraper';
 4 | 
 5 | export default function initClient(clientOpts:ClientOptions):BrowserClient|IDomClientConstructor {
 6 |   if (!clientOpts) throw new Error('missing DOM options');
 7 |   if (!clientOpts.name) throw new Error('missing DOM client');
 8 | 
 9 |   let client;
10 |   switch (clientOpts.name) {
11 |     case 'cheerio':
12 |       if (!CheerioClient) throw new Error('cheerio package not installed');
13 |       client = CheerioClient;
14 |       break;
15 |     case 'jsdom':
16 |       if (!JsdomClient) throw new Error('jsdom package not installed');
17 |       client = JsdomClient;
18 |       break;
19 |     case 'puppeteer':
20 |       if (!PuppeteerClient) throw new Error('puppeteer package not installed');
21 |       client = new PuppeteerClient(clientOpts.opts);
22 |       break;
23 |     case 'playwright':
24 |       if (!PlaywrightClient) throw new Error('playwright-core package not installed');
25 |       client = new PlaywrightClient(clientOpts.opts);
26 |       break;
27 |     default:
28 |       throw new Error(`invalid client ${clientOpts.name}`);
29 |   }
30 | 
31 |   return client;
32 | }
33 | 


--------------------------------------------------------------------------------
/src/export/CsvExporter.ts:
--------------------------------------------------------------------------------
 1 | import fs from 'fs';
 2 | import Resource, { ResourceQuery } from '../storage/base/Resource';
 3 | import Exporter, { ExportOptions } from './Exporter';
 4 | import { getLogger } from '../logger/Logger';
 5 | 
 6 | export type CsvExportOptions = ExportOptions & {
 7 |   fieldSeparator?: string;
 8 |   lineSeparator?: string;
 9 | }
10 | 
11 | /** Provides CSV export capabilities. */
12 | export default class CsvExporter extends Exporter {
13 |   logger = getLogger('CsvExporter');
14 | 
15 |   opts: CsvExportOptions;
16 | 
17 |   wstream: fs.WriteStream;
18 | 
19 |   getResourceQuery(): Partial<ResourceQuery> {
20 |     return { whereNotNull: [ 'content' ], cols: [ 'url', 'content' ] };
21 |   }
22 | 
23 |   async preParse(): Promise<void> {
24 |     this.wstream = fs.createWriteStream(this.opts.filepath);
25 | 
26 |     // write csv header
27 |     this.wstream.write([ 'url', ...this.getContentKeys() ].join(this.opts.fieldSeparator));
28 |   }
29 | 
30 |   async parse(resource: Partial<Resource>): Promise<void> {
31 |     const { lineSeparator } = this.opts;
32 |     const csvRows = this.resourceToCsvRows(resource);
33 |     this.wstream.write(lineSeparator);
34 |     this.wstream.write(csvRows.join(lineSeparator));
35 |   }
36 | 
37 |   async postParse() {
38 |     this.wstream.close();
39 |   }
40 | 
41 |   getContentKeys(): string[] {
42 |     return this.project.plugins
43 |       .map(plugin => plugin.getContentKeys())
44 |       .find(contentKeys => contentKeys)
45 |       || [];
46 |   }
47 | 
48 |   resourceToCsvRows(resource: Partial<Resource>): string[][] {
49 |     const { url, content } = resource;
50 | 
51 |     const csvRows: string[][] = [];
52 |     content.forEach(contentRowVal => {
53 |       const csvRow = [ url ];
54 |       contentRowVal.forEach(contentColVal => {
55 |         csvRow.push(this.getCsvVal(contentColVal));
56 |       });
57 |       csvRows.push(csvRow);
58 |     });
59 | 
60 |     // no content for current resource, add a [url] entry
61 |     if (csvRows.length === 0) {
62 |       csvRows.push([ url ]);
63 |     }
64 | 
65 |     return csvRows;
66 |   }
67 | 
68 |   getCsvVal(contentVal: string) {
69 |     /*
70 |     quotes handling
71 |     RFC-4180 "If double-quotes are used to enclose fields,
72 |     then a double-quote appearing inside a field must be escaped by preceding it with another double quote."
73 |     */
74 |     if (contentVal === undefined) {
75 |       return '""';
76 |     }
77 | 
78 |     if (typeof contentVal === 'string') {
79 |       const quotedVal = contentVal.replace(/"/g, '""');
80 |       return `"${quotedVal}"`;
81 |     }
82 | 
83 |     return contentVal;
84 |   }
85 | 
86 |   getDefaultOptions(): Partial<CsvExportOptions> {
87 |     return {
88 |       fieldSeparator: ',',
89 |       lineSeparator: '\n',
90 |       pageLimit: 10000,
91 |     };
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/export/Exporter.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable no-await-in-loop */
 2 | import { isAbsolute, join } from 'path';
 3 | import { LogWrapper } from '../logger/Logger';
 4 | import Project from '../storage/base/Project';
 5 | import Resource, { ResourceQuery } from '../storage/base/Resource';
 6 | import ConnectionManager from '../storage/ConnectionManager';
 7 | 
 8 | export type ExportOptions = {
 9 |   pageLimit?: number;
10 |   filepath: string;
11 | }
12 | 
13 | /** Scraped data exporters should extend this class. */
14 | export default abstract class Exporter {
15 |   logger: LogWrapper;
16 | 
17 |   project: Project;
18 |   opts: ExportOptions;
19 | 
20 |   constructor(opts: ExportOptions) {
21 |     this.opts = Object.assign(this.getDefaultOptions(), opts);
22 |     this.opts.filepath = isAbsolute(opts.filepath) ? opts.filepath : join(process.cwd(), opts.filepath);
23 |   }
24 | 
25 |   getPagedResources(offset: number, limit: number): Promise<Partial<Resource>[]> {
26 |     return this.project.getPagedResources({ ...this.getResourceQuery(), offset, limit });
27 |   }
28 | 
29 |   async export(project: Project) {
30 |     let connManager: ConnectionManager;
31 | 
32 |     try {
33 |       // use a separate db connection, scrape and export have different db lifecycles and may run in parallel
34 |       connManager = ConnectionManager.clone(project);
35 |       await connManager.connect();
36 |       const ExtProject = await connManager.getProject();
37 | 
38 |       // retrieve the project from the currently active db connection
39 |       this.project = await ExtProject.get(project.id);
40 |       if (!this.project) {
41 |         throw new Error(`could not find project ${project.name}`);
42 |       }
43 | 
44 |       // need to init the plugins as one of the plugins may contain info related to the exported columns
45 |       this.project.plugins = await this.project.initPlugins(true);
46 | 
47 |       let resources: Partial<Resource>[];
48 |       const { pageLimit: limit } = this.opts;
49 |       let offset = 0;
50 | 
51 |       do {
52 |         resources = await this.getPagedResources(offset, limit);
53 | 
54 |         if (offset === 0) {
55 |           if (resources.length === 0) {
56 |             this.logger.warn('No content to export.');
57 |             break;
58 |           }
59 | 
60 |           this.logger.info(`Exporting under ${this.opts.filepath} ...`);
61 |           await this.preParse();
62 |         }
63 | 
64 |         // eslint-disable-next-line no-loop-func
65 |         await Promise.all(resources.map((resource, idx) => this.parse(resource, offset + idx)));
66 |         offset += limit;
67 |       }
68 |       while (resources.length > 0);
69 | 
70 |       if (offset > 0) {
71 |         await this.postParse();
72 |         this.logger.info(`Exporting under ${this.opts.filepath} ... done`);
73 |       }
74 |     }
75 |     catch (err) {
76 |       this.logger.error(err, `error exporting using options ${JSON.stringify(this.opts)}`);
77 |     }
78 |     finally {
79 |       if (connManager) {
80 |         await connManager.close();
81 |       }
82 |     }
83 |   }
84 | 
85 |   getDefaultOptions(): Partial<ExportOptions> {
86 |     return {
87 |       pageLimit: 10000,
88 |     };
89 |   }
90 | 
91 |   abstract getResourceQuery(): Partial<ResourceQuery>;
92 | 
93 |   abstract preParse(): Promise<void>;
94 |   abstract parse(resource: Partial<Resource>, resourceIdx: number): Promise<void>;
95 |   abstract postParse(): Promise<void>;
96 | }
97 | 


--------------------------------------------------------------------------------
/src/export/MimeTypes.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "audio/aac": "aac",
 3 |     "application/x-abiword": "abw",
 4 |     "application/x-freearc": "arc",
 5 |     "video/x-msvideo": "avi",
 6 |     "application/vnd.amazon.ebook": "azw",
 7 |     "application/octet-stream": "bin",
 8 |     "image/bmp": "bmp",
 9 |     "application/x-bzip": "bz",
10 |     "application/x-bzip2": "bz2",
11 |     "application/x-csh": "csh",
12 |     "text/css": "css",
13 |     "text/csv": "csv",
14 |     "application/msword": "doc",
15 |     "application/vnd.openxmlformats": "docx",
16 |     "application/vnd.ms-fontobject": "eot",
17 |     "application/epub+zip": "epub",
18 |     "application/gzip": "gz",
19 |     "image/gif": "gif",
20 |     "text/html": "html",
21 |     "image/vnd.microsoft.icon": "ico",
22 |     "text/calendar": "ics",
23 |     "application/java-archive": "jar",
24 |     "image/jpeg": "jpg",
25 |     "text/javascript": "js",
26 |     "application/json": "json",
27 |     "application/ld+json": "jsonld",
28 |     "audio/midi": "midi",
29 |     "audio/x-midi": "midi",
30 |     "audio/mpeg": "mp3",
31 |     "video/mpeg": "mpeg",
32 |     "application/vnd.apple.installer+xml": "mpkg",
33 |     "application/vnd.oasis.opendocument.presentation": "odp",
34 |     "application/vnd.oasis.opendocument.spreadsheet": "ods",
35 |     "application/vnd.oasis.opendocument.text": "odt",
36 |     "audio/ogg": "oga",
37 |     "video/ogg": "ogv",
38 |     "application/ogg": "ogx",
39 |     "audio/opus": "opus",
40 |     "font/otf": "otf",
41 |     "image/png": "png",
42 |     "application/pdf": "pdf",
43 |     "application/vnd.ms-powerpoint": "ppt",
44 |     "officedocument.presentationml.presentation": "pptx",
45 |     "application/vnd.rar": "rar",
46 |     "application/rtf": "rtf",
47 |     "application/x-sh": "sh",
48 |     "image/svg+xml": "svg",
49 |     "application/x-shockwave-flash": "swf",
50 |     "application/x-tar": "tar",
51 |     "image/tiff": "tiff",
52 |     "video/mp2t": "ts",
53 |     "font/ttf": "ttf",
54 |     "text/plain": "txt",
55 |     "application/vnd.visio": "vsd",
56 |     "audio/wav": "wav",
57 |     "audio/webm": "weba",
58 |     "video/webm": "webm",
59 |     "image/webp": "webp",
60 |     "font/woff": "woff",
61 |     "font/woff2": "woff2",
62 |     "application/xhtml+xml": "xhtml",
63 |     "application/vnd.ms-excel": "xls",
64 |     "officedocument.spreadsheetml.sheet": "xlsx",
65 |     "application/xml": "xml",
66 |     "text/xml": "xml",
67 |     "application/vnd.mozilla.xul+xml": "xul",
68 |     "application/zip": "zip",
69 |     "video/3gpp": "3gp",
70 |     "audio/3gpp": "3gp",
71 |     "video/3gpp2": "3g2",
72 |     "audio/3gpp2": "3g2",
73 |     "application/x-7z-compressed": "7z"
74 | }


--------------------------------------------------------------------------------
/src/export/ZipExporter.ts:
--------------------------------------------------------------------------------
  1 | import fs from 'fs';
  2 | import path, { parse } from 'path';
  3 | import JSZip from 'jszip';
  4 | import Exporter from './Exporter';
  5 | import Resource, { ResourceQuery } from '../storage/base/Resource';
  6 | 
  7 | import * as MimeTypes from './MimeTypes.json';
  8 | import { getLogger } from '../logger/Logger';
  9 | 
 10 | /** Provides ZIP export capabilities. */
 11 | export default class ZipExporter extends Exporter {
 12 |   logger = getLogger('ZipExporter');
 13 | 
 14 |   zip: JSZip;
 15 |   zipIdx: number;
 16 | 
 17 |   getResourceQuery(): Partial<ResourceQuery> {
 18 |     return { whereNotNull: [ 'data' ], cols: [ 'url', 'data', 'parent', 'contentType' ] };
 19 |   }
 20 | 
 21 |   async preParse(): Promise<void> {
 22 |     this.zipIdx = 0;
 23 |   }
 24 | 
 25 |   async parse(resource: Partial<Resource>, idx: number): Promise<void> {
 26 |     // for each bulk resource read do a separate archive
 27 |     if (idx % this.opts.pageLimit === 0) {
 28 |       // close the prev archive if present
 29 |       if (this.zip) {
 30 |         await this.writeZip();
 31 |         this.zipIdx += 1;
 32 |       }
 33 | 
 34 |       // create a new archive
 35 |       this.zip = new JSZip();
 36 |     }
 37 | 
 38 |     const name = `${this.getName(resource)}.${this.getExtension(resource)}`;
 39 |     this.zip.file(name, resource.data);
 40 |   }
 41 | 
 42 |   async postParse() {
 43 |     await this.writeZip();
 44 |   }
 45 | 
 46 |   async writeZip() {
 47 |     const content = await this.zip.generateAsync({
 48 |       type: 'uint8array',
 49 |       compression: 'STORE',
 50 |     });
 51 |     fs.writeFileSync(this.getPath(), content);
 52 |   }
 53 | 
 54 |   getPath() {
 55 |     const { dir, name, ext } = parse(this.opts.filepath);
 56 | 
 57 |     const idxSuffix = this.zipIdx === 0 ? '' : `-${this.zipIdx}`;
 58 |     const zipPath = path.join(dir, `${name}${idxSuffix}${ext}`);
 59 |     return zipPath;
 60 |   }
 61 | 
 62 |   getName(resource: Partial<Resource>): string {
 63 |     const nameParts: string[] = [];
 64 | 
 65 |     // get resource name from parent metadata
 66 |     if (resource.parent) {
 67 |       const { title, linkText, imgAlt } = resource.parent;
 68 |       nameParts.push(title);
 69 |       nameParts.push(linkText);
 70 |       nameParts.push(imgAlt);
 71 | 
 72 |       const nonEmptyNameParts = nameParts.filter(namePart => namePart);
 73 |       if (nonEmptyNameParts.length > 0) {
 74 |         return nonEmptyNameParts.map(namePart => namePart.substr(0, 100)).join('-');
 75 |       }
 76 |     }
 77 | 
 78 |     // get resource name just from its url
 79 |     const nameMatch = /.+\/([^.?]+).*($|\?)/.exec(resource.url);
 80 |     if (nameMatch) {
 81 |       return nameMatch[1];
 82 |     }
 83 | 
 84 |     // failsafe, just return the last part of url
 85 |     return resource.url.substr(-30);
 86 |   }
 87 | 
 88 |   getExtension(resource: Partial<Resource>): string {
 89 |     // extension can be identified based on mime type
 90 |     if (MimeTypes[resource.contentType]) {
 91 |       return MimeTypes[resource.contentType];
 92 |     }
 93 | 
 94 |     // extension can be identified based on regex against url
 95 |     // have at least 2 ".", one from domain, one from extension
 96 |     const extensionMatch = /\..+.\.([^.?]+)($|\?)/.exec(resource.url);
 97 |     if (extensionMatch) {
 98 |       return extensionMatch[1];
 99 |     }
100 | 
101 |     // failed to find extension
102 |     return 'unknown';
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/pipelines/BrowserStaticContentPipeline.ts:
--------------------------------------------------------------------------------
 1 | import { Pipeline } from './pipelines';
 2 | 
 3 | const pipeline:Pipeline = {
 4 |   defaultPluginOpts: [
 5 |     {
 6 |       name: 'BrowserFetchPlugin',
 7 |     },
 8 |     {
 9 |       name: 'ExtractUrlsPlugin',
10 |     },
11 |     {
12 |       name: 'ExtractHtmlContentPlugin',
13 |     },
14 |     {
15 |       name: 'InsertResourcesPlugin',
16 |     },
17 |     {
18 |       name: 'UpsertResourcePlugin',
19 |     },
20 |   ],
21 | 
22 | };
23 | 
24 | export default pipeline;
25 | 


--------------------------------------------------------------------------------
/src/pipelines/DomStaticContentPipeline.ts:
--------------------------------------------------------------------------------
 1 | import { Pipeline } from './pipelines';
 2 | 
 3 | const pipeline:Pipeline = {
 4 |   defaultPluginOpts: [
 5 |     {
 6 |       name: 'NodeFetchPlugin',
 7 |     },
 8 |     {
 9 |       name: 'ExtractUrlsPlugin',
10 |       domRead: false,
11 |     },
12 |     {
13 |       name: 'ExtractHtmlContentPlugin',
14 |       domRead: false,
15 |     },
16 |     {
17 |       name: 'InsertResourcesPlugin',
18 |     },
19 |     {
20 |       name: 'UpsertResourcePlugin',
21 |     },
22 |   ],
23 | 
24 | };
25 | 
26 | export default pipeline;
27 | 


--------------------------------------------------------------------------------
/src/pipelines/pipelines.ts:
--------------------------------------------------------------------------------
 1 | import { PluginOpts } from '../plugins/Plugin';
 2 | import BrowserStaticContentPipeline from './BrowserStaticContentPipeline';
 3 | import DomStaticContentPipeline from './DomStaticContentPipeline';
 4 | 
 5 | export type Pipeline = {
 6 |   defaultPluginOpts:PluginOpts[];
 7 | }
 8 | 
 9 | export type Pipelines = {
10 |   [key: string] : Pipeline
11 | }
12 | 
13 | /**
14 |  * Built-in, predefined pipelines.
15 |  * Each one defines a series of plugins with default options to be executed against each to be scraped resource.
16 |  */
17 | const pipelines: Pipelines = {
18 |   'browser-static-content': BrowserStaticContentPipeline,
19 |   'dom-static-content': DomStaticContentPipeline,
20 | };
21 | 
22 | /**
23 |  * Takes starting default options and overrides them with custom ones.
24 |  * @param defaultOpts - default starting options
25 |  * @param customOpts - override options
26 |  */
27 | const mergePluginOpts = (defaultOpts: PluginOpts[], customOpts: PluginOpts[] = []):PluginOpts[] => {
28 |   const mergeOpts:PluginOpts[] = [ ...defaultOpts ];
29 |   customOpts.forEach(pluginCustomOpts => {
30 |     if (pluginCustomOpts.before) {
31 |       const idx = mergeOpts.findIndex(mergePluginOpts => mergePluginOpts.name === pluginCustomOpts.before);
32 |       if (idx === -1) throw new Error(`could not find plugin ${pluginCustomOpts.before} as before anchor`);
33 |       mergeOpts.splice(idx, 0, pluginCustomOpts);
34 |       return;
35 |     }
36 | 
37 |     if (pluginCustomOpts.replace) {
38 |       const idx = mergeOpts.findIndex(mergePluginOpts => mergePluginOpts.name === pluginCustomOpts.replace);
39 |       if (idx === -1) throw new Error(`could not find plugin ${pluginCustomOpts.before} as replace anchor`);
40 |       mergeOpts[idx] = pluginCustomOpts;
41 |       return;
42 |     }
43 | 
44 |     if (pluginCustomOpts.after) {
45 |       const idx = mergeOpts.findIndex(mergePluginOpts => mergePluginOpts.name === pluginCustomOpts.after);
46 |       if (idx === -1) throw new Error(`could not find plugin ${pluginCustomOpts.before} as after anchor`);
47 |       mergeOpts.splice(idx + 1, 0, pluginCustomOpts);
48 |       return;
49 |     }
50 | 
51 |     const idx = mergeOpts.findIndex(mergePluginOpts => mergePluginOpts.name === pluginCustomOpts.name);
52 |     if (idx === -1) throw new Error(`could not find plugin ${pluginCustomOpts.name} as merge anchor`);
53 |     mergeOpts[idx] = { ...mergeOpts[idx], ...pluginCustomOpts };
54 |   });
55 | 
56 |   return mergeOpts;
57 | };
58 | 
59 | export {
60 |   mergePluginOpts,
61 |   pipelines,
62 | };
63 | 


--------------------------------------------------------------------------------
/src/plugins/Plugin.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable no-prototype-builtins */
 2 | /* eslint-disable no-param-reassign */
 3 | /* eslint-disable no-restricted-syntax */
 4 | import { JSONSchema7 } from 'json-schema';
 5 | import SchemaHelper from '../schema/SchemaHelper';
 6 | import Project from '../storage/base/Project';
 7 | import Resource from '../storage/base/Resource';
 8 | import BrowserClient from '../browserclient/BrowserClient';
 9 | import { IDomClientConstructor } from '../domclient/DomClient';
10 | 
11 | export type PluginOpts = {
12 |   name: string;
13 |   domRead?: boolean;
14 |   domWrite?: boolean;
15 |   [key: string]: unknown;
16 | 
17 |   // position options within a plugin list
18 |   before?: string;
19 |   replace?: string;
20 |   after?: string;
21 |   path?: string;
22 | }
23 | 
24 | /** All plugins should extend this class implementing the test and apply methods. */
25 | export default abstract class Plugin {
26 |   static get schema() {
27 |     return {};
28 |   }
29 | 
30 |   opts: Partial<PluginOpts>;
31 | 
32 |   constructor(opts: Partial<PluginOpts> = {}) {
33 |     const { schema } = <typeof Plugin> this.constructor;
34 |     this.opts = SchemaHelper.instantiate(schema, opts);
35 |   }
36 | 
37 |   /**
38 |    * Relevant for a pipeline plugin responsible for actual content scraping.
39 |    * @returns keys the scraped data will be exported under
40 |    */
41 |   getContentKeys(): string[] {
42 |     return undefined;
43 |   }
44 | 
45 |   /**
46 |    * Tests if the plugin should be executed or not against the current resource.
47 |    * @param project - current scrape project
48 |    * @param resource - current scrape resource
49 |    */
50 |   abstract test(project: Project, resource: Resource): Promise<boolean> | boolean;
51 | 
52 |   /**
53 |    * Executes the plugin against the current resource, either in node.js or browser environment.
54 |    * The result will be merged into the currently scraped resource at scraper level.
55 |    * @param project - current scrape project
56 |    * @param resource - current scrape resource
57 |    * @param client - current browser client
58 |    */
59 |   abstract apply(project: Project, resource: Resource, client: BrowserClient | IDomClientConstructor): Promise<void | Partial<Resource>> | void | Partial<Resource>;
60 | }
61 | 
62 | export interface IPlugin {
63 |   new(kwArgs: Partial<PluginOpts>): Plugin;
64 |   schema: JSONSchema7;
65 | }
66 | 


--------------------------------------------------------------------------------
/src/plugins/default/BaseFetchPlugin.ts:
--------------------------------------------------------------------------------
  1 | /* eslint-disable max-classes-per-file */
  2 | import { Project } from '../..';
  3 | import BrowserClient from '../../browserclient/BrowserClient';
  4 | import Resource from '../../storage/base/Resource';
  5 | import Plugin from '../Plugin';
  6 | 
  7 | export class FetchError extends Error {
  8 |   status: number;
  9 |   redirectUrl: string;
 10 | 
 11 |   constructor(status: number, redirectUrl?: string) {
 12 |     super();
 13 |     this.status = status;
 14 |     this.redirectUrl = redirectUrl;
 15 |   }
 16 | }
 17 | 
 18 | export default abstract class BaseFetchPlugin extends Plugin {
 19 |   /**
 20 |    * check against 2xx codes and an optional list of allowed status
 21 |    * @param status response status code
 22 |    */
 23 |   isValidStatus(status: number, allowedStatus: number[] = []) {
 24 |     return Math.floor(status / 100) === 2 || allowedStatus.includes(status);
 25 |   }
 26 | 
 27 |   /**
 28 |    *  check against 3xx codes
 29 |    * @param status response status code
 30 |    */
 31 |   isRedirectStatus(status: number) {
 32 |     return Math.floor(status / 100) === 3;
 33 |   }
 34 | 
 35 |   test(project: Project, resource: Resource) {
 36 |     if (!resource || !resource.url) return false;
 37 | 
 38 |     // only fetch a resource that hasn't been fetched yet
 39 |     if (resource.contentType) return false;
 40 | 
 41 |     // only http/https supported
 42 |     const { protocol } = new URL(resource.url);
 43 |     return protocol === 'http:' || protocol === 'https:';
 44 |   }
 45 | 
 46 |   async apply(project: Project, resource: Resource, client: BrowserClient): Promise<Partial<Resource>> {
 47 |     let result: Partial<Resource>;
 48 | 
 49 |     try {
 50 |       result = await this.fetch(resource, client);
 51 |     }
 52 |     catch (err) {
 53 |       return this.fetchErrResult(err);
 54 |     }
 55 | 
 56 |     return result;
 57 |   }
 58 | 
 59 |   fetchErrResult(err: Error) {
 60 |     if (err instanceof FetchError) {
 61 |       const { status, redirectUrl } = err;
 62 |       /*
 63 |       redirect detected
 64 |       for the current resource return redirect status
 65 |       also add the final url as a new resource to be scraped
 66 |       don't return contentType as many plugin use it as testing condition and we don't want the original redirect url to be scraped
 67 |       */
 68 |       if (this.isRedirectStatus(status)) {
 69 |         return {
 70 |           status,
 71 |           resourcesToAdd: [ { url: redirectUrl } ],
 72 |         };
 73 |       }
 74 | 
 75 |       /*
 76 |       all other fetch errors
 77 |       don't return contentType as many plugins use it as testing condition and we don't want the original redirect url to be scraped
 78 |       */
 79 |       return {
 80 |         status,
 81 |       };
 82 |     }
 83 | 
 84 |     // errors not related to fetch status code
 85 |     throw err;
 86 |   }
 87 | 
 88 |   /**
 89 |    * Extract just the content type, not the full header value
 90 |    * @param rawContentType : like 'text/html; charset=UTF-8'
 91 |    */
 92 |   getContentType(rawContentType: string): string {
 93 |     if (rawContentType) {
 94 |       const matchArr = rawContentType.match(/^[^;]+/);
 95 |       return matchArr ? matchArr[0] : null;
 96 |     }
 97 |     return null;
 98 |   }
 99 | 
100 |   abstract fetch(resource: Resource, client?: BrowserClient, opts?: RequestInit);
101 | }
102 | 


--------------------------------------------------------------------------------
/src/plugins/default/InsertResourcesPlugin.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable no-param-reassign */
 2 | /* eslint-disable no-await-in-loop */
 3 | import Plugin from '../Plugin';
 4 | import Project from '../../storage/base/Project';
 5 | import Resource from '../../storage/base/Resource';
 6 | import { SchemaType } from '../../schema/SchemaHelper';
 7 | import { getLogger } from '../../logger/Logger';
 8 | 
 9 | /** Saves in database newly identified resources within the current project. */
10 | export default class InsertResourcesPlugin extends Plugin {
11 |   static get schema() {
12 |     return {
13 |       type: 'object',
14 |       title: 'Insert Resources Plugin',
15 |       description: 'saves new resources within the current project based on newly identified urls.',
16 |       properties: {
17 |         maxResources: {
18 |           type: 'integer',
19 |           default: -1,
20 |           title: 'Max Resources',
21 |           description: 'Maximum number of resources to be saved and scraped. A value of -1 disables this check.',
22 |         },
23 |       },
24 |     } as const;
25 |   }
26 | 
27 |   logger = getLogger('InsertResourcesPlugin');
28 | 
29 |   opts: SchemaType<typeof InsertResourcesPlugin.schema>;
30 | 
31 |   constructor(opts: SchemaType<typeof InsertResourcesPlugin.schema> = {}) {
32 |     super(opts);
33 |   }
34 | 
35 |   test(project: Project, resource: Resource) {
36 |     if (!resource) return false;
37 | 
38 |     // only save new urls if there's something to save
39 |     return resource.resourcesToAdd && resource.resourcesToAdd.length > 0;
40 |   }
41 | 
42 |   /**
43 |    * Uses project.queue to INSERT to-be-scraped resources with IGNORE on 'url' CONFLICT.
44 |    */
45 |   async apply(project: Project, resource: Resource) {
46 |     const { resourcesToAdd } = resource;
47 | 
48 |     this.logger.debug(resourcesToAdd, 'adding newly discovered resources');
49 | 
50 |     // each 'child' resource has an increased 'depth' relative to its parent
51 |     resourcesToAdd.forEach(resourceToAdd => {
52 |       resourceToAdd.depth = resource.depth + 1;
53 |     });
54 | 
55 |     // a threshold is defined, take it into account
56 |     if (this.opts.maxResources > 0) {
57 |       const resourceCount = await project.queue.count();
58 |       const maxResourcesToAdd = Math.max(0, this.opts.maxResources - resourceCount);
59 | 
60 |       // add resources below the threshold
61 |       if (maxResourcesToAdd > 0) {
62 |         // inserting all resources doesn't exceed the threshold
63 |         if (maxResourcesToAdd >= resourcesToAdd.length) {
64 |           await project.queue.add(resourcesToAdd);
65 |         }
66 |         // inserting all resources exceeds the threshold, only insert a subset
67 |         else {
68 |           const toCheckUrls = resourcesToAdd.map(resourceToAdd => resourceToAdd.url);
69 |           const newUrls = await project.queue.filterNewUrls(toCheckUrls);
70 |           let newResourcesNotInStorage = resourcesToAdd.filter(resourceToAdd => newUrls.includes(resourceToAdd.url));
71 | 
72 |           if (newResourcesNotInStorage.length > 0) {
73 |             newResourcesNotInStorage = newResourcesNotInStorage.slice(0, Math.min(maxResourcesToAdd, newResourcesNotInStorage.length));
74 |           }
75 |           await project.queue.add(newResourcesNotInStorage);
76 |         }
77 |       }
78 |     }
79 |     // no threshold, insert all resources
80 |     else {
81 |       await project.queue.add(resourcesToAdd);
82 |     }
83 | 
84 |     return { resourcesToAdd: null };
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/plugins/default/UpsertResourcePlugin.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable no-await-in-loop */
 2 | /* eslint-disable no-param-reassign */
 3 | import Plugin from '../Plugin';
 4 | import Project from '../../storage/base/Project';
 5 | import Resource from '../../storage/base/Resource';
 6 | import { SchemaType } from '../../schema/SchemaHelper';
 7 | import { getLogger } from '../../logger/Logger';
 8 | 
 9 | /** Updates a resource in the database after its scraping completes. */
10 | export default class UpsertResourcePlugin extends Plugin {
11 |   static get schema() {
12 |     return {
13 |       type: 'object',
14 |       title: 'Upsert Resource Plugin',
15 |       description: 'updates a static resource or inserts a dynamic one after scraping it.',
16 |       properties: {
17 |         keepHtmlData: {
18 |           type: 'boolean',
19 |           default: false,
20 |           title: 'Keep Html Data',
21 |           description: 'Whether or not to save html buffer response (if present) under resource.data',
22 |         },
23 |       },
24 |     } as const;
25 |   }
26 | 
27 |   logger = getLogger('UpsertResourcePlugin');
28 |   opts: SchemaType<typeof UpsertResourcePlugin.schema>;
29 | 
30 |   constructor(opts: SchemaType<typeof UpsertResourcePlugin.schema> = {}) {
31 |     super(opts);
32 |   }
33 | 
34 |   test(project: Project, resource: Resource) {
35 |     return !!(resource);
36 |   }
37 | 
38 |   async apply(project: Project, resource: Resource) {
39 |     // guard against incomplete resources not capable of updating the scrape queue
40 |     if (!resource.status || !resource.queueEntryId) {
41 |       throw new Error('incomplete resource');
42 |     }
43 | 
44 |     /*
45 |     scrape complete, update queue entry, save scraped resource
46 |     a resource generated from dynamic actions doesn't update the corresponding queue entry, it has already been updated by the `parent` static resource
47 | 
48 |     at some point, treat differently:
49 |     - scraped in error resources: don't add them to the resource table as they don't contain succesfull scraped content
50 |     */
51 |     if (!resource.actions) {
52 |       await Promise.all([
53 |         this.saveResource(resource),
54 |         project.queue.updateStatus(resource.queueEntryId, resource.status),
55 |       ]);
56 |     }
57 |     else {
58 |       await this.saveResource(resource);
59 |     }
60 | 
61 |     /*
62 |     after a resource is updated, remove its dynamic actions
63 |     this allows for other dynamic plugins to be triggered
64 |     */
65 |     return { actions: null };
66 |   }
67 | 
68 |   async saveResource(resource: Resource) {
69 |     // scrape complete, remove inProgress flag, set scrape date
70 |     resource.scrapedAt = new Date(Date.now());
71 | 
72 |     // only save html response under resource.data (Uint8Array) if the corresponding flag is set
73 |     if (!this.opts.keepHtmlData && (/html/i).test(resource.contentType) && resource.data) {
74 |       resource.data = null;
75 |     }
76 | 
77 |     await resource.save();
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/plugins/dom-utils.ts:
--------------------------------------------------------------------------------
 1 | export const enum DomStabilityStatus {
 2 |   Stable,
 3 |   Unstable,
 4 |   Unchanged
 5 | }
 6 | 
 7 | /**
 8 |  * Useful for bypassing preloader content.
 9 |  * @param stabilityCheck - Considers the page loaded and ready to be scraped when there are no more DOM changes within the specified amount of time (milliseconds).
10 |  * @param stabilityTimeout - Maximum waiting time (miliseconds) for achieving DOM stability in case of a continuously updated DOM (ex: timers, countdowns).
11 |  */
12 | function waitForDomStability({ stabilityCheck, stabilityTimeout }:{stabilityCheck: number, stabilityTimeout: number}):Promise<DomStabilityStatus> {
13 |   return new Promise(resolve => {
14 |     let stabilityCheckId:number;
15 |     let stabilityTimeoutId:number;
16 |     let domChanged = false;
17 | 
18 |     // if this is reached, DOM is stable
19 |     const waitStableResolve = observer => {
20 |       window.clearTimeout(stabilityTimeoutId);
21 |       observer.disconnect();
22 |       resolve(domChanged ? DomStabilityStatus.Stable : DomStabilityStatus.Unchanged);
23 |     };
24 | 
25 |     const observer = new MutationObserver((mutationList, observer) => {
26 |       for (let i = 0; i < mutationList.length; i += 1) {
27 |         // we only care if new nodes have been added
28 |         if (mutationList[i].type === 'childList') {
29 |           // restart the stabilityCheck timer
30 |           domChanged = true;
31 |           window.clearTimeout(stabilityCheckId);
32 |           stabilityCheckId = window.setTimeout(waitStableResolve, stabilityCheck, observer);
33 |           break;
34 |         }
35 |       }
36 |     });
37 | 
38 |     // start stability check countdown
39 |     stabilityCheckId = window.setTimeout(waitStableResolve, stabilityCheck, observer);
40 | 
41 |     // start observing document.body
42 |     observer.observe(document.body, { attributes: true, childList: true, subtree: true });
43 | 
44 |     // enforce stability timeout
45 |     stabilityTimeoutId = window.setTimeout(
46 |       () => {
47 |         // clear in progress stability check
48 |         window.clearTimeout(stabilityCheckId);
49 | 
50 |         observer.disconnect();
51 |         resolve(DomStabilityStatus.Unstable);
52 |       },
53 |       stabilityTimeout,
54 |     );
55 |   });
56 | }
57 | 
58 | export {
59 |   waitForDomStability,
60 | };
61 | 


--------------------------------------------------------------------------------
/src/plugins/file-utils.ts:
--------------------------------------------------------------------------------
 1 | import fs from 'fs';
 2 | import path from 'path';
 3 | 
 4 | /**
 5 |  * Get closest parent dir containing a package.json file
 6 |  */
 7 | // eslint-disable-next-line import/prefer-default-export
 8 | export function getPackageDir(startPath: string):string {
 9 |   const startDirPath = path.dirname(startPath);
10 |   const parentPath: string[] = [];
11 |   while (!fs.existsSync(path.join(startDirPath, ...parentPath, 'package.json')) || parentPath.length > 10) {
12 |     parentPath.push('..');
13 |   }
14 | 
15 |   return path.join(startDirPath, ...parentPath);
16 | }
17 | 
18 | export function moduleExists(name) {
19 |   try {
20 |     return require.resolve(name);
21 |   }
22 |   catch (e) {
23 |     return false;
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/plugins/url-utils.ts:
--------------------------------------------------------------------------------
 1 | export const enum Protocol {
 2 |   HTTPS = 'https:', HTTP = 'http:'
 3 | }
 4 | 
 5 | /**
 6 |  * URL normalization including adding protocol prefix if missing.
 7 |  * Mostly used in batch insert jobs.
 8 |  * @param rawUrl - input url
 9 |  * @param defaultProtocol - protocol to add if one is not present, defaults to https
10 |  * @throws error on invalid urls
11 |  * @returns normalized url
12 |  */
13 | export function normalizeUrl(rawUrl: string, defaultProtocol:string = Protocol.HTTPS):string {
14 |   if (!this.isURL(rawUrl)) throw new Error(`error normalizing url: ${rawUrl}`);
15 | 
16 |   // if protocol is missing, add default one
17 |   const fullUrl = rawUrl.split('//').length === 1 ? `${defaultProtocol}//${rawUrl}` : rawUrl;
18 |   return new URL(fullUrl).toString();
19 | }
20 | 
21 | /**
22 |  * Identify the csv column containing an url
23 |  * @param csvRow - csv row with columns separated by ','
24 |  */
25 | export function getUrlColIdx(csvRow: string):number {
26 |   const urlIdx = csvRow.split(',').map(col => col.trim()).findIndex(col => this.isURL(col));
27 |   if (urlIdx === -1) throw new Error(`could not detect url column from ${csvRow}`);
28 |   return urlIdx;
29 | }
30 | 
31 | /**
32 |  * Check if a url is valid based on regex. Protocol prefix is optional.
33 |  * @param url - input candidate
34 |  * @returns - whether or not the input url is valid
35 |  */
36 | export function isURL(url: string):boolean {
37 |   return /([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,}/.test(url.toLowerCase());
38 | }
39 | 


--------------------------------------------------------------------------------
/src/scraper/QueueBuffer.ts:
--------------------------------------------------------------------------------
 1 | import Project from '../storage/base/Project';
 2 | import Resource from '../storage/base/Resource';
 3 | 
 4 | export default class QueueBuffer {
 5 |   project: Project;
 6 |   resources: Resource[];
 7 |   size: number;
 8 | 
 9 |   /**
10 |    * Prevents parallel project.queue.getResourcesToScrape calls
11 |    */
12 |   refillInProgress:boolean;
13 | 
14 |   error;
15 | 
16 |   constructor(size: number) {
17 |     this.size = size;
18 |     this.resources = [];
19 |   }
20 | 
21 |   init(project, resources: Resource[]) {
22 |     this.project = project;
23 |     this.resources = resources;
24 |   }
25 | 
26 |   async refill():Promise<void> {
27 |     // buffer is only filled sequentially
28 |     if (this.refillInProgress) return;
29 | 
30 |     try {
31 |       this.refillInProgress = true;
32 |       const toBeScrapedResources = await this.project.queue.getResourcesToScrape(this.size - this.resources.length);
33 |       this.addResources(toBeScrapedResources);
34 |       this.refillInProgress = false;
35 |     }
36 |     catch (err) {
37 |       // parent call doesn't wait for this async to finish thus can't catch it, store err separately
38 |       this.error = err;
39 |     }
40 |   }
41 | 
42 |   async getResource(stop:boolean):Promise<Resource> {
43 |     /*
44 |     stop signal has been received
45 |     gracefully stop scraping, allow all scrape-in-progress resources to be scraped
46 |     */
47 |     if (stop) {
48 |       if (this.resources.length > 0) {
49 |         // re-make to-be-scraped buffered resources eligible for scraping by reseting their status flag
50 |         await Promise.all(this.resources.map(resource => this.project.queue.updateStatus(resource.queueEntryId, null)));
51 |         this.resources = [];
52 |       }
53 | 
54 |       return null;
55 |     }
56 | 
57 |     /*
58 |     stop signal was received due to buffer error in an independent async call, throw the error up
59 |     parent scraper will catch any errors and stop the process via the `stop` flag
60 |     */
61 |     if (this.error) throw (this.error);
62 | 
63 |     // attemp to re-fill buffer before it's completely empty
64 |     if (this.resources.length < this.size / 2) {
65 |       // buffer needs to be refilled now, can't refill it independently, we risk isScrapingComplete condition to pass
66 |       if (this.resources.length === 0) {
67 |         await this.refill();
68 |         // take advantage of waiting for refillBuffer, directly thrown the error if one was caught
69 |         // avoid isScrapingComplete returning true on empty buffer due to refillBuffer error
70 |         if (this.error) throw (this.error);
71 |       }
72 |       // refill buffer independently
73 |       else {
74 |         this.refill();
75 |       }
76 |     }
77 | 
78 |     // in the future, don't just retrieve the 1st resource, attempt to search for one meeting the concurrency conditions
79 |     return this.resources.length > 0 ? this.resources.shift() : null;
80 |   }
81 | 
82 |   addResources(resources: Resource[]) {
83 |     this.resources.push(...resources);
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/storage/base/Connection.ts:
--------------------------------------------------------------------------------
 1 | import { IProjectStorage } from './Project';
 2 | import { IQueueStorage } from './Queue';
 3 | import { IResourceStorage } from './Resource';
 4 | 
 5 | export type ConnectionConfig = {
 6 |   client: string,
 7 |   [key: string]: any
 8 | }
 9 | 
10 | export default abstract class Connection {
11 |   config: ConnectionConfig;
12 | 
13 |   constructor(config:ConnectionConfig) {
14 |     this.config = config;
15 |   }
16 | 
17 |   abstract open():Promise<void>;
18 |   abstract close():Promise<void>;
19 | 
20 |   abstract getProjectStorage():IProjectStorage;
21 |   abstract getQueueStorage():IQueueStorage;
22 |   abstract getResourceStorage():IResourceStorage;
23 | }
24 | 


--------------------------------------------------------------------------------
/src/storage/base/Entity.ts:
--------------------------------------------------------------------------------
 1 | /** Base class for all entities. */
 2 | export default abstract class Entity {
 3 |   id: string | number;
 4 | 
 5 |   abstract save():Promise<number>;
 6 |   abstract del():Promise<void>;
 7 |   abstract toJSON();
 8 | 
 9 |   abstract get dbCols(): string[];
10 | 
11 |   constructor(kwArgs: Partial<Entity>) {
12 |     Object.keys(kwArgs).forEach(kwArgKey => {
13 |       this[kwArgKey] = kwArgs[kwArgKey];
14 |     });
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/storage/base/Storage.ts:
--------------------------------------------------------------------------------
 1 | import Connection from './Connection';
 2 | 
 3 | /**
 4 |  * Each storage option (db, in-memory) extends this class.
 5 |  */
 6 | export default abstract class Storage {
 7 |   conn: Connection;
 8 | 
 9 |   constructor(conn:Connection) {
10 |     this.conn = conn;
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/src/storage/knex/KnexConnection.ts:
--------------------------------------------------------------------------------
 1 | import { Knex, knex } from 'knex';
 2 | import Connection, { ConnectionConfig } from '../base/Connection';
 3 | import { IProjectStorage } from '../base/Project';
 4 | import { IQueueStorage } from '../base/Queue';
 5 | import { IResourceStorage } from '../base/Resource';
 6 | import KnexProject from './KnexProject';
 7 | import KnexQueue from './KnexQueue';
 8 | import KnexResource from './KnexResource';
 9 | 
10 | export default class KnexConnection extends Connection {
11 |   knex: Knex;
12 |   config: Knex.Config & {client: string};
13 | 
14 |   constructor(config?:ConnectionConfig) {
15 |     // if no config present, use in memory sqlite
16 |     super(config || {
17 |       client: 'sqlite3',
18 |       useNullAsDefault: true,
19 |       connection: {
20 |         filename: ':memory:',
21 |       },
22 |       debug: false,
23 |     });
24 |   }
25 | 
26 |   async open() {
27 |     if (!this.knex) {
28 |       this.knex = knex(this.config);
29 |     }
30 |   }
31 | 
32 |   async close():Promise<void> {
33 |     if (this.knex) {
34 |       const { knex } = this;
35 |       delete this.knex;
36 |       await knex.destroy();
37 |     }
38 |   }
39 | 
40 |   getProjectStorage():IProjectStorage {
41 |     return new KnexProject(this);
42 |   }
43 | 
44 |   getQueueStorage():IQueueStorage {
45 |     return new KnexQueue(this);
46 |   }
47 | 
48 |   getResourceStorage():IResourceStorage {
49 |     return new KnexResource(this);
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/storage/knex/KnexProject.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable no-await-in-loop */
 2 | import Project, { IProjectStorage } from '../base/Project';
 3 | import KnexStorage from './KnexStorage';
 4 | 
 5 | export default class KnexProject extends KnexStorage implements IProjectStorage {
 6 |   get builder() {
 7 |     return this.knex('projects');
 8 |   }
 9 | 
10 |   async init():Promise<void> {
11 |     const schemaBuilder = this.knex.schema;
12 |     const tablePresent = await schemaBuilder.hasTable('projects');
13 |     if (tablePresent) return;
14 | 
15 |     await schemaBuilder.createTable(
16 |       'projects',
17 |       builder => {
18 |         builder.increments('id').primary();
19 |         builder.string('name').unique();
20 | 
21 |         this.jsonCol(builder, 'pluginOpts');
22 |       },
23 |     );
24 |   }
25 | 
26 |   async get(nameOrId: number | string) {
27 |     const colName = Number.isInteger(nameOrId) ? 'id' : 'name';
28 |     return this.builder.where({ [colName]: nameOrId }).first();
29 |   }
30 | 
31 |   save(project:Project):Promise<number> {
32 |     return super.save(project);
33 |   }
34 | 
35 |   update(project: Project):Promise<void> {
36 |     return this.builder.where('id', project.id).update(this.toJSON(project));
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/storage/knex/KnexResource.ts:
--------------------------------------------------------------------------------
 1 | import Project from '../base/Project';
 2 | import Resource, { ResourceQuery, IResourceStorage } from '../base/Resource';
 3 | import KnexStorage from './KnexStorage';
 4 | 
 5 | /** @see {@link Resource} */
 6 | export default class KnexResource extends KnexStorage implements IResourceStorage {
 7 |   projectId: string | number;
 8 | 
 9 |   get tableName():string {
10 |     if (!this.projectId) throw new Error('projectId not set');
11 |     return `${this.projectId}-resources`;
12 |   }
13 | 
14 |   get builder() {
15 |     return this.knex(this.tableName);
16 |   }
17 | 
18 |   async init(project:Project):Promise<void> {
19 |     if (!project.id) throw new Error('project.id not set');
20 |     this.projectId = project.id;
21 | 
22 |     const schemaBuilder = this.knex.schema;
23 |     const tablePresent = await schemaBuilder.hasTable(this.tableName);
24 |     if (tablePresent) return;
25 | 
26 |     await schemaBuilder.createTable(
27 |       this.tableName,
28 |       builder => {
29 |         builder.increments('id').primary();
30 |         builder.string('url');
31 |         builder.integer('depth').defaultTo(0);
32 |         builder.dateTime('scrapedAt');
33 | 
34 |         builder.integer('status');
35 |         builder.string('contentType');
36 | 
37 |         this.jsonCol(builder, 'content');
38 |         this.jsonCol(builder, 'parent');
39 |         this.jsonCol(builder, 'actions');
40 | 
41 |         this.binaryCol(builder, 'data');
42 |       },
43 |     );
44 |   }
45 | 
46 |   async getPagedResources(query: Partial<ResourceQuery>):Promise<Partial<Resource>[]> {
47 |     const { cols, where, whereNotNull, whereIn, offset, limit } = query;
48 | 
49 |     let queryBuilder = this.builder.select(cols || [ 'url', 'content' ]).orderBy('id');
50 | 
51 |     if (where && Object.keys(where).length > 0) {
52 |       queryBuilder = queryBuilder.where(where);
53 |     }
54 |     if (offset !== undefined) {
55 |       queryBuilder = queryBuilder.offset(offset);
56 |     }
57 |     if (limit !== undefined) {
58 |       queryBuilder = queryBuilder.limit(limit);
59 |     }
60 |     if (whereNotNull) {
61 |       whereNotNull.forEach(notNullCol => {
62 |         queryBuilder = queryBuilder.whereNotNull(notNullCol);
63 |       });
64 |     }
65 |     if (whereIn) {
66 |       Object.keys(whereIn).forEach(key => {
67 |         queryBuilder = queryBuilder.whereIn(key, whereIn[key]);
68 |       });
69 |     }
70 | 
71 |     return queryBuilder;
72 |   }
73 | 
74 |   getResource(url: string):Promise<Resource> {
75 |     return this.builder.where({ url }).first();
76 |   }
77 | 
78 |   delAll():Promise<void> {
79 |     return this.builder.del();
80 |   }
81 | 
82 |   drop() {
83 |     return this.knex.schema.dropTable(this.tableName);
84 |   }
85 | 
86 |   count():Promise<number> {
87 |     return super.count(this.tableName);
88 |   }
89 | 
90 |   save(resource):Promise<number> {
91 |     return super.save(resource);
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/test/.mocharc.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   diff: true,
3 |   recursive: true,
4 |   extension: ['ts'],
5 |   package: './package.json',
6 |   reporter: 'spec',
7 |   timeout: 55000,
8 |   file: ['./test/utils/ts-node-config.js', './test/utils/shims.js']
9 | };


--------------------------------------------------------------------------------
/test/acceptance/cheerio.ts:
--------------------------------------------------------------------------------
 1 | import acceptanceSuite from './acceptance-suite';
 2 | import Connection from '../../src/storage/base/Connection';
 3 | import KnexConnection from '../../src/storage/knex/KnexConnection';
 4 | import * as sqliteConn from '../config/storage/sqlite/sqlite-conn.json';
 5 | import * as mysqlConn from '../config/storage/mysql/mysql-conn.json';
 6 | import * as pgConn from '../config/storage/pg/pg-conn.json';
 7 | import CheerioClient from '../../src/domclient/CheerioClient';
 8 | import { ConcurrencyOptions } from '../../src/scraper/ConcurrencyManager';
 9 | import { PluginOpts } from '../../src';
10 | 
11 | const conn:Connection[] = [
12 |   new KnexConnection(sqliteConn),
13 |   new KnexConnection(mysqlConn),
14 |   new KnexConnection(pgConn),
15 | ];
16 | 
17 | const concurrencyOptions:ConcurrencyOptions[] = [
18 |   {
19 |     proxyPool: [ {
20 |       host: '127.0.0.1',
21 |       port: 8080,
22 |     } ],
23 |   },
24 |   {
25 |     proxy: {
26 |       maxRequests: 10,
27 |       delay: 100,
28 |     },
29 |     domain: {
30 |       maxRequests: 10,
31 |       delay: 100,
32 |     },
33 |     proxyPool: [ {
34 |       host: '127.0.0.1',
35 |       port: 8080,
36 |     } ],
37 |   },
38 | ];
39 | 
40 | const pluginOptions: PluginOpts[][] = [
41 |   [
42 |     {
43 |       name: 'NodeFetchPlugin',
44 |       headers: {
45 |         'Accept-Encoding': 'br,gzip,deflate',
46 |       },
47 |     },
48 |   ],
49 |   [
50 |     {
51 |       name: 'NodeFetchPlugin',
52 |       headers: {
53 |         'Accept-Encoding': 'identity',
54 |       },
55 |     },
56 |   ],
57 | ];
58 | 
59 | for (let i = 0; i < conn.length; i += 1) {
60 |   for (let j = 0; j < concurrencyOptions.length; j += 1) {
61 |     /*
62 |     only when using cheerio
63 |     for parallel scraping, fetch resources both compressed and uncompressed
64 |     sequential scraping will fetch using default headers (accepting gzip) like the other acceptance suites
65 |     */
66 |     const nodeScrapeWithCustomHeaders = concurrencyOptions[j].proxy && concurrencyOptions[j].proxy.maxRequests > 1;
67 | 
68 |     if (nodeScrapeWithCustomHeaders) {
69 |       for (let k = 0; k < pluginOptions.length; k += 1) {
70 |         acceptanceSuite(
71 |           'dom-static-content',
72 |           conn[i],
73 |           CheerioClient,
74 |           concurrencyOptions[j],
75 |           pluginOptions[k],
76 |         );
77 |       }
78 |     }
79 |     else {
80 |       acceptanceSuite(
81 |         'dom-static-content',
82 |         conn[i],
83 |         CheerioClient,
84 |         concurrencyOptions[j],
85 |       );
86 |     }
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/test/acceptance/cli/config/config-single-page-single-content-entry-custom-plugin.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "storage": {
 3 |     "client": "sqlite3",
 4 |     "useNullAsDefault": true,
 5 |     "connection": {
 6 |       "filename": "../../../tmp/db.sqlite"
 7 |     }
 8 |   },
 9 |   "client": {
10 |     "name": "cheerio"
11 |   },
12 |   "project": {
13 |     "name": "sitea.com",
14 |     "pipeline": "dom-static-content",
15 |     "pluginOpts": [
16 |       {
17 |         "name": "NodeFetchPlugin",
18 |         "headers": {
19 |           "Accept-Encoding": "identity"
20 |         }
21 |       },
22 |       {
23 |         "name": "H1CounterPlugin",
24 |         "path": "../plugins/h1-counter-plugin.js",
25 |         "replace": "ExtractHtmlContentPlugin",
26 |         "startVal": 50
27 |       }
28 |     ],
29 |     "resources": [
30 |       {
31 |         "url": "http://sitea.com/index.html"
32 |       }
33 |     ]
34 |   },
35 |   "concurrency": {
36 |     "proxy": {
37 |       "maxRequests": 10,
38 |       "delay": 100
39 |     },
40 |     "domain": {
41 |       "maxRequests": 10,
42 |       "delay": 100
43 |     },
44 |     "proxyPool": [ {
45 |       "host": "127.0.0.1",
46 |       "port": 8080
47 |     } ]
48 |   },
49 |   "process": {
50 |   }
51 | }


--------------------------------------------------------------------------------
/test/acceptance/cli/config/config-single-page-single-content-entry.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "storage": {
 3 |     "client": "sqlite3",
 4 |     "useNullAsDefault": true,
 5 |     "connection": {
 6 |       "filename": "../../../tmp/db.sqlite"
 7 |     },
 8 |     "debug": false
 9 |   },
10 |   "client": {
11 |     "name": "cheerio"
12 |   },
13 |   "project": {
14 |     "name": "sitea.com",
15 |     "pipeline": "dom-static-content",
16 |     "pluginOpts": [
17 |       {
18 |         "name": "NodeFetchPlugin",
19 |         "headers": {
20 |           "Accept-Encoding": "identity"
21 |         }
22 |       },
23 |       {
24 |         "name": "ExtractHtmlContentPlugin",
25 |         "selectorPairs": [
26 |           {
27 |             "contentSelector": "h1"
28 |           }
29 |         ]
30 |       }
31 |     ],
32 |     "resources": [
33 |       {
34 |         "url": "http://sitea.com/index.html"
35 |       }
36 |     ]
37 |   },
38 |   "concurrency": {
39 |     "proxy": {
40 |       "maxRequests": 10,
41 |       "delay": 100
42 |     },
43 |     "domain": {
44 |       "maxRequests": 10,
45 |       "delay": 100
46 |     },
47 |     "proxyPool": [ {
48 |       "host": "127.0.0.1",
49 |       "port": 8080
50 |     } ]
51 |   },
52 |   "process": {
53 |   }
54 | }


--------------------------------------------------------------------------------
/test/acceptance/cli/config/config-with-external-resources.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "storage": {
 3 |     "client": "sqlite3",
 4 |     "useNullAsDefault": true,
 5 |     "connection": {
 6 |       "filename": "../../../tmp/db.sqlite"
 7 |     }
 8 |   },
 9 |   "client": {
10 |     "name": "cheerio"
11 |   },
12 |   "project": {
13 |     "name": "sitea.com",
14 |     "pipeline": "dom-static-content",
15 |     "pluginOpts": [
16 |       {
17 |         "name": "NodeFetchPlugin",
18 |         "headers": {
19 |           "Accept-Encoding": "identity"
20 |         }
21 |       },
22 |       {
23 |         "name": "ExtractHtmlContentPlugin",
24 |         "selectorPairs": [
25 |           {
26 |             "contentSelector": "h1"
27 |           }
28 |         ]
29 |       }
30 |     ],
31 |     "resourcePath": "../resources/resources.csv",
32 |     "resources": [
33 |       {
34 |         "url": "http://sitea.com/index.html"
35 |       }
36 |     ]
37 |   },
38 |   "concurrency": {
39 |     "proxy": {
40 |       "maxRequests": 10,
41 |       "delay": 100
42 |     },
43 |     "domain": {
44 |       "maxRequests": 10,
45 |       "delay": 100
46 |     },
47 |     "proxyPool": [ {
48 |       "host": "127.0.0.1",
49 |       "port": 8080
50 |     } ]
51 |   },
52 |   "process": {
53 |   }
54 | }


--------------------------------------------------------------------------------
/test/acceptance/cli/config/config-with-invalid-external-resources.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "storage": {
 3 |     "client": "sqlite3",
 4 |     "useNullAsDefault": true,
 5 |     "connection": {
 6 |       "filename": "../../../tmp/db.sqlite"
 7 |     }
 8 |   },
 9 |   "client": {
10 |     "name": "cheerio"
11 |   },
12 |   "project": {
13 |     "name": "sitea.com",
14 |     "pipeline": "dom-static-content",
15 |     "pluginOpts": [
16 |       {
17 |         "name": "NodeFetchPlugin",
18 |         "headers": {
19 |           "Accept-Encoding": "identity"
20 |         }
21 |       },
22 |       {
23 |         "name": "ExtractHtmlContentPlugin",
24 |         "selectorPairs": [
25 |           {
26 |             "contentSelector": "h1"
27 |           }
28 |         ]
29 |       }
30 |     ],
31 |     "resourcePath": "non-existent-resources.csv"
32 |   },
33 |   "concurrency": {
34 |     "proxy": {
35 |       "maxRequests": 10,
36 |       "delay": 100
37 |     },
38 |     "domain": {
39 |       "maxRequests": 10,
40 |       "delay": 100
41 |     },
42 |     "proxyPool": [ {
43 |       "host": "127.0.0.1",
44 |       "port": 8080
45 |     } ]
46 |   },
47 |   "process": {
48 |   }
49 | }


--------------------------------------------------------------------------------
/test/acceptance/cli/plugins/h1-counter-plugin.js:
--------------------------------------------------------------------------------
 1 | class H1CounterPlugin {
 2 |   opts = {
 3 |     startVal: 10,
 4 |   }
 5 | 
 6 |   // defines csv export columns
 7 |   getContentKeys() {
 8 |     return [ 'h1', 'h1Length' ];
 9 |   }
10 | 
11 |   test(project, resource) {
12 |     if (!resource) return false;
13 |     return (/html/i).test(resource.contentType);
14 |   }
15 | 
16 |   apply(project, resource, DomClient) {
17 |     const doc = new DomClient(resource.data);
18 | 
19 |     const content = doc.querySelectorAll('h1').map(domNode => ([
20 |       domNode.getAttribute('innerText'),
21 |       domNode.getAttribute('innerText').length + this.opts.startVal,
22 |     ]));
23 | 
24 |     /*
25 |     a content entry is represented by an array containing one or multiple scraped values
26 |     we can have multiple content entries for a single resources due to dom selectors returning multiple results
27 |     */
28 |     return { content };
29 |   }
30 | }
31 | 
32 | module.exports = H1CounterPlugin;
33 | 


--------------------------------------------------------------------------------
/test/acceptance/cli/resources/resources-single-entry.csv:
--------------------------------------------------------------------------------
1 | 1,http://siteA.com/other1.html


--------------------------------------------------------------------------------
/test/acceptance/cli/resources/resources.csv:
--------------------------------------------------------------------------------
1 | 1,http://sitea.com/other1.html
2 | 2,http://sitea.com/other2.html
3 | 3,http://sitea.com/other3.html


--------------------------------------------------------------------------------
/test/acceptance/cli/resources/unnormalized-resources.csv:
--------------------------------------------------------------------------------
1 | 1,http://siteA.com/other1.html
2 | 2,http://siteA.com/other2.html
3 | 3,invalid-url


--------------------------------------------------------------------------------
/test/acceptance/docker/config/base-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "storage": {
 3 |     "client": "sqlite3",
 4 |     "useNullAsDefault": true,
 5 |     "connection": {
 6 |       "filename": "gsf.sqlite"
 7 |     },
 8 |     "debug": false
 9 |   },
10 |   "client": {
11 |     "name": "cheerio"
12 |   },
13 |   "project": {
14 |     "name": "myProj",
15 |     "pipeline": "dom-static-content",
16 |     "pluginOpts": [
17 |       {
18 |         "name": "ExtractHtmlContentPlugin",
19 |         "selectorPairs": [
20 |           {
21 |             "contentSelector": "h1"
22 |           }
23 |         ]
24 |       }
25 |     ],
26 |     "resources": [
27 |       {
28 |         "url": "http://sitea.com/index.html"
29 |       }
30 |     ]
31 |   },
32 |   "concurrency": {
33 |     "proxyPool": [
34 |       {
35 |         "host": "127.0.0.1",
36 |         "port": 8080
37 |       }
38 |     ]
39 |   }
40 | }


--------------------------------------------------------------------------------
/test/acceptance/jsdom.ts:
--------------------------------------------------------------------------------
 1 | import acceptanceSuite from './acceptance-suite';
 2 | import Connection from '../../src/storage/base/Connection';
 3 | import KnexConnection from '../../src/storage/knex/KnexConnection';
 4 | import * as sqliteConn from '../config/storage/sqlite/sqlite-conn.json';
 5 | import * as mysqlConn from '../config/storage/mysql/mysql-conn.json';
 6 | import * as pgConn from '../config/storage/pg/pg-conn.json';
 7 | import JsdomClient from '../../src/domclient/JsdomClient';
 8 | import { ConcurrencyOptions } from '../../src/scraper/ConcurrencyManager';
 9 | 
10 | const conn:Connection[] = [
11 |   new KnexConnection(sqliteConn),
12 |   new KnexConnection(mysqlConn),
13 |   new KnexConnection(pgConn),
14 | ];
15 | 
16 | const concurrencyOptions:ConcurrencyOptions[] = [
17 |   {
18 |     proxyPool: [ {
19 |       host: '127.0.0.1',
20 |       port: 8080,
21 |     } ],
22 |   },
23 |   {
24 |     proxy: {
25 |       maxRequests: 10,
26 |       delay: 100,
27 |     },
28 |     domain: {
29 |       maxRequests: 10,
30 |       delay: 100,
31 |     },
32 |     proxyPool: [ {
33 |       host: '127.0.0.1',
34 |       port: 8080,
35 |     } ],
36 |   },
37 | ];
38 | 
39 | for (let i = 0; i < conn.length; i += 1) {
40 |   for (let j = 0; j < concurrencyOptions.length; j += 1) {
41 |     acceptanceSuite(
42 |       'dom-static-content',
43 |       conn[i],
44 |       JsdomClient,
45 |       concurrencyOptions[j],
46 |     );
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/test/acceptance/playwright_chromium.ts:
--------------------------------------------------------------------------------
 1 | import acceptanceSuite from './acceptance-suite';
 2 | import Connection from '../../src/storage/base/Connection';
 3 | import KnexConnection from '../../src/storage/knex/KnexConnection';
 4 | import * as sqliteConn from '../config/storage/sqlite/sqlite-conn.json';
 5 | import * as mysqlConn from '../config/storage/mysql/mysql-conn.json';
 6 | import * as pgConn from '../config/storage/pg/pg-conn.json';
 7 | import * as playwrightChromium from '../config/browserclient/playwright/playwright-chromium.json';
 8 | import PlaywrightClient from '../../src/browserclient/PlaywrightClient';
 9 | import { ConcurrencyOptions } from '../../src/scraper/ConcurrencyManager';
10 | 
11 | const browserClient = new PlaywrightClient(playwrightChromium);
12 | 
13 | const conn:Connection[] = [
14 |   new KnexConnection(sqliteConn),
15 |   new KnexConnection(mysqlConn),
16 |   new KnexConnection(pgConn),
17 | ];
18 | 
19 | const concurrencyOptions:ConcurrencyOptions[] = [
20 |   {
21 |     proxyPool: [ {
22 |       host: '127.0.0.1',
23 |       port: 8080,
24 |     } ],
25 |   },
26 |   {
27 |     proxy: {
28 |       maxRequests: 10,
29 |       delay: 100,
30 |     },
31 |     domain: {
32 |       maxRequests: 10,
33 |       delay: 100,
34 |     },
35 |     proxyPool: [ {
36 |       host: '127.0.0.1',
37 |       port: 8080,
38 |     } ],
39 |   },
40 | ];
41 | 
42 | for (let i = 0; i < conn.length; i += 1) {
43 |   for (let j = 0; j < concurrencyOptions.length; j += 1) {
44 |     acceptanceSuite(
45 |       'browser-static-content',
46 |       conn[i],
47 |       browserClient,
48 |       concurrencyOptions[j],
49 |     );
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/test/acceptance/puppeteer_chromium.ts:
--------------------------------------------------------------------------------
 1 | import acceptanceSuite from './acceptance-suite';
 2 | import Connection from '../../src/storage/base/Connection';
 3 | import KnexConnection from '../../src/storage/knex/KnexConnection';
 4 | import * as sqliteConn from '../config/storage/sqlite/sqlite-conn.json';
 5 | import * as mysqlConn from '../config/storage/mysql/mysql-conn.json';
 6 | import * as pgConn from '../config/storage/pg/pg-conn.json';
 7 | import * as puppeteerChromium from '../config/browserclient/puppeteer/puppeteer-chromium.json';
 8 | import PuppeteerClient from '../../src/browserclient/PuppeteerClient';
 9 | import { ConcurrencyOptions } from '../../src/scraper/ConcurrencyManager';
10 | 
11 | const browserClient = new PuppeteerClient(puppeteerChromium);
12 | 
13 | const conn:Connection[] = [
14 |   new KnexConnection(sqliteConn),
15 |   new KnexConnection(mysqlConn),
16 |   new KnexConnection(pgConn),
17 | ];
18 | 
19 | const concurrencyOptions:ConcurrencyOptions[] = [
20 |   {
21 |     proxyPool: [ {
22 |       host: '127.0.0.1',
23 |       port: 8080,
24 |     } ],
25 |   },
26 |   {
27 |     proxy: {
28 |       maxRequests: 10,
29 |       delay: 100,
30 |     },
31 |     domain: {
32 |       maxRequests: 10,
33 |       delay: 100,
34 |     },
35 |     proxyPool: [ {
36 |       host: '127.0.0.1',
37 |       port: 8080,
38 |     } ],
39 |   },
40 | ];
41 | 
42 | for (let i = 0; i < conn.length; i += 1) {
43 |   for (let j = 0; j < concurrencyOptions.length; j += 1) {
44 |     acceptanceSuite(
45 |       'browser-static-content',
46 |       conn[i],
47 |       browserClient,
48 |       concurrencyOptions[j],
49 |     );
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/test/config/browserclient/playwright/playwright-chromium.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "browser": "chromium",
 3 |   "headless": true,
 4 |   "ignoreHTTPSErrors": true,
 5 |   "slowMo": 20,
 6 |   "args": [
 7 |     "--host-rules=MAP *:80 127.0.0.1:8080, MAP *:443 127.0.0.1:8443",
 8 |     "--ignore-certificate-errors",
 9 | 
10 |     "--disable-gpu",
11 |     "--disable-dev-shm-usage",
12 |     "--disable-setuid-sandbox",
13 |     "--no-first-run",
14 |     "--no-sandbox",
15 |     "--no-zygote",
16 |     "--single-process"
17 |   ]
18 | }


--------------------------------------------------------------------------------
/test/config/browserclient/puppeteer/puppeteer-chromium.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "browser": "chromium",
 3 |   "headless": true,
 4 |   "ignoreHTTPSErrors": true,
 5 |   "slowMo": 20,
 6 |   "args": [
 7 |     "--host-rules=MAP *:80 127.0.0.1:8080, MAP *:443 127.0.0.1:8443",
 8 |     "--ignore-certificate-errors",
 9 | 
10 |     "--disable-gpu",
11 |     "--disable-dev-shm-usage",
12 |     "--disable-setuid-sandbox",
13 |     "--no-first-run",
14 |     "--no-sandbox",
15 |     "--no-zygote",
16 |     "--single-process"
17 |   ]
18 | }


--------------------------------------------------------------------------------
/test/config/storage/mysql/mysql-conn.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "client": "mysql",
 3 |   "useNullAsDefault": true,
 4 |   "connection": {
 5 |     "host": "localhost",
 6 |     "port": "33060",
 7 |     "user": "gsf-user",
 8 |     "password": "gsf-pswd",
 9 |     "database": "gsf-db"
10 |   },
11 |   "debug": false
12 | }


--------------------------------------------------------------------------------
/test/config/storage/mysql/mysql.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | services:                
 3 |   mysql:
 4 |     image: mysql/mysql-server:5.7
 5 |     environment:
 6 |       MYSQL_ROOT_PASSWORD: gsf-root
 7 |       MYSQL_USER: gsf-user
 8 |       MYSQL_PASSWORD: gsf-pswd
 9 |       MYSQL_DATABASE: gsf-db
10 |       MYSQL_HOST: localhost
11 |     ports:
12 |       - 33060:3306
13 | 
14 | 


--------------------------------------------------------------------------------
/test/config/storage/pg/pg-conn.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "client": "pg",
 3 |   "useNullAsDefault": true,
 4 |   "connection": {
 5 |     "host": "localhost",
 6 |     "port": "54320",
 7 |     "user": "gsf-user",
 8 |     "password": "gsf-pswd",
 9 |     "database": "gsf-db"
10 |   },
11 |   "debug": false
12 | }


--------------------------------------------------------------------------------
/test/config/storage/pg/pg.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | services:   
 3 |   pg:
 4 |     image: postgres:11-alpine
 5 |     environment:
 6 |       POSTGRES_USER: gsf-user
 7 |       POSTGRES_PASSWORD: gsf-pswd
 8 |       POSTGRES_DB: gsf-db
 9 |     ports:
10 |       - 54320:5432
11 | 
12 | 


--------------------------------------------------------------------------------
/test/config/storage/sqlite/sqlite-conn.json:
--------------------------------------------------------------------------------
1 | {
2 |   "client": "sqlite3",
3 |   "useNullAsDefault": true,
4 |   "connection": {
5 |     "filename": "test/tmp/acc.sqlite"
6 |   },
7 |   "debug": false
8 | }


--------------------------------------------------------------------------------
/test/tmp/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/get-set-fetch/scraper/879c03e2811aaa13479095a4db376563059b3b4d/test/tmp/.gitkeep


--------------------------------------------------------------------------------
/test/tsconfig.test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |       "esModuleInterop": true,
 4 |       "resolveJsonModule": true,
 5 |       "target": "esnext",
 6 |       "strict": false,
 7 |       "moduleResolution": "node",
 8 |       "module": "commonjs",
 9 |       "allowJs": true,
10 |       "newLine": "LF",
11 |       "useDefineForClassFields": false
12 |   },
13 |   "include": [
14 |     "../test"
15 |   ],
16 |   
17 | }


--------------------------------------------------------------------------------
/test/unit/confighash/test-config-hash.ts:
--------------------------------------------------------------------------------
 1 | import { assert } from 'chai';
 2 | import { encode, decode } from '../../../src/confighash/config-hash';
 3 | 
 4 | describe('ConfigHash', () => {
 5 |   const expectedDefinition = {
 6 |     name: 'projectA',
 7 |     pipeline: 'browser-static-content',
 8 |     pluginOpts: [
 9 |       {
10 |         name: 'ExtractUrlsPlugin',
11 |         selectorPairs: [
12 |           {
13 |             urlSelector: "a[href$='.html']",
14 |           },
15 |           {
16 |             urlSelector: 'img',
17 |           },
18 |         ],
19 |       },
20 |       {
21 |         name: 'ExtractHtmlContentPlugin',
22 |         selectorPairs: [
23 |         ],
24 |       },
25 |     ],
26 |     resources: [
27 |       {
28 |         url: 'http://sitea.com/index.html',
29 |       },
30 |     ],
31 |   };
32 | 
33 |   const expectedConfigHash = 'ePnXQdMJgxtWUJSfBYwER2JSJ6GkSFoqiM4oSk1TsVUHO0o9VgnkQ1QlmbnpIMfqkJPkY/EFCDA8ioHBnggOlMy8lNQKeMgAAOKgZAQ=';
34 | 
35 |   it('encode', () => {
36 |     const encodedDefinition = encode(expectedDefinition);
37 |     assert.deepEqual(encodedDefinition, expectedConfigHash);
38 |   });
39 | 
40 |   it('decode', () => {
41 |     const decodedDefinition = decode(expectedConfigHash);
42 |     assert.deepEqual(decodedDefinition, expectedDefinition);
43 |   });
44 | });
45 | 


--------------------------------------------------------------------------------
/test/unit/domclients/test-cheerio-client.ts:
--------------------------------------------------------------------------------
 1 | import { assert } from 'chai';
 2 | import CheerioClient from '../../../src/domclient/CheerioClient';
 3 | 
 4 | describe('CheerioClient', () => {
 5 |   it('root querySelectorAll', () => {
 6 |     const client = new CheerioClient(Buffer.from('<body><p class="classA">pA</p><p class="classB">pB</p></body>'));
 7 | 
 8 |     const nodes = client.querySelectorAll('p');
 9 |     assert.strictEqual(nodes.length, 2);
10 |     assert.strictEqual(nodes[1].getAttribute('innerText'), 'pB');
11 |     assert.strictEqual(nodes[1].getAttribute('class'), 'classB');
12 |   });
13 | 
14 |   it('nested querySelectorAll', () => {
15 |     const client = new CheerioClient(Buffer.from('<body><p class="classA"><a class="classA" href="linkA">linkA</a></p></body>'));
16 | 
17 |     const pNodes = client.querySelectorAll('p');
18 |     assert.strictEqual(pNodes.length, 1);
19 | 
20 |     let linkNodes = pNodes[0].querySelectorAll('a[class="classA"]');
21 |     assert.strictEqual(linkNodes.length, 1);
22 |     assert.strictEqual(linkNodes[0].getAttribute('href'), 'linkA');
23 | 
24 |     linkNodes = pNodes[0].querySelectorAll('a[class="classB"]');
25 |     assert.strictEqual(linkNodes.length, 0);
26 |   });
27 | });
28 | 


--------------------------------------------------------------------------------
/test/unit/domclients/test-jsdom-client.ts:
--------------------------------------------------------------------------------
 1 | import { assert } from 'chai';
 2 | import JsdomClient from '../../../src/domclient/JsdomClient';
 3 | 
 4 | describe('JsdomClient', () => {
 5 |   it('root querySelectorAll', () => {
 6 |     const client = new JsdomClient(Buffer.from('<body><p class="classA">pA</p><p class="classB">pB</p></body>'));
 7 | 
 8 |     const nodes = client.querySelectorAll('p');
 9 |     assert.strictEqual(nodes.length, 2);
10 |     assert.strictEqual(nodes[1].getAttribute('innerText'), 'pB');
11 |     assert.strictEqual(nodes[1].getAttribute('class'), 'classB');
12 |   });
13 | 
14 |   it('nested querySelectorAll', () => {
15 |     const client = new JsdomClient(Buffer.from('<p class="classA"><a class="classA" href="linkA">linkA</a></p>'));
16 | 
17 |     const pNodes = client.querySelectorAll('p');
18 |     assert.strictEqual(pNodes.length, 1);
19 | 
20 |     let linkNodes = pNodes[0].querySelectorAll('a[class="classA"]');
21 |     assert.strictEqual(linkNodes.length, 1);
22 |     assert.strictEqual(linkNodes[0].getAttribute('href'), 'linkA');
23 | 
24 |     linkNodes = pNodes[0].querySelectorAll('a[class="classB"]');
25 |     assert.strictEqual(linkNodes.length, 0);
26 |   });
27 | });
28 | 


--------------------------------------------------------------------------------
/test/unit/exporter/test-csv-exporter.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable @typescript-eslint/no-empty-function */
 2 | import fs from 'fs';
 3 | import { assert } from 'chai';
 4 | import { SinonSandbox, createSandbox } from 'sinon';
 5 | import CsvExporter from '../../../src/export/CsvExporter';
 6 | import Project from '../../../src/storage/base/Project';
 7 | import ConnectionManager from '../../../src/storage/ConnectionManager';
 8 | 
 9 | describe('CsvExporter', () => {
10 |   let sandbox:SinonSandbox;
11 |   let exporter: CsvExporter;
12 |   let project;
13 |   let content: string;
14 |   const lineSeparator = '\n';
15 | 
16 |   beforeEach(() => {
17 |     content = '';
18 |     sandbox = createSandbox();
19 |     project = sandbox.createStubInstance(Project);
20 | 
21 |     sandbox.stub(fs, 'createWriteStream').returns(<any>{
22 |       write: (val: string) => {
23 |         content += val;
24 |       },
25 |       close: () => {},
26 |     });
27 | 
28 |     sandbox.stub(ConnectionManager, 'clone').returns(
29 |       <any>sandbox.createStubInstance(
30 |         ConnectionManager,
31 |         <any>{ getProject: { get: () => project } },
32 |       ),
33 |     );
34 | 
35 |     exporter = new CsvExporter({ filepath: 'fileA.csv' });
36 |   });
37 | 
38 |   afterEach(() => {
39 |     sandbox.restore();
40 |   });
41 | 
42 |   it('array values - single selector', async () => {
43 |     sandbox.stub(exporter, 'getContentKeys').returns([ 'colA' ]);
44 |     project.getPagedResources.onCall(0).returns([
45 |       { url: 'urlA', content: [ [ 'A1 content' ], [ 'A2 content' ] ] },
46 |       { url: 'urlB', content: [ [ 'A3 content' ] ] },
47 |     ]);
48 |     project.getPagedResources.onCall(1).returns([]);
49 |     await exporter.export(project);
50 | 
51 |     const expectedContent = `url,colA
52 |       urlA,"A1 content"
53 |       urlA,"A2 content"
54 |       urlB,"A3 content"`
55 |       .split(lineSeparator).map(csvLine => csvLine.trim()).join(lineSeparator);
56 | 
57 |     assert.strictEqual(content, expectedContent);
58 |   });
59 | 
60 |   it('array values - single selector - empty content', async () => {
61 |     sandbox.stub(exporter, 'getContentKeys').returns([ 'colA' ]);
62 |     project.getPagedResources.onCall(0).returns([
63 |       { url: 'urlA', content: [ [ 'A1 content' ] ] },
64 |       { url: 'urlB', content: [ [ ] ] },
65 |       { url: 'urlC', content: [ ] },
66 |     ]);
67 |     project.getPagedResources.onCall(1).returns([]);
68 |     await exporter.export(project);
69 | 
70 |     const expectedContent = `url,colA
71 |       urlA,"A1 content"
72 |       urlB
73 |       urlC`
74 |       .split(lineSeparator).map(csvLine => csvLine.trim()).join(lineSeparator);
75 | 
76 |     assert.strictEqual(content, expectedContent);
77 |   });
78 | 
79 |   it('array values - multiple selectors', async () => {
80 |     sandbox.stub(exporter, 'getContentKeys').returns([ 'colA', 'colB' ]);
81 |     project.getPagedResources.onCall(0).returns([
82 |       { url: 'urlA', content: [ [ 'A1 content', 'B1 content' ], [ 'A2 content', 'B2 content' ] ] },
83 |       { url: 'urlB', content: [ [ 'A3 content', 'B3 content' ] ] },
84 |     ]);
85 |     project.getPagedResources.onCall(1).returns([]);
86 |     await exporter.export(project);
87 | 
88 |     const expectedContent = `url,colA,colB
89 |       urlA,"A1 content","B1 content"
90 |       urlA,"A2 content","B2 content"
91 |       urlB,"A3 content","B3 content"`
92 |       .split(lineSeparator).map(csvLine => csvLine.trim()).join(lineSeparator);
93 | 
94 |     assert.strictEqual(content, expectedContent);
95 |   });
96 | });
97 | 


--------------------------------------------------------------------------------
/test/unit/logwrapper/test-log-wrapper.ts:
--------------------------------------------------------------------------------
 1 | import { assert } from 'chai';
 2 | import { getLogger, setLogger } from '../../../src/logger/Logger';
 3 | 
 4 | describe('LogWrapper', () => {
 5 |   it('default log level', () => {
 6 |     const childWrapper = getLogger('test');
 7 |     assert.strictEqual(childWrapper.logger.level, 'warn');
 8 |   });
 9 | 
10 |   it('changes to parent logger propagate to existing child loggers', () => {
11 |     const childWrapper = getLogger('test');
12 | 
13 |     setLogger({ level: 'info' });
14 |     assert.strictEqual(childWrapper.logger.level, 'info');
15 | 
16 |     // revert back to default log level
17 |     setLogger({ level: 'warn' });
18 |   });
19 | 
20 |   it('filter out log arguments', () => {
21 |     const childWrapper = getLogger('test');
22 | 
23 |     const rawObj = [
24 |       { a: 1, b: Buffer.from('a'), c: null, d: new Uint8Array([ 0, 1, 2 ]) },
25 |       { d: 'message C', e: Buffer.from('a'), cert: {}, f: null },
26 |     ];
27 | 
28 |     // extra circular reference :)
29 |     rawObj[1].f = rawObj;
30 | 
31 |     assert.sameDeepMembers(
32 |       childWrapper.filterArg(rawObj),
33 |       [
34 |         {
35 |           a: 1,
36 |           b: '<Buffer> not included',
37 |           c: null,
38 |           d: '<ArrayBuffer|DataView> not included',
39 |         },
40 |         {
41 |           d: 'message C',
42 |           e: '<Buffer> not included',
43 |           cert: '<cert> not included',
44 |           f: null,
45 |         },
46 |       ],
47 |     );
48 |   });
49 | 
50 |   it('filter out ignore error', () => {
51 |     const childWrapper = getLogger('test');
52 | 
53 |     const err = new Error('unexpected error');
54 |     const filteredErr = childWrapper.filterArg(err);
55 | 
56 |     assert.strictEqual(filteredErr.name, err.name);
57 |     assert.strictEqual(filteredErr.message, err.message);
58 |     assert.strictEqual(filteredErr.stack, err.stack);
59 |   });
60 | });
61 | 


--------------------------------------------------------------------------------
/test/unit/pipelines/test-merge-plugin-opts.ts:
--------------------------------------------------------------------------------
  1 | /* eslint-disable prefer-destructuring */
  2 | import { assert } from 'chai';
  3 | import { PluginOpts } from '../../../src/plugins/Plugin';
  4 | import { pipelines, mergePluginOpts } from '../../../src/pipelines/pipelines';
  5 | 
  6 | describe('MergePluginOpts', () => {
  7 |   const { defaultPluginOpts } = pipelines['browser-static-content'];
  8 | 
  9 |   it('before anchor', () => {
 10 |     const customOpts = [ {
 11 |       name: 'CustomBeforePlugin',
 12 |       before: 'BrowserFetchPlugin',
 13 |     } ];
 14 | 
 15 |     const mergedOpts = mergePluginOpts(defaultPluginOpts, customOpts);
 16 | 
 17 |     assert.sameDeepOrderedMembers(
 18 |       mergedOpts,
 19 |       [ ...customOpts, ...defaultPluginOpts ],
 20 |     );
 21 |   });
 22 | 
 23 |   it('after anchor', () => {
 24 |     const customOpts = [ {
 25 |       name: 'CustomAfterPlugin',
 26 |       after: 'UpsertResourcePlugin',
 27 |     } ];
 28 | 
 29 |     const mergedOpts = mergePluginOpts(defaultPluginOpts, customOpts);
 30 | 
 31 |     assert.sameDeepOrderedMembers(
 32 |       mergedOpts,
 33 |       [ ...defaultPluginOpts, ...customOpts ],
 34 |     );
 35 |   });
 36 | 
 37 |   it('replace anchor', () => {
 38 |     const customOpts = [ {
 39 |       name: 'CustomReplacePlugin',
 40 |       replace: 'BrowserFetchPlugin',
 41 |     } ];
 42 | 
 43 |     const mergedOpts = mergePluginOpts(defaultPluginOpts, customOpts);
 44 |     const expectedOpts = [ ...defaultPluginOpts ];
 45 |     expectedOpts[0] = customOpts[0];
 46 | 
 47 |     assert.sameDeepOrderedMembers(
 48 |       mergedOpts,
 49 |       expectedOpts,
 50 |     );
 51 |   });
 52 | 
 53 |   it('merge anchor', () => {
 54 |     const customOpts = [ {
 55 |       name: 'ExtractUrlsPlugin',
 56 |       maxDepth: 5,
 57 |     } ];
 58 | 
 59 |     const mergedOpts = mergePluginOpts(defaultPluginOpts, customOpts);
 60 |     const expectedOpts = [ ...defaultPluginOpts ];
 61 |     expectedOpts[1] = customOpts[0];
 62 | 
 63 |     assert.sameDeepOrderedMembers(
 64 |       mergedOpts,
 65 |       expectedOpts,
 66 |     );
 67 |   });
 68 | 
 69 |   it('multiple anchors', () => {
 70 |     const customOpts = [
 71 |       {
 72 |         name: 'CustomBefore1Plugin',
 73 |         before: 'BrowserFetchPlugin',
 74 |       },
 75 |       {
 76 |         name: 'CustomBefore2Plugin',
 77 |         before: 'BrowserFetchPlugin',
 78 |       },
 79 |       {
 80 |         name: 'CustomAfter1Plugin',
 81 |         after: 'ExtractHtmlContentPlugin',
 82 |       },
 83 |       {
 84 |         name: 'CustomAfter2Plugin',
 85 |         after: 'ExtractHtmlContentPlugin',
 86 |       },
 87 |     ];
 88 | 
 89 |     const mergedOpts = mergePluginOpts(defaultPluginOpts, customOpts);
 90 |     const expectedOpts:PluginOpts[] = [
 91 |       {
 92 |         name: 'CustomBefore1Plugin',
 93 |         before: 'BrowserFetchPlugin',
 94 |       },
 95 |       {
 96 |         name: 'CustomBefore2Plugin',
 97 |         before: 'BrowserFetchPlugin',
 98 |       },
 99 |       {
100 |         name: 'BrowserFetchPlugin',
101 |       },
102 |       {
103 |         name: 'ExtractUrlsPlugin',
104 |       },
105 |       {
106 |         name: 'ExtractHtmlContentPlugin',
107 |       },
108 |       {
109 |         name: 'CustomAfter2Plugin',
110 |         after: 'ExtractHtmlContentPlugin',
111 |       },
112 |       {
113 |         name: 'CustomAfter1Plugin',
114 |         after: 'ExtractHtmlContentPlugin',
115 |       },
116 |       {
117 |         name: 'InsertResourcesPlugin',
118 |       },
119 |       {
120 |         name: 'UpsertResourcePlugin',
121 |       },
122 |     ];
123 | 
124 |     assert.sameDeepOrderedMembers(
125 |       mergedOpts,
126 |       expectedOpts,
127 |     );
128 |   });
129 | });
130 | 


--------------------------------------------------------------------------------
/test/unit/plugins/test-dom-utils.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable @typescript-eslint/no-empty-function */
 2 | // eslint-disable-next-line max-classes-per-file
 3 | import { assert } from 'chai';
 4 | import { SinonSandbox, createSandbox } from 'sinon';
 5 | import { DomStabilityStatus, waitForDomStability } from '../../../src/plugins/dom-utils';
 6 | 
 7 | describe('DOM Utils', () => {
 8 |   let sandbox:SinonSandbox;
 9 |   let mutationCallback: (mutationList, observer) => void;
10 | 
11 |   const PollyMutationObserver = class {
12 |     constructor(callback) {
13 |       mutationCallback = callback;
14 |     }
15 | 
16 |     observe() {}
17 |     disconnect() {}
18 |   };
19 |   global.MutationObserver = <any>PollyMutationObserver;
20 | 
21 |   beforeEach(() => {
22 |     sandbox = createSandbox();
23 |   });
24 | 
25 |   afterEach(() => {
26 |     sandbox.restore();
27 |     mutationCallback = null;
28 |   });
29 | 
30 |   it('waitForDomStability - DomStabilityStatus.Unchanged', async () => {
31 |     sandbox.stub(PollyMutationObserver.prototype);
32 | 
33 |     const domStatus: DomStabilityStatus = await waitForDomStability({ stabilityCheck: 102, stabilityTimeout: 500 });
34 |     await new Promise(resolve => setTimeout(resolve, 3000));
35 |     assert.strictEqual(domStatus, DomStabilityStatus.Unchanged);
36 |   });
37 | 
38 |   it('waitForDomStability - DomStabilityStatus.Unstable', async () => {
39 |     const observer = sandbox.createStubInstance(PollyMutationObserver);
40 | 
41 |     const domStatusPromise: Promise<DomStabilityStatus> = waitForDomStability({ stabilityCheck: 200, stabilityTimeout: 500 });
42 |     const intervalId = setInterval(mutationCallback, 100, [ { type: 'childList' } ], observer);
43 | 
44 |     const domStatus: DomStabilityStatus = await domStatusPromise;
45 |     assert.strictEqual(domStatus, DomStabilityStatus.Unstable);
46 | 
47 |     clearInterval(intervalId);
48 |   });
49 | 
50 |   it('waitForDomStability - DomStabilityStatus.Stable', async () => {
51 |     const observer = sandbox.createStubInstance(PollyMutationObserver);
52 | 
53 |     const domStatusPromise: Promise<DomStabilityStatus> = waitForDomStability({ stabilityCheck: 200, stabilityTimeout: 500 });
54 |     const intervalId = setTimeout(mutationCallback, 100, [ { type: 'childList' } ], observer);
55 | 
56 |     const domStatus: DomStabilityStatus = await domStatusPromise;
57 |     assert.strictEqual(domStatus, DomStabilityStatus.Stable);
58 | 
59 |     clearInterval(intervalId);
60 |   });
61 | });
62 | 


--------------------------------------------------------------------------------
/test/unit/plugins/test-insert-resources-plugin.ts:
--------------------------------------------------------------------------------
 1 | import { assert } from 'chai';
 2 | import { createSandbox, SinonSandbox, SinonStubbedInstance } from 'sinon';
 3 | import InsertResourcesPlugin from '../../../src/plugins/default/InsertResourcesPlugin';
 4 | import Queue from '../../../src/storage/base/Queue';
 5 | import Resource from '../../../src/storage/base/Resource';
 6 | 
 7 | describe('InsertResourcesPlugin', () => {
 8 |   let sandbox:SinonSandbox;
 9 |   let plugin: InsertResourcesPlugin;
10 |   let project:{queue: SinonStubbedInstance<Queue>};
11 | 
12 |   beforeEach(() => {
13 |     sandbox = createSandbox();
14 | 
15 |     const queue = sandbox.stub<Queue>(<any>{
16 |       count: () => null,
17 |       filterNewUrls: urls => null,
18 |       add: () => null,
19 |     });
20 |     queue.count.returns(Promise.resolve(0));
21 |     queue.filterNewUrls.callsFake((urls:string[]) => Promise.resolve(urls));
22 | 
23 |     project = { queue };
24 |   });
25 | 
26 |   afterEach(() => {
27 |     sandbox.restore();
28 |   });
29 | 
30 |   it('test conditions', () => {
31 |     plugin = new InsertResourcesPlugin();
32 |     assert.isFalse(plugin.test(<any>project, null));
33 |     assert.isFalse(plugin.test(<any>project, <Resource>{ resourcesToAdd: [ ] }));
34 |     assert.isTrue(plugin.test(<any>project, <Resource>{ resourcesToAdd: [ { url: 'http://a.com' } ] }));
35 |   });
36 | 
37 |   it('fully save new resources, maxResources undefined', async () => {
38 |     plugin = new InsertResourcesPlugin();
39 |     await plugin.apply(<any>project, <Resource>{ depth: 0, resourcesToAdd: [ { url: 'urlA' }, { url: 'urlB' } ] });
40 | 
41 |     assert.isTrue(project.queue.filterNewUrls.notCalled);
42 |     assert.isTrue(project.queue.add.calledOnce);
43 | 
44 |     const [ saveResources ] = project.queue.add.args[0];
45 |     assert.sameDeepMembers([ { url: 'urlA', depth: 1 }, { url: 'urlB', depth: 1 } ], saveResources);
46 |   });
47 | 
48 |   it('fully save new resources, maxResources defined', async () => {
49 |     plugin = new InsertResourcesPlugin({ maxResources: 2 });
50 |     await plugin.apply(<any>project, <Resource>{ depth: 0, resourcesToAdd: [ { url: 'urlA' }, { url: 'urlB' } ] });
51 | 
52 |     assert.isTrue(project.queue.filterNewUrls.notCalled);
53 |     assert.isTrue(project.queue.add.calledOnce);
54 | 
55 |     const [ saveResources ] = project.queue.add.args[0];
56 |     assert.sameDeepMembers([ { url: 'urlA', depth: 1 }, { url: 'urlB', depth: 1 } ], saveResources);
57 |   });
58 | 
59 |   it('partially save new resources', async () => {
60 |     plugin = new InsertResourcesPlugin({ maxResources: 1 });
61 |     await plugin.apply(<any>project, <Resource>{ depth: 0, resourcesToAdd: [ { url: 'urlA' }, { url: 'urlB' } ] });
62 | 
63 |     assert.isTrue(project.queue.filterNewUrls.calledOnce);
64 |     assert.isTrue(project.queue.add.calledOnce);
65 | 
66 |     const [ saveResources ] = project.queue.add.args[0];
67 |     assert.sameDeepMembers([ { url: 'urlA', depth: 1 } ], saveResources);
68 |   });
69 | 
70 |   it('partially save new/existing resources', async () => {
71 |     project.queue.filterNewUrls.returns(Promise.resolve([ 'urlB' ]));
72 |     plugin = new InsertResourcesPlugin({ maxResources: 1 });
73 |     await plugin.apply(<any>project, <Resource>{ depth: 0, resourcesToAdd: [ { url: 'urlA' }, { url: 'urlB' } ] });
74 | 
75 |     assert.isTrue(project.queue.filterNewUrls.calledOnce);
76 |     assert.isTrue(project.queue.add.calledOnce);
77 | 
78 |     const [ saveResources ] = project.queue.add.args[0];
79 |     assert.sameDeepMembers([ { url: 'urlB', depth: 1 } ], saveResources);
80 |   });
81 | });
82 | 


--------------------------------------------------------------------------------
/test/unit/plugins/test-node-fetch-plugin.ts:
--------------------------------------------------------------------------------
 1 | import { assert } from 'chai';
 2 | import { SinonSandbox, createSandbox } from 'sinon';
 3 | import http, { ClientRequest } from 'http';
 4 | import { Readable } from 'stream';
 5 | import { gzipSync } from 'zlib';
 6 | import NodeFetchPlugin from '../../../src/plugins/default/NodeFetchPlugin';
 7 | import Resource from '../../../src/storage/base/Resource';
 8 | 
 9 | describe('NodeFetchPlugin', () => {
10 |   let sandbox:SinonSandbox;
11 |   const plugin: NodeFetchPlugin = new NodeFetchPlugin();
12 | 
13 |   beforeEach(() => {
14 |     sandbox = createSandbox();
15 |   });
16 | 
17 |   afterEach(() => {
18 |     sandbox.restore();
19 |   });
20 | 
21 |   it('fetch no compression', async () => {
22 |     const htmlContent = '<body></body>';
23 |     const clientRequestStub = sandbox.createStubInstance(ClientRequest);
24 |     sandbox.stub(plugin, 'getRequestFnc').returns((opts, callback) => {
25 |       const response = Readable.from(htmlContent);
26 | 
27 |       callback(<any>Object.assign(response, {
28 |         statusCode: 201,
29 |         headers: {
30 |           'content-encoding': '',
31 |         },
32 |       }));
33 | 
34 |       return <any>clientRequestStub;
35 |     });
36 | 
37 |     const result = await plugin.fetch(<Resource>{ url: 'http://sitea.com' });
38 |     assert.strictEqual(result.data.toString(), htmlContent);
39 |     assert.isTrue(clientRequestStub.end.calledOnce);
40 |   });
41 | 
42 |   it('fetch gzip', async () => {
43 |     const htmlContent = '<body></body>';
44 |     const clientRequestStub = sandbox.createStubInstance(ClientRequest);
45 |     sandbox.stub(plugin, 'getRequestFnc').returns((opts, callback) => {
46 |       const response = Readable.from(gzipSync(htmlContent));
47 |       callback(<any>Object.assign(response, {
48 |         statusCode: 201,
49 |         headers: {
50 |           'content-encoding': 'gzip',
51 |         },
52 |       }));
53 | 
54 |       return <any>clientRequestStub;
55 |     });
56 | 
57 |     const result = await plugin.fetch(<Resource>{ url: 'http://sitea.com' });
58 |     assert.strictEqual((<Buffer>result.data).toString('utf8'), htmlContent);
59 |     assert.isTrue(clientRequestStub.end.calledOnce);
60 |   });
61 | 
62 |   it('fetch read timeout', async () => {
63 |     const srv = http.createServer((req, res) => {
64 |       setTimeout(() => {
65 |         res.writeHead(200, { 'Content-Type': 'text/plain' });
66 |         res.write('<body></body>');
67 |         res.end();
68 |       }, 1 * 1000);
69 |     });
70 |     srv.listen(8000);
71 | 
72 |     plugin.opts.readTimeout = 0.5 * 1000;
73 |     let timeoutError;
74 |     try {
75 |       await plugin.fetch(<Resource>{ url: 'http://sitea.com', proxy: { host: '127.0.0.1', port: 8000 } });
76 |     }
77 |     catch (err) {
78 |       timeoutError = err;
79 |     }
80 | 
81 |     srv.close();
82 | 
83 |     assert.strictEqual(timeoutError.status, 408);
84 |   });
85 | });
86 | 


--------------------------------------------------------------------------------
/test/unit/plugins/test-scroll-plugin.ts:
--------------------------------------------------------------------------------
 1 | import { assert } from 'chai';
 2 | import { createSandbox, SinonSandbox } from 'sinon';
 3 | import ScrollPlugin from '../../../src/plugins/default/ScrollPlugin';
 4 | import * as utils from '../../../src/plugins/dom-utils';
 5 | import Resource from '../../../src/storage/base/Resource';
 6 | 
 7 | describe('ScrollPlugin', () => {
 8 |   let sandbox:SinonSandbox;
 9 |   let plugin: ScrollPlugin;
10 |   const project:any = {};
11 | 
12 |   beforeEach(() => {
13 |     sandbox = createSandbox();
14 |     sandbox.stub(window, 'scrollTo');
15 |   });
16 | 
17 |   afterEach(() => {
18 |     sandbox.restore();
19 |   });
20 | 
21 |   it('test conditions', () => {
22 |     plugin = new ScrollPlugin();
23 |     assert.isFalse(plugin.test(project, null));
24 |     assert.isFalse(plugin.test(project, <Resource>{ actions: [ 'clickA' ] }));
25 |     assert.isTrue(plugin.test(project, <Resource>{ contentType: 'text/html' }));
26 |   });
27 | 
28 |   it('apply DOM unchanged', async () => {
29 |     plugin = new ScrollPlugin();
30 |     const stubWaitForStability = sandbox.stub(utils, 'waitForDomStability');
31 |     stubWaitForStability.returns(new Promise(resolve => resolve(utils.DomStabilityStatus.Unchanged)));
32 | 
33 |     const actualResult = await plugin.apply();
34 |     assert.isNull(actualResult);
35 |   });
36 | 
37 |   it('apply DOM changed, stable', async () => {
38 |     plugin = new ScrollPlugin();
39 |     const stubWaitForStability = sandbox.stub(utils, 'waitForDomStability');
40 |     stubWaitForStability.returns(new Promise(resolve => resolve(utils.DomStabilityStatus.Stable)));
41 | 
42 |     const actualResult = await plugin.apply();
43 |     const expectedResult = { actions: [ 'scroll#1' ], status: 200 };
44 |     assert.deepEqual(actualResult, expectedResult);
45 |   });
46 | 
47 |   it('apply DOM changed, unstable', async () => {
48 |     plugin = new ScrollPlugin();
49 |     const stubWaitForStability = sandbox.stub(utils, 'waitForDomStability');
50 |     stubWaitForStability.returns(new Promise(resolve => resolve(utils.DomStabilityStatus.Unstable)));
51 | 
52 |     let actualErr;
53 |     try {
54 |       await plugin.apply();
55 |     }
56 |     catch (err) {
57 |       actualErr = err;
58 |     }
59 | 
60 |     assert.strictEqual(actualErr.message, `DOM not stable after stabilityTimeout of ${plugin.opts.stabilityTimeout}`);
61 |   });
62 | });
63 | 


--------------------------------------------------------------------------------
/test/unit/plugins/test-upsert-resource-plugin.ts:
--------------------------------------------------------------------------------
 1 | import { assert } from 'chai';
 2 | import UpsertResourcePlugin from '../../../src/plugins/default/UpsertResourcePlugin';
 3 | import Resource from '../../../src/storage/base/Resource';
 4 | 
 5 | describe('UpsertResourcePlugin', () => {
 6 |   let plugin: UpsertResourcePlugin;
 7 |   const project:any = { resourceCount: 0 };
 8 | 
 9 |   it('test conditions', () => {
10 |     plugin = new UpsertResourcePlugin();
11 |     assert.isFalse(plugin.test(project, null));
12 |     assert.isTrue(plugin.test(project, <Resource>{}));
13 |   });
14 | });
15 | 


--------------------------------------------------------------------------------
/test/unit/plugins/test-url-utils.ts:
--------------------------------------------------------------------------------
 1 | import { assert } from 'chai';
 2 | import { getUrlColIdx, normalizeUrl } from '../../../src/plugins/url-utils';
 3 | 
 4 | describe('URL Utils', () => {
 5 |   it('normalizeUrl', async () => {
 6 |     assert.strictEqual(normalizeUrl('http://wWw.CaPs.com'), 'http://www.caps.com/');
 7 |     assert.strictEqual(normalizeUrl('no-proTocoL.com'), 'https://no-protocol.com/');
 8 |     assert.strictEqual(normalizeUrl('WWw.no-proTocoL.com'), 'https://www.no-protocol.com/');
 9 |   });
10 | 
11 |   it('getUrlColIdx', async () => {
12 |     assert.strictEqual(getUrlColIdx('1,http://sitea.com'), 1);
13 |     assert.strictEqual(getUrlColIdx('1, 2, www.sitea.com'), 2);
14 |     assert.strictEqual(getUrlColIdx('sitea.com'), 0);
15 |   });
16 | 
17 |   it('getUrlColIdx throws error', async () => {
18 |     let urlErr;
19 |     try {
20 |       getUrlColIdx('1,2,invalidurl');
21 |     }
22 |     catch (err) {
23 |       urlErr = err;
24 |     }
25 | 
26 |     assert.strictEqual(urlErr.message, 'could not detect url column from 1,2,invalidurl');
27 |   });
28 | });
29 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-cjs-js/BaseJs.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable */
 2 | class BaseJsA {
 3 |   jsa(a, b) {
 4 |     return a + b;
 5 |   }
 6 | }
 7 | 
 8 | class BaseJsB {
 9 |   jsb(a, b) {
10 |     return a + b;
11 |   }
12 | }
13 | 
14 | module.exports = {
15 |   BaseJsA,
16 |   BaseJsB
17 | }
18 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-cjs-js/Extended.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable */
 2 | const { BaseJsA } = require('./BaseJs');
 3 | 
 4 | class Extended extends BaseJsA {
 5 |   sum(a, b) {
 6 |     return a + b;
 7 |   }
 8 | }
 9 | 
10 | module.exports = Extended;
11 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-cjs-js/ExtendedDomRead.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable */
 2 | const { BaseJsB } = require('./BaseJs');
 3 | 
 4 | class ExtendedDomRead extends BaseJsB {
 5 |   opts = {
 6 |     domRead: true,
 7 |   }
 8 | 
 9 |   sum(a, b) {
10 |     console.log(BaseJsB);
11 |     return this.jsb(a, b);
12 |   }
13 | 
14 |   async asum(a, b) {
15 |     return this.jsb(a, b);
16 |   }
17 | }
18 | 
19 | module.exports = ExtendedDomRead;
20 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-cjs-js/expected-extended-dom-read-bundle.txt:
--------------------------------------------------------------------------------
 1 | /* eslint-disable */
 2 | class BaseJsA {
 3 |   jsa(a, b) {
 4 |     return a + b;
 5 |   }
 6 | }
 7 | 
 8 | class BaseJsB$1 {
 9 |   jsb(a, b) {
10 |     return a + b;
11 |   }
12 | }
13 | 
14 | var BaseJs = {
15 |   BaseJsA,
16 |   BaseJsB: BaseJsB$1
17 | };
18 | 
19 | var require$$0 = BaseJs;
20 | 
21 | /* eslint-disable */
22 | 
23 | const { BaseJsB } = require$$0;
24 | 
25 | class ExtendedDomRead extends BaseJsB {
26 |   opts = {
27 |     domRead: true,
28 |   }
29 | 
30 |   sum(a, b) {
31 |     console.log(BaseJsB);
32 |     return this.jsb(a, b);
33 |   }
34 | 
35 |   async asum(a, b) {
36 |     return this.jsb(a, b);
37 |   }
38 | }
39 | 
40 | var ExtendedDomRead_1 = ExtendedDomRead;
41 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-esm-js/BaseJs.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable */
 2 | export class BaseJsA {
 3 |   jsa(a, b) {
 4 |     return a + b;
 5 |   }
 6 | }
 7 | 
 8 | export class BaseJsB {
 9 |   jsb(a, b) {
10 |     return a + b;
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-esm-js/ExtendedDomRead.js:
--------------------------------------------------------------------------------
 1 | import { extra } from '@get-set-fetch/test-utils';
 2 | import { BaseJsB } from './BaseJs';
 3 | 
 4 | export default class ExtendedDomRead extends BaseJsB {
 5 |   opts = {
 6 |     domRead: true,
 7 |   }
 8 | 
 9 |   sum(a, b) {
10 |     return this.jsb(a, b);
11 |   }
12 | 
13 |   async asum(a, b) {
14 |     console.log(extra);
15 |     return this.jsb(a, b);
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-esm-js/expected-extended-dom-read-bundle.txt:
--------------------------------------------------------------------------------
 1 | const extra = 'just a named export required for testing partial package import when bundling scraper plugins';
 2 | 
 3 | /* eslint-disable */
 4 | 
 5 | class BaseJsB {
 6 |   jsb(a, b) {
 7 |     return a + b;
 8 |   }
 9 | }
10 | 
11 | class ExtendedDomRead extends BaseJsB {
12 |   opts = {
13 |     domRead: true,
14 |   }
15 | 
16 |   sum(a, b) {
17 |     return this.jsb(a, b);
18 |   }
19 | 
20 |   async asum(a, b) {
21 |     console.log(extra);
22 |     return this.jsb(a, b);
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-esm-ts/BaseTs.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable */
 2 | export class BaseTsA {
 3 |   tsa(a:number, b:number) {
 4 |     return a + b;
 5 |   }
 6 | }
 7 | 
 8 | export class BaseTsB {
 9 |   tsb(a:number, b:number) {
10 |     return a + b;
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-esm-ts/Extended.ts:
--------------------------------------------------------------------------------
1 | import { BaseTsA } from './BaseTs';
2 | 
3 | export default class Extended extends BaseTsA {
4 |   sum(a:number, b:number) {
5 |     return a + b;
6 |   }
7 | }
8 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-esm-ts/ExtendedDomRead.ts:
--------------------------------------------------------------------------------
 1 | import { extra } from '@get-set-fetch/test-utils';
 2 | import { BaseTsB } from './BaseTs';
 3 | 
 4 | export default class ExtendedDomRead extends BaseTsB {
 5 |   opts = {
 6 |     domRead: true,
 7 |   }
 8 | 
 9 |   sum(a, b) {
10 |     return this.tsb(a, b);
11 |   }
12 | 
13 |   async asum(a, b) {
14 |     console.log(extra);
15 |     return this.tsb(a, b);
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-esm-ts/expected-extended-dom-read-bundle.txt:
--------------------------------------------------------------------------------
 1 | const extra = 'just a named export required for testing partial package import when bundling scraper plugins';
 2 | 
 3 | /* eslint-disable */
 4 | class BaseTsB {
 5 |     tsb(a, b) {
 6 |         return a + b;
 7 |     }
 8 | }
 9 | 
10 | class ExtendedDomRead extends BaseTsB {
11 |     constructor() {
12 |         super(...arguments);
13 |         this.opts = {
14 |             domRead: true,
15 |         };
16 |     }
17 |     sum(a, b) {
18 |         return this.tsb(a, b);
19 |     }
20 |     async asum(a, b) {
21 |         console.log(extra);
22 |         return this.tsb(a, b);
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-mixed-esm-cjs-ts-js/BaseJs.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable */
 2 | class BaseJsA {
 3 |   jsa(a, b) {
 4 |     return a + b;
 5 |   }
 6 | }
 7 | 
 8 | class BaseJsB {
 9 |   jsb(a, b) {
10 |     return a + b;
11 |   }
12 | }
13 | 
14 | module.exports = {
15 |   BaseJsA,
16 |   BaseJsB
17 | }
18 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-mixed-esm-cjs-ts-js/BaseTs.ts:
--------------------------------------------------------------------------------
 1 | /* eslint-disable */
 2 | export class BaseTsA {
 3 |   tsa(a:number, b:number) {
 4 |     return a + b;
 5 |   }
 6 | }
 7 | 
 8 | export class BaseTsB {
 9 |   tsb(a:number, b:number) {
10 |     return a + b;
11 |   }
12 | }
13 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-mixed-esm-cjs-ts-js/ExtendedDomRead.ts:
--------------------------------------------------------------------------------
 1 | import { extra } from '@get-set-fetch/test-utils';
 2 | import { BaseJsB } from './BaseJs';
 3 | import { BaseTsB } from './BaseTs';
 4 | 
 5 | export default class ExtendedDomRead extends BaseJsB {
 6 |   opts = {
 7 |     domRead: true,
 8 |   }
 9 | 
10 |   sum(a, b) {
11 |     console.log(BaseTsB);
12 |     return this.jsb(a, b);
13 |   }
14 | 
15 |   async asum(a, b) {
16 |     console.log(extra);
17 |     return this.jsb(a, b);
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/test/unit/pluginstore/input-mixed-esm-cjs-ts-js/expected-extended-dom-read-bundle.txt:
--------------------------------------------------------------------------------
 1 | const extra = 'just a named export required for testing partial package import when bundling scraper plugins';
 2 | 
 3 | /* eslint-disable */
 4 | class BaseJsA {
 5 |   jsa(a, b) {
 6 |     return a + b;
 7 |   }
 8 | }
 9 | 
10 | class BaseJsB {
11 |   jsb(a, b) {
12 |     return a + b;
13 |   }
14 | }
15 | 
16 | var BaseJs = {
17 |   BaseJsA,
18 |   BaseJsB
19 | };
20 | 
21 | /* eslint-disable */
22 | class BaseTsB {
23 |     tsb(a, b) {
24 |         return a + b;
25 |     }
26 | }
27 | 
28 | class ExtendedDomRead extends BaseJs.BaseJsB {
29 |     constructor() {
30 |         super(...arguments);
31 |         this.opts = {
32 |             domRead: true,
33 |         };
34 |     }
35 |     sum(a, b) {
36 |         console.log(BaseTsB);
37 |         return this.jsb(a, b);
38 |     }
39 |     async asum(a, b) {
40 |         console.log(extra);
41 |         return this.jsb(a, b);
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/test/unit/scraper/test-runtime-metrics.ts:
--------------------------------------------------------------------------------
 1 | import { assert } from 'chai';
 2 | import { SinonSandbox, createSandbox } from 'sinon';
 3 | import RuntimeMetrics, { RuntimeOptions } from '../../../src/scraper/RuntimeMetrics';
 4 | 
 5 | describe('RuntimeMetrics', () => {
 6 |   let sandbox:SinonSandbox;
 7 |   let metrics:RuntimeMetrics;
 8 | 
 9 |   beforeEach(() => {
10 |     sandbox = createSandbox();
11 |   });
12 | 
13 |   afterEach(() => {
14 |     sandbox.restore();
15 |   });
16 | 
17 |   it('snapshot', () => {
18 |     metrics = new RuntimeMetrics();
19 |     metrics.cpuUsage = {
20 |       totalTick: 1000,
21 |       totalIdle: 1000,
22 |       processTick: null,
23 |     };
24 |     sandbox.stub(metrics, 'getMemoryUsage').returns({
25 |       freeMem: 800,
26 |       totalMem: 1000,
27 |       processMem: 100,
28 |     });
29 |     sandbox.stub(metrics, 'getCpuUsage').returns({
30 |       totalTick: 2000,
31 |       totalIdle: 1600,
32 |       processTick: { user: 50000, system: 100000 },
33 |     });
34 | 
35 |     const snapshot:RuntimeOptions = metrics.takeSnapshot();
36 | 
37 |     assert.deepEqual(
38 |       snapshot,
39 |       {
40 |         global: { mem: 200, memPct: 20, cpuPct: 40 },
41 |         process: { mem: 100, memPct: 10, cpuPct: 15 },
42 |       },
43 |     );
44 |   });
45 | });
46 | 


--------------------------------------------------------------------------------
/test/unit/storage/mysql-unit-suite.ts:
--------------------------------------------------------------------------------
1 | import unitSuite from './unit-suite';
2 | import * as connConfig from '../../config/storage/mysql/mysql-conn.json';
3 | import Connection from '../../../src/storage/base/Connection';
4 | import KnexConnection from '../../../src/storage/knex/KnexConnection';
5 | 
6 | const conn:Connection = new KnexConnection(connConfig);
7 | unitSuite(conn);
8 | 


--------------------------------------------------------------------------------
/test/unit/storage/pg-unit-suite.ts:
--------------------------------------------------------------------------------
1 | import unitSuite from './unit-suite';
2 | import * as connConfig from '../../config/storage/pg/pg-conn.json';
3 | import Connection from '../../../src/storage/base/Connection';
4 | import KnexConnection from '../../../src/storage/knex/KnexConnection';
5 | 
6 | const conn:Connection = new KnexConnection(connConfig);
7 | unitSuite(conn);
8 | 


--------------------------------------------------------------------------------
/test/unit/storage/sqlite3-unit-suite.ts:
--------------------------------------------------------------------------------
1 | import unitSuite from './unit-suite';
2 | import * as connConfig from '../../config/storage/sqlite/sqlite-conn.json';
3 | import Connection from '../../../src/storage/base/Connection';
4 | import KnexConnection from '../../../src/storage/knex/KnexConnection';
5 | 
6 | const conn:Connection = new KnexConnection(connConfig);
7 | unitSuite(conn);
8 | 


--------------------------------------------------------------------------------
/test/unit/storage/unit-suite.ts:
--------------------------------------------------------------------------------
 1 | import crudResource from './test-resource-crud';
 2 | import crudProject from './test-project-crud';
 3 | import Connection from '../../../src/storage/base/Connection';
 4 | 
 5 | const suites = {
 6 |   crudResource,
 7 |   crudProject,
 8 | };
 9 | 
10 | export default function unitSuite(conn: Connection) {
11 |   Object.values(suites).forEach(suite => {
12 |     suite(conn);
13 |   });
14 | }
15 | 


--------------------------------------------------------------------------------
/test/utils/shims.js:
--------------------------------------------------------------------------------
1 | import { JSDOM } from 'jsdom';
2 | 
3 | // init jsdom environment for testing plugins running in browser
4 | const dom = new JSDOM('<!DOCTYPE html><p>Hello world</p>');
5 | global.document = dom.window.document;
6 | global.window = dom.window;
7 | 


--------------------------------------------------------------------------------
/test/utils/ts-node-config.js:
--------------------------------------------------------------------------------
1 | require('ts-node').register({
2 |   project: 'test/tsconfig.test.json',
3 |   files: true,
4 |   pretty: true,
5 |   'no-cache': true,
6 |   ignore: [ /node_modules\/(?!@get-set-fetch\/test-utils)/ ],
7 | });
8 | 


--------------------------------------------------------------------------------
/tsconfig.debug.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "esModuleInterop": true,
 4 |     "resolveJsonModule": true,
 5 |     "target": "es2020",
 6 |     "strict": false,
 7 |     "moduleResolution": "node",
 8 |     "module": "commonjs",
 9 |     "allowJs": true,
10 |     "rootDir": "src",
11 |     "outDir": "dist/cjs",
12 |     "declaration": true,
13 |     "newLine": "LF",
14 |     "preserveConstEnums": true,
15 |     "sourceMap": true,
16 |     "useDefineForClassFields": true
17 |   },
18 |   "include": [
19 |     "src"
20 |   ]
21 | }


--------------------------------------------------------------------------------
/tsconfig.esm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "composite": true,
 4 |     "esModuleInterop": true,
 5 |     "resolveJsonModule": true,
 6 |     "target": "es2020",
 7 |     "strict": false,
 8 |     "moduleResolution": "node",
 9 |     "module": "es2020",
10 |     "allowJs": true,
11 |     "rootDir": "src",
12 |     "outDir": "dist/esm",
13 |     "declaration": true,
14 |     "newLine": "LF",
15 |     "preserveConstEnums": true,
16 |     "useDefineForClassFields": false
17 |   },
18 |   
19 |   "include": [
20 |     "src",
21 |     "src/**/*.json",
22 |   ],
23 | }


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "esModuleInterop": true,
 4 |     "resolveJsonModule": true,
 5 |     "target": "es2020",
 6 |     "strict": false,
 7 |     "moduleResolution": "node",
 8 |     "module": "commonjs",
 9 |     "allowJs": true,
10 |     "rootDir": "src",
11 |     "outDir": "dist/cjs",
12 |     "declaration": true,
13 |     "newLine": "LF",
14 |     "preserveConstEnums": true,
15 |     "useDefineForClassFields": false
16 |   },
17 |   "include": [
18 |     "src"
19 |   ]
20 | }


--------------------------------------------------------------------------------