├── .actrc ├── .eslintrc.json ├── .github ├── act │ └── event.json ├── actions │ └── acceptance │ │ └── action.yml └── workflows │ ├── audit.yml │ └── test.yml ├── .gitignore ├── .nycrc.json ├── LICENSE ├── README.md ├── bin └── gsfscrape ├── changelog.md ├── cloud ├── ansible │ ├── gsf-postgresql-logs │ │ └── tasks │ │ │ └── main.yml │ ├── gsf-postgresql-setup │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ └── tasks │ │ │ └── main.yml │ ├── gsf-scraper-benchmark │ │ └── tasks │ │ │ └── main.yml │ ├── gsf-scraper-export │ │ └── tasks │ │ │ └── main.yml │ ├── gsf-scraper-logs │ │ └── tasks │ │ │ └── main.yml │ ├── gsf-scraper-queue │ │ ├── tasks │ │ │ └── main.yml │ │ └── templates │ │ │ └── queue.j2 │ ├── gsf-scraper-setup │ │ ├── defaults │ │ │ └── main.yml │ │ ├── handlers │ │ │ └── main.yml │ │ ├── tasks │ │ │ └── main.yml │ │ └── templates │ │ │ └── getsetfetch.service.j2 │ └── gsf-scraper-stats │ │ └── tasks │ │ └── main.yml └── terraform │ ├── main.tf │ ├── provider.tf │ ├── templates │ └── hosts.tpl │ ├── user_data_pg.yml │ ├── user_data_scraper.yml │ └── variables.tf ├── datasets ├── .gitignore ├── javascript-libs-from-top-1mm-sites │ ├── ansible │ │ ├── files │ │ │ └── ExtractScriptsPlugin.js │ │ ├── inventory │ │ │ └── .gitkeep │ │ ├── pg-setup.yml │ │ ├── scraper-setup.yml │ │ ├── templates │ │ │ └── js-scripts-config.json.j2 │ │ └── vault.yml │ ├── charts │ │ ├── extract │ │ │ ├── CategoryExtractor.ts │ │ │ ├── ScriptParser.ts │ │ │ └── summarize-js-libs.ts │ │ ├── most-used-js-libs-2022-06-05-thumb.png │ │ ├── most-used-js-libs-2022-06-05.csv │ │ ├── most-used-js-libs-2022-06-05.png │ │ ├── most-used-js-libs-2022-06-05.svg │ │ └── most-used-js-libs.html │ ├── exports │ │ └── .gitkeep │ ├── readme.md │ └── terraform │ │ ├── main.tf │ │ ├── provider.tf │ │ └── variables.tf ├── readme.md ├── tsconfig.datasets.json └── utils │ ├── map.ts │ └── serve-static.ts ├── development.md ├── docker ├── Dockerfile ├── data │ ├── chromium-security-profile.json │ ├── config-pg-puppeteer.json │ ├── config-sqlite-cheerio.json │ └── config-sqlite-puppeteer.json ├── docker.md └── pg-puppeteer │ └── docker-compose.yml ├── examples ├── article-excerpts │ ├── ReadabilityPlugin.ts │ ├── article-excerpts-config.json │ └── article-excerpts.ts ├── cloud │ ├── ansible │ │ ├── files │ │ │ ├── ExtractScriptsPlugin.js │ │ │ ├── gsf-config.json │ │ │ └── gsf.service │ │ ├── pg-setup.yml │ │ ├── pg-stats.yml │ │ ├── scraper-export.yml │ │ ├── scraper-logs.yml │ │ ├── scraper-setup.yml │ │ ├── scraper-systemd-logs.yml │ │ └── vars.yml │ ├── readme.md │ └── terraform │ │ ├── provider.tf │ │ ├── resource-ansible-inventory.tf │ │ ├── resource-pg.sh │ │ ├── resource-pg.tf │ │ ├── resource-scraper.sh │ │ ├── resource-scraper.tf │ │ ├── resource-vpc.tf │ │ └── templates │ │ └── hosts.tpl ├── console-content │ ├── ConsoleBrowserFetchPlugin.ts │ ├── ConsolePuppeteerClient.ts │ ├── console-content-config.json │ └── console-content.ts ├── in-memory-queue │ ├── InMemoryConnection.ts │ ├── InMemoryQueue.ts │ ├── in-memory-queue-config.json │ └── in-memory-queue.ts ├── infinite-scrolling │ ├── infinite-scrolling-config.json │ └── infinite-scrolling.ts ├── pdf-extraction │ ├── pdf-extraction-config.json │ └── pdf-extraction.ts ├── product-details │ ├── product-details-config.json │ └── product-details.ts ├── sitemap │ ├── ExtractSameHostUrlsPlugin.ts │ ├── SitemapExporter.ts │ ├── SkipExtractHtmlContentPlugin.ts │ ├── scrape-config.json │ └── sitemap.ts ├── tabular-data │ ├── tabular-data-config.json │ └── tabular-data.ts ├── tls-fingerprinting │ ├── RandomTlsFingerprintFetch.ts │ ├── readme.md │ ├── tls-fingerprinting-config.json │ └── tls-fingerprinting.ts └── tsconfig.examples.json ├── package-lock.json ├── package.json ├── src ├── browserclient │ ├── BrowserClient.ts │ ├── PlaywrightClient.ts │ └── PuppeteerClient.ts ├── cli │ └── cli.ts ├── confighash │ ├── config-hash.ts │ └── dictionary-v1.json ├── domclient │ ├── CheerioClient.ts │ ├── DomClient.ts │ ├── JsdomClient.ts │ ├── NativeClient.ts │ └── client-utils.ts ├── export │ ├── CsvExporter.ts │ ├── Exporter.ts │ ├── MimeTypes.json │ └── ZipExporter.ts ├── index.ts ├── logger │ └── Logger.ts ├── pipelines │ ├── BrowserStaticContentPipeline.ts │ ├── DomStaticContentPipeline.ts │ └── pipelines.ts ├── plugins │ ├── Plugin.ts │ ├── default │ │ ├── BaseFetchPlugin.ts │ │ ├── BrowserFetchPlugin.ts │ │ ├── ExtractHtmlContentPlugin.ts │ │ ├── ExtractUrlsPlugin.ts │ │ ├── InsertResourcesPlugin.ts │ │ ├── NodeFetchPlugin.ts │ │ ├── ScrollPlugin.ts │ │ └── UpsertResourcePlugin.ts │ ├── dom-utils.ts │ ├── file-utils.ts │ └── url-utils.ts ├── pluginstore │ └── PluginStore.ts ├── schema │ └── SchemaHelper.ts ├── scraper │ ├── ConcurrencyManager.ts │ ├── QueueBuffer.ts │ ├── RuntimeMetrics.ts │ └── Scraper.ts └── storage │ ├── ConnectionManager.ts │ ├── base │ ├── Connection.ts │ ├── Entity.ts │ ├── Project.ts │ ├── Queue.ts │ ├── Resource.ts │ └── Storage.ts │ └── knex │ ├── KnexConnection.ts │ ├── KnexProject.ts │ ├── KnexQueue.ts │ ├── KnexResource.ts │ └── KnexStorage.ts ├── test ├── .mocharc.js ├── acceptance │ ├── acceptance-suite.ts │ ├── cheerio.ts │ ├── cli │ │ ├── config │ │ │ ├── config-single-page-single-content-entry-custom-plugin.json │ │ │ ├── config-single-page-single-content-entry.json │ │ │ ├── config-with-external-resources.json │ │ │ └── config-with-invalid-external-resources.json │ │ ├── plugins │ │ │ └── h1-counter-plugin.js │ │ ├── resources │ │ │ ├── resources-single-entry.csv │ │ │ ├── resources.csv │ │ │ └── unnormalized-resources.csv │ │ └── test-cli.ts │ ├── docker │ │ ├── config │ │ │ └── base-config.json │ │ └── test-docker.ts │ ├── jsdom.ts │ ├── playwright_chromium.ts │ └── puppeteer_chromium.ts ├── config │ ├── browserclient │ │ ├── playwright │ │ │ └── playwright-chromium.json │ │ └── puppeteer │ │ │ └── puppeteer-chromium.json │ └── storage │ │ ├── mysql │ │ ├── mysql-conn.json │ │ └── mysql.yml │ │ ├── pg │ │ ├── pg-conn.json │ │ └── pg.yml │ │ └── sqlite │ │ └── sqlite-conn.json ├── tmp │ └── .gitkeep ├── tsconfig.test.json ├── unit │ ├── confighash │ │ └── test-config-hash.ts │ ├── domclients │ │ ├── test-cheerio-client.ts │ │ └── test-jsdom-client.ts │ ├── exporter │ │ ├── test-csv-exporter.ts │ │ └── test-zip-exporter.ts │ ├── logwrapper │ │ └── test-log-wrapper.ts │ ├── pipelines │ │ └── test-merge-plugin-opts.ts │ ├── plugins │ │ ├── test-browser-fetch-plugin.ts │ │ ├── test-dom-utils.ts │ │ ├── test-extract-html-content-plugin.ts │ │ ├── test-extract-urls-plugin.ts │ │ ├── test-insert-resources-plugin.ts │ │ ├── test-node-fetch-plugin.ts │ │ ├── test-scroll-plugin.ts │ │ ├── test-upsert-resource-plugin.ts │ │ └── test-url-utils.ts │ ├── pluginstore │ │ ├── input-cjs-js │ │ │ ├── BaseJs.js │ │ │ ├── Extended.js │ │ │ ├── ExtendedDomRead.js │ │ │ └── expected-extended-dom-read-bundle.txt │ │ ├── input-esm-js │ │ │ ├── BaseJs.js │ │ │ ├── ExtendedDomRead.js │ │ │ └── expected-extended-dom-read-bundle.txt │ │ ├── input-esm-ts │ │ │ ├── BaseTs.ts │ │ │ ├── Extended.ts │ │ │ ├── ExtendedDomRead.ts │ │ │ └── expected-extended-dom-read-bundle.txt │ │ ├── input-mixed-esm-cjs-ts-js │ │ │ ├── BaseJs.js │ │ │ ├── BaseTs.ts │ │ │ ├── ExtendedDomRead.ts │ │ │ └── expected-extended-dom-read-bundle.txt │ │ └── test-plugin-store.ts │ ├── schema │ │ └── test-schema-helper.ts │ ├── scraper │ │ ├── test-concurrency-manager.ts │ │ ├── test-runtime-metrics.ts │ │ ├── test-scraper-concurrency-constraints.ts │ │ ├── test-scraper-discovery.ts │ │ └── test-scraper-single-project.ts │ └── storage │ │ ├── mysql-unit-suite.ts │ │ ├── pg-unit-suite.ts │ │ ├── sqlite3-unit-suite.ts │ │ ├── test-project-crud.ts │ │ ├── test-resource-crud.ts │ │ └── unit-suite.ts └── utils │ ├── shims.js │ └── ts-node-config.js ├── tsconfig.debug.json ├── tsconfig.esm.json └── tsconfig.json /.actrc: -------------------------------------------------------------------------------- 1 | -P ubuntu-latest=nektos/act-environments-ubuntu:20.04 2 | -e .github/act/event.json -------------------------------------------------------------------------------- /.github/act/event.json: -------------------------------------------------------------------------------- 1 | { 2 | "act": true, 3 | "pull_request": { 4 | "head": { 5 | "ref": "next" 6 | } 7 | } 8 | } -------------------------------------------------------------------------------- /.github/actions/acceptance/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Acceptance Tests' 2 | description: 'Acceptance Tests using dynamic inputs: storage, client, docker file, ..' 3 | inputs: 4 | storage: 5 | description: oneOf - sqlite, mysql, pg 6 | required: true 7 | storage_deps: 8 | description: storage npm dependencies 9 | required: true 10 | client: 11 | description: oneOf - cheerio, jsdom, puppeteer_chromium, playwright_chromium 12 | required: true 13 | client_deps: 14 | description: client npm dependencies 15 | required: true 16 | concurrency: 17 | description: oneOf - sequential, parallel 18 | required: true 19 | docker_file: 20 | description: if present starts/stops a corresponding docker container 21 | coveralls_token: 22 | required: true 23 | run_storage_unit_tests: 24 | description: whether or not to also run unit tests for the selected storage 25 | required: true 26 | default: false 27 | 28 | runs: 29 | using: "composite" 30 | steps: 31 | - uses: actions/checkout@v2 32 | 33 | - name: Cache node modules 34 | uses: actions/cache@v2 35 | with: 36 | # npm cache files are stored in `~/.npm` on Linux/macOS 37 | # can't cache based on package-lock.json as it doesn't contain the peerDependencies we want to cache 38 | path: ~/.npm 39 | key: ${{ runner.os }}-npm-${{ hashFiles('**/package.json') }} 40 | 41 | - name: Docker ${{ inputs.storage }} up 42 | if: ${{ inputs.docker_file }} 43 | run: /bin/sh -c 'docker_file="${{ inputs.docker_file }}"; if [ $docker_file ]; then docker-compose -f ${{ inputs.docker_file }} up -d; else echo "ignored, no docker file"; fi' 44 | 45 | - name: Setup node 16 46 | uses: actions/setup-node@v3 47 | with: 48 | node-version: 16 49 | 50 | - name: Install dependencies 51 | run: npm ci 52 | 53 | - name: Install storage peer dependencies for ${{ inputs.storage }} 54 | run: npm install ${{ inputs.storage_deps }} --save 55 | 56 | - name: Install client peer dependencies for ${{ inputs.client }} 57 | run: npm install ${{ inputs.client_deps }} --save 58 | 59 | - name: ${{ inputs.storage }} unit tests 60 | if: ${{ inputs.run_storage_unit_tests }} 61 | run: npx nyc mocha --config test/.mocharc.js \"test/unit/storage/${{ inputs.storage }}-unit-suite.ts\" 62 | 63 | - name: Coveralls for unit tests 64 | if: ${{ inputs.run_storage_unit_tests }} 65 | uses: coverallsapp/github-action@master 66 | with: 67 | github-token: ${{ inputs.coveralls_token }} 68 | flag-name: unit - ${{ inputs.storage }} 69 | parallel: true 70 | 71 | - name: Acceptance tests - ${{ inputs.client }} - ${{ inputs.storage }} - ${{ inputs.concurrency }} 72 | run: | 73 | npx nyc --exclude=**/BrowserFetchPlugin.ts --exclude=**/utils.ts mocha --config test/.mocharc.js test/acceptance/${{ inputs.client }}.ts --grep '${{ inputs.storage }} - concurrency: ${{ inputs.concurrency }}'", 74 | 75 | - name: Coveralls for acceptance tests 76 | uses: coverallsapp/github-action@master 77 | with: 78 | github-token: ${{ inputs.coveralls_token }} 79 | flag-name: acceptance - ${{ inputs.storage }} - ${{ inputs.client }} - ${{ inputs.concurrency }} 80 | parallel: true 81 | 82 | - name: Docker ${{ inputs.storage }} down 83 | if: ${{ inputs.docker_file }} 84 | run: /bin/sh -c 'docker_file="${{ inputs.docker_file }}"; if [ $docker_file ]; then docker-compose -f ${{ inputs.docker_file }} down; else echo "ignored, no docker file"; fi' 85 | 86 | -------------------------------------------------------------------------------- /.github/workflows/audit.yml: -------------------------------------------------------------------------------- 1 | name: audit 2 | 3 | on: 4 | push: 5 | branches: [ main, next ] 6 | pull_request: 7 | branches: [ main, next ] 8 | 9 | jobs: 10 | audit: 11 | if: ${{ !github.event.act }} 12 | runs-on: ubuntu-18.04 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: Setup node 16 17 | uses: actions/setup-node@v3 18 | with: 19 | node-version: 16 20 | - name: NPM Audit 21 | run: npm audit --audit-level=high -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # Next.js build output 79 | .next 80 | 81 | # Nuxt.js build / generate output 82 | .nuxt 83 | dist 84 | 85 | # Gatsby files 86 | .cache/ 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 88 | # https://nextjs.org/blog/next-9-1#public-directory-support 89 | # public 90 | 91 | # vuepress build output 92 | .vuepress/dist 93 | 94 | # Serverless directories 95 | .serverless/ 96 | 97 | # FuseBox cache 98 | .fusebox/ 99 | 100 | # DynamoDB Local files 101 | .dynamodb/ 102 | 103 | # TernJS port file 104 | .tern-port 105 | 106 | # notes 107 | *.txt 108 | 109 | # typescript outdir 110 | .dist 111 | 112 | # examples data 113 | examples/**/*.csv 114 | examples/**/*.zip 115 | examples/**/*.sqlite 116 | examples/**/*.log 117 | !examples/*.gitignore 118 | 119 | # test tmp dir 120 | test/tmp 121 | !test/tmp/*.gitignore 122 | 123 | # vscode settings 124 | .vscode 125 | 126 | # act workflows 127 | workflow 128 | 129 | .terraform 130 | *.tfstate* 131 | *.hcl 132 | 133 | .private 134 | 135 | majestic-million* 136 | -------------------------------------------------------------------------------- /.nycrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "reporter": ["lcov"] 3 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 get-set-fetch 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bin/gsfscrape: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const { default: cli } = require('../dist/cjs/cli/cli.js'); 4 | cli(process.argv); -------------------------------------------------------------------------------- /cloud/ansible/gsf-postgresql-logs/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: "Retrieve systemd log messages since last boot" 3 | ansible.builtin.shell: "journalctl -u postgresql -b" 4 | register: journalctl 5 | 6 | - name: "Retrieve service status" 7 | ansible.builtin.shell: systemctl status postgresql 8 | register: systemctl 9 | 10 | - name: "Copy output to local file" 11 | delegate_to: localhost 12 | ansible.builtin.copy: 13 | dest: "{{export_dir}}/pg-{{ inventory_hostname }}-systemd.log" 14 | content: "{{ systemctl.stdout }}\n\n{{ journalctl.stdout }}" 15 | 16 | - name: "Copy output to local file" 17 | ansible.builtin.fetch: 18 | src: "/var/log/postgresql/postgresql-14-main.log" 19 | dest: "{{export_dir}}/pg-{{ ansible_host }}-main.log" 20 | validate_checksum: false # it keeps changing as resources are scraped 21 | flat: true 22 | -------------------------------------------------------------------------------- /cloud/ansible/gsf-postgresql-setup/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # pg tuning for 4 vCPU, 8 GB RAM, using pgtune as base config 3 | pg_config: 4 | max_connections: 100 5 | shared_buffers: 2GB 6 | effective_cache_size: 6GB 7 | maintenance_work_mem: 512MB 8 | checkpoint_completion_target: 0.9 9 | wal_buffers: 16MB 10 | default_statistics_target: 100 11 | random_page_cost: 1.1 12 | effective_io_concurrency: 200 13 | work_mem: 10485kB 14 | min_wal_size: 1GB 15 | max_wal_size: 4GB 16 | max_worker_processes: 4 17 | max_parallel_workers_per_gather: 2 18 | max_parallel_workers: 4 19 | max_parallel_maintenance_workers: 2 20 | 21 | # log sql statements duration 22 | # log_destination: stderr 23 | # log_min_duration_statement: 20 24 | -------------------------------------------------------------------------------- /cloud/ansible/gsf-postgresql-setup/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: restart postgres 3 | service: name=postgresql state=restarted 4 | -------------------------------------------------------------------------------- /cloud/ansible/gsf-postgresql-setup/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Load defaults 3 | include_vars: 4 | file: './defaults/main.yml' 5 | name: defaults 6 | 7 | - name: In-place merge of input dicts (pg_config) with the default ones 8 | set_fact: 9 | pg_config: "{{ defaults.pg_config | combine(pg_config, recursive=True) }}" 10 | 11 | - name: "Create database" 12 | postgresql_db: 13 | state: present 14 | name: "{{ db.name }}" 15 | become: true 16 | become_user: postgres 17 | 18 | - name: "Create user" 19 | postgresql_user: 20 | state: present 21 | name: "{{ db.user }}" 22 | password: "{{ db.password }}" 23 | become: true 24 | become_user: postgres 25 | 26 | - name: "Grant user access to database" 27 | postgresql_privs: 28 | type: database 29 | database: "{{ db.name }}" 30 | roles: "{{ db.user }}" 31 | grant_option: false 32 | privs: all 33 | become: true 34 | become_user: postgres 35 | 36 | - name: "Allow remote connections on private network" 37 | postgresql_set: 38 | name: listen_addresses 39 | value: 'localhost, {{ private_ip_address }}' 40 | become: true 41 | become_user: postgres 42 | notify: restart postgres 43 | 44 | - name: "Allow md5 connection for user" 45 | postgresql_pg_hba: 46 | dest: /etc/postgresql/14/main/pg_hba.conf 47 | contype: host 48 | address: all 49 | databases: all 50 | method: md5 51 | users: "{{ db.user }}" 52 | create: true 53 | become: true 54 | become_user: postgres 55 | notify: restart postgres 56 | 57 | - name: "Tunning server for 8GB RAM" 58 | postgresql_set: 59 | name: "{{ item.key }}" 60 | value: "{{ item.value }}" 61 | become: true 62 | become_user: postgres 63 | notify: restart postgres 64 | with_dict: "{{ pg_config }}" 65 | -------------------------------------------------------------------------------- /cloud/ansible/gsf-scraper-benchmark/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: "sql: project id" 2 | postgresql_query: 3 | db: "{{ db_name }}" 4 | login_user: "{{ db_user }}" 5 | login_password: "{{ db_password }}" 6 | login_host: "localhost" 7 | query: select id from "projects" where name = %(project_name)s; 8 | named_args: 9 | project_name: "{{ project_name }}" 10 | register: sql_project_id 11 | 12 | - name: "sql: group and sort asc scrapedAt timestamps" 13 | postgresql_query: 14 | db: "{{ db_name }}" 15 | login_user: "{{ db_user }}" 16 | login_password: "{{ db_password }}" 17 | login_host: "localhost" 18 | query: > 19 | select count(*), truncated_time from ( 20 | select date_trunc('minute', "scrapedAt") as truncated_time from "{{ sql_project_id.query_result[0].id }}-resources" order by truncated_time 21 | ) as truncated group by truncated_time; 22 | register: sql_group_status 23 | 24 | - name: "generate csv rows" 25 | set_fact: 26 | csv_rows: | 27 | {% for entry in (sql_group_status.query_result) %} 28 | {{ entry.truncated_time }},{{ entry.count }} 29 | {% endfor %} 30 | delegate_to: localhost 31 | 32 | - name: write to file 33 | delegate_to: localhost 34 | copy: 35 | content: "{{ csv_rows }}" 36 | dest: "{{ export_file }}" 37 | -------------------------------------------------------------------------------- /cloud/ansible/gsf-scraper-export/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: "export results as csv" 2 | ansible.builtin.shell: gsfscrape --config gsf-config.json --loglevel {{log_level}} --logdestination {{log_destination}} --export {{ export_file | basename }} 3 | args: 4 | chdir: "{{ work_dir }}" 5 | 6 | - name: fetch results 7 | ansible.builtin.fetch: 8 | src: "{{ work_dir }}/{{ export_file | basename }}" 9 | dest: "{{ export_file }}" 10 | flat: true 11 | -------------------------------------------------------------------------------- /cloud/ansible/gsf-scraper-logs/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Fetch scrape log 3 | ansible.builtin.fetch: 4 | src: "{{ work_dir }}/{{ log_destination }}" 5 | dest: "{{export_dir}}/{{ ansible_host }}-scrape.log" 6 | validate_checksum: false # it keeps changing as resources are scraped 7 | flat: true 8 | 9 | - name: "Retrieve getsetfetch.service status" 10 | ansible.builtin.shell: systemctl status getsetfetch.service 11 | register: systemctl 12 | 13 | - name: "Copy getsetfetch.service status output to local file" 14 | delegate_to: localhost 15 | ansible.builtin.copy: 16 | dest: "{{export_dir}}/{{ inventory_hostname }}-systemd-status.log" 17 | # content: "{{ systemctl.stdout }}\n\n{{ journalctl.stdout }}" 18 | content: "{{ systemctl.stdout }}" 19 | 20 | - name: Fetch getsetfetch.service output and error logs 21 | ansible.builtin.fetch: 22 | src: "{{ work_dir }}/{{item}}.log" 23 | dest: "{{export_dir}}/{{ ansible_host }}-systemd-{{item}}.log" 24 | validate_checksum: false # it keeps changing as resources are scraped 25 | flat: true 26 | with_items: 27 | - output 28 | - error -------------------------------------------------------------------------------- /cloud/ansible/gsf-scraper-queue/tasks/main.yml: -------------------------------------------------------------------------------- 1 | - name: "sql: project id" 2 | postgresql_query: 3 | db: "{{ db_name }}" 4 | login_user: "{{ db_user }}" 5 | login_password: "{{ db_password }}" 6 | login_host: "localhost" 7 | query: select id from "projects" where name = %(project_name)s; 8 | named_args: 9 | project_name: "{{ project_name }}" 10 | register: sql_project_id 11 | 12 | - name: "sql: queue filtered by status" 13 | postgresql_query: 14 | db: "{{ db_name }}" 15 | login_user: "{{ db_user }}" 16 | login_password: "{{ db_password }}" 17 | login_host: "localhost" 18 | query: > 19 | select url, status, error 20 | from "{{ sql_project_id.query_result[0].id }}-queue" 21 | where status is not null and status / 100 = %(status)s; 22 | named_args: 23 | status: "{{ status }}" 24 | register: sql_queue 25 | no_log: true 26 | 27 | - name: "stats_file: write csv header" 28 | delegate_to: localhost 29 | template: 30 | src: templates/queue.j2 31 | dest: "{{ export_file }}" 32 | -------------------------------------------------------------------------------- /cloud/ansible/gsf-scraper-queue/templates/queue.j2: -------------------------------------------------------------------------------- 1 | url,status,error 2 | {% for item in sql_queue.query_result %} 3 | {{ item.url }},{{ item.status }},{{ item.error }} 4 | {% endfor %} -------------------------------------------------------------------------------- /cloud/ansible/gsf-scraper-setup/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | db: 3 | pool: 4 | min: 1 5 | max: 5 6 | 7 | scraper: 8 | npm_install: [] 9 | # 4 (node default) + db.pool.max + files/gsf-config.json->concurrency.maxRequests 10 | uv_threadpool_size: 15 11 | work_dir: /srv/gsf 12 | log: 13 | level: info 14 | destination: scrape.log 15 | files: 16 | gsf_config: templates/gsf-config.json.j2 17 | scrape_urls: '' 18 | additional: [] 19 | -------------------------------------------------------------------------------- /cloud/ansible/gsf-scraper-setup/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: start scraper 3 | systemd: 4 | name: getsetfetch 5 | daemon_reload: true 6 | state: started 7 | enabled: yes 8 | 9 | -------------------------------------------------------------------------------- /cloud/ansible/gsf-scraper-setup/templates/getsetfetch.service.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=get-set-fetch-scraper 3 | After=network.target 4 | 5 | [Service] 6 | ExecStart=gsfscrape --config gsf-config.json {{args}} --loglevel {{scraper.log.level}} --logdestination {{scraper.log.destination}} 7 | Restart=always 8 | User=gsf 9 | Group=nogroup 10 | Environment=PATH=/usr/bin:/usr/local/bin 11 | Environment=UV_THREADPOOL_SIZE={{scraper.uv_threadpool_size}} 12 | WorkingDirectory={{scraper.work_dir}} 13 | StandardOutput=append:{{scraper.work_dir}}/output.log 14 | StandardError=append:{{scraper.work_dir}}/error.log 15 | 16 | [Install] 17 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /cloud/terraform/main.tf: -------------------------------------------------------------------------------- 1 | resource "digitalocean_vpc" "gsf" { 2 | name = "getsetfetch-vpc" 3 | region = var.region 4 | } 5 | 6 | resource "digitalocean_ssh_key" "gsf" { 7 | name = var.public_key_name 8 | public_key = file(var.public_key_file) 9 | } 10 | 11 | resource "digitalocean_droplet" "gsf_pg" { 12 | image = var.pg.image 13 | name = var.pg.name 14 | region = var.region 15 | size = var.pg.size 16 | monitoring = true 17 | resize_disk = false 18 | vpc_uuid = digitalocean_vpc.gsf.id 19 | 20 | ssh_keys = [ 21 | digitalocean_ssh_key.gsf.id 22 | ] 23 | 24 | user_data = file("${path.module}/user_data_pg.yml") 25 | 26 | provisioner "remote-exec" { 27 | inline = [ 28 | "cloud-init status --wait" 29 | ] 30 | 31 | connection { 32 | host = self.ipv4_address 33 | type = "ssh" 34 | user = "root" 35 | private_key = file(var.private_key_file) 36 | } 37 | } 38 | 39 | provisioner "local-exec" { 40 | command = < /etc/apt/sources.list.d/pgdg.list' 9 | - wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - 10 | - sudo apt-get update 11 | - sudo apt-get -y install postgresql -------------------------------------------------------------------------------- /cloud/terraform/user_data_scraper.yml: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | package_update: true 3 | package_upgrade: true 4 | runcmd: 5 | - 'curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash -' 6 | - 'sudo apt-get install -y nodejs' -------------------------------------------------------------------------------- /cloud/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | variable "public_key_name" { 2 | type = string 3 | } 4 | variable "public_key_file" { 5 | type = string 6 | } 7 | variable "private_key_file" { 8 | type = string 9 | } 10 | variable "ansible_inventory_file" { 11 | type = string 12 | } 13 | 14 | variable "region" { 15 | type = string 16 | } 17 | 18 | 19 | variable "pg" { 20 | type = object({ 21 | name = string 22 | image = string 23 | size = string 24 | ansible_playbook_file = string 25 | }) 26 | } 27 | 28 | variable "scraper" { 29 | type = object({ 30 | count = number 31 | name = string 32 | image = string 33 | size = string 34 | ansible_playbook_file = string 35 | }) 36 | } 37 | 38 | -------------------------------------------------------------------------------- /datasets/.gitignore: -------------------------------------------------------------------------------- 1 | # exports holds lots of csv, log data for each scraper node 2 | # ignore ansible csv files like majestic-million 3 | *.log 4 | *.csv 5 | *.gz 6 | 7 | -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/ansible/files/ExtractScriptsPlugin.js: -------------------------------------------------------------------------------- 1 | class ExtractScriptsPlugin { 2 | // defines csv export columns 3 | getContentKeys() { 4 | return [ 'scripts' ]; 5 | } 6 | 7 | test(project, resource) { 8 | if (!resource) return false; 9 | return (/html/i).test(resource.contentType); 10 | } 11 | 12 | apply(project, resource, DomClient) { 13 | const doc = new DomClient(resource.data); 14 | 15 | const scripts = []; 16 | Array.from(doc.querySelectorAll('script')).forEach(script => { 17 | let src = script.getAttribute('src'); 18 | let isInvalidScript; 19 | if (src) { 20 | src = src.trim(); 21 | 22 | // src may contain actual js code, or just url fragments like "http://", "//", ... 23 | isInvalidScript = src.startsWith('data:') || /function\s*\(|^(http)*:*[/\\]+$/.test(src); 24 | } 25 | else { 26 | src = ''; 27 | } 28 | 29 | if (!isInvalidScript && !scripts.includes(src)) { 30 | scripts.push(src); 31 | } 32 | }); 33 | 34 | /* 35 | a content entry is represented by an array containing one or multiple scraped values 36 | we can have multiple content entries for a single resources due to dom selectors returning multiple results 37 | */ 38 | return { content: [ scripts ] }; 39 | } 40 | } 41 | 42 | module.exports = ExtractScriptsPlugin; 43 | -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/ansible/inventory/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/get-set-fetch/scraper/879c03e2811aaa13479095a4db376563059b3b4d/datasets/javascript-libs-from-top-1mm-sites/ansible/inventory/.gitkeep -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/ansible/pg-setup.yml: -------------------------------------------------------------------------------- 1 | - hosts: all 2 | vars_files: 3 | - vault.yml 4 | 5 | roles: 6 | - role: gsf-postgresql-setup 7 | vars: 8 | db: 9 | name: getsetfetch 10 | user: "{{ vault_db_user }}" 11 | password: "{{ vault_db_password }}" 12 | pg_config: 13 | max_connections: 210 # 20 scrapers * 10 max connection pool + 10 14 | -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/ansible/scraper-setup.yml: -------------------------------------------------------------------------------- 1 | - hosts: all 2 | vars_files: 3 | - vault.yml 4 | 5 | roles: 6 | - role: gsf-scraper-setup 7 | vars: 8 | db: 9 | name: getsetfetch 10 | user: "{{ vault_db_user }}" 11 | password: "{{ vault_db_password }}" 12 | pool: 13 | min: 10 14 | max: 10 15 | scraper: 16 | uv_threadpool_size: 34 # 4 (default) + 30 (max concurrent dns.lookups) 17 | npm_install: 18 | - knex@1.0.7 19 | - pg@8.7.3 20 | - cheerio@1.0.0-rc.10 21 | - "@get-set-fetch/scraper@0.11.0" 22 | # - get-set-fetch-scraper-0.10.0.tgz 23 | log: 24 | level: info 25 | files: 26 | scrape_urls: majestic-million-compact.csv 27 | gsf_config: templates/js-scripts-config.json.j2 28 | additional: 29 | - ExtractScriptsPlugin.js -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/ansible/templates/js-scripts-config.json.j2: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "pg", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "host": "{{ db_host }}", 7 | "port": "5432", 8 | "user": "{{ db.user }}", 9 | "password": "{{ db.password }}", 10 | "database": "{{ db.name }}" 11 | }, 12 | "pool": { 13 | "min": {{ db.pool.min }}, 14 | "max": {{ db.pool.max }} 15 | }, 16 | "debug": false 17 | }, 18 | "client": { 19 | "name": "cheerio" 20 | }, 21 | "project": { 22 | "name": "js-scripts1", 23 | "resourcePath": "", 24 | "pipeline": "dom-static-content", 25 | "pluginOpts": [ 26 | { 27 | "name": "ExtractUrlsPlugin", 28 | "maxDepth": 0 29 | }, 30 | { 31 | "name": "NodeFetchPlugin", 32 | "headers": { 33 | "Accept-Encoding": "br,gzip,deflate", 34 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0" 35 | }, 36 | "tlsCheck": false, 37 | "dnsResolution": "lookup" 38 | }, 39 | { 40 | "name": "ExtractScriptsPlugin", 41 | "path": "ExtractScriptsPlugin.js", 42 | "replace": "ExtractHtmlContentPlugin" 43 | } 44 | ] 45 | }, 46 | "concurrency": { 47 | "domain": { 48 | "maxRequests": 30, 49 | "delay": 50 50 | }, 51 | "proxy": { 52 | "maxRequests": 30, 53 | "delay": 50 54 | }, 55 | "session": { 56 | "maxRequests": 30, 57 | "delay": 50 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/ansible/vault.yml: -------------------------------------------------------------------------------- 1 | vault_db_user: 2 | vault_db_password: -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/charts/extract/summarize-js-libs.ts: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import CategoryExtractor from './CategoryExtractor'; 3 | import ScriptParser from './ScriptParser'; 4 | import { getTotals } from '../../../utils/map'; 5 | 6 | (async () => { 7 | const prefix = 'getsetfetch-dataset'; 8 | 9 | // get script data as Map> 10 | const scriptParser = new ScriptParser(); 11 | const scripts = await scriptParser.parse(`../../exports/${prefix}-javascript-libraries.csv`); 12 | 13 | // extract pathname (script name) counts with a min count of 10 14 | const { pathnameTotal } = getTotals(scripts); 15 | fs.writeFileSync( 16 | `../${prefix}-javascript-libraries-frequency-count.csv`, 17 | pathnameTotal 18 | .filter(([ script, count ]) => count >= 10) 19 | .map(([ script, count ]) => `${script},${count}`).join('\n'), 20 | ); 21 | 22 | const categoryExtractor = new CategoryExtractor(); 23 | categoryExtractor.parse(`../${prefix}-javascript-libraries-frequency-count.csv`); 24 | fs.writeFileSync('../most-used-js-libs.csv', categoryExtractor.toCsv()); 25 | })(); 26 | -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/charts/most-used-js-libs-2022-06-05-thumb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/get-set-fetch/scraper/879c03e2811aaa13479095a4db376563059b3b4d/datasets/javascript-libs-from-top-1mm-sites/charts/most-used-js-libs-2022-06-05-thumb.png -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/charts/most-used-js-libs-2022-06-05.csv: -------------------------------------------------------------------------------- 1 | category,script,value 2 | Utils,jQuery,419366 3 | Utils,jQuery Migrate,207334 4 | Utils,Google reCAPTCHA,54779 5 | Utils,slick,43628 6 | Utils,Modernizr,35714 7 | Utils,Owl Carousel,35043 8 | Utils,i18n,34633 9 | Utils,Underscore,34011 10 | Utils,Cloudflare Email Decode,29961 11 | Analytics,Google Analytics,191205 12 | Analytics,WordPress Stats and Insights,30127 13 | Analytics,Cloudflare Insights,11924 14 | Analytics,gtm4wp-form-move-tracker,395 15 | Analytics,Google Analytics For Wordpress,331 16 | CMS,WordPress Core,150323 17 | CMS,WordPress Contact Form 7,68478 18 | CMS,WordPress Elementor,45632 19 | CMS,WordPress WooCommerce,26727 20 | CMS,WordPress Slider Revolution,23339 21 | CMS,WordPress Utilities,16101 22 | CMS,Gravity Forms,487 23 | CMS,WordPress Visual Composer,25 24 | UI Widgets,Bootstrap,97579 25 | UI Widgets,jQuery UI,98659 26 | UI Widgets,imagesLoaded,44877 27 | UI Widgets,jQuery FitVids,19770 28 | UI Widgets,Popper,19495 29 | UI Widgets,Google Maps,19283 30 | UI Widgets,jQuery FlexSlider,18303 31 | UI Widgets,jQuery Magnific Popup,16463 32 | UI Widgets,jQuery Fancybox,15192 33 | UI Widgets,Animate on Scroll,4947 34 | Advertising,Google Adsense,52970 35 | Advertising,Google Publisher Tags,15515 36 | Cookies,jQuery Cookie,16461 37 | Cookies,Cookie Consent,10335 38 | Cookies,JavaScript Cookie,9972 39 | Cookies,OneTrust Cookies Consent,9493 40 | Cookies,consent.cookiebot.com/uc,6692 41 | Cookies,Cookie Law Info,307 42 | Cookies,Cookie Notice,248 43 | Optimization,Cloudflare Rocket Loader,13218 44 | Optimization,WordPress Autoptimize,13151 45 | Optimization,LazySizes,11695 46 | Optimization,optimize,6584 47 | Optimization,LazyLoad,5669 48 | Optimization,smush-lazy-load,127 -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/charts/most-used-js-libs-2022-06-05.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/get-set-fetch/scraper/879c03e2811aaa13479095a4db376563059b3b4d/datasets/javascript-libs-from-top-1mm-sites/charts/most-used-js-libs-2022-06-05.png -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/exports/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/get-set-fetch/scraper/879c03e2811aaa13479095a4db376563059b3b4d/datasets/javascript-libs-from-top-1mm-sites/exports/.gitkeep -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/readme.md: -------------------------------------------------------------------------------- 1 | ### Javascript Libraries From Top 1 Million Sites 2 | 3 | CSV files available as [open access dataset](https://zenodo.org/record/6617972) 4 | - getsetfetch-dataset-javascript-libraries.csv.gz (146 MB) 5 | - Each row contains a page URL followed by script source URLs (absolute or relative) encountered in that page. Inline scripts have an \"\" value. \ 6 | ex: https:// sitemaps.org/,"\","/lang.js" 7 | 8 | - getsetfetch-dataset-javascript-libraries-frequency-count.csv.gz (214 KB) 9 | - Each row contains a partial script pathname followed by a frequency count. 10 | The pathname is split in fragments based on "/" and expanded from right to left until the first non-generic fragment is found. If the full pathname contains only generic keywords (index, main, dist, etc...) the script hostname is added as well. Common suffixes like .min, .min.js are removed. \ 11 | ex: jquery/ui/core,62554 12 | 13 | 14 | #### Get Input Data 15 | The project scrapes URLs from Majestic 1 Million (June 5th, 2022). \ 16 | Download the csv from the [official site](https://majestic.com/reports/majestic-million). \ 17 | Keep 3rd column with the domain name. Manually remove 1st row containing labels. 18 | ```bash 19 | cd ansible/files 20 | cut -d, -f 3 downloaded-majestic-million.csv > majestic-million-compact.csv 21 | sed -i '1d' majestic-million-compact.csv 22 | ``` 23 | 24 | majestic-million-compact.csv is referenced by ansible playbook [scraper-setup.yml](ansible/scraper-setup.yml). It will be used to add the URLs to the initial scraping queue. 25 | 26 | #### Scrape in Cloud 27 | See [getsetfetch.org/blog/cloud-scraping-running-existing-projects.html](https://getsetfetch.org/blog/cloud-scraping-running-existing-projects.html) on detailed instructions on how to setup Terraform and Ansible, start scraping, monitor progress and export scraped content. 28 | 29 | The defined terraform module [main.tf](terraform/main.tf) provisions one central PostgreSQL instance and 20 scraper instances deployed on DigitalOcean Frankfurt FRA1 datacenter. 30 | 31 | ```bash 32 | terraform apply \ 33 | -var "api_token=${API_TOKEN}" \ 34 | -var "public_key_file=" \ 35 | -var "private_key_file=" \ 36 | -parallelism=30 37 | ``` 38 | 39 | #### Summarize Scraped Data 40 | ```bash 41 | cd charts/extract 42 | npx ts-node summarize-js-libs.ts 43 | ``` 44 | 45 | #### Generate Chart(s) 46 | Start a basic http server serving static files from current directory on localhost:9000. 47 | ```bash 48 | cd charts 49 | npx ts-node ../../utils/serve-static.ts 50 | ``` 51 | #### Most Used Javascript Libraries (percentage) 52 | [![Most Used Javascript Libraries](./charts/most-used-js-libs-2022-06-05-thumb.png)](./charts/most-used-js-libs-2022-06-05.svg) 53 | 54 | - http://localhost:9000/most-used-js-libs.html 55 | - filters out libraries with less than 1% usage 56 | - groups libraries into categories with each category having a maximum of 9 entries 57 | -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/terraform/main.tf: -------------------------------------------------------------------------------- 1 | module "js_libs" { 2 | source = "../../../cloud/terraform" 3 | 4 | region = "fra1" 5 | public_key_name = "get-set-fetch" 6 | public_key_file = var.public_key_file 7 | private_key_file = var.private_key_file 8 | ansible_inventory_file = "../ansible/inventory/hosts.cfg" 9 | 10 | pg = { 11 | name = "pg" 12 | image = "ubuntu-20-04-x64" 13 | size = "s-4vcpu-8gb" 14 | ansible_playbook_file = "../ansible/pg-setup.yml" 15 | } 16 | 17 | scraper = { 18 | count = 20 19 | name = "scraper" 20 | image = "ubuntu-20-04-x64" 21 | size = "s-1vcpu-1gb" 22 | ansible_playbook_file = "../ansible/scraper-setup.yml" 23 | } 24 | } 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | digitalocean = { 4 | source = "digitalocean/digitalocean" 5 | version = "~> 2.4" 6 | } 7 | } 8 | } 9 | 10 | provider "digitalocean" { 11 | token = var.api_token 12 | } 13 | -------------------------------------------------------------------------------- /datasets/javascript-libs-from-top-1mm-sites/terraform/variables.tf: -------------------------------------------------------------------------------- 1 | variable "api_token" {} 2 | variable "public_key_file" {} 3 | variable "private_key_file" {} 4 | -------------------------------------------------------------------------------- /datasets/readme.md: -------------------------------------------------------------------------------- 1 | ### Datasets 2 | 3 | Each directory represents a scraping project to be run in the cloud using Terraform and Ansible. Unless otherwise specified each project defines a central PostgreSQL instance and 20 scraper instances deployed on DigitalOcean Frankfurt FRA1 datacenter. 4 | 5 | Check [getsetfetch.org/node/cloud.html](https://getsetfetch.org/node/cloud.html) for details on available Terraform modules and Ansible roles. 6 | 7 | Check [getsetfetch.org/blog/cloud-scraping-running-existing-projects.html](https://getsetfetch.org/blog/cloud-scraping-running-existing-projects.html) for detailed info on how to run the projects. 8 | 9 | Available datasets: 10 | - [Javascript Libraries From Top 1 Million Sites](javascript-libs-from-top-1mm-sites/) 11 | -------------------------------------------------------------------------------- /datasets/tsconfig.datasets.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "esModuleInterop": true, 4 | "resolveJsonModule": true, 5 | "target": "esnext", 6 | "strict": false, 7 | "moduleResolution": "node", 8 | "module": "commonjs", 9 | "allowJs": true, 10 | "newLine": "LF", 11 | }, 12 | "include": [ 13 | "./" 14 | ], 15 | } -------------------------------------------------------------------------------- /datasets/utils/map.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable import/prefer-default-export */ 2 | 3 | export function getTotals(map: Map>) 4 | : {hostnameTotal:[string, number][], pathnameTotal: [string, number][]} { 5 | const hostnameTotalMap:Map = new Map(); 6 | const pathnameTotalMap:Map = new Map(); 7 | 8 | let totalScriptCount = 0; 9 | map.forEach((pathnames, hostname) => { 10 | let hostnameCount = 0; 11 | 12 | // sum pathnames (script names) across all hostnames 13 | pathnames.forEach((count, pathname) => { 14 | addToMap(pathnameTotalMap, pathname, count); 15 | hostnameCount += count; 16 | }); 17 | 18 | // record each hostname total scripts 19 | hostnameTotalMap.set(hostname, hostnameCount); 20 | totalScriptCount += hostnameCount; 21 | }); 22 | 23 | const avgScriptCount = totalScriptCount / pathnameTotalMap.size; 24 | 25 | // order descending 26 | const hostnameTotal = Array.from(hostnameTotalMap.entries()).sort((a: [string, number], b: [string, number]) => b[1] - a[1]); 27 | const pathnameTotal = Array.from(pathnameTotalMap.entries()).sort((a: [string, number], b: [string, number]) => b[1] - a[1]); 28 | 29 | return { hostnameTotal, pathnameTotal }; 30 | } 31 | 32 | export function getTopEntries(map: Map>, topHostnames:number = 20, topPathnames:number = 20) 33 | : {hostnames: string[], pathnames: string[]} { 34 | const { hostnameTotal, pathnameTotal } = getTotals(map); 35 | 36 | const hostnames:string[] = hostnameTotal.slice(0, topHostnames).map(([ key ]) => key); 37 | const pathnames:string[] = pathnameTotal.slice(0, topPathnames).map(([ key ]) => key); 38 | 39 | return { hostnames, pathnames }; 40 | } 41 | 42 | export function addToMap(map: Map, key: string, val: number = 1) { 43 | const count = map.get(key); 44 | if (!count) { 45 | map.set(key, val); 46 | } 47 | else { 48 | map.set(key, count + val); 49 | } 50 | } 51 | 52 | export function addToNestedMap(map: Map>, mainKey: string, subKey: string, val: number = 1) { 53 | const subMap = map.get(mainKey); 54 | if (!subMap) { 55 | map.set(mainKey, new Map([ [ subKey, 1 ] ])); 56 | } 57 | else { 58 | addToMap(subMap, subKey, val); 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /datasets/utils/serve-static.ts: -------------------------------------------------------------------------------- 1 | import http from 'http'; 2 | import url from 'url'; 3 | import fs from 'fs'; 4 | import path from 'path'; 5 | 6 | const port = process.argv[2] ? parseInt(process.argv[2], 10) : 9000; 7 | 8 | http.createServer((req, res) => { 9 | // console.log(`${req.method} ${req.url}`); 10 | 11 | // parse URL 12 | const parsedUrl = url.parse(req.url); 13 | // extract URL path 14 | let pathname = `.${parsedUrl.pathname}`; 15 | // based on the URL path, extract the file extension. e.g. .js, .doc, ... 16 | const { ext } = path.parse(pathname); 17 | // maps file extension to MIME typere 18 | const map = { 19 | '.ico': 'image/x-icon', 20 | '.html': 'text/html', 21 | '.js': 'text/javascript', 22 | '.json': 'application/json', 23 | '.css': 'text/css', 24 | '.png': 'image/png', 25 | '.jpg': 'image/jpeg', 26 | '.wav': 'audio/wav', 27 | '.mp3': 'audio/mpeg', 28 | '.svg': 'image/svg+xml', 29 | '.pdf': 'application/pdf', 30 | '.doc': 'application/msword', 31 | }; 32 | 33 | fs.exists(pathname, exist => { 34 | if (!exist) { 35 | // if the file is not found, return 404 36 | res.statusCode = 404; 37 | res.end(`File ${pathname} not found!`); 38 | return; 39 | } 40 | 41 | // if is a directory search for index file matching the extension 42 | if (fs.statSync(pathname).isDirectory()) pathname += `/index${ext}`; 43 | 44 | // read file from file system 45 | fs.readFile(pathname, (err, data) => { 46 | if (err) { 47 | res.statusCode = 500; 48 | res.end(`Error getting the file: ${err}.`); 49 | } 50 | else { 51 | // if the file is found, set Content-type and send data 52 | res.setHeader('Content-type', map[ext] || 'text/plain'); 53 | res.end(data); 54 | } 55 | }); 56 | }); 57 | }).listen(port); 58 | 59 | console.log(`Server listening on port ${port}`); 60 | -------------------------------------------------------------------------------- /development.md: -------------------------------------------------------------------------------- 1 | ## Debugging in VSCode 2 | 3 | ### All unit tests except command line 4 | Just use vscode default "run and debug" settings. From "run and debug' panel, select the "node.js" option, "Select Launch Configuration" command palette will appear, select one of the "Run Script: test:" options. 5 | 6 | ### Command line unit tests 7 | Use the below VSCode launch configuration, modify args with the command line arguments you want to debug against. 8 | ```json 9 | { 10 | "type": "pwa-node", 11 | "request": "launch", 12 | "name": "Launch Get-Set-Fetch Cli", 13 | "program": "${workspaceFolder}/dist/cjs/cli/cli.js", 14 | "args": [ "--version"], 15 | "skipFiles": [ 16 | "/**" 17 | ], 18 | "preLaunchTask": "tsc: build - tsconfig.debug.json", 19 | "outFiles": [ 20 | "${workspaceFolder}/dist/cjs/**/*.js" 21 | ] 22 | } 23 | ``` -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.15 2 | 3 | # 1000 is the first UID assigned to a non root user (debian, ubuntu) 4 | # this can mitigate permissions issues when mapping volumes between host and container 5 | ARG USER_ID=1000 6 | ARG GROUP_ID=1000 7 | 8 | ARG STORAGE 9 | ARG BROWSER_CLIENT 10 | ARG DOM_CLIENT 11 | ARG VERSION 12 | ARG BRANCH=main 13 | 14 | # core apk packages 15 | RUN apk add --no-cache nodejs npm git 16 | 17 | # node-gyp required for some packages like @vscode/sqlite3, 18 | # remove the virtual pkg group at the end 19 | RUN apk add --no-cache --virtual .gyp g++ make py3-pip 20 | 21 | # puppeteer apk packages 22 | # install chromium (91.0.4472.164-r0) package, https://pkgs.alpinelinux.org/packages?name=chromium&branch=v3.14 23 | # puppeteer v9.1.1 works with this chromium version, https://github.com/puppeteer/puppeteer/releases 24 | RUN if [ "$BROWSER_CLIENT" = "puppeteer" ] ; then apk add --no-cache \ 25 | chromium \ 26 | nss \ 27 | freetype \ 28 | harfbuzz \ 29 | ca-certificates \ 30 | ttf-freefont; fi 31 | 32 | # puppeteer env variables 33 | # skip installing chromium, puppeteer will be using the installed package 34 | ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ 35 | PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser 36 | 37 | # add user so we don't need --no-sandbox, https://developers.google.com/web/tools/puppeteer/troubleshooting#running-on-alpine 38 | # match uid, gid coming from the host user 39 | RUN addgroup --system --gid $GROUP_ID gsfuser 40 | RUN adduser --system --uid $USER_ID --ingroup gsfuser gsfuser 41 | 42 | # run everything after as non-privileged user 43 | USER gsfuser 44 | 45 | RUN mkdir -p /home/gsfuser/Downloads /home/gsfuser/scraper 46 | 47 | # install and build get-set-fetch/scraper from github sources 48 | RUN if [ "$VERSION" = "source" ] ; then \ 49 | git clone -b "$BRANCH" --single-branch https://github.com/get-set-fetch/scraper.git /home/gsfuser/scraper \ 50 | && cd /home/gsfuser/scraper \ 51 | && npm ci \ 52 | && npm run build; fi 53 | 54 | WORKDIR /home/gsfuser/scraper 55 | 56 | # associative arrays not available in sh or ash 57 | # use some nested case statements for linking storage, browser and dom client npm packages to gsf versions 58 | RUN case "$VERSION" in \ 59 | 'source') \ 60 | case "$STORAGE" in \ 61 | 'sqlite') npm install knex@1.0.7 @vscode/sqlite3@5.0.8 ;; \ 62 | 'pg') npm install knex@1.0.7 pg@8.7.3 ;; \ 63 | 'mysql') npm install knex@1.0.7 mysql@2.18.1 ;; \ 64 | esac; \ 65 | case "$BROWSER_CLIENT" in \ 66 | 'puppeteer') npm install puppeteer@14.3.0 ;; \ 67 | 'playwright') npm install playwright-core@1.13.1 playwright-chromium@1.13.1 ;; \ 68 | esac; \ 69 | case "$DOM_CLIENT" in \ 70 | 'cheerio') npm install cheerio@1.0.0-rc.10 ;; \ 71 | 'jsdom') npm install jsdom@16.7.0 ;; \ 72 | esac \ 73 | ;; \ 74 | esac 75 | 76 | # remove node-gyp related packages and switch back to gsfuser 77 | USER root 78 | RUN apk del .gyp 79 | USER gsfuser 80 | 81 | # invoke entrypoint as exec form, gsfscrape will receive signals such as SIGTERM 82 | ENTRYPOINT ["/home/gsfuser/scraper/bin/gsfscrape"] 83 | 84 | # default arguments 85 | CMD [ "--version" ] -------------------------------------------------------------------------------- /docker/data/config-pg-puppeteer.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "pg", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "host": "pg", 7 | "port": "5432", 8 | "user": "gsf-user", 9 | "password": "gsf-pswd", 10 | "database": "gsf-db" 11 | }, 12 | "debug": false 13 | }, 14 | "client": { 15 | "name": "puppeteer", 16 | "opts": { 17 | "ignoreHTTPSErrors": true, 18 | "args": [ 19 | "--ignore-certificate-errors", 20 | "--no-first-run", 21 | "--single-process" 22 | ] 23 | } 24 | }, 25 | "project": { 26 | "name": "myProj", 27 | "pipeline": "browser-static-content", 28 | "pluginOpts": [ 29 | { 30 | "name": "ExtractHtmlContentPlugin", 31 | "selectorPairs": [ 32 | { 33 | "contentSelector": "h3", 34 | "label": "headline" 35 | } 36 | ] 37 | }, 38 | { 39 | "name": "InsertResourcesPlugin", 40 | "maxResources": 1 41 | }, 42 | { 43 | "name": "UpsertResourcePlugin", 44 | "keepHtmlData": true 45 | } 46 | ], 47 | "resources": [ 48 | { 49 | "url": "https://getsetfetch.org/index.html" 50 | } 51 | ] 52 | } 53 | } -------------------------------------------------------------------------------- /docker/data/config-sqlite-cheerio.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "gsf.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "cheerio" 12 | }, 13 | "project": { 14 | "name": "myProj", 15 | "pipeline": "dom-static-content", 16 | "pluginOpts": [ 17 | { 18 | "name": "ExtractHtmlContentPlugin", 19 | "selectorPairs": [ 20 | { 21 | "contentSelector": "h3" 22 | } 23 | ] 24 | }, 25 | { 26 | "name": "InsertResourcesPlugin", 27 | "maxResources": 1 28 | } 29 | ], 30 | "resources": [ 31 | { 32 | "url": "https://getsetfetch.org/index.html" 33 | } 34 | ] 35 | } 36 | } -------------------------------------------------------------------------------- /docker/data/config-sqlite-puppeteer.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "gsf.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "puppeteer", 12 | "opts": { 13 | "ignoreHTTPSErrors": true, 14 | "args": [ 15 | "--ignore-certificate-errors", 16 | "--no-first-run", 17 | "--single-process" 18 | ] 19 | } 20 | }, 21 | "project": { 22 | "name": "myProj", 23 | "pipeline": "browser-static-content", 24 | "pluginOpts": [ 25 | { 26 | "name": "ExtractHtmlContentPlugin", 27 | "selectorPairs": [ 28 | { 29 | "contentSelector": "h3" 30 | } 31 | ] 32 | }, 33 | { 34 | "name": "InsertResourcesPlugin", 35 | "maxResources": 1 36 | } 37 | ], 38 | "resources": [ 39 | { 40 | "url": "https://getsetfetch.org/index.html" 41 | } 42 | ] 43 | } 44 | } -------------------------------------------------------------------------------- /docker/docker.md: -------------------------------------------------------------------------------- 1 | For both docker build and run commands make this repo directory the current working directory. 2 | 3 | ## Build 4 | All scraper images are based on alpine:3.14 docker image. 5 | You have to build the images locally; they're not published on Docker Hub. 6 | A set of built-time variables allows you to customize the docker image. 7 | 8 | Built-time Variable | Values | Default | 9 | | ------- | -------| --| 10 | | BROWSER_CLIENT | puppeteer | - 11 | | DOM_CLIENT | cheerio, jsdom | - 12 | | STORAGE | sqlite, pg, mysql | - 13 | | VERSION | source | - 14 | | USER_ID | | 1000 15 | | GROUP_ID | | 1000 16 | 17 | `BROWSER_CLIENT` and `DOM_CLIENT` variables are mutually exclusive. You either scrape using a headless browser or a HTML/DOM parser library. 18 | 19 | `USER_ID` and `GROUP_ID` are used to add the `gsfuser` user to the container. This non-root user runs the scraper, reads and writes data to the `/home/gsfuser/scraper/data` container path mounted from the host. Use `--build-arg USER_ID=$(id -u)`, `--build-arg GROUP_ID=$(id -g)` to provide the same uid/gid as the currently logged in user. If you're on Windows you can ignore these two variables. 20 | 21 | Create an image using cheerio, sqlite and latest source code. 22 | ```bash 23 | docker build \ 24 | --tag getsetfetch \ 25 | --build-arg DOM_CLIENT=cheerio \ 26 | --build-arg STORAGE=sqlite \ 27 | --build-arg VERSION=source \ 28 | --build-arg USER_ID=$(id -u) \ 29 | --build-arg GROUP_ID=$(id -g) . 30 | ``` 31 | 32 | Create an image using puppeteer, sqlite and latest source code. 33 | ```bash 34 | docker build \ 35 | --tag getsetfetch \ 36 | --build-arg BROWSER_CLIENT=puppeteer \ 37 | --build-arg STORAGE=sqlite \ 38 | --build-arg VERSION=source \ 39 | --build-arg USER_ID=$(id -u) \ 40 | --build-arg GROUP_ID=$(id -g) . 41 | ``` 42 | 43 | 44 | ## Run 45 | All examples contain config, log, sqlite, csv files under `/home/gsfuser/scraper/data` container path mounted from the host for easy access to logs and exported scraped content. Remaining arguments represent [CLI arguments](/get-set-fetch/scraper#command-line-interface). 46 | 47 | 48 | Log, scrape and export data using [config-sqlite-cheerio.json](data/config-sqlite-cheerio.json). 49 | ```bash 50 | docker run \ 51 | -v /scraper/docker/data:/home/gsfuser/scraper/data getsetfetch:latest \ 52 | --version \ 53 | --config data/config-sqlite-cheerio.json \ 54 | --save \ 55 | --overwrite \ 56 | --scrape \ 57 | --loglevel info \ 58 | --logdestination data/scrape.log \ 59 | --export data/export.csv 60 | ``` 61 | 62 | Log, scrape and export data using [config-sqlite-puppeteer.json](data/config-sqlite-puppeteer.json). Use either `--security-opt seccomp=unconfined` or `--security-opt seccomp=data/chromium-security-profile.json` ([source blog](https://blog.jessfraz.com/post/how-to-use-new-docker-seccomp-profiles/)) to allow Chromium syscalls. 63 | ```bash 64 | docker run \ 65 | --security-opt seccomp=unconfined 66 | -v /scraper/docker/data:/home/gsfuser/scraper/data getsetfetch:latest \ 67 | --version \ 68 | --config data/config-sqlite-puppeteer.json \ 69 | --save \ 70 | --overwrite \ 71 | --scrape \ 72 | --loglevel info \ 73 | --logdestination data/scrape.log \ 74 | --export data/export.csv 75 | ``` 76 | 77 | You can also start the scraper as a [docker-compose service](pg-puppeteer/docker-compose.yml). This example scrapes using puppeteer and postgresql. Remember to build the corresponding image `--build-arg STORAGE=pg --build-arg BROWSER_CLIENT=puppeteer` first :) 78 | 79 | ```bash 80 | cd ./pg-puppeteer 81 | 82 | # start 83 | docker-compose up -d 84 | 85 | # stop 86 | docker-compose down 87 | ``` 88 | -------------------------------------------------------------------------------- /docker/pg-puppeteer/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.3" 2 | services: 3 | pg: 4 | image: postgres:11-alpine 5 | environment: 6 | POSTGRES_USER: gsf-user 7 | POSTGRES_PASSWORD: gsf-pswd 8 | POSTGRES_DB: gsf-db 9 | 10 | gsf: 11 | image: getsetfetch:latest 12 | command: > 13 | --version 14 | --config data/config-pg-puppeteer.json 15 | --save 16 | --overwrite 17 | --scrape 18 | --loglevel info 19 | --logdestination data/scrape.log 20 | --export data/export.csv 21 | 22 | volumes: 23 | - ../data:/home/gsfuser/scraper/data 24 | security_opt: 25 | - seccomp:"../data/chromium-security-profile.json" 26 | depends_on: 27 | - pg 28 | 29 | volumes: 30 | data: -------------------------------------------------------------------------------- /examples/article-excerpts/ReadabilityPlugin.ts: -------------------------------------------------------------------------------- 1 | import { Readability } from '@mozilla/readability'; 2 | import { Plugin, Project, Resource } from '../../src/index'; 3 | 4 | /** 5 | * IMPORTANT NOTE ! 6 | * if you're using plain javascript besides removing Project and Resource types, don't extend the abstract Plugin class 7 | * @rollup/plugin-commonjs will bundle the entire @get-set-fetch/scraper project including fs, jszip, ... imports 8 | */ 9 | export default class ReadabilityPlugin extends Plugin { 10 | opts = { 11 | domRead: true, 12 | } 13 | 14 | test(project:Project, resource:Resource) { 15 | if (!resource) return false; 16 | return (/html/i).test(resource.contentType); 17 | } 18 | 19 | apply() { 20 | const article = new Readability(document).parse(); 21 | return { content: [ [ article.excerpt ] ] }; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /examples/article-excerpts/article-excerpts-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "article-excerpts.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "puppeteer", 12 | "opts": { 13 | "args": [ 14 | "--disable-gpu", 15 | "--disable-dev-shm-usage", 16 | "--disable-setuid-sandbox", 17 | "--no-first-run", 18 | "--no-sandbox", 19 | "--no-zygote", 20 | "--single-process" 21 | ] 22 | } 23 | }, 24 | "project": { 25 | "name": "BBCTechNews", 26 | "pipeline": "browser-static-content", 27 | "pluginOpts": [ 28 | { 29 | "name": "ExtractUrlsPlugin", 30 | "maxDepth": 1, 31 | "selectorPairs": [ 32 | { 33 | "urlSelector": "a[href ^= \"/news/technology-\"]" 34 | } 35 | ] 36 | }, 37 | { 38 | "name": "ReadabilityPlugin", 39 | "path": "ReadabilityPlugin.ts", 40 | "replace": "ExtractHtmlContentPlugin", 41 | "domRead": true 42 | }, 43 | { 44 | "name": "InsertResourcesPlugin", 45 | "maxResources": 5 46 | } 47 | ], 48 | "resources": [ 49 | { 50 | "url": "https://www.bbc.com/news/technology" 51 | } 52 | ] 53 | }, 54 | "concurrency": { 55 | "session": { 56 | "maxRequests": 1, 57 | "delay": 3000 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /examples/article-excerpts/article-excerpts.ts: -------------------------------------------------------------------------------- 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 2 | import path from 'path'; 3 | import { destination } from 'pino'; 4 | import { PluginStore, Scraper, Project, setLogger, ScrapeEvent, CsvExporter } from '../../src/index'; 5 | 6 | /* scrape configuration */ 7 | import ScrapeConfig from './article-excerpts-config.json'; 8 | 9 | // write all INFO and above messages to 'scrape.log' 10 | setLogger({ level: 'info' }, destination('scrape.log')); 11 | 12 | (async () => { 13 | /* 14 | manually register external plugin 15 | not really needed in this case since the external config file contains a 'path' property to the ReadabilityPlugin 16 | enabling automatic plugin registration 17 | if config file is loaded from cli only js plugin files can be imported 18 | */ 19 | await PluginStore.init(); 20 | await PluginStore.addEntry(path.join(__dirname, 'ReadabilityPlugin.ts')); 21 | 22 | /* create a scraper instance with the above settings */ 23 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client); 24 | 25 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => { 26 | const exporter = new CsvExporter({ filepath: 'article-excerpts.csv' }); 27 | await exporter.export(project); 28 | }); 29 | 30 | /* start scraping by specifying project and concurrency settings */ 31 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency); 32 | })(); 33 | -------------------------------------------------------------------------------- /examples/cloud/ansible/files/ExtractScriptsPlugin.js: -------------------------------------------------------------------------------- 1 | class ExtractScriptsPlugin { 2 | // defines csv export columns 3 | getContentKeys() { 4 | return [ 'scripts' ]; 5 | } 6 | 7 | test(project, resource) { 8 | if (!resource) return false; 9 | return (/html/i).test(resource.contentType); 10 | } 11 | 12 | apply(project, resource, DomClient) { 13 | const doc = new DomClient(resource.data); 14 | 15 | const scripts = []; 16 | Array.from(doc.querySelectorAll('script')).forEach(script => { 17 | const src = script.getAttribute('src') ? script.getAttribute('src') : ''; 18 | if (!scripts.includes(src)) { 19 | scripts.push(src); 20 | } 21 | }); 22 | 23 | /* 24 | a content entry is represented by an array containing one or multiple scraped values 25 | we can have multiple content entries for a single resources due to dom selectors returning multiple results 26 | */ 27 | return { content: [ scripts ] }; 28 | } 29 | } 30 | 31 | module.exports = ExtractScriptsPlugin; 32 | -------------------------------------------------------------------------------- /examples/cloud/ansible/files/gsf-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "pg", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "host": "", 7 | "port": "5432", 8 | "user": "", 9 | "password": "", 10 | "database": "" 11 | }, 12 | "pool": { 13 | "min": 2, 14 | "max": 50 15 | }, 16 | "debug": false 17 | }, 18 | "client": { 19 | "name": "cheerio" 20 | }, 21 | "project": { 22 | "name": "top-1", 23 | "resourcePath": "", 24 | "pipeline": "dom-static-content", 25 | "pluginOpts": [ 26 | { 27 | "name": "ExtractUrlsPlugin", 28 | "maxDepth": 0 29 | }, 30 | { 31 | "name": "NodeFetchPlugin", 32 | "headers": { 33 | "Accept-Encoding": "br,gzip,deflate", 34 | "User-Agent": "" 35 | } 36 | }, 37 | { 38 | "name": "ExtractScriptsPlugin", 39 | "path": "ExtractScriptsPlugin.js", 40 | "replace": "ExtractHtmlContentPlugin" 41 | } 42 | ] 43 | }, 44 | "concurrency": { 45 | "domain": { 46 | "maxRequests": 100, 47 | "delay": 1 48 | }, 49 | "proxy": { 50 | "maxRequests": 100, 51 | "delay": 1 52 | }, 53 | "session": { 54 | "maxRequests": 100, 55 | "delay": 1 56 | } 57 | } 58 | } -------------------------------------------------------------------------------- /examples/cloud/ansible/files/gsf.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=get-set-fetch-scraper 3 | After=network.target 4 | 5 | [Service] 6 | ExecStart=+gsfscrape --config gsf-config.json --loglevel --logdestination 7 | Restart=always 8 | User=nobody 9 | Group=nogroup 10 | Environment=PATH=/usr/bin:/usr/local/bin 11 | WorkingDirectory= 12 | 13 | [Install] 14 | WantedBy=multi-user.target 15 | -------------------------------------------------------------------------------- /examples/cloud/ansible/pg-setup.yml: -------------------------------------------------------------------------------- 1 | - hosts: all 2 | become: true 3 | become_user: root 4 | 5 | vars_files: 6 | - vars.yml 7 | 8 | tasks: 9 | - name: "Create app database" 10 | postgresql_db: 11 | state: present 12 | name: "{{ db_name }}" 13 | become: true 14 | become_user: postgres 15 | 16 | - name: "Create db user" 17 | postgresql_user: 18 | state: present 19 | name: "{{ db_user }}" 20 | password: "{{ db_password }}" 21 | become: true 22 | become_user: postgres 23 | 24 | - name: "Grant db user access to app db" 25 | postgresql_privs: 26 | type: database 27 | database: "{{ db_name }}" 28 | roles: "{{ db_user }}" 29 | grant_option: false 30 | privs: all 31 | become: true 32 | become_user: postgres 33 | 34 | - name: "Allow remote connections on private network" 35 | postgresql_set: 36 | name: listen_addresses 37 | value: 'localhost, {{ private_ip_address }}' 38 | become: true 39 | become_user: postgres 40 | notify: restart postgres 41 | 42 | - name: "Allow md5 connection for the db user" 43 | postgresql_pg_hba: 44 | dest: /etc/postgresql/14/main/pg_hba.conf 45 | contype: host 46 | address: all 47 | databases: all 48 | method: md5 49 | users: "{{ db_user }}" 50 | create: true 51 | become: true 52 | become_user: postgres 53 | notify: restart postgres 54 | 55 | - name: "Tunning for 8GB RAM" 56 | postgresql_set: 57 | name: "{{ item.key }}" 58 | value: "{{ item.value }}" 59 | become: true 60 | become_user: postgres 61 | notify: restart postgres 62 | with_dict: "{{ pg_config }}" 63 | 64 | handlers: 65 | - name: restart postgres 66 | service: name=postgresql state=restarted 67 | -------------------------------------------------------------------------------- /examples/cloud/ansible/scraper-export.yml: -------------------------------------------------------------------------------- 1 | - hosts: scraper[0] 2 | 3 | vars_files: 4 | - vars.yml 5 | 6 | tasks: 7 | - name: "export results as csv" 8 | ansible.builtin.shell: gsfscrape --config gsf-config.json --loglevel {{scrape_log_level}} --logdestination {{scrape_log_destination}} --scrape --export {{ scrape_export_file }} 9 | args: 10 | chdir: "{{ scrape_dir }}" 11 | 12 | - name: fetch results 13 | ansible.builtin.fetch: 14 | src: "{{ scrape_dir }}/{{ scrape_export_file }}" 15 | dest: "../exports/{{ scrape_export_file }}" 16 | flat: true 17 | -------------------------------------------------------------------------------- /examples/cloud/ansible/scraper-logs.yml: -------------------------------------------------------------------------------- 1 | - hosts: scraper 2 | 3 | vars_files: 4 | - vars.yml 5 | 6 | tasks: 7 | - name: fetch scrape log 8 | ansible.builtin.fetch: 9 | src: "{{ scrape_dir }}/{{ scrape_log_destination }}" 10 | dest: "../exports/{{ ansible_host }}-scrape.log" 11 | validate_checksum: false # it keeps changing as resources are scraped 12 | flat: true 13 | -------------------------------------------------------------------------------- /examples/cloud/ansible/scraper-setup.yml: -------------------------------------------------------------------------------- 1 | - hosts: all 2 | become: true 3 | become_user: root 4 | 5 | vars_files: 6 | - vars.yml 7 | 8 | tasks: 9 | # community.general.npm is not installing cheerio@rc version for some reason 10 | - name: "Install get-set-fetch scraper and peer dependencies" 11 | ansible.builtin.shell: npm install -g {{ item.name }}@{{ item.version }} 12 | with_items: 13 | - { name: "@get-set-fetch/scraper", version: "0.9.0" } 14 | - { name: "knex", version: "1.0.5" } 15 | - { name: "pg", version: "8.7.1" } 16 | - { name: "cheerio", version: "1.0.0-rc.10" } 17 | 18 | - name: Create a directory if it does not exist 19 | ansible.builtin.file: 20 | path: "{{ scrape_dir }}" 21 | state: directory 22 | mode: 0644 23 | 24 | - name: "Copy input csv file(s)" 25 | ansible.builtin.copy: 26 | src: "files/{{ item }}" 27 | dest: "{{ scrape_dir }}/{{ item }}" 28 | owner: root 29 | group: root 30 | mode: 0644 31 | with_items: 32 | - "{{ scrape_resource_file }}" 33 | when: scraper_idx == "0" 34 | 35 | - name: "Copy input config and plugin files" 36 | ansible.builtin.copy: 37 | src: "files/{{ item }}" 38 | dest: "{{ scrape_dir }}/{{ item }}" 39 | owner: root 40 | group: root 41 | mode: 0644 42 | with_items: 43 | - gsf-config.json 44 | - ExtractScriptsPlugin.js 45 | 46 | - name: "Update scrape config" 47 | ansible.builtin.replace: 48 | path: "{{ scrape_dir }}/gsf-config.json" 49 | regexp: "{{ item.regexp }}" 50 | replace: "{{ item.replace }}" 51 | with_items: 52 | - { regexp: "", replace: "{{ db_host }}" } 53 | - { regexp: "", replace: "{{ db_user }}" } 54 | - { regexp: "", replace: "{{ db_password }}" } 55 | - { regexp: "", replace: "{{ db_name }}" } 56 | - { regexp: "", replace: "{{ scrape_user_agent }}" } 57 | 58 | - name: "Update scrape config external resource path" 59 | ansible.builtin.replace: 60 | path: "{{ scrape_dir }}/gsf-config.json" 61 | regexp: "" 62 | replace: "{{ scrape_resource_file }}" 63 | when: scraper_idx == "0" 64 | 65 | - name: "Remove scrape config external resource path" 66 | ansible.builtin.replace: 67 | path: "{{ scrape_dir }}/gsf-config.json" 68 | regexp: ".+resourcePath.+" 69 | replace: "" 70 | when: scraper_idx != "0" 71 | 72 | - name: Copy systemd service file 73 | ansible.builtin.copy: 74 | src: files/gsf.service 75 | dest: /etc/systemd/system 76 | owner: root 77 | group: root 78 | 79 | - name: "Update systemd service file" 80 | ansible.builtin.replace: 81 | path: "/etc/systemd/system/gsf.service" 82 | regexp: "{{ item.regexp }}" 83 | replace: "{{ item.replace }}" 84 | with_items: 85 | - { regexp: "", replace: "{{ scrape_log_level }}" } 86 | - { regexp: "", replace: "{{ scrape_log_destination }}" } 87 | - { regexp: "", replace: "{{ scrape_dir }}" } 88 | - { regexp: "", replace: "{{ '--save --discover --retry 30' if scraper_idx == '0' else '--discover --retry 30' }}" } 89 | notify: 90 | - start scraper 91 | 92 | handlers: 93 | - name: start scraper 94 | systemd: 95 | name: gsf 96 | state: started 97 | enabled: yes -------------------------------------------------------------------------------- /examples/cloud/ansible/scraper-systemd-logs.yml: -------------------------------------------------------------------------------- 1 | - hosts: scraper 2 | 3 | vars_files: 4 | - vars.yml 5 | 6 | tasks: 7 | - name: "Retrieve systemd log messages since last boot" 8 | ansible.builtin.shell: "journalctl -u gsf.service -b" 9 | register: journalctl 10 | 11 | - name: "Retrieve service status" 12 | ansible.builtin.shell: systemctl status gsf.service 13 | register: systemctl 14 | 15 | - name: "Copy output to local file" 16 | delegate_to: localhost 17 | ansible.builtin.copy: 18 | dest: "../exports/{{ inventory_hostname }}-systemd.log" 19 | content: "{{ systemctl.stdout }}\n\n{{ journalctl.stdout }}" 20 | -------------------------------------------------------------------------------- /examples/cloud/ansible/vars.yml: -------------------------------------------------------------------------------- 1 | --- 2 | db_user: 3 | db_password: 4 | db_name: 5 | db_stats_file: pg-stats.csv 6 | 7 | # pg tunning for 4 vCPU, 8 GB RAM, using pgtune as base config 8 | pg_config: 9 | max_connections: 100 10 | shared_buffers: 2GB 11 | effective_cache_size: 6GB 12 | maintenance_work_mem: 512MB 13 | checkpoint_completion_target: 0.9 14 | wal_buffers: 16MB 15 | default_statistics_target: 100 16 | random_page_cost: 1.1 17 | effective_io_concurrency: 200 18 | work_mem: 10485kB 19 | min_wal_size: 1GB 20 | max_wal_size: 4GB 21 | max_worker_processes: 4 22 | max_parallel_workers_per_gather: 2 23 | max_parallel_workers: 4 24 | max_parallel_maintenance_workers: 2 25 | 26 | scrape_dir: /srv/gsf 27 | scrape_log_level: debug 28 | scrape_log_destination: scrape.log 29 | scrape_resource_file: majestic_million-29-dec-2021.csv 30 | scrape_user_agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0 31 | scrape_export_file: project.csv 32 | 33 | -------------------------------------------------------------------------------- /examples/cloud/readme.md: -------------------------------------------------------------------------------- 1 | distributed scraping using multiple get-set-fetch scraper instances and a central postgresql instance. 2 | 3 | terraform creates the instances while ansible configures them. 4 | 5 | scrape status - systemd status and logs for scraper instances, sql queries for postgresql - is monitored via ansible playbooks. 6 | 7 | the scrape configuration referring ExtractScriptsPlugin is responsible for extracting js script urls from top 1 million sites as reported by majestic. -------------------------------------------------------------------------------- /examples/cloud/terraform/provider.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | digitalocean = { 4 | source = "digitalocean/digitalocean" 5 | version = "~> 2.0" 6 | } 7 | } 8 | } 9 | 10 | variable "do_token" {} 11 | variable "pvt_key" {} 12 | variable "pub_key" {} 13 | 14 | provider "digitalocean" { 15 | token = var.do_token 16 | } 17 | -------------------------------------------------------------------------------- /examples/cloud/terraform/resource-ansible-inventory.tf: -------------------------------------------------------------------------------- 1 | resource "local_file" "ansible_inventory" { 2 | content = templatefile("${path.root}/templates/hosts.tpl", 3 | { 4 | postgresql_ip = digitalocean_droplet.getsetfetch_pg.ipv4_address 5 | scraper_ips = digitalocean_droplet.getsetfetch_scraper.*.ipv4_address 6 | } 7 | ) 8 | filename = "${path.root}/../ansible/inventory/hosts.cfg" 9 | } 10 | -------------------------------------------------------------------------------- /examples/cloud/terraform/resource-pg.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' 3 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - 4 | sudo apt-get update 5 | sudo apt-get -y install postgresql 6 | 7 | sudo apt-get -y install libpq-dev python3-psycopg2 8 | -------------------------------------------------------------------------------- /examples/cloud/terraform/resource-pg.tf: -------------------------------------------------------------------------------- 1 | resource "digitalocean_droplet" "getsetfetch_pg" { 2 | image = "ubuntu-20-04-x64" 3 | name = "getsetfetch-pg" 4 | region = "fra1" 5 | size = "s-4vcpu-8gb" 6 | monitoring = true 7 | resize_disk = false 8 | vpc_uuid = digitalocean_vpc.getsetfetch_vpc.id 9 | 10 | 11 | ssh_keys = [ 12 | data.digitalocean_ssh_key.terraform.id 13 | ] 14 | 15 | user_data = file("resource-pg.sh") 16 | 17 | provisioner "remote-exec" { 18 | inline = [ 19 | "cloud-init status --wait", 20 | "echo 'Connected!'" 21 | ] 22 | 23 | connection { 24 | host = self.ipv4_address 25 | type = "ssh" 26 | user = "root" 27 | private_key = file(var.pvt_key) 28 | } 29 | } 30 | 31 | 32 | provisioner "local-exec" { 33 | command = <> { 11 | const result: Partial = await super.openInTab(resource, client); 12 | result.content = client.consoleContent; 13 | return result; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /examples/console-content/ConsolePuppeteerClient.ts: -------------------------------------------------------------------------------- 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 2 | import { ConsoleMessage, HTTPResponse, WaitForOptions } from 'puppeteer'; 3 | import { PuppeteerClient } from '../../src/index'; 4 | 5 | export default class ConsolePuppeteerClient extends PuppeteerClient { 6 | consoleContent: string[][]; 7 | 8 | async launch(): Promise { 9 | await super.launch(); 10 | 11 | const consoleHandler = (evt: ConsoleMessage) => { 12 | this.consoleContent.push([ 13 | evt.type(), 14 | evt.text(), 15 | ]); 16 | }; 17 | 18 | this.page.on('console', consoleHandler); 19 | } 20 | 21 | goto(url: string, opts: WaitForOptions): Promise { 22 | this.consoleContent = []; 23 | return super.goto(url, opts); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /examples/console-content/console-content-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "console-content.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "puppeteer" 12 | }, 13 | "project": { 14 | "name": "ConsoleContent", 15 | "pipeline": "browser-static-content", 16 | "pluginOpts": [ 17 | { 18 | "name": "ConsoleBrowserFetchPlugin", 19 | "path": "ConsoleBrowserFetchPlugin.ts", 20 | "replace": "BrowserFetchPlugin" 21 | }, 22 | { 23 | "name": "ExtractUrlsPlugin", 24 | "maxDepth": 1, 25 | "selectorPairs": [ 26 | { 27 | "urlSelector": "nav a" 28 | } 29 | ] 30 | }, 31 | { 32 | "name": "InsertResourcesPlugin", 33 | "maxResources": 5 34 | } 35 | ], 36 | "resources": [ 37 | { 38 | "url": "https://en.wikipedia.org/wiki/Main_Page" 39 | } 40 | ] 41 | }, 42 | "concurrency": { 43 | "session": { 44 | "maxRequests": 1, 45 | "delay": 1000 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /examples/console-content/console-content.ts: -------------------------------------------------------------------------------- 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 2 | import { destination } from 'pino'; 3 | import { Scraper, Project, setLogger, ScrapeEvent, CsvExporter, BrowserClient } from '../../src/index'; 4 | 5 | /* scrape configuration */ 6 | import ScrapeConfig from './console-content-config.json'; 7 | import ConsolePuppeteerClient from './ConsolePuppeteerClient'; 8 | 9 | // write all INFO and above messages to 'scrape.log' 10 | setLogger({ level: 'info' }, destination('scrape.log')); 11 | 12 | /* create a scraper instance with the above settings */ 13 | const browserClient: BrowserClient = new ConsolePuppeteerClient(); 14 | const scraper = new Scraper(ScrapeConfig.storage, browserClient); 15 | 16 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => { 17 | const exporter = new CsvExporter({ filepath: 'console.csv' }); 18 | await exporter.export(project); 19 | }); 20 | 21 | /* start scraping by specifying project and concurrency settings */ 22 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency); 23 | -------------------------------------------------------------------------------- /examples/in-memory-queue/InMemoryConnection.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-empty-function */ 2 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 3 | import { IProjectStorage, IResourceStorage, IQueueStorage, Connection } from '../../src/index'; 4 | import InMemoryQueue from './InMemoryQueue'; 5 | 6 | export default class InMemoryConnection extends Connection { 7 | /* 8 | by default each connection type is established based on some config, 9 | there are no settings for in-memory storage, just specify a client value 10 | */ 11 | constructor() { 12 | super({ client: 'in-memory' }); 13 | } 14 | 15 | async open() {} 16 | async close() {} 17 | 18 | getProjectStorage():IProjectStorage { 19 | throw new Error('In-Memory Project not supported'); 20 | } 21 | 22 | getResourceStorage():IResourceStorage { 23 | throw new Error('In-Memory Resource not supported'); 24 | } 25 | 26 | getQueueStorage():IQueueStorage { 27 | return new InMemoryQueue(this); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /examples/in-memory-queue/InMemoryQueue.ts: -------------------------------------------------------------------------------- 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 2 | import { IQueueStorage, QueueEntry, Storage } from '../../src/index'; 3 | 4 | export default class InMemoryQueue extends Storage implements IQueueStorage { 5 | queue:Map; 6 | 7 | async drop() { 8 | delete this.queue; 9 | } 10 | 11 | async init() { 12 | this.queue = new Map(); 13 | } 14 | 15 | async filterExistingEntries(urls: string[]) { 16 | return urls 17 | .filter(url => this.queue.has(url)) 18 | .map(url => ({ url })); 19 | } 20 | 21 | async add(entries: QueueEntry[]) { 22 | entries.forEach(entry => { 23 | if (!this.queue.has(entry.url)) { 24 | this.queue.set(entry.url, { ...entry, id: entry.url }); 25 | } 26 | }); 27 | } 28 | 29 | async count() { 30 | return this.queue.size; 31 | } 32 | 33 | async getResourcesToScrape(limit:number = 10) { 34 | const queueEntries:QueueEntry[] = []; 35 | 36 | const queueIt = this.queue.values(); 37 | let result: IteratorResult = queueIt.next(); 38 | 39 | while (queueEntries.length < limit && !result.done) { 40 | const queueEntry:QueueEntry = result.value; 41 | 42 | if (queueEntry.status === undefined) { 43 | queueEntry.status = 1; 44 | queueEntries.push(queueEntry); 45 | } 46 | 47 | result = queueIt.next(); 48 | } 49 | 50 | return queueEntries; 51 | } 52 | 53 | async getAll() { 54 | return Array.from(this.queue.values()); 55 | } 56 | 57 | async updateStatus(url: string, status: number) { 58 | const queueEntry = this.queue.get(url); 59 | if (queueEntry) { 60 | queueEntry.status = status; 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /examples/in-memory-queue/in-memory-queue-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "gsf.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "cheerio" 12 | }, 13 | "project": { 14 | "name": "sitemap", 15 | "resources": [ 16 | { 17 | "url": "https://getsetfetch.org/node/storage.html" 18 | } 19 | ], 20 | "pipeline": "dom-static-content", 21 | "pluginOpts": [ 22 | { 23 | "name": "ExtractHtmlContentPlugin", 24 | "selectorPairs": [ 25 | { 26 | "contentSelector": "h2.card-header-title", 27 | "label": "title" 28 | } 29 | ] 30 | }, 31 | { 32 | "name": "InsertResourcesPlugin", 33 | "maxResources": 3 34 | } 35 | ] 36 | }, 37 | "concurrency": { 38 | "session": { 39 | "maxRequests": 1, 40 | "delay": 1000 41 | } 42 | } 43 | } -------------------------------------------------------------------------------- /examples/in-memory-queue/in-memory-queue.ts: -------------------------------------------------------------------------------- 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 2 | import { destination } from 'pino'; 3 | import { Scraper, setLogger, ScrapeEvent, Project, CsvExporter } from '../../src/index'; 4 | 5 | /* scrape configuration */ 6 | import ScrapeConfig from './in-memory-queue-config.json'; 7 | import InMemoryConnection from './InMemoryConnection'; 8 | 9 | /* write all INFO and above messages to 'gsf.logs' */ 10 | setLogger({ level: 'info' }, destination('scrape.log')); 11 | 12 | /* create a scraper instance with the above settings */ 13 | const conn = { 14 | Project: ScrapeConfig.storage, 15 | Queue: new InMemoryConnection(), 16 | Resource: ScrapeConfig.storage, 17 | }; 18 | const scraper = new Scraper(conn, ScrapeConfig.client); 19 | 20 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => { 21 | const exporter = new CsvExporter({ filepath: 'in-memory-queue.csv' }); 22 | await exporter.export(project); 23 | }); 24 | 25 | /* start scraping by specifying project and concurrency settings */ 26 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency); 27 | -------------------------------------------------------------------------------- /examples/infinite-scrolling/infinite-scrolling-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "infinite-scrolling.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "puppeteer", 12 | "opts": { 13 | "args": [ 14 | "--disable-gpu", 15 | "--disable-dev-shm-usage", 16 | "--disable-setuid-sandbox", 17 | "--no-first-run", 18 | "--no-sandbox", 19 | "--no-zygote", 20 | "--single-process" 21 | ] 22 | } 23 | }, 24 | "project": { 25 | "name": "HistoricalFigures", 26 | "pipeline": "browser-static-content", 27 | "pluginOpts": [ 28 | { 29 | "name": "BrowserFetchPlugin", 30 | "stabilityCheck": 2000, 31 | "stabilityTimeout": 5000 32 | }, 33 | { 34 | "name": "ExtractUrlsPlugin", 35 | "maxDepth": 0 36 | }, 37 | { 38 | "name": "ExtractHtmlContentPlugin", 39 | "selectorPairs": [ 40 | { 41 | "contentSelector": "li > a[data-galabel=grid-item] > span > span span:first-child", 42 | "label": "name" 43 | }, 44 | { 45 | "contentSelector": "li > a[data-galabel=grid-item] > span > span span:last-child", 46 | "label": "items" 47 | } 48 | ] 49 | }, 50 | { 51 | "name": "ScrollPlugin", 52 | "after": "UpsertResourcePlugin", 53 | "maxActions": 3, 54 | "delay": 1000, 55 | "stabilityCheck": 2000, 56 | "stabilityTimeout": 3000 57 | } 58 | ], 59 | "resources": [ 60 | { 61 | "url": "https://artsandculture.google.com/incognito/category/historical-figure" 62 | } 63 | ] 64 | }, 65 | "concurrency": { 66 | "session": { 67 | "maxRequests": 1, 68 | "delay": 3000 69 | } 70 | } 71 | } -------------------------------------------------------------------------------- /examples/infinite-scrolling/infinite-scrolling.ts: -------------------------------------------------------------------------------- 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 2 | import { destination } from 'pino'; 3 | import { Scraper, Project, setLogger, ScrapeEvent, CsvExporter } from '../../src/index'; 4 | 5 | /* scrape configuration */ 6 | import ScrapeConfig from './infinite-scrolling-config.json'; 7 | 8 | // write all INFO and above messages to 'scrape.log' 9 | setLogger({ level: 'info' }, destination('scrape.log')); 10 | 11 | /* create a scraper instance with the above settings */ 12 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client); 13 | 14 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => { 15 | const exporter = new CsvExporter({ filepath: 'historical-figures.csv' }); 16 | await exporter.export(project); 17 | }); 18 | 19 | /* start scraping by specifying project and concurrency settings */ 20 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency); 21 | -------------------------------------------------------------------------------- /examples/pdf-extraction/pdf-extraction-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "pdf-extraction.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "cheerio" 12 | }, 13 | "project": { 14 | "name": "CovidUpdates", 15 | "pipeline": "dom-static-content", 16 | "pluginOpts": [ 17 | { 18 | "name": "ExtractUrlsPlugin", 19 | "maxDepth": 2, 20 | "selectorPairs": [ 21 | { 22 | "urlSelector": ".sf-meeting-report-list:nth-child(5) > a.sf-meeting-report-list__item" 23 | }, 24 | { 25 | "urlSelector": ".button-blue-background > a", 26 | "titleSelector": "h1.dynamic-content__heading" 27 | } 28 | ] 29 | } 30 | ], 31 | "resources": [ 32 | { 33 | "url": "https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports" 34 | } 35 | ] 36 | }, 37 | "concurrency": { 38 | "session": { 39 | "maxRequests": 1, 40 | "delay": 3000 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /examples/pdf-extraction/pdf-extraction.ts: -------------------------------------------------------------------------------- 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 2 | import { destination } from 'pino'; 3 | import { Scraper, Project, setLogger, ScrapeEvent, ZipExporter } from '../../src/index'; 4 | 5 | /* scrape configuration */ 6 | import ScrapeConfig from './pdf-extraction-config.json'; 7 | 8 | // write all INFO and above messages to 'scrape.log' 9 | setLogger({ level: 'info' }, destination('scrape.log')); 10 | 11 | /* create a scraper instance with the above settings */ 12 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client); 13 | 14 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => { 15 | const exporter = new ZipExporter({ filepath: 'covid-updates.zip' }); 16 | await exporter.export(project); 17 | }); 18 | 19 | /* start scraping by specifying project and concurrency settings */ 20 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency); 21 | -------------------------------------------------------------------------------- /examples/product-details/product-details-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "product-details.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "cheerio" 12 | }, 13 | "project": { 14 | "name": "AsimovBooks", 15 | "pipeline": "dom-static-content", 16 | "pluginOpts": [ 17 | { 18 | "name": "ExtractUrlsPlugin", 19 | "maxDepth": 3, 20 | "selectorPairs": [ 21 | { 22 | "urlSelector": "#searchResults ~ .pagination > a.ChoosePage:nth-child(2)" 23 | }, 24 | { 25 | "urlSelector": "h3.booktitle a.results" 26 | }, 27 | { 28 | "urlSelector": "a.coverLook > img.cover" 29 | } 30 | ] 31 | }, 32 | { 33 | "name": "ExtractHtmlContentPlugin", 34 | "selectorPairs": [ 35 | { 36 | "contentSelector": "h1.work-title", 37 | "label": "title" 38 | }, 39 | { 40 | "contentSelector": "h2.edition-byline a", 41 | "label": "author" 42 | }, 43 | { 44 | "contentSelector": "ul.readers-stats > li.avg-ratings > span[itemProp=\"ratingValue\"]", 45 | "label": "rating value" 46 | }, 47 | { 48 | "contentSelector": "ul.readers-stats > li > span[itemProp=\"reviewCount\"]", 49 | "label": "review count" 50 | } 51 | ] 52 | } 53 | ], 54 | "resources": [ 55 | { 56 | "url": "https://openlibrary.org/authors/OL34221A/Isaac_Asimov?page=1" 57 | } 58 | ] 59 | }, 60 | "concurrency": { 61 | "session": { 62 | "maxRequests": 1, 63 | "delay": 3000 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /examples/product-details/product-details.ts: -------------------------------------------------------------------------------- 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 2 | import { destination } from 'pino'; 3 | import { Scraper, Project, setLogger, ScrapeEvent, CsvExporter, ZipExporter } from '../../src/index'; 4 | 5 | /* scrape configuration */ 6 | import ScrapeConfig from './product-details-config.json'; 7 | 8 | // write all INFO and above messages to 'scrape.log' 9 | setLogger({ level: 'info' }, destination('scrape.log')); 10 | 11 | /* create a scraper instance with the above settings */ 12 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client); 13 | 14 | /* export books details as csv */ 15 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => { 16 | const exporter = new CsvExporter({ filepath: 'books.csv' }); 17 | await exporter.export(project); 18 | }); 19 | 20 | /* export book covers as zip */ 21 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => { 22 | const exporter = new ZipExporter({ filepath: 'book-covers.zip' }); 23 | await exporter.export(project); 24 | }); 25 | 26 | /* start scraping by specifying project and concurrency settings */ 27 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency); 28 | -------------------------------------------------------------------------------- /examples/sitemap/ExtractSameHostUrlsPlugin.ts: -------------------------------------------------------------------------------- 1 | import { ExtractUrlsPlugin, PluginOpts } from '../../src/index'; 2 | 3 | export default class ExtractSameHostUrlsPlugin extends ExtractUrlsPlugin { 4 | constructor(opts:Partial = {}) { 5 | super(opts); 6 | 7 | /* parent plugin runs in browser by default, the current one doesn't */ 8 | this.opts.domRead = false; 9 | } 10 | 11 | /* only extract URLs from the sitemap domain */ 12 | isValidUrl(url: URL) { 13 | return url.hostname === 'www.getsetfetch.org'; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /examples/sitemap/SitemapExporter.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-await-in-loop */ 2 | import fs from 'fs'; 3 | import { Exporter, Resource, getLogger } from '../../src/index'; 4 | 5 | export default class SitemapExporter extends Exporter { 6 | logger = getLogger('SitemapExporter'); 7 | 8 | wstream: fs.WriteStream; 9 | 10 | getResourceQuery() { 11 | return { cols: [ 'url' ], where: { contentType: 'text/html' } }; 12 | } 13 | 14 | async preParse() { 15 | this.wstream = fs.createWriteStream(this.opts.filepath); 16 | this.wstream.write('\n'); 17 | this.wstream.write('\n'); 18 | } 19 | 20 | async parse(resource: Partial) { 21 | this.wstream.write(`${resource.url}\n`); 22 | } 23 | 24 | async postParse() { 25 | this.wstream.write(''); 26 | this.wstream.close(); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /examples/sitemap/SkipExtractHtmlContentPlugin.ts: -------------------------------------------------------------------------------- 1 | import { Plugin } from '../../src/index'; 2 | 3 | /** 4 | * if you're using plain javascript besides removing Project and Resource types, don't extend the abstract Plugin class 5 | * @rollup/plugin-commonjs will bundle the entire @get-set-fetch/scraper project including fs, jszip, ... imports 6 | */ 7 | export default class SkipExtractHtmlContentPlugin extends Plugin { 8 | /* 9 | never invoke the plugin, it's just an empty placeholder for ExtractHtmlContentPlugin 10 | since we're not interested in scraping content 11 | */ 12 | test() { 13 | return false; 14 | } 15 | 16 | // eslint-disable-next-line @typescript-eslint/no-empty-function 17 | apply() {} 18 | } 19 | -------------------------------------------------------------------------------- /examples/sitemap/scrape-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "gsf.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "cheerio" 12 | }, 13 | "project": { 14 | "name": "sitemap", 15 | "resources": [ 16 | { 17 | "url": "https://www.getsetfetch.org/index.html" 18 | } 19 | ], 20 | "pipeline": "dom-static-content", 21 | "pluginOpts": [ 22 | { 23 | "name": "ExtractSameHostUrlsPlugin", 24 | "path": "ExtractSameHostUrlsPlugin.ts", 25 | "replace": "ExtractUrlsPlugin" 26 | }, 27 | { 28 | "name": "SkipExtractHtmlContentPlugin", 29 | "path": "SkipExtractHtmlContentPlugin.ts", 30 | "replace": "ExtractHtmlContentPlugin" 31 | }, 32 | { 33 | "name": "InsertResourcesPlugin", 34 | "maxResources": 100 35 | } 36 | ] 37 | }, 38 | "concurrency": { 39 | "session": { 40 | "maxRequests": 1, 41 | "delay": 3000 42 | } 43 | } 44 | } -------------------------------------------------------------------------------- /examples/sitemap/sitemap.ts: -------------------------------------------------------------------------------- 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 2 | import { destination } from 'pino'; 3 | import { Scraper, setLogger, ScrapeEvent, Project } from '../../src/index'; 4 | 5 | /* scrape configuration */ 6 | import ScrapeConfig from './scrape-config.json'; 7 | import SitemapExporter from './SitemapExporter'; 8 | 9 | /* write all INFO and above messages to 'gsf.logs' */ 10 | setLogger({ level: 'info' }, destination('scrape.log')); 11 | 12 | /* create a scraper instance with the above settings */ 13 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client); 14 | 15 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => { 16 | const exporter = new SitemapExporter({ filepath: 'sitemap.xml' }); 17 | await exporter.export(project); 18 | }); 19 | 20 | /* start scraping by specifying project and concurrency settings */ 21 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency); 22 | -------------------------------------------------------------------------------- /examples/tabular-data/tabular-data-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "tabular-data.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "cheerio" 12 | }, 13 | "project": { 14 | "name": "LanguageList", 15 | "pipeline": "dom-static-content", 16 | "pluginOpts": [ 17 | { 18 | "name": "ExtractUrlsPlugin", 19 | "maxDepth": 0 20 | }, 21 | { 22 | "name": "ExtractHtmlContentPlugin", 23 | "selectorPairs": [ 24 | { 25 | "contentSelector": "table:nth-of-type(2) td:nth-child(2) > a:first-child", 26 | "label": "language" 27 | }, 28 | { 29 | "contentSelector": "table:nth-of-type(2) td:nth-child(3)", 30 | "label": "speakers (milions)" 31 | } 32 | ] 33 | } 34 | ], 35 | "resources": [ 36 | { 37 | "url": "https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers" 38 | } 39 | ] 40 | }, 41 | "concurrency": { 42 | "session": { 43 | "maxRequests": 1, 44 | "delay": 3000 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /examples/tabular-data/tabular-data.ts: -------------------------------------------------------------------------------- 1 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 2 | import { Scraper, Project, setLogger, ScrapeEvent, CsvExporter } from '../../src/index'; 3 | 4 | /* scrape configuration */ 5 | import ScrapeConfig from './tabular-data-config.json'; 6 | 7 | // write all INFO and above messages to 'scrape.log' 8 | setLogger({ level: 'info' }); 9 | 10 | /* create a scraper instance with the above settings */ 11 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client); 12 | 13 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => { 14 | const exporter = new CsvExporter({ filepath: 'languages.csv' }); 15 | await exporter.export(project); 16 | }); 17 | 18 | /* start scraping by specifying project and concurrency settings */ 19 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency); 20 | -------------------------------------------------------------------------------- /examples/tls-fingerprinting/RandomTlsFingerprintFetch.ts: -------------------------------------------------------------------------------- 1 | import crypto from 'crypto'; 2 | import { NodeFetchPlugin, Resource } from '../../src/index'; 3 | import { Protocol } from '../../src/plugins/url-utils'; 4 | 5 | export default class RandomTlsFingerprintFetch extends NodeFetchPlugin { 6 | shuffledCipherList: string[]; 7 | 8 | getShuffledCipherList():string[] { 9 | const nodeOrderedCipherList = crypto.constants.defaultCipherList.split(':'); 10 | 11 | // keep the most important ciphers in the same order 12 | const fixedCipherList = nodeOrderedCipherList.slice(0, 3); 13 | 14 | // shuffle the rest 15 | const shuffledCipherList = nodeOrderedCipherList.slice(3) 16 | .map(cipher => ({ cipher, sort: Math.random() })) 17 | .sort((a, b) => a.sort - b.sort) 18 | .map(({ cipher }) => cipher); 19 | 20 | return [ 21 | ...fixedCipherList, 22 | ...shuffledCipherList, 23 | ]; 24 | } 25 | 26 | async getRequestOptions(url:URL, resource: Resource) { 27 | const reqOpts = await super.getRequestOptions(url, resource); 28 | 29 | if (url.protocol === Protocol.HTTPS) { 30 | // one time initialization of randomly ordered ciphers 31 | if (!this.shuffledCipherList) { 32 | this.shuffledCipherList = this.getShuffledCipherList(); 33 | this.logger.info(this.shuffledCipherList, 'using shuffled cipherlist'); 34 | } 35 | 36 | reqOpts.ciphers = this.shuffledCipherList.join(':'); 37 | } 38 | return reqOpts; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /examples/tls-fingerprinting/readme.md: -------------------------------------------------------------------------------- 1 | Details available on [getsetfetch.org/blog/tls-fingerprint.html](https://getsetfetch.org/blog/tls-fingerprint.html). -------------------------------------------------------------------------------- /examples/tls-fingerprinting/tls-fingerprinting-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "tls-fingerprinting-data.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "cheerio" 12 | }, 13 | "project": { 14 | "name": "TlsFingerprinting", 15 | "pipeline": "dom-static-content", 16 | "pluginOpts": [ 17 | { 18 | "name": "ExtractUrlsPlugin", 19 | "maxDepth": 0 20 | }, 21 | { 22 | "name": "RandomTlsFingerprintFetch", 23 | "path": "RandomTlsFingerprintFetch.ts", 24 | "replace": "NodeFetchPlugin" 25 | }, 26 | { 27 | "name": "ExtractHtmlContentPlugin", 28 | "selectorPairs": [ 29 | { 30 | "contentSelector": "table:nth-of-type(2) td:nth-child(2) > a:first-child", 31 | "label": "language" 32 | }, 33 | { 34 | "contentSelector": "table:nth-of-type(2) td:nth-child(3)", 35 | "label": "speakers (milions)" 36 | } 37 | ] 38 | } 39 | ], 40 | "resources": [ 41 | { 42 | "url": "https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers" 43 | } 44 | ] 45 | }, 46 | "concurrency": { 47 | "session": { 48 | "maxRequests": 1, 49 | "delay": 3000 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /examples/tls-fingerprinting/tls-fingerprinting.ts: -------------------------------------------------------------------------------- 1 | import { destination } from 'pino'; 2 | /* for standalone projects replace '../../src/index' with '@get-set-fetch/scraper' */ 3 | import { Scraper, Project, setLogger, ScrapeEvent, CsvExporter } from '../../src/index'; 4 | 5 | /* scrape configuration */ 6 | import ScrapeConfig from './tls-fingerprinting-config.json'; 7 | 8 | // write all INFO and above messages to 'scrape.log' 9 | setLogger({ level: 'info' }); // destination('scrape.log') 10 | 11 | /* create a scraper instance with the above settings */ 12 | const scraper = new Scraper(ScrapeConfig.storage, ScrapeConfig.client); 13 | 14 | scraper.on(ScrapeEvent.ProjectScraped, async (project: Project) => { 15 | const exporter = new CsvExporter({ filepath: 'languages.csv' }); 16 | await exporter.export(project); 17 | }); 18 | 19 | /* start scraping by specifying project and concurrency settings */ 20 | scraper.scrape(ScrapeConfig.project, ScrapeConfig.concurrency); 21 | -------------------------------------------------------------------------------- /examples/tsconfig.examples.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "esModuleInterop": true, 4 | "resolveJsonModule": true, 5 | "target": "esnext", 6 | "strict": false, 7 | "moduleResolution": "node", 8 | "module": "commonjs", 9 | "allowJs": true, 10 | "newLine": "LF", 11 | }, 12 | "include": [ 13 | "./" 14 | ], 15 | } -------------------------------------------------------------------------------- /src/browserclient/BrowserClient.ts: -------------------------------------------------------------------------------- 1 | /** Provides a common API to interact with various browser clients. */ 2 | 3 | export type BaseResponse = { 4 | status(): number; 5 | url(): string; 6 | request(): {}; 7 | ok(): boolean; 8 | headers(); 9 | } 10 | 11 | export default abstract class BrowserClient { 12 | /** 13 | * whether or not the browser is launched 14 | */ 15 | isLaunched: boolean; 16 | 17 | /** 18 | * options for launching the browser 19 | * if not specified {headless: true} is added 20 | */ 21 | opts: { 22 | browser?: string; 23 | [key: string]:any; 24 | }; 25 | 26 | constructor(opts) { 27 | this.opts = opts; 28 | this.isLaunched = false; 29 | } 30 | 31 | abstract launch():Promise; 32 | abstract close():Promise; 33 | abstract closePage():Promise; 34 | 35 | /* 36 | puppeteer supports evaluate with multiple arguments 37 | playwright supports evaluate with a single argument object 38 | use object destructuring to support both APIs 39 | */ 40 | abstract evaluate(fnc, argObj?):Promise; 41 | 42 | abstract getRedirectResponse(req):Promise; 43 | 44 | abstract goto(url: string, opts):Promise; 45 | abstract getUrl():string; 46 | } 47 | -------------------------------------------------------------------------------- /src/browserclient/PuppeteerClient.ts: -------------------------------------------------------------------------------- 1 | // @ts-ignore 2 | // eslint-disable-next-line import/no-unresolved 3 | import { Browser, LaunchOptions, launch as plaunch, Page, WaitForOptions, HTTPResponse, HTTPRequest, BrowserLaunchArgumentOptions, BrowserConnectOptions } from 'puppeteer'; 4 | import { getLogger } from '../logger/Logger'; 5 | import BrowserClient from './BrowserClient'; 6 | 7 | /** Puppeteer Client. */ 8 | export default class PuppeteerClient extends BrowserClient { 9 | logger = getLogger('PuppeteerClient'); 10 | 11 | browser: Browser; 12 | page: Page; 13 | opts: LaunchOptions; 14 | 15 | constructor(opts:LaunchOptions & BrowserLaunchArgumentOptions & BrowserConnectOptions = {}) { 16 | super({ headlesss: true, ...opts }); 17 | } 18 | 19 | async launch():Promise { 20 | this.browser = await plaunch(this.opts); 21 | this.page = await this.browser.newPage(); 22 | 23 | this.isLaunched = true; 24 | } 25 | 26 | async close():Promise { 27 | this.page = null; 28 | await this.browser.close(); 29 | this.isLaunched = false; 30 | } 31 | 32 | goto(url: string, opts: WaitForOptions):Promise { 33 | return this.page.goto(url, opts); 34 | } 35 | 36 | async getRedirectResponse(req:HTTPRequest):Promise { 37 | const redirectChain = req.redirectChain(); 38 | return redirectChain.length > 0 ? redirectChain[0].response() : null; 39 | } 40 | 41 | getUrl() { 42 | return this.page.url(); 43 | } 44 | 45 | async closePage() { 46 | if (this.page) { 47 | await this.page.close(); 48 | this.page = null; 49 | } 50 | } 51 | 52 | async evaluate(pageFunction, argObj?) { 53 | // if there's an error in the async fnc to be evaluated the page.evaluate return promise may never resolve 54 | // listen to page errors and reject accordingly 55 | return new Promise(async (resolve, reject) => { 56 | const logConsole = this.logger.logger.level === 'trace' || this.logger.logger.level === 'debug'; 57 | const consoleHandler = msg => { 58 | for (let i = 0; i < msg.args().length; i += 1) { 59 | this.logger.debug(`DOM console: ${msg.args()[i]}`); 60 | } 61 | }; 62 | 63 | if (logConsole) { 64 | this.page.on('console', consoleHandler); 65 | } 66 | 67 | const errorHandler = err => { 68 | this.logger.error(err); 69 | reject(err); 70 | this.page.off('pageerror', errorHandler); 71 | this.page.off('error', errorHandler); 72 | if (logConsole) { 73 | this.page.off('console', consoleHandler); 74 | } 75 | }; 76 | this.page.on('pageerror', errorHandler); 77 | this.page.on('error', errorHandler); 78 | 79 | this.logger.trace({ pageFunction: pageFunction.toString(), argObj }, 'evaluate call'); 80 | const result = await this.page.evaluate(pageFunction, argObj); 81 | resolve(result); 82 | this.page.off('pageerror', errorHandler); 83 | this.page.off('error', errorHandler); 84 | this.page.off('console', consoleHandler); 85 | }); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/confighash/config-hash.ts: -------------------------------------------------------------------------------- 1 | import { Buffer } from 'buffer'; 2 | import { deflateSync, inflateSync, constants } from 'zlib'; 3 | import * as dictionaryV1 from './dictionary-v1.json'; 4 | 5 | /** 6 | * Converts a project configuration to a deflated based64 string. 7 | * @param input - project configuration 8 | */ 9 | function encode(input: object):string { 10 | const deflatedIntArr = deflateSync(JSON.stringify(input), { dictionary: Buffer.from(JSON.stringify(dictionaryV1)), level: constants.Z_BEST_COMPRESSION }); 11 | return Buffer.from(deflatedIntArr).toString('base64'); 12 | } 13 | 14 | /** 15 | * Converts a deflated based64 string to a project configuration. 16 | * @param deflatedBase64String - project configuration 17 | */ 18 | function decode(deflatedBase64String: string) { 19 | if (!deflatedBase64String || deflatedBase64String.length === 0) return null; 20 | 21 | let inflatedInstance = null; 22 | const buffer = Buffer.from(deflatedBase64String, 'base64'); 23 | const inflatedString = inflateSync(buffer, { dictionary: Buffer.from(JSON.stringify(dictionaryV1)) }); 24 | inflatedInstance = JSON.parse(inflatedString.toString('utf-8')); 25 | 26 | return inflatedInstance; 27 | } 28 | 29 | export { 30 | encode, decode, 31 | }; 32 | -------------------------------------------------------------------------------- /src/confighash/dictionary-v1.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "", 3 | "pipeline": "browser-static-content|dom-static-content", 4 | "pluginOpts": [ 5 | { 6 | "name": "ExtractHtmlContentPlugin", 7 | "selectorPairs": [ 8 | { 9 | "label": "", 10 | "contentSelector": "", 11 | "contentProperty": "innerText" 12 | } 13 | ], 14 | "domRead": true 15 | }, 16 | { 17 | "name": "ExtractUrlsPlugin", 18 | "selectorPairs": [ 19 | { 20 | "urlSelector": "a", 21 | "titleSelector": "" 22 | } 23 | ], 24 | "maxDepth": -1 25 | }, 26 | { 27 | "name": "BrowserFetchPlugin", 28 | "gotoOptions": { 29 | "timeout": 0, 30 | "waitUntil": 0 31 | }, 32 | "stabilityCheck": 0, 33 | "stabilityTimeout": 0 34 | }, 35 | { 36 | "name": "NodeFetchPlugin", 37 | "headers": {} 38 | }, 39 | { 40 | "name": "InsertResourcesPlugin", 41 | "maxResources": -1 42 | }, 43 | { 44 | "name": "ScrollPlugin", 45 | "domWrite": true, 46 | "delay": 1000, 47 | "maxActions": -1 48 | }, 49 | { 50 | "name": "UpsertResourcePlugin" 51 | } 52 | ], 53 | "resources": [ 54 | { 55 | "url": "https://.com/.html" 56 | } 57 | ] 58 | } -------------------------------------------------------------------------------- /src/domclient/CheerioClient.ts: -------------------------------------------------------------------------------- 1 | // @ts-ignore 2 | // eslint-disable-next-line import/no-unresolved 3 | import cheerio from 'cheerio'; 4 | import { IDomNode } from './DomClient'; 5 | 6 | export default class CheerioClient implements IDomNode { 7 | root: cheerio.Root; 8 | elm:cheerio.Element; 9 | 10 | constructor(bufferOrRoot: Buffer|cheerio.Root, elm?: cheerio.Element) { 11 | this.root = bufferOrRoot instanceof Buffer ? cheerio.load(bufferOrRoot.toString('utf8')) : bufferOrRoot; 12 | this.elm = elm; 13 | } 14 | 15 | querySelectorAll(selector: string):IDomNode[] { 16 | const elms = this.elm ? this.root(selector, this.elm) : this.root(selector); 17 | return elms.toArray().map(elm => new CheerioClient(this.root, elm)); 18 | } 19 | 20 | getAttribute(prop:string) { 21 | if (prop === 'innerText') { 22 | return this.root(this.elm).text(); 23 | } 24 | 25 | return this.root(this.elm).attr(prop); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/domclient/DomClient.ts: -------------------------------------------------------------------------------- 1 | export interface IDomNode { 2 | querySelectorAll(selector: string):IDomNode[]; 3 | getAttribute(prop: string); 4 | } 5 | 6 | export interface IDomClientConstructor { 7 | new(...args): IDomNode; 8 | } 9 | -------------------------------------------------------------------------------- /src/domclient/JsdomClient.ts: -------------------------------------------------------------------------------- 1 | // @ts-ignore 2 | // eslint-disable-next-line import/no-unresolved 3 | import { JSDOM } from 'jsdom'; 4 | import { IDomNode } from './DomClient'; 5 | 6 | export default class JsdomClient implements IDomNode { 7 | elm: Element; 8 | 9 | constructor(bufferOrElm: Buffer|Element) { 10 | this.elm = bufferOrElm instanceof Buffer ? new JSDOM(bufferOrElm.toString('utf8')).window.document.querySelector('body') : bufferOrElm; 11 | } 12 | 13 | querySelectorAll(selector: string):IDomNode[] { 14 | return Array.from(this.elm.querySelectorAll(selector)).map(elm => new JsdomClient(elm)); 15 | } 16 | 17 | getAttribute(prop:string) { 18 | if (prop === 'innerText') { 19 | return this.elm.textContent; 20 | } 21 | 22 | return this.elm.getAttribute(prop); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/domclient/NativeClient.ts: -------------------------------------------------------------------------------- 1 | import { IDomNode } from './DomClient'; 2 | 3 | export default class NativeClient implements IDomNode { 4 | document: Element; 5 | 6 | constructor(document: Element) { 7 | this.document = document; 8 | } 9 | 10 | querySelectorAll(selector: string):IDomNode[] { 11 | return Array.from(this.document.querySelectorAll(selector)).map(elm => new NativeClient(elm)); 12 | } 13 | 14 | getAttribute(prop:string) { 15 | return this.document[prop] || this.document.getAttribute(prop); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/domclient/client-utils.ts: -------------------------------------------------------------------------------- 1 | import { PlaywrightClient, PuppeteerClient, CheerioClient, JsdomClient, BrowserClient, 2 | IDomClientConstructor } from '../index'; 3 | import { ClientOptions } from '../scraper/Scraper'; 4 | 5 | export default function initClient(clientOpts:ClientOptions):BrowserClient|IDomClientConstructor { 6 | if (!clientOpts) throw new Error('missing DOM options'); 7 | if (!clientOpts.name) throw new Error('missing DOM client'); 8 | 9 | let client; 10 | switch (clientOpts.name) { 11 | case 'cheerio': 12 | if (!CheerioClient) throw new Error('cheerio package not installed'); 13 | client = CheerioClient; 14 | break; 15 | case 'jsdom': 16 | if (!JsdomClient) throw new Error('jsdom package not installed'); 17 | client = JsdomClient; 18 | break; 19 | case 'puppeteer': 20 | if (!PuppeteerClient) throw new Error('puppeteer package not installed'); 21 | client = new PuppeteerClient(clientOpts.opts); 22 | break; 23 | case 'playwright': 24 | if (!PlaywrightClient) throw new Error('playwright-core package not installed'); 25 | client = new PlaywrightClient(clientOpts.opts); 26 | break; 27 | default: 28 | throw new Error(`invalid client ${clientOpts.name}`); 29 | } 30 | 31 | return client; 32 | } 33 | -------------------------------------------------------------------------------- /src/export/CsvExporter.ts: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import Resource, { ResourceQuery } from '../storage/base/Resource'; 3 | import Exporter, { ExportOptions } from './Exporter'; 4 | import { getLogger } from '../logger/Logger'; 5 | 6 | export type CsvExportOptions = ExportOptions & { 7 | fieldSeparator?: string; 8 | lineSeparator?: string; 9 | } 10 | 11 | /** Provides CSV export capabilities. */ 12 | export default class CsvExporter extends Exporter { 13 | logger = getLogger('CsvExporter'); 14 | 15 | opts: CsvExportOptions; 16 | 17 | wstream: fs.WriteStream; 18 | 19 | getResourceQuery(): Partial { 20 | return { whereNotNull: [ 'content' ], cols: [ 'url', 'content' ] }; 21 | } 22 | 23 | async preParse(): Promise { 24 | this.wstream = fs.createWriteStream(this.opts.filepath); 25 | 26 | // write csv header 27 | this.wstream.write([ 'url', ...this.getContentKeys() ].join(this.opts.fieldSeparator)); 28 | } 29 | 30 | async parse(resource: Partial): Promise { 31 | const { lineSeparator } = this.opts; 32 | const csvRows = this.resourceToCsvRows(resource); 33 | this.wstream.write(lineSeparator); 34 | this.wstream.write(csvRows.join(lineSeparator)); 35 | } 36 | 37 | async postParse() { 38 | this.wstream.close(); 39 | } 40 | 41 | getContentKeys(): string[] { 42 | return this.project.plugins 43 | .map(plugin => plugin.getContentKeys()) 44 | .find(contentKeys => contentKeys) 45 | || []; 46 | } 47 | 48 | resourceToCsvRows(resource: Partial): string[][] { 49 | const { url, content } = resource; 50 | 51 | const csvRows: string[][] = []; 52 | content.forEach(contentRowVal => { 53 | const csvRow = [ url ]; 54 | contentRowVal.forEach(contentColVal => { 55 | csvRow.push(this.getCsvVal(contentColVal)); 56 | }); 57 | csvRows.push(csvRow); 58 | }); 59 | 60 | // no content for current resource, add a [url] entry 61 | if (csvRows.length === 0) { 62 | csvRows.push([ url ]); 63 | } 64 | 65 | return csvRows; 66 | } 67 | 68 | getCsvVal(contentVal: string) { 69 | /* 70 | quotes handling 71 | RFC-4180 "If double-quotes are used to enclose fields, 72 | then a double-quote appearing inside a field must be escaped by preceding it with another double quote." 73 | */ 74 | if (contentVal === undefined) { 75 | return '""'; 76 | } 77 | 78 | if (typeof contentVal === 'string') { 79 | const quotedVal = contentVal.replace(/"/g, '""'); 80 | return `"${quotedVal}"`; 81 | } 82 | 83 | return contentVal; 84 | } 85 | 86 | getDefaultOptions(): Partial { 87 | return { 88 | fieldSeparator: ',', 89 | lineSeparator: '\n', 90 | pageLimit: 10000, 91 | }; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/export/Exporter.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-await-in-loop */ 2 | import { isAbsolute, join } from 'path'; 3 | import { LogWrapper } from '../logger/Logger'; 4 | import Project from '../storage/base/Project'; 5 | import Resource, { ResourceQuery } from '../storage/base/Resource'; 6 | import ConnectionManager from '../storage/ConnectionManager'; 7 | 8 | export type ExportOptions = { 9 | pageLimit?: number; 10 | filepath: string; 11 | } 12 | 13 | /** Scraped data exporters should extend this class. */ 14 | export default abstract class Exporter { 15 | logger: LogWrapper; 16 | 17 | project: Project; 18 | opts: ExportOptions; 19 | 20 | constructor(opts: ExportOptions) { 21 | this.opts = Object.assign(this.getDefaultOptions(), opts); 22 | this.opts.filepath = isAbsolute(opts.filepath) ? opts.filepath : join(process.cwd(), opts.filepath); 23 | } 24 | 25 | getPagedResources(offset: number, limit: number): Promise[]> { 26 | return this.project.getPagedResources({ ...this.getResourceQuery(), offset, limit }); 27 | } 28 | 29 | async export(project: Project) { 30 | let connManager: ConnectionManager; 31 | 32 | try { 33 | // use a separate db connection, scrape and export have different db lifecycles and may run in parallel 34 | connManager = ConnectionManager.clone(project); 35 | await connManager.connect(); 36 | const ExtProject = await connManager.getProject(); 37 | 38 | // retrieve the project from the currently active db connection 39 | this.project = await ExtProject.get(project.id); 40 | if (!this.project) { 41 | throw new Error(`could not find project ${project.name}`); 42 | } 43 | 44 | // need to init the plugins as one of the plugins may contain info related to the exported columns 45 | this.project.plugins = await this.project.initPlugins(true); 46 | 47 | let resources: Partial[]; 48 | const { pageLimit: limit } = this.opts; 49 | let offset = 0; 50 | 51 | do { 52 | resources = await this.getPagedResources(offset, limit); 53 | 54 | if (offset === 0) { 55 | if (resources.length === 0) { 56 | this.logger.warn('No content to export.'); 57 | break; 58 | } 59 | 60 | this.logger.info(`Exporting under ${this.opts.filepath} ...`); 61 | await this.preParse(); 62 | } 63 | 64 | // eslint-disable-next-line no-loop-func 65 | await Promise.all(resources.map((resource, idx) => this.parse(resource, offset + idx))); 66 | offset += limit; 67 | } 68 | while (resources.length > 0); 69 | 70 | if (offset > 0) { 71 | await this.postParse(); 72 | this.logger.info(`Exporting under ${this.opts.filepath} ... done`); 73 | } 74 | } 75 | catch (err) { 76 | this.logger.error(err, `error exporting using options ${JSON.stringify(this.opts)}`); 77 | } 78 | finally { 79 | if (connManager) { 80 | await connManager.close(); 81 | } 82 | } 83 | } 84 | 85 | getDefaultOptions(): Partial { 86 | return { 87 | pageLimit: 10000, 88 | }; 89 | } 90 | 91 | abstract getResourceQuery(): Partial; 92 | 93 | abstract preParse(): Promise; 94 | abstract parse(resource: Partial, resourceIdx: number): Promise; 95 | abstract postParse(): Promise; 96 | } 97 | -------------------------------------------------------------------------------- /src/export/MimeTypes.json: -------------------------------------------------------------------------------- 1 | { 2 | "audio/aac": "aac", 3 | "application/x-abiword": "abw", 4 | "application/x-freearc": "arc", 5 | "video/x-msvideo": "avi", 6 | "application/vnd.amazon.ebook": "azw", 7 | "application/octet-stream": "bin", 8 | "image/bmp": "bmp", 9 | "application/x-bzip": "bz", 10 | "application/x-bzip2": "bz2", 11 | "application/x-csh": "csh", 12 | "text/css": "css", 13 | "text/csv": "csv", 14 | "application/msword": "doc", 15 | "application/vnd.openxmlformats": "docx", 16 | "application/vnd.ms-fontobject": "eot", 17 | "application/epub+zip": "epub", 18 | "application/gzip": "gz", 19 | "image/gif": "gif", 20 | "text/html": "html", 21 | "image/vnd.microsoft.icon": "ico", 22 | "text/calendar": "ics", 23 | "application/java-archive": "jar", 24 | "image/jpeg": "jpg", 25 | "text/javascript": "js", 26 | "application/json": "json", 27 | "application/ld+json": "jsonld", 28 | "audio/midi": "midi", 29 | "audio/x-midi": "midi", 30 | "audio/mpeg": "mp3", 31 | "video/mpeg": "mpeg", 32 | "application/vnd.apple.installer+xml": "mpkg", 33 | "application/vnd.oasis.opendocument.presentation": "odp", 34 | "application/vnd.oasis.opendocument.spreadsheet": "ods", 35 | "application/vnd.oasis.opendocument.text": "odt", 36 | "audio/ogg": "oga", 37 | "video/ogg": "ogv", 38 | "application/ogg": "ogx", 39 | "audio/opus": "opus", 40 | "font/otf": "otf", 41 | "image/png": "png", 42 | "application/pdf": "pdf", 43 | "application/vnd.ms-powerpoint": "ppt", 44 | "officedocument.presentationml.presentation": "pptx", 45 | "application/vnd.rar": "rar", 46 | "application/rtf": "rtf", 47 | "application/x-sh": "sh", 48 | "image/svg+xml": "svg", 49 | "application/x-shockwave-flash": "swf", 50 | "application/x-tar": "tar", 51 | "image/tiff": "tiff", 52 | "video/mp2t": "ts", 53 | "font/ttf": "ttf", 54 | "text/plain": "txt", 55 | "application/vnd.visio": "vsd", 56 | "audio/wav": "wav", 57 | "audio/webm": "weba", 58 | "video/webm": "webm", 59 | "image/webp": "webp", 60 | "font/woff": "woff", 61 | "font/woff2": "woff2", 62 | "application/xhtml+xml": "xhtml", 63 | "application/vnd.ms-excel": "xls", 64 | "officedocument.spreadsheetml.sheet": "xlsx", 65 | "application/xml": "xml", 66 | "text/xml": "xml", 67 | "application/vnd.mozilla.xul+xml": "xul", 68 | "application/zip": "zip", 69 | "video/3gpp": "3gp", 70 | "audio/3gpp": "3gp", 71 | "video/3gpp2": "3g2", 72 | "audio/3gpp2": "3g2", 73 | "application/x-7z-compressed": "7z" 74 | } -------------------------------------------------------------------------------- /src/export/ZipExporter.ts: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import path, { parse } from 'path'; 3 | import JSZip from 'jszip'; 4 | import Exporter from './Exporter'; 5 | import Resource, { ResourceQuery } from '../storage/base/Resource'; 6 | 7 | import * as MimeTypes from './MimeTypes.json'; 8 | import { getLogger } from '../logger/Logger'; 9 | 10 | /** Provides ZIP export capabilities. */ 11 | export default class ZipExporter extends Exporter { 12 | logger = getLogger('ZipExporter'); 13 | 14 | zip: JSZip; 15 | zipIdx: number; 16 | 17 | getResourceQuery(): Partial { 18 | return { whereNotNull: [ 'data' ], cols: [ 'url', 'data', 'parent', 'contentType' ] }; 19 | } 20 | 21 | async preParse(): Promise { 22 | this.zipIdx = 0; 23 | } 24 | 25 | async parse(resource: Partial, idx: number): Promise { 26 | // for each bulk resource read do a separate archive 27 | if (idx % this.opts.pageLimit === 0) { 28 | // close the prev archive if present 29 | if (this.zip) { 30 | await this.writeZip(); 31 | this.zipIdx += 1; 32 | } 33 | 34 | // create a new archive 35 | this.zip = new JSZip(); 36 | } 37 | 38 | const name = `${this.getName(resource)}.${this.getExtension(resource)}`; 39 | this.zip.file(name, resource.data); 40 | } 41 | 42 | async postParse() { 43 | await this.writeZip(); 44 | } 45 | 46 | async writeZip() { 47 | const content = await this.zip.generateAsync({ 48 | type: 'uint8array', 49 | compression: 'STORE', 50 | }); 51 | fs.writeFileSync(this.getPath(), content); 52 | } 53 | 54 | getPath() { 55 | const { dir, name, ext } = parse(this.opts.filepath); 56 | 57 | const idxSuffix = this.zipIdx === 0 ? '' : `-${this.zipIdx}`; 58 | const zipPath = path.join(dir, `${name}${idxSuffix}${ext}`); 59 | return zipPath; 60 | } 61 | 62 | getName(resource: Partial): string { 63 | const nameParts: string[] = []; 64 | 65 | // get resource name from parent metadata 66 | if (resource.parent) { 67 | const { title, linkText, imgAlt } = resource.parent; 68 | nameParts.push(title); 69 | nameParts.push(linkText); 70 | nameParts.push(imgAlt); 71 | 72 | const nonEmptyNameParts = nameParts.filter(namePart => namePart); 73 | if (nonEmptyNameParts.length > 0) { 74 | return nonEmptyNameParts.map(namePart => namePart.substr(0, 100)).join('-'); 75 | } 76 | } 77 | 78 | // get resource name just from its url 79 | const nameMatch = /.+\/([^.?]+).*($|\?)/.exec(resource.url); 80 | if (nameMatch) { 81 | return nameMatch[1]; 82 | } 83 | 84 | // failsafe, just return the last part of url 85 | return resource.url.substr(-30); 86 | } 87 | 88 | getExtension(resource: Partial): string { 89 | // extension can be identified based on mime type 90 | if (MimeTypes[resource.contentType]) { 91 | return MimeTypes[resource.contentType]; 92 | } 93 | 94 | // extension can be identified based on regex against url 95 | // have at least 2 ".", one from domain, one from extension 96 | const extensionMatch = /\..+.\.([^.?]+)($|\?)/.exec(resource.url); 97 | if (extensionMatch) { 98 | return extensionMatch[1]; 99 | } 100 | 101 | // failed to find extension 102 | return 'unknown'; 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/pipelines/BrowserStaticContentPipeline.ts: -------------------------------------------------------------------------------- 1 | import { Pipeline } from './pipelines'; 2 | 3 | const pipeline:Pipeline = { 4 | defaultPluginOpts: [ 5 | { 6 | name: 'BrowserFetchPlugin', 7 | }, 8 | { 9 | name: 'ExtractUrlsPlugin', 10 | }, 11 | { 12 | name: 'ExtractHtmlContentPlugin', 13 | }, 14 | { 15 | name: 'InsertResourcesPlugin', 16 | }, 17 | { 18 | name: 'UpsertResourcePlugin', 19 | }, 20 | ], 21 | 22 | }; 23 | 24 | export default pipeline; 25 | -------------------------------------------------------------------------------- /src/pipelines/DomStaticContentPipeline.ts: -------------------------------------------------------------------------------- 1 | import { Pipeline } from './pipelines'; 2 | 3 | const pipeline:Pipeline = { 4 | defaultPluginOpts: [ 5 | { 6 | name: 'NodeFetchPlugin', 7 | }, 8 | { 9 | name: 'ExtractUrlsPlugin', 10 | domRead: false, 11 | }, 12 | { 13 | name: 'ExtractHtmlContentPlugin', 14 | domRead: false, 15 | }, 16 | { 17 | name: 'InsertResourcesPlugin', 18 | }, 19 | { 20 | name: 'UpsertResourcePlugin', 21 | }, 22 | ], 23 | 24 | }; 25 | 26 | export default pipeline; 27 | -------------------------------------------------------------------------------- /src/pipelines/pipelines.ts: -------------------------------------------------------------------------------- 1 | import { PluginOpts } from '../plugins/Plugin'; 2 | import BrowserStaticContentPipeline from './BrowserStaticContentPipeline'; 3 | import DomStaticContentPipeline from './DomStaticContentPipeline'; 4 | 5 | export type Pipeline = { 6 | defaultPluginOpts:PluginOpts[]; 7 | } 8 | 9 | export type Pipelines = { 10 | [key: string] : Pipeline 11 | } 12 | 13 | /** 14 | * Built-in, predefined pipelines. 15 | * Each one defines a series of plugins with default options to be executed against each to be scraped resource. 16 | */ 17 | const pipelines: Pipelines = { 18 | 'browser-static-content': BrowserStaticContentPipeline, 19 | 'dom-static-content': DomStaticContentPipeline, 20 | }; 21 | 22 | /** 23 | * Takes starting default options and overrides them with custom ones. 24 | * @param defaultOpts - default starting options 25 | * @param customOpts - override options 26 | */ 27 | const mergePluginOpts = (defaultOpts: PluginOpts[], customOpts: PluginOpts[] = []):PluginOpts[] => { 28 | const mergeOpts:PluginOpts[] = [ ...defaultOpts ]; 29 | customOpts.forEach(pluginCustomOpts => { 30 | if (pluginCustomOpts.before) { 31 | const idx = mergeOpts.findIndex(mergePluginOpts => mergePluginOpts.name === pluginCustomOpts.before); 32 | if (idx === -1) throw new Error(`could not find plugin ${pluginCustomOpts.before} as before anchor`); 33 | mergeOpts.splice(idx, 0, pluginCustomOpts); 34 | return; 35 | } 36 | 37 | if (pluginCustomOpts.replace) { 38 | const idx = mergeOpts.findIndex(mergePluginOpts => mergePluginOpts.name === pluginCustomOpts.replace); 39 | if (idx === -1) throw new Error(`could not find plugin ${pluginCustomOpts.before} as replace anchor`); 40 | mergeOpts[idx] = pluginCustomOpts; 41 | return; 42 | } 43 | 44 | if (pluginCustomOpts.after) { 45 | const idx = mergeOpts.findIndex(mergePluginOpts => mergePluginOpts.name === pluginCustomOpts.after); 46 | if (idx === -1) throw new Error(`could not find plugin ${pluginCustomOpts.before} as after anchor`); 47 | mergeOpts.splice(idx + 1, 0, pluginCustomOpts); 48 | return; 49 | } 50 | 51 | const idx = mergeOpts.findIndex(mergePluginOpts => mergePluginOpts.name === pluginCustomOpts.name); 52 | if (idx === -1) throw new Error(`could not find plugin ${pluginCustomOpts.name} as merge anchor`); 53 | mergeOpts[idx] = { ...mergeOpts[idx], ...pluginCustomOpts }; 54 | }); 55 | 56 | return mergeOpts; 57 | }; 58 | 59 | export { 60 | mergePluginOpts, 61 | pipelines, 62 | }; 63 | -------------------------------------------------------------------------------- /src/plugins/Plugin.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-prototype-builtins */ 2 | /* eslint-disable no-param-reassign */ 3 | /* eslint-disable no-restricted-syntax */ 4 | import { JSONSchema7 } from 'json-schema'; 5 | import SchemaHelper from '../schema/SchemaHelper'; 6 | import Project from '../storage/base/Project'; 7 | import Resource from '../storage/base/Resource'; 8 | import BrowserClient from '../browserclient/BrowserClient'; 9 | import { IDomClientConstructor } from '../domclient/DomClient'; 10 | 11 | export type PluginOpts = { 12 | name: string; 13 | domRead?: boolean; 14 | domWrite?: boolean; 15 | [key: string]: unknown; 16 | 17 | // position options within a plugin list 18 | before?: string; 19 | replace?: string; 20 | after?: string; 21 | path?: string; 22 | } 23 | 24 | /** All plugins should extend this class implementing the test and apply methods. */ 25 | export default abstract class Plugin { 26 | static get schema() { 27 | return {}; 28 | } 29 | 30 | opts: Partial; 31 | 32 | constructor(opts: Partial = {}) { 33 | const { schema } = this.constructor; 34 | this.opts = SchemaHelper.instantiate(schema, opts); 35 | } 36 | 37 | /** 38 | * Relevant for a pipeline plugin responsible for actual content scraping. 39 | * @returns keys the scraped data will be exported under 40 | */ 41 | getContentKeys(): string[] { 42 | return undefined; 43 | } 44 | 45 | /** 46 | * Tests if the plugin should be executed or not against the current resource. 47 | * @param project - current scrape project 48 | * @param resource - current scrape resource 49 | */ 50 | abstract test(project: Project, resource: Resource): Promise | boolean; 51 | 52 | /** 53 | * Executes the plugin against the current resource, either in node.js or browser environment. 54 | * The result will be merged into the currently scraped resource at scraper level. 55 | * @param project - current scrape project 56 | * @param resource - current scrape resource 57 | * @param client - current browser client 58 | */ 59 | abstract apply(project: Project, resource: Resource, client: BrowserClient | IDomClientConstructor): Promise> | void | Partial; 60 | } 61 | 62 | export interface IPlugin { 63 | new(kwArgs: Partial): Plugin; 64 | schema: JSONSchema7; 65 | } 66 | -------------------------------------------------------------------------------- /src/plugins/default/BaseFetchPlugin.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable max-classes-per-file */ 2 | import { Project } from '../..'; 3 | import BrowserClient from '../../browserclient/BrowserClient'; 4 | import Resource from '../../storage/base/Resource'; 5 | import Plugin from '../Plugin'; 6 | 7 | export class FetchError extends Error { 8 | status: number; 9 | redirectUrl: string; 10 | 11 | constructor(status: number, redirectUrl?: string) { 12 | super(); 13 | this.status = status; 14 | this.redirectUrl = redirectUrl; 15 | } 16 | } 17 | 18 | export default abstract class BaseFetchPlugin extends Plugin { 19 | /** 20 | * check against 2xx codes and an optional list of allowed status 21 | * @param status response status code 22 | */ 23 | isValidStatus(status: number, allowedStatus: number[] = []) { 24 | return Math.floor(status / 100) === 2 || allowedStatus.includes(status); 25 | } 26 | 27 | /** 28 | * check against 3xx codes 29 | * @param status response status code 30 | */ 31 | isRedirectStatus(status: number) { 32 | return Math.floor(status / 100) === 3; 33 | } 34 | 35 | test(project: Project, resource: Resource) { 36 | if (!resource || !resource.url) return false; 37 | 38 | // only fetch a resource that hasn't been fetched yet 39 | if (resource.contentType) return false; 40 | 41 | // only http/https supported 42 | const { protocol } = new URL(resource.url); 43 | return protocol === 'http:' || protocol === 'https:'; 44 | } 45 | 46 | async apply(project: Project, resource: Resource, client: BrowserClient): Promise> { 47 | let result: Partial; 48 | 49 | try { 50 | result = await this.fetch(resource, client); 51 | } 52 | catch (err) { 53 | return this.fetchErrResult(err); 54 | } 55 | 56 | return result; 57 | } 58 | 59 | fetchErrResult(err: Error) { 60 | if (err instanceof FetchError) { 61 | const { status, redirectUrl } = err; 62 | /* 63 | redirect detected 64 | for the current resource return redirect status 65 | also add the final url as a new resource to be scraped 66 | don't return contentType as many plugin use it as testing condition and we don't want the original redirect url to be scraped 67 | */ 68 | if (this.isRedirectStatus(status)) { 69 | return { 70 | status, 71 | resourcesToAdd: [ { url: redirectUrl } ], 72 | }; 73 | } 74 | 75 | /* 76 | all other fetch errors 77 | don't return contentType as many plugins use it as testing condition and we don't want the original redirect url to be scraped 78 | */ 79 | return { 80 | status, 81 | }; 82 | } 83 | 84 | // errors not related to fetch status code 85 | throw err; 86 | } 87 | 88 | /** 89 | * Extract just the content type, not the full header value 90 | * @param rawContentType : like 'text/html; charset=UTF-8' 91 | */ 92 | getContentType(rawContentType: string): string { 93 | if (rawContentType) { 94 | const matchArr = rawContentType.match(/^[^;]+/); 95 | return matchArr ? matchArr[0] : null; 96 | } 97 | return null; 98 | } 99 | 100 | abstract fetch(resource: Resource, client?: BrowserClient, opts?: RequestInit); 101 | } 102 | -------------------------------------------------------------------------------- /src/plugins/default/InsertResourcesPlugin.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-param-reassign */ 2 | /* eslint-disable no-await-in-loop */ 3 | import Plugin from '../Plugin'; 4 | import Project from '../../storage/base/Project'; 5 | import Resource from '../../storage/base/Resource'; 6 | import { SchemaType } from '../../schema/SchemaHelper'; 7 | import { getLogger } from '../../logger/Logger'; 8 | 9 | /** Saves in database newly identified resources within the current project. */ 10 | export default class InsertResourcesPlugin extends Plugin { 11 | static get schema() { 12 | return { 13 | type: 'object', 14 | title: 'Insert Resources Plugin', 15 | description: 'saves new resources within the current project based on newly identified urls.', 16 | properties: { 17 | maxResources: { 18 | type: 'integer', 19 | default: -1, 20 | title: 'Max Resources', 21 | description: 'Maximum number of resources to be saved and scraped. A value of -1 disables this check.', 22 | }, 23 | }, 24 | } as const; 25 | } 26 | 27 | logger = getLogger('InsertResourcesPlugin'); 28 | 29 | opts: SchemaType; 30 | 31 | constructor(opts: SchemaType = {}) { 32 | super(opts); 33 | } 34 | 35 | test(project: Project, resource: Resource) { 36 | if (!resource) return false; 37 | 38 | // only save new urls if there's something to save 39 | return resource.resourcesToAdd && resource.resourcesToAdd.length > 0; 40 | } 41 | 42 | /** 43 | * Uses project.queue to INSERT to-be-scraped resources with IGNORE on 'url' CONFLICT. 44 | */ 45 | async apply(project: Project, resource: Resource) { 46 | const { resourcesToAdd } = resource; 47 | 48 | this.logger.debug(resourcesToAdd, 'adding newly discovered resources'); 49 | 50 | // each 'child' resource has an increased 'depth' relative to its parent 51 | resourcesToAdd.forEach(resourceToAdd => { 52 | resourceToAdd.depth = resource.depth + 1; 53 | }); 54 | 55 | // a threshold is defined, take it into account 56 | if (this.opts.maxResources > 0) { 57 | const resourceCount = await project.queue.count(); 58 | const maxResourcesToAdd = Math.max(0, this.opts.maxResources - resourceCount); 59 | 60 | // add resources below the threshold 61 | if (maxResourcesToAdd > 0) { 62 | // inserting all resources doesn't exceed the threshold 63 | if (maxResourcesToAdd >= resourcesToAdd.length) { 64 | await project.queue.add(resourcesToAdd); 65 | } 66 | // inserting all resources exceeds the threshold, only insert a subset 67 | else { 68 | const toCheckUrls = resourcesToAdd.map(resourceToAdd => resourceToAdd.url); 69 | const newUrls = await project.queue.filterNewUrls(toCheckUrls); 70 | let newResourcesNotInStorage = resourcesToAdd.filter(resourceToAdd => newUrls.includes(resourceToAdd.url)); 71 | 72 | if (newResourcesNotInStorage.length > 0) { 73 | newResourcesNotInStorage = newResourcesNotInStorage.slice(0, Math.min(maxResourcesToAdd, newResourcesNotInStorage.length)); 74 | } 75 | await project.queue.add(newResourcesNotInStorage); 76 | } 77 | } 78 | } 79 | // no threshold, insert all resources 80 | else { 81 | await project.queue.add(resourcesToAdd); 82 | } 83 | 84 | return { resourcesToAdd: null }; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/plugins/default/UpsertResourcePlugin.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-await-in-loop */ 2 | /* eslint-disable no-param-reassign */ 3 | import Plugin from '../Plugin'; 4 | import Project from '../../storage/base/Project'; 5 | import Resource from '../../storage/base/Resource'; 6 | import { SchemaType } from '../../schema/SchemaHelper'; 7 | import { getLogger } from '../../logger/Logger'; 8 | 9 | /** Updates a resource in the database after its scraping completes. */ 10 | export default class UpsertResourcePlugin extends Plugin { 11 | static get schema() { 12 | return { 13 | type: 'object', 14 | title: 'Upsert Resource Plugin', 15 | description: 'updates a static resource or inserts a dynamic one after scraping it.', 16 | properties: { 17 | keepHtmlData: { 18 | type: 'boolean', 19 | default: false, 20 | title: 'Keep Html Data', 21 | description: 'Whether or not to save html buffer response (if present) under resource.data', 22 | }, 23 | }, 24 | } as const; 25 | } 26 | 27 | logger = getLogger('UpsertResourcePlugin'); 28 | opts: SchemaType; 29 | 30 | constructor(opts: SchemaType = {}) { 31 | super(opts); 32 | } 33 | 34 | test(project: Project, resource: Resource) { 35 | return !!(resource); 36 | } 37 | 38 | async apply(project: Project, resource: Resource) { 39 | // guard against incomplete resources not capable of updating the scrape queue 40 | if (!resource.status || !resource.queueEntryId) { 41 | throw new Error('incomplete resource'); 42 | } 43 | 44 | /* 45 | scrape complete, update queue entry, save scraped resource 46 | a resource generated from dynamic actions doesn't update the corresponding queue entry, it has already been updated by the `parent` static resource 47 | 48 | at some point, treat differently: 49 | - scraped in error resources: don't add them to the resource table as they don't contain succesfull scraped content 50 | */ 51 | if (!resource.actions) { 52 | await Promise.all([ 53 | this.saveResource(resource), 54 | project.queue.updateStatus(resource.queueEntryId, resource.status), 55 | ]); 56 | } 57 | else { 58 | await this.saveResource(resource); 59 | } 60 | 61 | /* 62 | after a resource is updated, remove its dynamic actions 63 | this allows for other dynamic plugins to be triggered 64 | */ 65 | return { actions: null }; 66 | } 67 | 68 | async saveResource(resource: Resource) { 69 | // scrape complete, remove inProgress flag, set scrape date 70 | resource.scrapedAt = new Date(Date.now()); 71 | 72 | // only save html response under resource.data (Uint8Array) if the corresponding flag is set 73 | if (!this.opts.keepHtmlData && (/html/i).test(resource.contentType) && resource.data) { 74 | resource.data = null; 75 | } 76 | 77 | await resource.save(); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/plugins/dom-utils.ts: -------------------------------------------------------------------------------- 1 | export const enum DomStabilityStatus { 2 | Stable, 3 | Unstable, 4 | Unchanged 5 | } 6 | 7 | /** 8 | * Useful for bypassing preloader content. 9 | * @param stabilityCheck - Considers the page loaded and ready to be scraped when there are no more DOM changes within the specified amount of time (milliseconds). 10 | * @param stabilityTimeout - Maximum waiting time (miliseconds) for achieving DOM stability in case of a continuously updated DOM (ex: timers, countdowns). 11 | */ 12 | function waitForDomStability({ stabilityCheck, stabilityTimeout }:{stabilityCheck: number, stabilityTimeout: number}):Promise { 13 | return new Promise(resolve => { 14 | let stabilityCheckId:number; 15 | let stabilityTimeoutId:number; 16 | let domChanged = false; 17 | 18 | // if this is reached, DOM is stable 19 | const waitStableResolve = observer => { 20 | window.clearTimeout(stabilityTimeoutId); 21 | observer.disconnect(); 22 | resolve(domChanged ? DomStabilityStatus.Stable : DomStabilityStatus.Unchanged); 23 | }; 24 | 25 | const observer = new MutationObserver((mutationList, observer) => { 26 | for (let i = 0; i < mutationList.length; i += 1) { 27 | // we only care if new nodes have been added 28 | if (mutationList[i].type === 'childList') { 29 | // restart the stabilityCheck timer 30 | domChanged = true; 31 | window.clearTimeout(stabilityCheckId); 32 | stabilityCheckId = window.setTimeout(waitStableResolve, stabilityCheck, observer); 33 | break; 34 | } 35 | } 36 | }); 37 | 38 | // start stability check countdown 39 | stabilityCheckId = window.setTimeout(waitStableResolve, stabilityCheck, observer); 40 | 41 | // start observing document.body 42 | observer.observe(document.body, { attributes: true, childList: true, subtree: true }); 43 | 44 | // enforce stability timeout 45 | stabilityTimeoutId = window.setTimeout( 46 | () => { 47 | // clear in progress stability check 48 | window.clearTimeout(stabilityCheckId); 49 | 50 | observer.disconnect(); 51 | resolve(DomStabilityStatus.Unstable); 52 | }, 53 | stabilityTimeout, 54 | ); 55 | }); 56 | } 57 | 58 | export { 59 | waitForDomStability, 60 | }; 61 | -------------------------------------------------------------------------------- /src/plugins/file-utils.ts: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import path from 'path'; 3 | 4 | /** 5 | * Get closest parent dir containing a package.json file 6 | */ 7 | // eslint-disable-next-line import/prefer-default-export 8 | export function getPackageDir(startPath: string):string { 9 | const startDirPath = path.dirname(startPath); 10 | const parentPath: string[] = []; 11 | while (!fs.existsSync(path.join(startDirPath, ...parentPath, 'package.json')) || parentPath.length > 10) { 12 | parentPath.push('..'); 13 | } 14 | 15 | return path.join(startDirPath, ...parentPath); 16 | } 17 | 18 | export function moduleExists(name) { 19 | try { 20 | return require.resolve(name); 21 | } 22 | catch (e) { 23 | return false; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/plugins/url-utils.ts: -------------------------------------------------------------------------------- 1 | export const enum Protocol { 2 | HTTPS = 'https:', HTTP = 'http:' 3 | } 4 | 5 | /** 6 | * URL normalization including adding protocol prefix if missing. 7 | * Mostly used in batch insert jobs. 8 | * @param rawUrl - input url 9 | * @param defaultProtocol - protocol to add if one is not present, defaults to https 10 | * @throws error on invalid urls 11 | * @returns normalized url 12 | */ 13 | export function normalizeUrl(rawUrl: string, defaultProtocol:string = Protocol.HTTPS):string { 14 | if (!this.isURL(rawUrl)) throw new Error(`error normalizing url: ${rawUrl}`); 15 | 16 | // if protocol is missing, add default one 17 | const fullUrl = rawUrl.split('//').length === 1 ? `${defaultProtocol}//${rawUrl}` : rawUrl; 18 | return new URL(fullUrl).toString(); 19 | } 20 | 21 | /** 22 | * Identify the csv column containing an url 23 | * @param csvRow - csv row with columns separated by ',' 24 | */ 25 | export function getUrlColIdx(csvRow: string):number { 26 | const urlIdx = csvRow.split(',').map(col => col.trim()).findIndex(col => this.isURL(col)); 27 | if (urlIdx === -1) throw new Error(`could not detect url column from ${csvRow}`); 28 | return urlIdx; 29 | } 30 | 31 | /** 32 | * Check if a url is valid based on regex. Protocol prefix is optional. 33 | * @param url - input candidate 34 | * @returns - whether or not the input url is valid 35 | */ 36 | export function isURL(url: string):boolean { 37 | return /([a-z0-9]+(-[a-z0-9]+)*\.)+[a-z]{2,}/.test(url.toLowerCase()); 38 | } 39 | -------------------------------------------------------------------------------- /src/scraper/QueueBuffer.ts: -------------------------------------------------------------------------------- 1 | import Project from '../storage/base/Project'; 2 | import Resource from '../storage/base/Resource'; 3 | 4 | export default class QueueBuffer { 5 | project: Project; 6 | resources: Resource[]; 7 | size: number; 8 | 9 | /** 10 | * Prevents parallel project.queue.getResourcesToScrape calls 11 | */ 12 | refillInProgress:boolean; 13 | 14 | error; 15 | 16 | constructor(size: number) { 17 | this.size = size; 18 | this.resources = []; 19 | } 20 | 21 | init(project, resources: Resource[]) { 22 | this.project = project; 23 | this.resources = resources; 24 | } 25 | 26 | async refill():Promise { 27 | // buffer is only filled sequentially 28 | if (this.refillInProgress) return; 29 | 30 | try { 31 | this.refillInProgress = true; 32 | const toBeScrapedResources = await this.project.queue.getResourcesToScrape(this.size - this.resources.length); 33 | this.addResources(toBeScrapedResources); 34 | this.refillInProgress = false; 35 | } 36 | catch (err) { 37 | // parent call doesn't wait for this async to finish thus can't catch it, store err separately 38 | this.error = err; 39 | } 40 | } 41 | 42 | async getResource(stop:boolean):Promise { 43 | /* 44 | stop signal has been received 45 | gracefully stop scraping, allow all scrape-in-progress resources to be scraped 46 | */ 47 | if (stop) { 48 | if (this.resources.length > 0) { 49 | // re-make to-be-scraped buffered resources eligible for scraping by reseting their status flag 50 | await Promise.all(this.resources.map(resource => this.project.queue.updateStatus(resource.queueEntryId, null))); 51 | this.resources = []; 52 | } 53 | 54 | return null; 55 | } 56 | 57 | /* 58 | stop signal was received due to buffer error in an independent async call, throw the error up 59 | parent scraper will catch any errors and stop the process via the `stop` flag 60 | */ 61 | if (this.error) throw (this.error); 62 | 63 | // attemp to re-fill buffer before it's completely empty 64 | if (this.resources.length < this.size / 2) { 65 | // buffer needs to be refilled now, can't refill it independently, we risk isScrapingComplete condition to pass 66 | if (this.resources.length === 0) { 67 | await this.refill(); 68 | // take advantage of waiting for refillBuffer, directly thrown the error if one was caught 69 | // avoid isScrapingComplete returning true on empty buffer due to refillBuffer error 70 | if (this.error) throw (this.error); 71 | } 72 | // refill buffer independently 73 | else { 74 | this.refill(); 75 | } 76 | } 77 | 78 | // in the future, don't just retrieve the 1st resource, attempt to search for one meeting the concurrency conditions 79 | return this.resources.length > 0 ? this.resources.shift() : null; 80 | } 81 | 82 | addResources(resources: Resource[]) { 83 | this.resources.push(...resources); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/storage/base/Connection.ts: -------------------------------------------------------------------------------- 1 | import { IProjectStorage } from './Project'; 2 | import { IQueueStorage } from './Queue'; 3 | import { IResourceStorage } from './Resource'; 4 | 5 | export type ConnectionConfig = { 6 | client: string, 7 | [key: string]: any 8 | } 9 | 10 | export default abstract class Connection { 11 | config: ConnectionConfig; 12 | 13 | constructor(config:ConnectionConfig) { 14 | this.config = config; 15 | } 16 | 17 | abstract open():Promise; 18 | abstract close():Promise; 19 | 20 | abstract getProjectStorage():IProjectStorage; 21 | abstract getQueueStorage():IQueueStorage; 22 | abstract getResourceStorage():IResourceStorage; 23 | } 24 | -------------------------------------------------------------------------------- /src/storage/base/Entity.ts: -------------------------------------------------------------------------------- 1 | /** Base class for all entities. */ 2 | export default abstract class Entity { 3 | id: string | number; 4 | 5 | abstract save():Promise; 6 | abstract del():Promise; 7 | abstract toJSON(); 8 | 9 | abstract get dbCols(): string[]; 10 | 11 | constructor(kwArgs: Partial) { 12 | Object.keys(kwArgs).forEach(kwArgKey => { 13 | this[kwArgKey] = kwArgs[kwArgKey]; 14 | }); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/storage/base/Storage.ts: -------------------------------------------------------------------------------- 1 | import Connection from './Connection'; 2 | 3 | /** 4 | * Each storage option (db, in-memory) extends this class. 5 | */ 6 | export default abstract class Storage { 7 | conn: Connection; 8 | 9 | constructor(conn:Connection) { 10 | this.conn = conn; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/storage/knex/KnexConnection.ts: -------------------------------------------------------------------------------- 1 | import { Knex, knex } from 'knex'; 2 | import Connection, { ConnectionConfig } from '../base/Connection'; 3 | import { IProjectStorage } from '../base/Project'; 4 | import { IQueueStorage } from '../base/Queue'; 5 | import { IResourceStorage } from '../base/Resource'; 6 | import KnexProject from './KnexProject'; 7 | import KnexQueue from './KnexQueue'; 8 | import KnexResource from './KnexResource'; 9 | 10 | export default class KnexConnection extends Connection { 11 | knex: Knex; 12 | config: Knex.Config & {client: string}; 13 | 14 | constructor(config?:ConnectionConfig) { 15 | // if no config present, use in memory sqlite 16 | super(config || { 17 | client: 'sqlite3', 18 | useNullAsDefault: true, 19 | connection: { 20 | filename: ':memory:', 21 | }, 22 | debug: false, 23 | }); 24 | } 25 | 26 | async open() { 27 | if (!this.knex) { 28 | this.knex = knex(this.config); 29 | } 30 | } 31 | 32 | async close():Promise { 33 | if (this.knex) { 34 | const { knex } = this; 35 | delete this.knex; 36 | await knex.destroy(); 37 | } 38 | } 39 | 40 | getProjectStorage():IProjectStorage { 41 | return new KnexProject(this); 42 | } 43 | 44 | getQueueStorage():IQueueStorage { 45 | return new KnexQueue(this); 46 | } 47 | 48 | getResourceStorage():IResourceStorage { 49 | return new KnexResource(this); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/storage/knex/KnexProject.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable no-await-in-loop */ 2 | import Project, { IProjectStorage } from '../base/Project'; 3 | import KnexStorage from './KnexStorage'; 4 | 5 | export default class KnexProject extends KnexStorage implements IProjectStorage { 6 | get builder() { 7 | return this.knex('projects'); 8 | } 9 | 10 | async init():Promise { 11 | const schemaBuilder = this.knex.schema; 12 | const tablePresent = await schemaBuilder.hasTable('projects'); 13 | if (tablePresent) return; 14 | 15 | await schemaBuilder.createTable( 16 | 'projects', 17 | builder => { 18 | builder.increments('id').primary(); 19 | builder.string('name').unique(); 20 | 21 | this.jsonCol(builder, 'pluginOpts'); 22 | }, 23 | ); 24 | } 25 | 26 | async get(nameOrId: number | string) { 27 | const colName = Number.isInteger(nameOrId) ? 'id' : 'name'; 28 | return this.builder.where({ [colName]: nameOrId }).first(); 29 | } 30 | 31 | save(project:Project):Promise { 32 | return super.save(project); 33 | } 34 | 35 | update(project: Project):Promise { 36 | return this.builder.where('id', project.id).update(this.toJSON(project)); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/storage/knex/KnexResource.ts: -------------------------------------------------------------------------------- 1 | import Project from '../base/Project'; 2 | import Resource, { ResourceQuery, IResourceStorage } from '../base/Resource'; 3 | import KnexStorage from './KnexStorage'; 4 | 5 | /** @see {@link Resource} */ 6 | export default class KnexResource extends KnexStorage implements IResourceStorage { 7 | projectId: string | number; 8 | 9 | get tableName():string { 10 | if (!this.projectId) throw new Error('projectId not set'); 11 | return `${this.projectId}-resources`; 12 | } 13 | 14 | get builder() { 15 | return this.knex(this.tableName); 16 | } 17 | 18 | async init(project:Project):Promise { 19 | if (!project.id) throw new Error('project.id not set'); 20 | this.projectId = project.id; 21 | 22 | const schemaBuilder = this.knex.schema; 23 | const tablePresent = await schemaBuilder.hasTable(this.tableName); 24 | if (tablePresent) return; 25 | 26 | await schemaBuilder.createTable( 27 | this.tableName, 28 | builder => { 29 | builder.increments('id').primary(); 30 | builder.string('url'); 31 | builder.integer('depth').defaultTo(0); 32 | builder.dateTime('scrapedAt'); 33 | 34 | builder.integer('status'); 35 | builder.string('contentType'); 36 | 37 | this.jsonCol(builder, 'content'); 38 | this.jsonCol(builder, 'parent'); 39 | this.jsonCol(builder, 'actions'); 40 | 41 | this.binaryCol(builder, 'data'); 42 | }, 43 | ); 44 | } 45 | 46 | async getPagedResources(query: Partial):Promise[]> { 47 | const { cols, where, whereNotNull, whereIn, offset, limit } = query; 48 | 49 | let queryBuilder = this.builder.select(cols || [ 'url', 'content' ]).orderBy('id'); 50 | 51 | if (where && Object.keys(where).length > 0) { 52 | queryBuilder = queryBuilder.where(where); 53 | } 54 | if (offset !== undefined) { 55 | queryBuilder = queryBuilder.offset(offset); 56 | } 57 | if (limit !== undefined) { 58 | queryBuilder = queryBuilder.limit(limit); 59 | } 60 | if (whereNotNull) { 61 | whereNotNull.forEach(notNullCol => { 62 | queryBuilder = queryBuilder.whereNotNull(notNullCol); 63 | }); 64 | } 65 | if (whereIn) { 66 | Object.keys(whereIn).forEach(key => { 67 | queryBuilder = queryBuilder.whereIn(key, whereIn[key]); 68 | }); 69 | } 70 | 71 | return queryBuilder; 72 | } 73 | 74 | getResource(url: string):Promise { 75 | return this.builder.where({ url }).first(); 76 | } 77 | 78 | delAll():Promise { 79 | return this.builder.del(); 80 | } 81 | 82 | drop() { 83 | return this.knex.schema.dropTable(this.tableName); 84 | } 85 | 86 | count():Promise { 87 | return super.count(this.tableName); 88 | } 89 | 90 | save(resource):Promise { 91 | return super.save(resource); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /test/.mocharc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | diff: true, 3 | recursive: true, 4 | extension: ['ts'], 5 | package: './package.json', 6 | reporter: 'spec', 7 | timeout: 55000, 8 | file: ['./test/utils/ts-node-config.js', './test/utils/shims.js'] 9 | }; -------------------------------------------------------------------------------- /test/acceptance/cheerio.ts: -------------------------------------------------------------------------------- 1 | import acceptanceSuite from './acceptance-suite'; 2 | import Connection from '../../src/storage/base/Connection'; 3 | import KnexConnection from '../../src/storage/knex/KnexConnection'; 4 | import * as sqliteConn from '../config/storage/sqlite/sqlite-conn.json'; 5 | import * as mysqlConn from '../config/storage/mysql/mysql-conn.json'; 6 | import * as pgConn from '../config/storage/pg/pg-conn.json'; 7 | import CheerioClient from '../../src/domclient/CheerioClient'; 8 | import { ConcurrencyOptions } from '../../src/scraper/ConcurrencyManager'; 9 | import { PluginOpts } from '../../src'; 10 | 11 | const conn:Connection[] = [ 12 | new KnexConnection(sqliteConn), 13 | new KnexConnection(mysqlConn), 14 | new KnexConnection(pgConn), 15 | ]; 16 | 17 | const concurrencyOptions:ConcurrencyOptions[] = [ 18 | { 19 | proxyPool: [ { 20 | host: '127.0.0.1', 21 | port: 8080, 22 | } ], 23 | }, 24 | { 25 | proxy: { 26 | maxRequests: 10, 27 | delay: 100, 28 | }, 29 | domain: { 30 | maxRequests: 10, 31 | delay: 100, 32 | }, 33 | proxyPool: [ { 34 | host: '127.0.0.1', 35 | port: 8080, 36 | } ], 37 | }, 38 | ]; 39 | 40 | const pluginOptions: PluginOpts[][] = [ 41 | [ 42 | { 43 | name: 'NodeFetchPlugin', 44 | headers: { 45 | 'Accept-Encoding': 'br,gzip,deflate', 46 | }, 47 | }, 48 | ], 49 | [ 50 | { 51 | name: 'NodeFetchPlugin', 52 | headers: { 53 | 'Accept-Encoding': 'identity', 54 | }, 55 | }, 56 | ], 57 | ]; 58 | 59 | for (let i = 0; i < conn.length; i += 1) { 60 | for (let j = 0; j < concurrencyOptions.length; j += 1) { 61 | /* 62 | only when using cheerio 63 | for parallel scraping, fetch resources both compressed and uncompressed 64 | sequential scraping will fetch using default headers (accepting gzip) like the other acceptance suites 65 | */ 66 | const nodeScrapeWithCustomHeaders = concurrencyOptions[j].proxy && concurrencyOptions[j].proxy.maxRequests > 1; 67 | 68 | if (nodeScrapeWithCustomHeaders) { 69 | for (let k = 0; k < pluginOptions.length; k += 1) { 70 | acceptanceSuite( 71 | 'dom-static-content', 72 | conn[i], 73 | CheerioClient, 74 | concurrencyOptions[j], 75 | pluginOptions[k], 76 | ); 77 | } 78 | } 79 | else { 80 | acceptanceSuite( 81 | 'dom-static-content', 82 | conn[i], 83 | CheerioClient, 84 | concurrencyOptions[j], 85 | ); 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /test/acceptance/cli/config/config-single-page-single-content-entry-custom-plugin.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "../../../tmp/db.sqlite" 7 | } 8 | }, 9 | "client": { 10 | "name": "cheerio" 11 | }, 12 | "project": { 13 | "name": "sitea.com", 14 | "pipeline": "dom-static-content", 15 | "pluginOpts": [ 16 | { 17 | "name": "NodeFetchPlugin", 18 | "headers": { 19 | "Accept-Encoding": "identity" 20 | } 21 | }, 22 | { 23 | "name": "H1CounterPlugin", 24 | "path": "../plugins/h1-counter-plugin.js", 25 | "replace": "ExtractHtmlContentPlugin", 26 | "startVal": 50 27 | } 28 | ], 29 | "resources": [ 30 | { 31 | "url": "http://sitea.com/index.html" 32 | } 33 | ] 34 | }, 35 | "concurrency": { 36 | "proxy": { 37 | "maxRequests": 10, 38 | "delay": 100 39 | }, 40 | "domain": { 41 | "maxRequests": 10, 42 | "delay": 100 43 | }, 44 | "proxyPool": [ { 45 | "host": "127.0.0.1", 46 | "port": 8080 47 | } ] 48 | }, 49 | "process": { 50 | } 51 | } -------------------------------------------------------------------------------- /test/acceptance/cli/config/config-single-page-single-content-entry.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "../../../tmp/db.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "cheerio" 12 | }, 13 | "project": { 14 | "name": "sitea.com", 15 | "pipeline": "dom-static-content", 16 | "pluginOpts": [ 17 | { 18 | "name": "NodeFetchPlugin", 19 | "headers": { 20 | "Accept-Encoding": "identity" 21 | } 22 | }, 23 | { 24 | "name": "ExtractHtmlContentPlugin", 25 | "selectorPairs": [ 26 | { 27 | "contentSelector": "h1" 28 | } 29 | ] 30 | } 31 | ], 32 | "resources": [ 33 | { 34 | "url": "http://sitea.com/index.html" 35 | } 36 | ] 37 | }, 38 | "concurrency": { 39 | "proxy": { 40 | "maxRequests": 10, 41 | "delay": 100 42 | }, 43 | "domain": { 44 | "maxRequests": 10, 45 | "delay": 100 46 | }, 47 | "proxyPool": [ { 48 | "host": "127.0.0.1", 49 | "port": 8080 50 | } ] 51 | }, 52 | "process": { 53 | } 54 | } -------------------------------------------------------------------------------- /test/acceptance/cli/config/config-with-external-resources.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "../../../tmp/db.sqlite" 7 | } 8 | }, 9 | "client": { 10 | "name": "cheerio" 11 | }, 12 | "project": { 13 | "name": "sitea.com", 14 | "pipeline": "dom-static-content", 15 | "pluginOpts": [ 16 | { 17 | "name": "NodeFetchPlugin", 18 | "headers": { 19 | "Accept-Encoding": "identity" 20 | } 21 | }, 22 | { 23 | "name": "ExtractHtmlContentPlugin", 24 | "selectorPairs": [ 25 | { 26 | "contentSelector": "h1" 27 | } 28 | ] 29 | } 30 | ], 31 | "resourcePath": "../resources/resources.csv", 32 | "resources": [ 33 | { 34 | "url": "http://sitea.com/index.html" 35 | } 36 | ] 37 | }, 38 | "concurrency": { 39 | "proxy": { 40 | "maxRequests": 10, 41 | "delay": 100 42 | }, 43 | "domain": { 44 | "maxRequests": 10, 45 | "delay": 100 46 | }, 47 | "proxyPool": [ { 48 | "host": "127.0.0.1", 49 | "port": 8080 50 | } ] 51 | }, 52 | "process": { 53 | } 54 | } -------------------------------------------------------------------------------- /test/acceptance/cli/config/config-with-invalid-external-resources.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "../../../tmp/db.sqlite" 7 | } 8 | }, 9 | "client": { 10 | "name": "cheerio" 11 | }, 12 | "project": { 13 | "name": "sitea.com", 14 | "pipeline": "dom-static-content", 15 | "pluginOpts": [ 16 | { 17 | "name": "NodeFetchPlugin", 18 | "headers": { 19 | "Accept-Encoding": "identity" 20 | } 21 | }, 22 | { 23 | "name": "ExtractHtmlContentPlugin", 24 | "selectorPairs": [ 25 | { 26 | "contentSelector": "h1" 27 | } 28 | ] 29 | } 30 | ], 31 | "resourcePath": "non-existent-resources.csv" 32 | }, 33 | "concurrency": { 34 | "proxy": { 35 | "maxRequests": 10, 36 | "delay": 100 37 | }, 38 | "domain": { 39 | "maxRequests": 10, 40 | "delay": 100 41 | }, 42 | "proxyPool": [ { 43 | "host": "127.0.0.1", 44 | "port": 8080 45 | } ] 46 | }, 47 | "process": { 48 | } 49 | } -------------------------------------------------------------------------------- /test/acceptance/cli/plugins/h1-counter-plugin.js: -------------------------------------------------------------------------------- 1 | class H1CounterPlugin { 2 | opts = { 3 | startVal: 10, 4 | } 5 | 6 | // defines csv export columns 7 | getContentKeys() { 8 | return [ 'h1', 'h1Length' ]; 9 | } 10 | 11 | test(project, resource) { 12 | if (!resource) return false; 13 | return (/html/i).test(resource.contentType); 14 | } 15 | 16 | apply(project, resource, DomClient) { 17 | const doc = new DomClient(resource.data); 18 | 19 | const content = doc.querySelectorAll('h1').map(domNode => ([ 20 | domNode.getAttribute('innerText'), 21 | domNode.getAttribute('innerText').length + this.opts.startVal, 22 | ])); 23 | 24 | /* 25 | a content entry is represented by an array containing one or multiple scraped values 26 | we can have multiple content entries for a single resources due to dom selectors returning multiple results 27 | */ 28 | return { content }; 29 | } 30 | } 31 | 32 | module.exports = H1CounterPlugin; 33 | -------------------------------------------------------------------------------- /test/acceptance/cli/resources/resources-single-entry.csv: -------------------------------------------------------------------------------- 1 | 1,http://siteA.com/other1.html -------------------------------------------------------------------------------- /test/acceptance/cli/resources/resources.csv: -------------------------------------------------------------------------------- 1 | 1,http://sitea.com/other1.html 2 | 2,http://sitea.com/other2.html 3 | 3,http://sitea.com/other3.html -------------------------------------------------------------------------------- /test/acceptance/cli/resources/unnormalized-resources.csv: -------------------------------------------------------------------------------- 1 | 1,http://siteA.com/other1.html 2 | 2,http://siteA.com/other2.html 3 | 3,invalid-url -------------------------------------------------------------------------------- /test/acceptance/docker/config/base-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "storage": { 3 | "client": "sqlite3", 4 | "useNullAsDefault": true, 5 | "connection": { 6 | "filename": "gsf.sqlite" 7 | }, 8 | "debug": false 9 | }, 10 | "client": { 11 | "name": "cheerio" 12 | }, 13 | "project": { 14 | "name": "myProj", 15 | "pipeline": "dom-static-content", 16 | "pluginOpts": [ 17 | { 18 | "name": "ExtractHtmlContentPlugin", 19 | "selectorPairs": [ 20 | { 21 | "contentSelector": "h1" 22 | } 23 | ] 24 | } 25 | ], 26 | "resources": [ 27 | { 28 | "url": "http://sitea.com/index.html" 29 | } 30 | ] 31 | }, 32 | "concurrency": { 33 | "proxyPool": [ 34 | { 35 | "host": "127.0.0.1", 36 | "port": 8080 37 | } 38 | ] 39 | } 40 | } -------------------------------------------------------------------------------- /test/acceptance/jsdom.ts: -------------------------------------------------------------------------------- 1 | import acceptanceSuite from './acceptance-suite'; 2 | import Connection from '../../src/storage/base/Connection'; 3 | import KnexConnection from '../../src/storage/knex/KnexConnection'; 4 | import * as sqliteConn from '../config/storage/sqlite/sqlite-conn.json'; 5 | import * as mysqlConn from '../config/storage/mysql/mysql-conn.json'; 6 | import * as pgConn from '../config/storage/pg/pg-conn.json'; 7 | import JsdomClient from '../../src/domclient/JsdomClient'; 8 | import { ConcurrencyOptions } from '../../src/scraper/ConcurrencyManager'; 9 | 10 | const conn:Connection[] = [ 11 | new KnexConnection(sqliteConn), 12 | new KnexConnection(mysqlConn), 13 | new KnexConnection(pgConn), 14 | ]; 15 | 16 | const concurrencyOptions:ConcurrencyOptions[] = [ 17 | { 18 | proxyPool: [ { 19 | host: '127.0.0.1', 20 | port: 8080, 21 | } ], 22 | }, 23 | { 24 | proxy: { 25 | maxRequests: 10, 26 | delay: 100, 27 | }, 28 | domain: { 29 | maxRequests: 10, 30 | delay: 100, 31 | }, 32 | proxyPool: [ { 33 | host: '127.0.0.1', 34 | port: 8080, 35 | } ], 36 | }, 37 | ]; 38 | 39 | for (let i = 0; i < conn.length; i += 1) { 40 | for (let j = 0; j < concurrencyOptions.length; j += 1) { 41 | acceptanceSuite( 42 | 'dom-static-content', 43 | conn[i], 44 | JsdomClient, 45 | concurrencyOptions[j], 46 | ); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /test/acceptance/playwright_chromium.ts: -------------------------------------------------------------------------------- 1 | import acceptanceSuite from './acceptance-suite'; 2 | import Connection from '../../src/storage/base/Connection'; 3 | import KnexConnection from '../../src/storage/knex/KnexConnection'; 4 | import * as sqliteConn from '../config/storage/sqlite/sqlite-conn.json'; 5 | import * as mysqlConn from '../config/storage/mysql/mysql-conn.json'; 6 | import * as pgConn from '../config/storage/pg/pg-conn.json'; 7 | import * as playwrightChromium from '../config/browserclient/playwright/playwright-chromium.json'; 8 | import PlaywrightClient from '../../src/browserclient/PlaywrightClient'; 9 | import { ConcurrencyOptions } from '../../src/scraper/ConcurrencyManager'; 10 | 11 | const browserClient = new PlaywrightClient(playwrightChromium); 12 | 13 | const conn:Connection[] = [ 14 | new KnexConnection(sqliteConn), 15 | new KnexConnection(mysqlConn), 16 | new KnexConnection(pgConn), 17 | ]; 18 | 19 | const concurrencyOptions:ConcurrencyOptions[] = [ 20 | { 21 | proxyPool: [ { 22 | host: '127.0.0.1', 23 | port: 8080, 24 | } ], 25 | }, 26 | { 27 | proxy: { 28 | maxRequests: 10, 29 | delay: 100, 30 | }, 31 | domain: { 32 | maxRequests: 10, 33 | delay: 100, 34 | }, 35 | proxyPool: [ { 36 | host: '127.0.0.1', 37 | port: 8080, 38 | } ], 39 | }, 40 | ]; 41 | 42 | for (let i = 0; i < conn.length; i += 1) { 43 | for (let j = 0; j < concurrencyOptions.length; j += 1) { 44 | acceptanceSuite( 45 | 'browser-static-content', 46 | conn[i], 47 | browserClient, 48 | concurrencyOptions[j], 49 | ); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /test/acceptance/puppeteer_chromium.ts: -------------------------------------------------------------------------------- 1 | import acceptanceSuite from './acceptance-suite'; 2 | import Connection from '../../src/storage/base/Connection'; 3 | import KnexConnection from '../../src/storage/knex/KnexConnection'; 4 | import * as sqliteConn from '../config/storage/sqlite/sqlite-conn.json'; 5 | import * as mysqlConn from '../config/storage/mysql/mysql-conn.json'; 6 | import * as pgConn from '../config/storage/pg/pg-conn.json'; 7 | import * as puppeteerChromium from '../config/browserclient/puppeteer/puppeteer-chromium.json'; 8 | import PuppeteerClient from '../../src/browserclient/PuppeteerClient'; 9 | import { ConcurrencyOptions } from '../../src/scraper/ConcurrencyManager'; 10 | 11 | const browserClient = new PuppeteerClient(puppeteerChromium); 12 | 13 | const conn:Connection[] = [ 14 | new KnexConnection(sqliteConn), 15 | new KnexConnection(mysqlConn), 16 | new KnexConnection(pgConn), 17 | ]; 18 | 19 | const concurrencyOptions:ConcurrencyOptions[] = [ 20 | { 21 | proxyPool: [ { 22 | host: '127.0.0.1', 23 | port: 8080, 24 | } ], 25 | }, 26 | { 27 | proxy: { 28 | maxRequests: 10, 29 | delay: 100, 30 | }, 31 | domain: { 32 | maxRequests: 10, 33 | delay: 100, 34 | }, 35 | proxyPool: [ { 36 | host: '127.0.0.1', 37 | port: 8080, 38 | } ], 39 | }, 40 | ]; 41 | 42 | for (let i = 0; i < conn.length; i += 1) { 43 | for (let j = 0; j < concurrencyOptions.length; j += 1) { 44 | acceptanceSuite( 45 | 'browser-static-content', 46 | conn[i], 47 | browserClient, 48 | concurrencyOptions[j], 49 | ); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /test/config/browserclient/playwright/playwright-chromium.json: -------------------------------------------------------------------------------- 1 | { 2 | "browser": "chromium", 3 | "headless": true, 4 | "ignoreHTTPSErrors": true, 5 | "slowMo": 20, 6 | "args": [ 7 | "--host-rules=MAP *:80 127.0.0.1:8080, MAP *:443 127.0.0.1:8443", 8 | "--ignore-certificate-errors", 9 | 10 | "--disable-gpu", 11 | "--disable-dev-shm-usage", 12 | "--disable-setuid-sandbox", 13 | "--no-first-run", 14 | "--no-sandbox", 15 | "--no-zygote", 16 | "--single-process" 17 | ] 18 | } -------------------------------------------------------------------------------- /test/config/browserclient/puppeteer/puppeteer-chromium.json: -------------------------------------------------------------------------------- 1 | { 2 | "browser": "chromium", 3 | "headless": true, 4 | "ignoreHTTPSErrors": true, 5 | "slowMo": 20, 6 | "args": [ 7 | "--host-rules=MAP *:80 127.0.0.1:8080, MAP *:443 127.0.0.1:8443", 8 | "--ignore-certificate-errors", 9 | 10 | "--disable-gpu", 11 | "--disable-dev-shm-usage", 12 | "--disable-setuid-sandbox", 13 | "--no-first-run", 14 | "--no-sandbox", 15 | "--no-zygote", 16 | "--single-process" 17 | ] 18 | } -------------------------------------------------------------------------------- /test/config/storage/mysql/mysql-conn.json: -------------------------------------------------------------------------------- 1 | { 2 | "client": "mysql", 3 | "useNullAsDefault": true, 4 | "connection": { 5 | "host": "localhost", 6 | "port": "33060", 7 | "user": "gsf-user", 8 | "password": "gsf-pswd", 9 | "database": "gsf-db" 10 | }, 11 | "debug": false 12 | } -------------------------------------------------------------------------------- /test/config/storage/mysql/mysql.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | services: 3 | mysql: 4 | image: mysql/mysql-server:5.7 5 | environment: 6 | MYSQL_ROOT_PASSWORD: gsf-root 7 | MYSQL_USER: gsf-user 8 | MYSQL_PASSWORD: gsf-pswd 9 | MYSQL_DATABASE: gsf-db 10 | MYSQL_HOST: localhost 11 | ports: 12 | - 33060:3306 13 | 14 | -------------------------------------------------------------------------------- /test/config/storage/pg/pg-conn.json: -------------------------------------------------------------------------------- 1 | { 2 | "client": "pg", 3 | "useNullAsDefault": true, 4 | "connection": { 5 | "host": "localhost", 6 | "port": "54320", 7 | "user": "gsf-user", 8 | "password": "gsf-pswd", 9 | "database": "gsf-db" 10 | }, 11 | "debug": false 12 | } -------------------------------------------------------------------------------- /test/config/storage/pg/pg.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | services: 3 | pg: 4 | image: postgres:11-alpine 5 | environment: 6 | POSTGRES_USER: gsf-user 7 | POSTGRES_PASSWORD: gsf-pswd 8 | POSTGRES_DB: gsf-db 9 | ports: 10 | - 54320:5432 11 | 12 | -------------------------------------------------------------------------------- /test/config/storage/sqlite/sqlite-conn.json: -------------------------------------------------------------------------------- 1 | { 2 | "client": "sqlite3", 3 | "useNullAsDefault": true, 4 | "connection": { 5 | "filename": "test/tmp/acc.sqlite" 6 | }, 7 | "debug": false 8 | } -------------------------------------------------------------------------------- /test/tmp/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/get-set-fetch/scraper/879c03e2811aaa13479095a4db376563059b3b4d/test/tmp/.gitkeep -------------------------------------------------------------------------------- /test/tsconfig.test.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "esModuleInterop": true, 4 | "resolveJsonModule": true, 5 | "target": "esnext", 6 | "strict": false, 7 | "moduleResolution": "node", 8 | "module": "commonjs", 9 | "allowJs": true, 10 | "newLine": "LF", 11 | "useDefineForClassFields": false 12 | }, 13 | "include": [ 14 | "../test" 15 | ], 16 | 17 | } -------------------------------------------------------------------------------- /test/unit/confighash/test-config-hash.ts: -------------------------------------------------------------------------------- 1 | import { assert } from 'chai'; 2 | import { encode, decode } from '../../../src/confighash/config-hash'; 3 | 4 | describe('ConfigHash', () => { 5 | const expectedDefinition = { 6 | name: 'projectA', 7 | pipeline: 'browser-static-content', 8 | pluginOpts: [ 9 | { 10 | name: 'ExtractUrlsPlugin', 11 | selectorPairs: [ 12 | { 13 | urlSelector: "a[href$='.html']", 14 | }, 15 | { 16 | urlSelector: 'img', 17 | }, 18 | ], 19 | }, 20 | { 21 | name: 'ExtractHtmlContentPlugin', 22 | selectorPairs: [ 23 | ], 24 | }, 25 | ], 26 | resources: [ 27 | { 28 | url: 'http://sitea.com/index.html', 29 | }, 30 | ], 31 | }; 32 | 33 | const expectedConfigHash = 'ePnXQdMJgxtWUJSfBYwER2JSJ6GkSFoqiM4oSk1TsVUHO0o9VgnkQ1QlmbnpIMfqkJPkY/EFCDA8ioHBnggOlMy8lNQKeMgAAOKgZAQ='; 34 | 35 | it('encode', () => { 36 | const encodedDefinition = encode(expectedDefinition); 37 | assert.deepEqual(encodedDefinition, expectedConfigHash); 38 | }); 39 | 40 | it('decode', () => { 41 | const decodedDefinition = decode(expectedConfigHash); 42 | assert.deepEqual(decodedDefinition, expectedDefinition); 43 | }); 44 | }); 45 | -------------------------------------------------------------------------------- /test/unit/domclients/test-cheerio-client.ts: -------------------------------------------------------------------------------- 1 | import { assert } from 'chai'; 2 | import CheerioClient from '../../../src/domclient/CheerioClient'; 3 | 4 | describe('CheerioClient', () => { 5 | it('root querySelectorAll', () => { 6 | const client = new CheerioClient(Buffer.from('

pA

pB

')); 7 | 8 | const nodes = client.querySelectorAll('p'); 9 | assert.strictEqual(nodes.length, 2); 10 | assert.strictEqual(nodes[1].getAttribute('innerText'), 'pB'); 11 | assert.strictEqual(nodes[1].getAttribute('class'), 'classB'); 12 | }); 13 | 14 | it('nested querySelectorAll', () => { 15 | const client = new CheerioClient(Buffer.from('

linkA

')); 16 | 17 | const pNodes = client.querySelectorAll('p'); 18 | assert.strictEqual(pNodes.length, 1); 19 | 20 | let linkNodes = pNodes[0].querySelectorAll('a[class="classA"]'); 21 | assert.strictEqual(linkNodes.length, 1); 22 | assert.strictEqual(linkNodes[0].getAttribute('href'), 'linkA'); 23 | 24 | linkNodes = pNodes[0].querySelectorAll('a[class="classB"]'); 25 | assert.strictEqual(linkNodes.length, 0); 26 | }); 27 | }); 28 | -------------------------------------------------------------------------------- /test/unit/domclients/test-jsdom-client.ts: -------------------------------------------------------------------------------- 1 | import { assert } from 'chai'; 2 | import JsdomClient from '../../../src/domclient/JsdomClient'; 3 | 4 | describe('JsdomClient', () => { 5 | it('root querySelectorAll', () => { 6 | const client = new JsdomClient(Buffer.from('

pA

pB

')); 7 | 8 | const nodes = client.querySelectorAll('p'); 9 | assert.strictEqual(nodes.length, 2); 10 | assert.strictEqual(nodes[1].getAttribute('innerText'), 'pB'); 11 | assert.strictEqual(nodes[1].getAttribute('class'), 'classB'); 12 | }); 13 | 14 | it('nested querySelectorAll', () => { 15 | const client = new JsdomClient(Buffer.from('

linkA

')); 16 | 17 | const pNodes = client.querySelectorAll('p'); 18 | assert.strictEqual(pNodes.length, 1); 19 | 20 | let linkNodes = pNodes[0].querySelectorAll('a[class="classA"]'); 21 | assert.strictEqual(linkNodes.length, 1); 22 | assert.strictEqual(linkNodes[0].getAttribute('href'), 'linkA'); 23 | 24 | linkNodes = pNodes[0].querySelectorAll('a[class="classB"]'); 25 | assert.strictEqual(linkNodes.length, 0); 26 | }); 27 | }); 28 | -------------------------------------------------------------------------------- /test/unit/exporter/test-csv-exporter.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-empty-function */ 2 | import fs from 'fs'; 3 | import { assert } from 'chai'; 4 | import { SinonSandbox, createSandbox } from 'sinon'; 5 | import CsvExporter from '../../../src/export/CsvExporter'; 6 | import Project from '../../../src/storage/base/Project'; 7 | import ConnectionManager from '../../../src/storage/ConnectionManager'; 8 | 9 | describe('CsvExporter', () => { 10 | let sandbox:SinonSandbox; 11 | let exporter: CsvExporter; 12 | let project; 13 | let content: string; 14 | const lineSeparator = '\n'; 15 | 16 | beforeEach(() => { 17 | content = ''; 18 | sandbox = createSandbox(); 19 | project = sandbox.createStubInstance(Project); 20 | 21 | sandbox.stub(fs, 'createWriteStream').returns({ 22 | write: (val: string) => { 23 | content += val; 24 | }, 25 | close: () => {}, 26 | }); 27 | 28 | sandbox.stub(ConnectionManager, 'clone').returns( 29 | sandbox.createStubInstance( 30 | ConnectionManager, 31 | { getProject: { get: () => project } }, 32 | ), 33 | ); 34 | 35 | exporter = new CsvExporter({ filepath: 'fileA.csv' }); 36 | }); 37 | 38 | afterEach(() => { 39 | sandbox.restore(); 40 | }); 41 | 42 | it('array values - single selector', async () => { 43 | sandbox.stub(exporter, 'getContentKeys').returns([ 'colA' ]); 44 | project.getPagedResources.onCall(0).returns([ 45 | { url: 'urlA', content: [ [ 'A1 content' ], [ 'A2 content' ] ] }, 46 | { url: 'urlB', content: [ [ 'A3 content' ] ] }, 47 | ]); 48 | project.getPagedResources.onCall(1).returns([]); 49 | await exporter.export(project); 50 | 51 | const expectedContent = `url,colA 52 | urlA,"A1 content" 53 | urlA,"A2 content" 54 | urlB,"A3 content"` 55 | .split(lineSeparator).map(csvLine => csvLine.trim()).join(lineSeparator); 56 | 57 | assert.strictEqual(content, expectedContent); 58 | }); 59 | 60 | it('array values - single selector - empty content', async () => { 61 | sandbox.stub(exporter, 'getContentKeys').returns([ 'colA' ]); 62 | project.getPagedResources.onCall(0).returns([ 63 | { url: 'urlA', content: [ [ 'A1 content' ] ] }, 64 | { url: 'urlB', content: [ [ ] ] }, 65 | { url: 'urlC', content: [ ] }, 66 | ]); 67 | project.getPagedResources.onCall(1).returns([]); 68 | await exporter.export(project); 69 | 70 | const expectedContent = `url,colA 71 | urlA,"A1 content" 72 | urlB 73 | urlC` 74 | .split(lineSeparator).map(csvLine => csvLine.trim()).join(lineSeparator); 75 | 76 | assert.strictEqual(content, expectedContent); 77 | }); 78 | 79 | it('array values - multiple selectors', async () => { 80 | sandbox.stub(exporter, 'getContentKeys').returns([ 'colA', 'colB' ]); 81 | project.getPagedResources.onCall(0).returns([ 82 | { url: 'urlA', content: [ [ 'A1 content', 'B1 content' ], [ 'A2 content', 'B2 content' ] ] }, 83 | { url: 'urlB', content: [ [ 'A3 content', 'B3 content' ] ] }, 84 | ]); 85 | project.getPagedResources.onCall(1).returns([]); 86 | await exporter.export(project); 87 | 88 | const expectedContent = `url,colA,colB 89 | urlA,"A1 content","B1 content" 90 | urlA,"A2 content","B2 content" 91 | urlB,"A3 content","B3 content"` 92 | .split(lineSeparator).map(csvLine => csvLine.trim()).join(lineSeparator); 93 | 94 | assert.strictEqual(content, expectedContent); 95 | }); 96 | }); 97 | -------------------------------------------------------------------------------- /test/unit/logwrapper/test-log-wrapper.ts: -------------------------------------------------------------------------------- 1 | import { assert } from 'chai'; 2 | import { getLogger, setLogger } from '../../../src/logger/Logger'; 3 | 4 | describe('LogWrapper', () => { 5 | it('default log level', () => { 6 | const childWrapper = getLogger('test'); 7 | assert.strictEqual(childWrapper.logger.level, 'warn'); 8 | }); 9 | 10 | it('changes to parent logger propagate to existing child loggers', () => { 11 | const childWrapper = getLogger('test'); 12 | 13 | setLogger({ level: 'info' }); 14 | assert.strictEqual(childWrapper.logger.level, 'info'); 15 | 16 | // revert back to default log level 17 | setLogger({ level: 'warn' }); 18 | }); 19 | 20 | it('filter out log arguments', () => { 21 | const childWrapper = getLogger('test'); 22 | 23 | const rawObj = [ 24 | { a: 1, b: Buffer.from('a'), c: null, d: new Uint8Array([ 0, 1, 2 ]) }, 25 | { d: 'message C', e: Buffer.from('a'), cert: {}, f: null }, 26 | ]; 27 | 28 | // extra circular reference :) 29 | rawObj[1].f = rawObj; 30 | 31 | assert.sameDeepMembers( 32 | childWrapper.filterArg(rawObj), 33 | [ 34 | { 35 | a: 1, 36 | b: ' not included', 37 | c: null, 38 | d: ' not included', 39 | }, 40 | { 41 | d: 'message C', 42 | e: ' not included', 43 | cert: ' not included', 44 | f: null, 45 | }, 46 | ], 47 | ); 48 | }); 49 | 50 | it('filter out ignore error', () => { 51 | const childWrapper = getLogger('test'); 52 | 53 | const err = new Error('unexpected error'); 54 | const filteredErr = childWrapper.filterArg(err); 55 | 56 | assert.strictEqual(filteredErr.name, err.name); 57 | assert.strictEqual(filteredErr.message, err.message); 58 | assert.strictEqual(filteredErr.stack, err.stack); 59 | }); 60 | }); 61 | -------------------------------------------------------------------------------- /test/unit/pipelines/test-merge-plugin-opts.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable prefer-destructuring */ 2 | import { assert } from 'chai'; 3 | import { PluginOpts } from '../../../src/plugins/Plugin'; 4 | import { pipelines, mergePluginOpts } from '../../../src/pipelines/pipelines'; 5 | 6 | describe('MergePluginOpts', () => { 7 | const { defaultPluginOpts } = pipelines['browser-static-content']; 8 | 9 | it('before anchor', () => { 10 | const customOpts = [ { 11 | name: 'CustomBeforePlugin', 12 | before: 'BrowserFetchPlugin', 13 | } ]; 14 | 15 | const mergedOpts = mergePluginOpts(defaultPluginOpts, customOpts); 16 | 17 | assert.sameDeepOrderedMembers( 18 | mergedOpts, 19 | [ ...customOpts, ...defaultPluginOpts ], 20 | ); 21 | }); 22 | 23 | it('after anchor', () => { 24 | const customOpts = [ { 25 | name: 'CustomAfterPlugin', 26 | after: 'UpsertResourcePlugin', 27 | } ]; 28 | 29 | const mergedOpts = mergePluginOpts(defaultPluginOpts, customOpts); 30 | 31 | assert.sameDeepOrderedMembers( 32 | mergedOpts, 33 | [ ...defaultPluginOpts, ...customOpts ], 34 | ); 35 | }); 36 | 37 | it('replace anchor', () => { 38 | const customOpts = [ { 39 | name: 'CustomReplacePlugin', 40 | replace: 'BrowserFetchPlugin', 41 | } ]; 42 | 43 | const mergedOpts = mergePluginOpts(defaultPluginOpts, customOpts); 44 | const expectedOpts = [ ...defaultPluginOpts ]; 45 | expectedOpts[0] = customOpts[0]; 46 | 47 | assert.sameDeepOrderedMembers( 48 | mergedOpts, 49 | expectedOpts, 50 | ); 51 | }); 52 | 53 | it('merge anchor', () => { 54 | const customOpts = [ { 55 | name: 'ExtractUrlsPlugin', 56 | maxDepth: 5, 57 | } ]; 58 | 59 | const mergedOpts = mergePluginOpts(defaultPluginOpts, customOpts); 60 | const expectedOpts = [ ...defaultPluginOpts ]; 61 | expectedOpts[1] = customOpts[0]; 62 | 63 | assert.sameDeepOrderedMembers( 64 | mergedOpts, 65 | expectedOpts, 66 | ); 67 | }); 68 | 69 | it('multiple anchors', () => { 70 | const customOpts = [ 71 | { 72 | name: 'CustomBefore1Plugin', 73 | before: 'BrowserFetchPlugin', 74 | }, 75 | { 76 | name: 'CustomBefore2Plugin', 77 | before: 'BrowserFetchPlugin', 78 | }, 79 | { 80 | name: 'CustomAfter1Plugin', 81 | after: 'ExtractHtmlContentPlugin', 82 | }, 83 | { 84 | name: 'CustomAfter2Plugin', 85 | after: 'ExtractHtmlContentPlugin', 86 | }, 87 | ]; 88 | 89 | const mergedOpts = mergePluginOpts(defaultPluginOpts, customOpts); 90 | const expectedOpts:PluginOpts[] = [ 91 | { 92 | name: 'CustomBefore1Plugin', 93 | before: 'BrowserFetchPlugin', 94 | }, 95 | { 96 | name: 'CustomBefore2Plugin', 97 | before: 'BrowserFetchPlugin', 98 | }, 99 | { 100 | name: 'BrowserFetchPlugin', 101 | }, 102 | { 103 | name: 'ExtractUrlsPlugin', 104 | }, 105 | { 106 | name: 'ExtractHtmlContentPlugin', 107 | }, 108 | { 109 | name: 'CustomAfter2Plugin', 110 | after: 'ExtractHtmlContentPlugin', 111 | }, 112 | { 113 | name: 'CustomAfter1Plugin', 114 | after: 'ExtractHtmlContentPlugin', 115 | }, 116 | { 117 | name: 'InsertResourcesPlugin', 118 | }, 119 | { 120 | name: 'UpsertResourcePlugin', 121 | }, 122 | ]; 123 | 124 | assert.sameDeepOrderedMembers( 125 | mergedOpts, 126 | expectedOpts, 127 | ); 128 | }); 129 | }); 130 | -------------------------------------------------------------------------------- /test/unit/plugins/test-dom-utils.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-empty-function */ 2 | // eslint-disable-next-line max-classes-per-file 3 | import { assert } from 'chai'; 4 | import { SinonSandbox, createSandbox } from 'sinon'; 5 | import { DomStabilityStatus, waitForDomStability } from '../../../src/plugins/dom-utils'; 6 | 7 | describe('DOM Utils', () => { 8 | let sandbox:SinonSandbox; 9 | let mutationCallback: (mutationList, observer) => void; 10 | 11 | const PollyMutationObserver = class { 12 | constructor(callback) { 13 | mutationCallback = callback; 14 | } 15 | 16 | observe() {} 17 | disconnect() {} 18 | }; 19 | global.MutationObserver = PollyMutationObserver; 20 | 21 | beforeEach(() => { 22 | sandbox = createSandbox(); 23 | }); 24 | 25 | afterEach(() => { 26 | sandbox.restore(); 27 | mutationCallback = null; 28 | }); 29 | 30 | it('waitForDomStability - DomStabilityStatus.Unchanged', async () => { 31 | sandbox.stub(PollyMutationObserver.prototype); 32 | 33 | const domStatus: DomStabilityStatus = await waitForDomStability({ stabilityCheck: 102, stabilityTimeout: 500 }); 34 | await new Promise(resolve => setTimeout(resolve, 3000)); 35 | assert.strictEqual(domStatus, DomStabilityStatus.Unchanged); 36 | }); 37 | 38 | it('waitForDomStability - DomStabilityStatus.Unstable', async () => { 39 | const observer = sandbox.createStubInstance(PollyMutationObserver); 40 | 41 | const domStatusPromise: Promise = waitForDomStability({ stabilityCheck: 200, stabilityTimeout: 500 }); 42 | const intervalId = setInterval(mutationCallback, 100, [ { type: 'childList' } ], observer); 43 | 44 | const domStatus: DomStabilityStatus = await domStatusPromise; 45 | assert.strictEqual(domStatus, DomStabilityStatus.Unstable); 46 | 47 | clearInterval(intervalId); 48 | }); 49 | 50 | it('waitForDomStability - DomStabilityStatus.Stable', async () => { 51 | const observer = sandbox.createStubInstance(PollyMutationObserver); 52 | 53 | const domStatusPromise: Promise = waitForDomStability({ stabilityCheck: 200, stabilityTimeout: 500 }); 54 | const intervalId = setTimeout(mutationCallback, 100, [ { type: 'childList' } ], observer); 55 | 56 | const domStatus: DomStabilityStatus = await domStatusPromise; 57 | assert.strictEqual(domStatus, DomStabilityStatus.Stable); 58 | 59 | clearInterval(intervalId); 60 | }); 61 | }); 62 | -------------------------------------------------------------------------------- /test/unit/plugins/test-insert-resources-plugin.ts: -------------------------------------------------------------------------------- 1 | import { assert } from 'chai'; 2 | import { createSandbox, SinonSandbox, SinonStubbedInstance } from 'sinon'; 3 | import InsertResourcesPlugin from '../../../src/plugins/default/InsertResourcesPlugin'; 4 | import Queue from '../../../src/storage/base/Queue'; 5 | import Resource from '../../../src/storage/base/Resource'; 6 | 7 | describe('InsertResourcesPlugin', () => { 8 | let sandbox:SinonSandbox; 9 | let plugin: InsertResourcesPlugin; 10 | let project:{queue: SinonStubbedInstance}; 11 | 12 | beforeEach(() => { 13 | sandbox = createSandbox(); 14 | 15 | const queue = sandbox.stub({ 16 | count: () => null, 17 | filterNewUrls: urls => null, 18 | add: () => null, 19 | }); 20 | queue.count.returns(Promise.resolve(0)); 21 | queue.filterNewUrls.callsFake((urls:string[]) => Promise.resolve(urls)); 22 | 23 | project = { queue }; 24 | }); 25 | 26 | afterEach(() => { 27 | sandbox.restore(); 28 | }); 29 | 30 | it('test conditions', () => { 31 | plugin = new InsertResourcesPlugin(); 32 | assert.isFalse(plugin.test(project, null)); 33 | assert.isFalse(plugin.test(project, { resourcesToAdd: [ ] })); 34 | assert.isTrue(plugin.test(project, { resourcesToAdd: [ { url: 'http://a.com' } ] })); 35 | }); 36 | 37 | it('fully save new resources, maxResources undefined', async () => { 38 | plugin = new InsertResourcesPlugin(); 39 | await plugin.apply(project, { depth: 0, resourcesToAdd: [ { url: 'urlA' }, { url: 'urlB' } ] }); 40 | 41 | assert.isTrue(project.queue.filterNewUrls.notCalled); 42 | assert.isTrue(project.queue.add.calledOnce); 43 | 44 | const [ saveResources ] = project.queue.add.args[0]; 45 | assert.sameDeepMembers([ { url: 'urlA', depth: 1 }, { url: 'urlB', depth: 1 } ], saveResources); 46 | }); 47 | 48 | it('fully save new resources, maxResources defined', async () => { 49 | plugin = new InsertResourcesPlugin({ maxResources: 2 }); 50 | await plugin.apply(project, { depth: 0, resourcesToAdd: [ { url: 'urlA' }, { url: 'urlB' } ] }); 51 | 52 | assert.isTrue(project.queue.filterNewUrls.notCalled); 53 | assert.isTrue(project.queue.add.calledOnce); 54 | 55 | const [ saveResources ] = project.queue.add.args[0]; 56 | assert.sameDeepMembers([ { url: 'urlA', depth: 1 }, { url: 'urlB', depth: 1 } ], saveResources); 57 | }); 58 | 59 | it('partially save new resources', async () => { 60 | plugin = new InsertResourcesPlugin({ maxResources: 1 }); 61 | await plugin.apply(project, { depth: 0, resourcesToAdd: [ { url: 'urlA' }, { url: 'urlB' } ] }); 62 | 63 | assert.isTrue(project.queue.filterNewUrls.calledOnce); 64 | assert.isTrue(project.queue.add.calledOnce); 65 | 66 | const [ saveResources ] = project.queue.add.args[0]; 67 | assert.sameDeepMembers([ { url: 'urlA', depth: 1 } ], saveResources); 68 | }); 69 | 70 | it('partially save new/existing resources', async () => { 71 | project.queue.filterNewUrls.returns(Promise.resolve([ 'urlB' ])); 72 | plugin = new InsertResourcesPlugin({ maxResources: 1 }); 73 | await plugin.apply(project, { depth: 0, resourcesToAdd: [ { url: 'urlA' }, { url: 'urlB' } ] }); 74 | 75 | assert.isTrue(project.queue.filterNewUrls.calledOnce); 76 | assert.isTrue(project.queue.add.calledOnce); 77 | 78 | const [ saveResources ] = project.queue.add.args[0]; 79 | assert.sameDeepMembers([ { url: 'urlB', depth: 1 } ], saveResources); 80 | }); 81 | }); 82 | -------------------------------------------------------------------------------- /test/unit/plugins/test-node-fetch-plugin.ts: -------------------------------------------------------------------------------- 1 | import { assert } from 'chai'; 2 | import { SinonSandbox, createSandbox } from 'sinon'; 3 | import http, { ClientRequest } from 'http'; 4 | import { Readable } from 'stream'; 5 | import { gzipSync } from 'zlib'; 6 | import NodeFetchPlugin from '../../../src/plugins/default/NodeFetchPlugin'; 7 | import Resource from '../../../src/storage/base/Resource'; 8 | 9 | describe('NodeFetchPlugin', () => { 10 | let sandbox:SinonSandbox; 11 | const plugin: NodeFetchPlugin = new NodeFetchPlugin(); 12 | 13 | beforeEach(() => { 14 | sandbox = createSandbox(); 15 | }); 16 | 17 | afterEach(() => { 18 | sandbox.restore(); 19 | }); 20 | 21 | it('fetch no compression', async () => { 22 | const htmlContent = ''; 23 | const clientRequestStub = sandbox.createStubInstance(ClientRequest); 24 | sandbox.stub(plugin, 'getRequestFnc').returns((opts, callback) => { 25 | const response = Readable.from(htmlContent); 26 | 27 | callback(Object.assign(response, { 28 | statusCode: 201, 29 | headers: { 30 | 'content-encoding': '', 31 | }, 32 | })); 33 | 34 | return clientRequestStub; 35 | }); 36 | 37 | const result = await plugin.fetch({ url: 'http://sitea.com' }); 38 | assert.strictEqual(result.data.toString(), htmlContent); 39 | assert.isTrue(clientRequestStub.end.calledOnce); 40 | }); 41 | 42 | it('fetch gzip', async () => { 43 | const htmlContent = ''; 44 | const clientRequestStub = sandbox.createStubInstance(ClientRequest); 45 | sandbox.stub(plugin, 'getRequestFnc').returns((opts, callback) => { 46 | const response = Readable.from(gzipSync(htmlContent)); 47 | callback(Object.assign(response, { 48 | statusCode: 201, 49 | headers: { 50 | 'content-encoding': 'gzip', 51 | }, 52 | })); 53 | 54 | return clientRequestStub; 55 | }); 56 | 57 | const result = await plugin.fetch({ url: 'http://sitea.com' }); 58 | assert.strictEqual((result.data).toString('utf8'), htmlContent); 59 | assert.isTrue(clientRequestStub.end.calledOnce); 60 | }); 61 | 62 | it('fetch read timeout', async () => { 63 | const srv = http.createServer((req, res) => { 64 | setTimeout(() => { 65 | res.writeHead(200, { 'Content-Type': 'text/plain' }); 66 | res.write(''); 67 | res.end(); 68 | }, 1 * 1000); 69 | }); 70 | srv.listen(8000); 71 | 72 | plugin.opts.readTimeout = 0.5 * 1000; 73 | let timeoutError; 74 | try { 75 | await plugin.fetch({ url: 'http://sitea.com', proxy: { host: '127.0.0.1', port: 8000 } }); 76 | } 77 | catch (err) { 78 | timeoutError = err; 79 | } 80 | 81 | srv.close(); 82 | 83 | assert.strictEqual(timeoutError.status, 408); 84 | }); 85 | }); 86 | -------------------------------------------------------------------------------- /test/unit/plugins/test-scroll-plugin.ts: -------------------------------------------------------------------------------- 1 | import { assert } from 'chai'; 2 | import { createSandbox, SinonSandbox } from 'sinon'; 3 | import ScrollPlugin from '../../../src/plugins/default/ScrollPlugin'; 4 | import * as utils from '../../../src/plugins/dom-utils'; 5 | import Resource from '../../../src/storage/base/Resource'; 6 | 7 | describe('ScrollPlugin', () => { 8 | let sandbox:SinonSandbox; 9 | let plugin: ScrollPlugin; 10 | const project:any = {}; 11 | 12 | beforeEach(() => { 13 | sandbox = createSandbox(); 14 | sandbox.stub(window, 'scrollTo'); 15 | }); 16 | 17 | afterEach(() => { 18 | sandbox.restore(); 19 | }); 20 | 21 | it('test conditions', () => { 22 | plugin = new ScrollPlugin(); 23 | assert.isFalse(plugin.test(project, null)); 24 | assert.isFalse(plugin.test(project, { actions: [ 'clickA' ] })); 25 | assert.isTrue(plugin.test(project, { contentType: 'text/html' })); 26 | }); 27 | 28 | it('apply DOM unchanged', async () => { 29 | plugin = new ScrollPlugin(); 30 | const stubWaitForStability = sandbox.stub(utils, 'waitForDomStability'); 31 | stubWaitForStability.returns(new Promise(resolve => resolve(utils.DomStabilityStatus.Unchanged))); 32 | 33 | const actualResult = await plugin.apply(); 34 | assert.isNull(actualResult); 35 | }); 36 | 37 | it('apply DOM changed, stable', async () => { 38 | plugin = new ScrollPlugin(); 39 | const stubWaitForStability = sandbox.stub(utils, 'waitForDomStability'); 40 | stubWaitForStability.returns(new Promise(resolve => resolve(utils.DomStabilityStatus.Stable))); 41 | 42 | const actualResult = await plugin.apply(); 43 | const expectedResult = { actions: [ 'scroll#1' ], status: 200 }; 44 | assert.deepEqual(actualResult, expectedResult); 45 | }); 46 | 47 | it('apply DOM changed, unstable', async () => { 48 | plugin = new ScrollPlugin(); 49 | const stubWaitForStability = sandbox.stub(utils, 'waitForDomStability'); 50 | stubWaitForStability.returns(new Promise(resolve => resolve(utils.DomStabilityStatus.Unstable))); 51 | 52 | let actualErr; 53 | try { 54 | await plugin.apply(); 55 | } 56 | catch (err) { 57 | actualErr = err; 58 | } 59 | 60 | assert.strictEqual(actualErr.message, `DOM not stable after stabilityTimeout of ${plugin.opts.stabilityTimeout}`); 61 | }); 62 | }); 63 | -------------------------------------------------------------------------------- /test/unit/plugins/test-upsert-resource-plugin.ts: -------------------------------------------------------------------------------- 1 | import { assert } from 'chai'; 2 | import UpsertResourcePlugin from '../../../src/plugins/default/UpsertResourcePlugin'; 3 | import Resource from '../../../src/storage/base/Resource'; 4 | 5 | describe('UpsertResourcePlugin', () => { 6 | let plugin: UpsertResourcePlugin; 7 | const project:any = { resourceCount: 0 }; 8 | 9 | it('test conditions', () => { 10 | plugin = new UpsertResourcePlugin(); 11 | assert.isFalse(plugin.test(project, null)); 12 | assert.isTrue(plugin.test(project, {})); 13 | }); 14 | }); 15 | -------------------------------------------------------------------------------- /test/unit/plugins/test-url-utils.ts: -------------------------------------------------------------------------------- 1 | import { assert } from 'chai'; 2 | import { getUrlColIdx, normalizeUrl } from '../../../src/plugins/url-utils'; 3 | 4 | describe('URL Utils', () => { 5 | it('normalizeUrl', async () => { 6 | assert.strictEqual(normalizeUrl('http://wWw.CaPs.com'), 'http://www.caps.com/'); 7 | assert.strictEqual(normalizeUrl('no-proTocoL.com'), 'https://no-protocol.com/'); 8 | assert.strictEqual(normalizeUrl('WWw.no-proTocoL.com'), 'https://www.no-protocol.com/'); 9 | }); 10 | 11 | it('getUrlColIdx', async () => { 12 | assert.strictEqual(getUrlColIdx('1,http://sitea.com'), 1); 13 | assert.strictEqual(getUrlColIdx('1, 2, www.sitea.com'), 2); 14 | assert.strictEqual(getUrlColIdx('sitea.com'), 0); 15 | }); 16 | 17 | it('getUrlColIdx throws error', async () => { 18 | let urlErr; 19 | try { 20 | getUrlColIdx('1,2,invalidurl'); 21 | } 22 | catch (err) { 23 | urlErr = err; 24 | } 25 | 26 | assert.strictEqual(urlErr.message, 'could not detect url column from 1,2,invalidurl'); 27 | }); 28 | }); 29 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-cjs-js/BaseJs.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | class BaseJsA { 3 | jsa(a, b) { 4 | return a + b; 5 | } 6 | } 7 | 8 | class BaseJsB { 9 | jsb(a, b) { 10 | return a + b; 11 | } 12 | } 13 | 14 | module.exports = { 15 | BaseJsA, 16 | BaseJsB 17 | } 18 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-cjs-js/Extended.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | const { BaseJsA } = require('./BaseJs'); 3 | 4 | class Extended extends BaseJsA { 5 | sum(a, b) { 6 | return a + b; 7 | } 8 | } 9 | 10 | module.exports = Extended; 11 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-cjs-js/ExtendedDomRead.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | const { BaseJsB } = require('./BaseJs'); 3 | 4 | class ExtendedDomRead extends BaseJsB { 5 | opts = { 6 | domRead: true, 7 | } 8 | 9 | sum(a, b) { 10 | console.log(BaseJsB); 11 | return this.jsb(a, b); 12 | } 13 | 14 | async asum(a, b) { 15 | return this.jsb(a, b); 16 | } 17 | } 18 | 19 | module.exports = ExtendedDomRead; 20 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-cjs-js/expected-extended-dom-read-bundle.txt: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | class BaseJsA { 3 | jsa(a, b) { 4 | return a + b; 5 | } 6 | } 7 | 8 | class BaseJsB$1 { 9 | jsb(a, b) { 10 | return a + b; 11 | } 12 | } 13 | 14 | var BaseJs = { 15 | BaseJsA, 16 | BaseJsB: BaseJsB$1 17 | }; 18 | 19 | var require$$0 = BaseJs; 20 | 21 | /* eslint-disable */ 22 | 23 | const { BaseJsB } = require$$0; 24 | 25 | class ExtendedDomRead extends BaseJsB { 26 | opts = { 27 | domRead: true, 28 | } 29 | 30 | sum(a, b) { 31 | console.log(BaseJsB); 32 | return this.jsb(a, b); 33 | } 34 | 35 | async asum(a, b) { 36 | return this.jsb(a, b); 37 | } 38 | } 39 | 40 | var ExtendedDomRead_1 = ExtendedDomRead; 41 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-esm-js/BaseJs.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | export class BaseJsA { 3 | jsa(a, b) { 4 | return a + b; 5 | } 6 | } 7 | 8 | export class BaseJsB { 9 | jsb(a, b) { 10 | return a + b; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-esm-js/ExtendedDomRead.js: -------------------------------------------------------------------------------- 1 | import { extra } from '@get-set-fetch/test-utils'; 2 | import { BaseJsB } from './BaseJs'; 3 | 4 | export default class ExtendedDomRead extends BaseJsB { 5 | opts = { 6 | domRead: true, 7 | } 8 | 9 | sum(a, b) { 10 | return this.jsb(a, b); 11 | } 12 | 13 | async asum(a, b) { 14 | console.log(extra); 15 | return this.jsb(a, b); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-esm-js/expected-extended-dom-read-bundle.txt: -------------------------------------------------------------------------------- 1 | const extra = 'just a named export required for testing partial package import when bundling scraper plugins'; 2 | 3 | /* eslint-disable */ 4 | 5 | class BaseJsB { 6 | jsb(a, b) { 7 | return a + b; 8 | } 9 | } 10 | 11 | class ExtendedDomRead extends BaseJsB { 12 | opts = { 13 | domRead: true, 14 | } 15 | 16 | sum(a, b) { 17 | return this.jsb(a, b); 18 | } 19 | 20 | async asum(a, b) { 21 | console.log(extra); 22 | return this.jsb(a, b); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-esm-ts/BaseTs.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | export class BaseTsA { 3 | tsa(a:number, b:number) { 4 | return a + b; 5 | } 6 | } 7 | 8 | export class BaseTsB { 9 | tsb(a:number, b:number) { 10 | return a + b; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-esm-ts/Extended.ts: -------------------------------------------------------------------------------- 1 | import { BaseTsA } from './BaseTs'; 2 | 3 | export default class Extended extends BaseTsA { 4 | sum(a:number, b:number) { 5 | return a + b; 6 | } 7 | } 8 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-esm-ts/ExtendedDomRead.ts: -------------------------------------------------------------------------------- 1 | import { extra } from '@get-set-fetch/test-utils'; 2 | import { BaseTsB } from './BaseTs'; 3 | 4 | export default class ExtendedDomRead extends BaseTsB { 5 | opts = { 6 | domRead: true, 7 | } 8 | 9 | sum(a, b) { 10 | return this.tsb(a, b); 11 | } 12 | 13 | async asum(a, b) { 14 | console.log(extra); 15 | return this.tsb(a, b); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-esm-ts/expected-extended-dom-read-bundle.txt: -------------------------------------------------------------------------------- 1 | const extra = 'just a named export required for testing partial package import when bundling scraper plugins'; 2 | 3 | /* eslint-disable */ 4 | class BaseTsB { 5 | tsb(a, b) { 6 | return a + b; 7 | } 8 | } 9 | 10 | class ExtendedDomRead extends BaseTsB { 11 | constructor() { 12 | super(...arguments); 13 | this.opts = { 14 | domRead: true, 15 | }; 16 | } 17 | sum(a, b) { 18 | return this.tsb(a, b); 19 | } 20 | async asum(a, b) { 21 | console.log(extra); 22 | return this.tsb(a, b); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-mixed-esm-cjs-ts-js/BaseJs.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | class BaseJsA { 3 | jsa(a, b) { 4 | return a + b; 5 | } 6 | } 7 | 8 | class BaseJsB { 9 | jsb(a, b) { 10 | return a + b; 11 | } 12 | } 13 | 14 | module.exports = { 15 | BaseJsA, 16 | BaseJsB 17 | } 18 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-mixed-esm-cjs-ts-js/BaseTs.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | export class BaseTsA { 3 | tsa(a:number, b:number) { 4 | return a + b; 5 | } 6 | } 7 | 8 | export class BaseTsB { 9 | tsb(a:number, b:number) { 10 | return a + b; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-mixed-esm-cjs-ts-js/ExtendedDomRead.ts: -------------------------------------------------------------------------------- 1 | import { extra } from '@get-set-fetch/test-utils'; 2 | import { BaseJsB } from './BaseJs'; 3 | import { BaseTsB } from './BaseTs'; 4 | 5 | export default class ExtendedDomRead extends BaseJsB { 6 | opts = { 7 | domRead: true, 8 | } 9 | 10 | sum(a, b) { 11 | console.log(BaseTsB); 12 | return this.jsb(a, b); 13 | } 14 | 15 | async asum(a, b) { 16 | console.log(extra); 17 | return this.jsb(a, b); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /test/unit/pluginstore/input-mixed-esm-cjs-ts-js/expected-extended-dom-read-bundle.txt: -------------------------------------------------------------------------------- 1 | const extra = 'just a named export required for testing partial package import when bundling scraper plugins'; 2 | 3 | /* eslint-disable */ 4 | class BaseJsA { 5 | jsa(a, b) { 6 | return a + b; 7 | } 8 | } 9 | 10 | class BaseJsB { 11 | jsb(a, b) { 12 | return a + b; 13 | } 14 | } 15 | 16 | var BaseJs = { 17 | BaseJsA, 18 | BaseJsB 19 | }; 20 | 21 | /* eslint-disable */ 22 | class BaseTsB { 23 | tsb(a, b) { 24 | return a + b; 25 | } 26 | } 27 | 28 | class ExtendedDomRead extends BaseJs.BaseJsB { 29 | constructor() { 30 | super(...arguments); 31 | this.opts = { 32 | domRead: true, 33 | }; 34 | } 35 | sum(a, b) { 36 | console.log(BaseTsB); 37 | return this.jsb(a, b); 38 | } 39 | async asum(a, b) { 40 | console.log(extra); 41 | return this.jsb(a, b); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /test/unit/scraper/test-runtime-metrics.ts: -------------------------------------------------------------------------------- 1 | import { assert } from 'chai'; 2 | import { SinonSandbox, createSandbox } from 'sinon'; 3 | import RuntimeMetrics, { RuntimeOptions } from '../../../src/scraper/RuntimeMetrics'; 4 | 5 | describe('RuntimeMetrics', () => { 6 | let sandbox:SinonSandbox; 7 | let metrics:RuntimeMetrics; 8 | 9 | beforeEach(() => { 10 | sandbox = createSandbox(); 11 | }); 12 | 13 | afterEach(() => { 14 | sandbox.restore(); 15 | }); 16 | 17 | it('snapshot', () => { 18 | metrics = new RuntimeMetrics(); 19 | metrics.cpuUsage = { 20 | totalTick: 1000, 21 | totalIdle: 1000, 22 | processTick: null, 23 | }; 24 | sandbox.stub(metrics, 'getMemoryUsage').returns({ 25 | freeMem: 800, 26 | totalMem: 1000, 27 | processMem: 100, 28 | }); 29 | sandbox.stub(metrics, 'getCpuUsage').returns({ 30 | totalTick: 2000, 31 | totalIdle: 1600, 32 | processTick: { user: 50000, system: 100000 }, 33 | }); 34 | 35 | const snapshot:RuntimeOptions = metrics.takeSnapshot(); 36 | 37 | assert.deepEqual( 38 | snapshot, 39 | { 40 | global: { mem: 200, memPct: 20, cpuPct: 40 }, 41 | process: { mem: 100, memPct: 10, cpuPct: 15 }, 42 | }, 43 | ); 44 | }); 45 | }); 46 | -------------------------------------------------------------------------------- /test/unit/storage/mysql-unit-suite.ts: -------------------------------------------------------------------------------- 1 | import unitSuite from './unit-suite'; 2 | import * as connConfig from '../../config/storage/mysql/mysql-conn.json'; 3 | import Connection from '../../../src/storage/base/Connection'; 4 | import KnexConnection from '../../../src/storage/knex/KnexConnection'; 5 | 6 | const conn:Connection = new KnexConnection(connConfig); 7 | unitSuite(conn); 8 | -------------------------------------------------------------------------------- /test/unit/storage/pg-unit-suite.ts: -------------------------------------------------------------------------------- 1 | import unitSuite from './unit-suite'; 2 | import * as connConfig from '../../config/storage/pg/pg-conn.json'; 3 | import Connection from '../../../src/storage/base/Connection'; 4 | import KnexConnection from '../../../src/storage/knex/KnexConnection'; 5 | 6 | const conn:Connection = new KnexConnection(connConfig); 7 | unitSuite(conn); 8 | -------------------------------------------------------------------------------- /test/unit/storage/sqlite3-unit-suite.ts: -------------------------------------------------------------------------------- 1 | import unitSuite from './unit-suite'; 2 | import * as connConfig from '../../config/storage/sqlite/sqlite-conn.json'; 3 | import Connection from '../../../src/storage/base/Connection'; 4 | import KnexConnection from '../../../src/storage/knex/KnexConnection'; 5 | 6 | const conn:Connection = new KnexConnection(connConfig); 7 | unitSuite(conn); 8 | -------------------------------------------------------------------------------- /test/unit/storage/unit-suite.ts: -------------------------------------------------------------------------------- 1 | import crudResource from './test-resource-crud'; 2 | import crudProject from './test-project-crud'; 3 | import Connection from '../../../src/storage/base/Connection'; 4 | 5 | const suites = { 6 | crudResource, 7 | crudProject, 8 | }; 9 | 10 | export default function unitSuite(conn: Connection) { 11 | Object.values(suites).forEach(suite => { 12 | suite(conn); 13 | }); 14 | } 15 | -------------------------------------------------------------------------------- /test/utils/shims.js: -------------------------------------------------------------------------------- 1 | import { JSDOM } from 'jsdom'; 2 | 3 | // init jsdom environment for testing plugins running in browser 4 | const dom = new JSDOM('

Hello world

'); 5 | global.document = dom.window.document; 6 | global.window = dom.window; 7 | -------------------------------------------------------------------------------- /test/utils/ts-node-config.js: -------------------------------------------------------------------------------- 1 | require('ts-node').register({ 2 | project: 'test/tsconfig.test.json', 3 | files: true, 4 | pretty: true, 5 | 'no-cache': true, 6 | ignore: [ /node_modules\/(?!@get-set-fetch\/test-utils)/ ], 7 | }); 8 | -------------------------------------------------------------------------------- /tsconfig.debug.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "esModuleInterop": true, 4 | "resolveJsonModule": true, 5 | "target": "es2020", 6 | "strict": false, 7 | "moduleResolution": "node", 8 | "module": "commonjs", 9 | "allowJs": true, 10 | "rootDir": "src", 11 | "outDir": "dist/cjs", 12 | "declaration": true, 13 | "newLine": "LF", 14 | "preserveConstEnums": true, 15 | "sourceMap": true, 16 | "useDefineForClassFields": true 17 | }, 18 | "include": [ 19 | "src" 20 | ] 21 | } -------------------------------------------------------------------------------- /tsconfig.esm.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "composite": true, 4 | "esModuleInterop": true, 5 | "resolveJsonModule": true, 6 | "target": "es2020", 7 | "strict": false, 8 | "moduleResolution": "node", 9 | "module": "es2020", 10 | "allowJs": true, 11 | "rootDir": "src", 12 | "outDir": "dist/esm", 13 | "declaration": true, 14 | "newLine": "LF", 15 | "preserveConstEnums": true, 16 | "useDefineForClassFields": false 17 | }, 18 | 19 | "include": [ 20 | "src", 21 | "src/**/*.json", 22 | ], 23 | } -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "esModuleInterop": true, 4 | "resolveJsonModule": true, 5 | "target": "es2020", 6 | "strict": false, 7 | "moduleResolution": "node", 8 | "module": "commonjs", 9 | "allowJs": true, 10 | "rootDir": "src", 11 | "outDir": "dist/cjs", 12 | "declaration": true, 13 | "newLine": "LF", 14 | "preserveConstEnums": true, 15 | "useDefineForClassFields": false 16 | }, 17 | "include": [ 18 | "src" 19 | ] 20 | } --------------------------------------------------------------------------------