├── .commitlintrc.js ├── .devcontainer └── devcontainer.json ├── .dockerignore ├── .editorconfig ├── .env.example ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── website-request.md │ └── website-request.yaml └── workflows │ └── build-and-release.yaml ├── .gitignore ├── .hintrc ├── .husky ├── commit-msg ├── post-merge └── pre-commit ├── .mocharc.json ├── .releaserc ├── .vscode ├── extensions.json ├── launch.json ├── settings.json └── tasks.json ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── LICENSE.md ├── README.md ├── bin ├── dev ├── dev.cmd ├── run └── run.cmd ├── dockerfile ├── dockerfile.debug ├── docs ├── help.md ├── plugins.md └── scrape.md ├── eslint.config.mjs ├── npm-shrinkwrap.json ├── package.json ├── renovate.json ├── scripts ├── docker-version.ts └── postinstall.ts ├── src ├── classes │ ├── base.class.ts │ ├── puppeteer.class.ts │ └── scrape-command.class.ts ├── commands │ └── scrape │ │ ├── all.ts │ │ └── amazon │ │ ├── helpers │ │ ├── auth.helper.ts │ │ ├── file.helper.ts │ │ └── selectors.helper.ts │ │ └── index.ts ├── enums │ ├── invoice-status.enum.ts │ └── loglevel.ts ├── helpers │ ├── exit.helper.ts │ ├── logger.helper.ts │ ├── parse-bool.helper.ts │ └── process.helper.ts ├── hooks │ └── prerun │ │ └── prerun.ts ├── index.ts ├── interfaces │ ├── amazon-options.interface.ts │ ├── amazon.interface.ts │ ├── downloader.interface.ts │ ├── invoice.interface.ts │ ├── processed-website.interface.ts │ ├── scrape.interface.ts │ ├── selectors.interface.ts │ └── website-run.interface.ts └── loglevel.ts ├── test ├── commands │ ├── hello │ │ ├── index.test.ts │ │ └── world.test.ts │ ├── scrape.test.ts │ └── test.test.ts ├── helpers │ └── init.js ├── hooks │ └── prerun │ │ └── prerun.test.ts └── tsconfig.json └── tsconfig.json /.commitlintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | extends: [`@commitlint/config-conventional`], 3 | parserPreset: 'conventional-changelog-conventionalcommits', 4 | rules: { 5 | 'body-max-line-length': [2, 'always', 250], 6 | 'body-max-length': [2, 'always', 250], 7 | }, 8 | prompt: { 9 | settings: {}, 10 | messages: { 11 | skip: ':skip', 12 | max: 'upper %d chars', 13 | min: '%d chars at least', 14 | emptyWarning: 'can not be empty', 15 | upperLimitWarning: 'over limit', 16 | lowerLimitWarning: 'below limit' 17 | }, 18 | questions: { 19 | type: { 20 | description: "Select the type of change that you're committing:", 21 | enum: { 22 | feat: { 23 | description: 'A new feature', 24 | title: 'Features', 25 | emoji: '✨', 26 | }, 27 | fix: { 28 | description: 'A bug fix', 29 | title: 'Bug Fixes', 30 | emoji: '🐛', 31 | }, 32 | docs: { 33 | description: 'Documentation only changes', 34 | title: 'Documentation', 35 | emoji: '📚', 36 | }, 37 | style: { 38 | description: 'Changes that do not affect the meaning of the code (white-space, formatting, missing semi-colons, etc)', 39 | title: 'Styles', 40 | emoji: '💎', 41 | }, 42 | refactor: { 43 | description: 'A code change that neither fixes a bug nor adds a feature', 44 | title: 'Code Refactoring', 45 | emoji: '📦', 46 | }, 47 | perf: { 48 | description: 'A code change that improves performance', 49 | title: 'Performance Improvements', 50 | emoji: '🚀', 51 | }, 52 | test: { 53 | description: 'Adding missing tests or correcting existing tests', 54 | title: 'Tests', 55 | emoji: '🚨', 56 | }, 57 | build: { 58 | description: 'Changes that affect the build system or external dependencies (example scopes: gulp, broccoli, npm)', 59 | title: 'Builds', 60 | emoji: '🛠', 61 | }, 62 | ci: { 63 | description: 'Changes to our CI configuration files and scripts (example scopes: Travis, Circle, BrowserStack, SauceLabs)', 64 | title: 'Continuous Integrations', 65 | emoji: '⚙️', 66 | }, 67 | chore: { 68 | description: "Other changes that don't modify src or test files", 69 | title: 'Chores', 70 | emoji: '♻️', 71 | }, 72 | revert: { 73 | description: 'Reverts a previous commit', 74 | title: 'Reverts', 75 | emoji: '🗑', 76 | }, 77 | }, 78 | }, 79 | scope: { 80 | description: 81 | 'What is the scope of this change (e.g. component or file name)', 82 | }, 83 | subject: { 84 | description: 'Write a short, imperative tense description of the change', 85 | }, 86 | body: { 87 | description: 'Provide a longer description of the change', 88 | }, 89 | isBreaking: { 90 | description: 'Are there any breaking changes?', 91 | }, 92 | breakingBody: { 93 | description: 94 | 'A BREAKING CHANGE commit requires a body. Please enter a longer description of the commit itself', 95 | }, 96 | breaking: { 97 | description: 'Describe the breaking changes', 98 | }, 99 | isIssueAffected: { 100 | description: 'Does this change affect any open issues?', 101 | }, 102 | issuesBody: { 103 | description: 104 | 'If issues are closed, the commit requires a body. Please enter a longer description of the commit itself', 105 | }, 106 | issues: { 107 | description: 'Add issue references (e.g. "fix #123", "re #123".)', 108 | }, 109 | }, 110 | } 111 | }; -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/typescript-node 3 | { 4 | "name": "Node.js & TypeScript", 5 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 6 | "image": "mcr.microsoft.com/devcontainers/typescript-node:20", 7 | 8 | // Features to add to the dev container. More info: https://containers.dev/features. 9 | "features": { 10 | "ghcr.io/snebjorn/devcontainer-feature/chromium:latest": {} 11 | }, 12 | 13 | // "workspaceMount": "source=C:/temp/$project/sub-folder,target=/workspace,type=bind", 14 | // "workspaceFolder": "/workspace" 15 | 16 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 17 | // "forwardPorts": [], 18 | 19 | // Use 'postCreateCommand' to run commands after the container is created. 20 | "postCreateCommand": "npm install -g npm@10.8.0", 21 | 22 | // Configure tool-specific properties. 23 | // "customizations": {}, 24 | 25 | // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. 26 | // "remoteUser": "root" 27 | } 28 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !./scripts/ 3 | !./scripts/ 4 | !./dist/ 5 | !./bin/ 6 | !./archive/ 7 | !.env.example 8 | !.package.json 9 | !package.json 10 | !package-lock.json 11 | !pm-shrinkwrap.json 12 | !./src 13 | **/*.md -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | 10 | [*.md] 11 | trim_trailing_whitespace = false 12 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Your Amazon username 2 | AMAZON_USERNAME="" 3 | 4 | # Your amazon password 5 | AMAZON_PASSWORD="" 6 | 7 | # Amazon top level domain 8 | AMAZON_TLD="de" 9 | 10 | # Only extracts invoices from this year (i.e. 2023) 11 | AMAZON_YEAR_FILTER="2023" 12 | 13 | # Only extracts invoices from this page (i.e. 2) 14 | AMAZON_PAGE_FILTER="" 15 | 16 | # Tracks already scraped documents and starts a new run at the last scraped one 17 | ONLY_NEW=true 18 | 19 | # Destination path for all scraped documents 20 | FILE_DESTINATION_FOLDER="./documents/" 21 | 22 | # Fallback extension when no extension can be determined 23 | FILE_FALLBACK_EXTENSION=".pdf" 24 | 25 | # Debug flag (sets the loglevel to DEBUG) 26 | DEBUG=false 27 | 28 | # Creates subfolders for every scraped page/plugin 29 | SUBFOLDER_FOR_PAGES=true 30 | 31 | # Sets the log path 32 | LOG_PATH="./logs/" 33 | 34 | # Log level (see https://github.com/winstonjs/winston#logging-levels) 35 | LOG_LEVEL="info" 36 | 37 | # Flag for executing the script periodically. Needs 'RECURRING_PATTERN' to be set. Default `true`when using docker container 38 | RECURRING=false 39 | 40 | # Cron pattern to execute periodically. Needs RECURRING to true 41 | RECURRING_PATTERN="*/30 * * * *" 42 | 43 | # Timezone used for docker enviroments 44 | TZ="Europe/Berlin" 45 | 46 | ### Danger-Zone 47 | NPM_TOKEN= 48 | GITHUB_TOKEN= -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: disanedev 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/website-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Website request 3 | about: Request for a new website to be scraped 4 | title: "[WR] website.tld" 5 | labels: enhancement, plugin 6 | assignees: '' 7 | 8 | --- 9 | 10 | # Website request 11 | To add a scraper for a specific webpage, we need some informations about the page. 12 | 13 | ## Basic information 14 | ### Which website to you want to crawl? 15 | We need the entire domain with TLD to crawl like 16 | 17 | ### What services do they provide? 18 | - [ ] Orders (with n invoices) 19 | - [ ] Contracts (with recurring calculation in a defined interval) 20 | 21 | ### In which industry does the company operate? 22 | - [ ] Commerce (like amazon etc.) 23 | - [ ] Energy (i.e. your local energy company) 24 | - [ ] Telecommunication (like Vodafone) 25 | - [ ] other (Please describe here: _________) 26 | 27 | ### Does the webpage have/needs an authentication? 28 | - [ ] yes 29 | - [ ] no 30 | 31 | ### Do they provide a two factor auth? 32 | Currently scraping documents via Docker container doesn't work with a seconds factor. We need this information to config this plugins as a 2fas-able one. 33 | - [ ] yes 34 | - [ ] no 35 | 36 | ### Do you provide us the credentials (privately)? 37 | - [ ] yes 38 | - [ ] no 39 | 40 | ### Are you willing to collaborate to get this scraper up and running? 41 | - [ ] yes 42 | - [ ] no 43 | 44 | ### What color (hex) represents the company? 45 | This is needed for a specific tag for this website and logs in the application. 46 | If you're unsure about this, please head up to https://www.color-hex.com/color-palettes/ and check if you find the main color of the company. 47 | 48 | ## Record the way you download your documents 49 | The chance is pretty high, we don't use your service website. So you have to provide us a way to code this entire process. Luckily Chrome have a recorder fur Puppeteer (the library we use to scrape the page). Please study the following link ([Puppeteer Recorder with Chrome DevTools (testingbot.com)](https://testingbot.com/support/puppeteer/recorder.html#)) and create a `.js`with the recording. 50 | 51 | Its important that you provide us a recording with `@puppeteer/replay` 52 | 53 | Ensure you only click the parts that are needed to get a download of the desired document. 54 |
55 | Example 56 | 57 | 58 | ```js 59 | import url from 'url'; 60 | import { createRunner } from '@puppeteer/replay'; 61 | 62 | export async function run(extension) { 63 | const runner = await createRunner(extension); 64 | 65 | await runner.runBeforeAllSteps(); 66 | 67 | await runner.runStep({ 68 | type: 'setViewport', 69 | width: 1134, 70 | height: 1284, 71 | deviceScaleFactor: 1, 72 | isMobile: false, 73 | hasTouch: false, 74 | isLandscape: false 75 | }); 76 | 77 | await runner.runStep({ 78 | type: 'navigate', 79 | url: 'https://www.vodafone.de/', 80 | assertedEvents: [ 81 | { 82 | type: 'navigation', 83 | url: 'https://www.vodafone.de/', 84 | title: '' 85 | } 86 | ] 87 | }); 88 | await runner.runStep({ 89 | type: 'click', 90 | target: 'main', 91 | selectors: [ 92 | [ 93 | 'aria/MeinVodafone', 94 | 'aria/[role="generic"]' 95 | ], 96 | [ 97 | 'li.item-myvf span.icon' 98 | ], 99 | [ 100 | 'xpath///*[@id="top"]/div/header/nav/div/div[2]/div/div/ul[2]/li[2]/a/span[1]' 101 | ], 102 | [ 103 | 'pierce/li.item-myvf span.icon' 104 | ] 105 | ], 106 | offsetY: 16, 107 | offsetX: 6.5, 108 | }); 109 | await runner.runStep({ 110 | type: 'click', 111 | target: 'main', 112 | selectors: [ 113 | [ 114 | 'aria/Login[role="button"]' 115 | ], 116 | [ 117 | '#meinVodafoneOverlay button' 118 | ], 119 | [ 120 | 'xpath///*[@id="mdd-login-form"]/fieldset/button' 121 | ], 122 | [ 123 | 'pierce/#meinVodafoneOverlay button' 124 | ] 125 | ], 126 | offsetY: 10, 127 | offsetX: 27.90625, 128 | assertedEvents: [ 129 | { 130 | type: 'navigation', 131 | url: 'https://www.vodafone.de/meinvodafone/services/', 132 | title: '' 133 | } 134 | ] 135 | }); 136 | await runner.runStep({ 137 | type: 'click', 138 | target: 'main', 139 | selectors: [ 140 | [ 141 | 'li:nth-of-type(1) svg.icon-arrow-down-i-xsml' 142 | ], 143 | [ 144 | 'xpath///*[@id="dashboard:mobile"]/svg[1]' 145 | ], 146 | [ 147 | 'pierce/li:nth-of-type(1) svg.icon-arrow-down-i-xsml' 148 | ] 149 | ], 150 | offsetY: 7.015625, 151 | offsetX: 9.5, 152 | }); 153 | await runner.runStep({ 154 | type: 'click', 155 | target: 'main', 156 | selectors: [ 157 | [ 158 | 'li:nth-of-type(1) div.tiles > a:nth-of-type(1) svg' 159 | ], 160 | [ 161 | 'xpath///*[@id="content"]/div[2]/div/div/section/div/div/div/div[3]/div[2]/ul/li[1]/div/div/div[1]/a[1]/div/div[1]/svg' 162 | ], 163 | [ 164 | 'pierce/li:nth-of-type(1) div.tiles > a:nth-of-type(1) svg' 165 | ] 166 | ], 167 | offsetY: 63.609375, 168 | offsetX: 22.484375, 169 | assertedEvents: [ 170 | { 171 | type: 'navigation', 172 | url: 'https://www.vodafone.de/meinvodafone/services/ihre-rechnungen/rechnungen', 173 | title: '' 174 | } 175 | ] 176 | }); 177 | await runner.runStep({ 178 | type: 'click', 179 | target: 'main', 180 | selectors: [ 181 | [ 182 | 'aria/Mehr anzeigen[role="button"]' 183 | ], 184 | [ 185 | '#content button' 186 | ], 187 | [ 188 | 'xpath///*[@id="billoverviewWrapperId"]/bill-overview-history/bill-history/div/div[2]/div/div/div/div[2]/vf-table-brix/div[2]/div/button' 189 | ], 190 | [ 191 | 'pierce/#content button' 192 | ], 193 | [ 194 | 'text/Mehr anzeigen' 195 | ] 196 | ], 197 | offsetY: 10, 198 | offsetX: 44.375, 199 | }); 200 | await runner.runStep({ 201 | type: 'click', 202 | target: 'main', 203 | selectors: [ 204 | [ 205 | 'tr:nth-of-type(1) > td:nth-of-type(4) span:nth-of-type(2) > svg' 206 | ], 207 | [ 208 | 'xpath///*[@id="billoverviewWrapperId"]/bill-overview-history/bill-history/div/div[2]/div/div/div/div[2]/vf-table-brix/div[2]/table/tbody/tr[1]/td[4]/div/span[2]/svg' 209 | ], 210 | [ 211 | 'pierce/tr:nth-of-type(1) > td:nth-of-type(4) span:nth-of-type(2) > svg' 212 | ] 213 | ], 214 | offsetY: 13.5, 215 | offsetX: 22.34375, 216 | }); 217 | await runner.runStep({ 218 | type: 'click', 219 | target: 'main', 220 | selectors: [ 221 | [ 222 | 'tr:nth-of-type(1) > td:nth-of-type(5) span:nth-of-type(2) use' 223 | ], 224 | [ 225 | 'xpath///*[@id="billoverviewWrapperId"]/bill-overview-history/bill-history/div/div[2]/div/div/div/div[2]/vf-table-brix/div[2]/table/tbody/tr[1]/td[5]/div/span[2]/svg/use' 226 | ], 227 | [ 228 | 'pierce/tr:nth-of-type(1) > td:nth-of-type(5) span:nth-of-type(2) use' 229 | ] 230 | ], 231 | offsetY: 10.5, 232 | offsetX: 13.45843505859375, 233 | }); 234 | 235 | await runner.runAfterAllSteps(); 236 | } 237 | 238 | if (process && import.meta.url === url.pathToFileURL(process.argv[1]).href) { 239 | run() 240 | } 241 | 242 | ``` 243 |
244 | 245 | > :warning: **Watch our for credentials**: 246 | > Only provide us credentials if you really want. We are not liable for leaked credentials 247 | 248 | > :memo: In some cases it would be beneficial to have credentials to test the scraping without your collaboration. If you don't want to provide us credentials (which is totally fine :white_check_mark:) we need your help to get the scrape for this website up and running. 249 | 250 | 251 | ## Screenshots 252 | Provide us some screenshots how the actually site looks like. This will help us to understand the way the site works. 253 | 254 | ## HTML-Files (optional, but would be a big help) 255 | If you don't provide us credentials, it could help if you provide us the saved HTML files. Every browser supports saving the current document. 256 | 257 | > :memo: You can save the HTML files right after the login procedure 258 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/website-request.yaml: -------------------------------------------------------------------------------- 1 | name: Website request 2 | description: Request for a new website to be scraped 3 | title: "[WR] website.tld " 4 | labels: ["enhancement", "plugin", "triage"] 5 | 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | # Website request 11 | To add a scraper for a specific webpage, we need some information about the page. 12 | 13 | - type: markdown 14 | attributes: 15 | value: | 16 | ## Basic information 17 | 18 | - type: input 19 | id: website 20 | attributes: 21 | label: Which website do you want to crawl? 22 | description: We need the entire domain with TLD to crawl like 23 | placeholder: http://test.com 24 | validations: 25 | required: true 26 | 27 | - type: dropdown 28 | id: provides-services 29 | attributes: 30 | label: What services do they provide? 31 | description: We need this information to determine which type of scraping we use 32 | options: 33 | - Orders (with n invoices) 34 | - Contracts (with recurring calculation in a defined interval) 35 | validations: 36 | required: true 37 | - type: dropdown 38 | id: website-industry 39 | attributes: 40 | label: In which industry does the company operate? 41 | description: We need this information to determine which type of scraping we use 42 | options: 43 | - Commerce (like amazon etc.) 44 | - Energy (i.e. your local energy company) 45 | - Telecommunication (like Vodafone) 46 | - other 47 | validations: 48 | required: true 49 | 50 | - type: input 51 | id: industry 52 | attributes: 53 | label: If you choose other... 54 | description: Provide us an industry 55 | validations: 56 | required: false 57 | 58 | - type: textarea 59 | id: website-description 60 | attributes: 61 | label: Describe the company 62 | placeholder: The company is a local energy provider in germany 63 | 64 | validations: 65 | required: true 66 | 67 | - type: checkboxes 68 | id: website-needs-auth 69 | attributes: 70 | label: Does the webpage have/needs an authentication? 71 | description: We need this information to determine which type of scraping we use and if its usable within docker 72 | options: 73 | - label: Yes, the website needs an authentication 74 | required: false 75 | - type: checkboxes 76 | id: website-needs-2fa 77 | attributes: 78 | label: Do they provide a two factor auth? 79 | description: Currently scraping documents via Docker container doesn't work with a seconds factor. We need this information to config this plugins as a 2fas-able one. 80 | options: 81 | - label: Yes, the website needs an seconds factor 82 | required: false 83 | 84 | - type: checkboxes 85 | id: website-provides-credentials 86 | attributes: 87 | label: Would you provide us the credentials (privately)? 88 | description: It would be beneficial to have the credentials. But if not, its totally fine 89 | options: 90 | - label: Yes, I would share my credentials 91 | required: false 92 | - type: checkboxes 93 | id: user-collaborates 94 | attributes: 95 | label: Are you willing to collaborate to get this scraper up and running? 96 | description: We need your help to get this scraper running. 97 | options: 98 | - label: Yes, I would collaborate on this actively. 99 | required: false 100 | 101 | - type: input 102 | id: color 103 | attributes: 104 | label: What color (hex) represents the company? 105 | description: This is needed for a specific tag for this website and logs in the application. If you're unsure about this, please head up to https://www.color-hex.com/color-palettes/ and check if you find the main color of the company. 106 | value: # 107 | validations: 108 | required: true 109 | 110 | - type: markdown 111 | attributes: 112 | value: | 113 | ## Record the way you download your documents 114 | To add a scraper for a specific webpage, we need some information about the page. 115 | 116 | The chance is pretty high, we don't use your service website. So you have to provide us a way to code this entire process. Luckily Chrome have a recorder fur Puppeteer (the library we use to scrape the page). Please study the following link ([Puppeteer Recorder with Chrome DevTools (testingbot.com)](https://testingbot.com/support/puppeteer/recorder.html#)) and create a `.js`with the recording. 117 | 118 | - type: textarea 119 | id: recorded-code 120 | attributes: 121 | label: Your recorded code 122 | description: Ensure you only click the parts that are needed to get a download of the desired document. Its important that you provide us a recording with `@puppeteer/replay` 123 | render: javascript 124 | validations: 125 | required: true 126 | 127 | - type: markdown 128 | attributes: 129 | value: | 130 | ## Screenshots 131 | Provide us some screenshots how the actually site looks like. This will help us to understand the way the site works. 132 | - type: textarea 133 | id: screenshots 134 | attributes: 135 | label: Paste your screenshots here 136 | validations: 137 | required: true 138 | - type: markdown 139 | attributes: 140 | value: | 141 | ## HTML-Files (optional, but would be a big help) 142 | If you don't provide us credentials, it could help if you provide us the saved HTML files. Every browser supports saving the current document. 143 | 144 | > :memo: You can save the HTML files right after the login procedure 145 | - type: textarea 146 | id: html 147 | 148 | attributes: 149 | label: Paste your html here 150 | render: html 151 | validations: 152 | required: false 153 | 154 | -------------------------------------------------------------------------------- /.github/workflows/build-and-release.yaml: -------------------------------------------------------------------------------- 1 | name: Semantic release 2 | on: 3 | workflow_dispatch: 4 | push: 5 | paths: 6 | - "src/**" 7 | - "dockerfile" 8 | - ".releaserc" 9 | - "package.json" 10 | - "README.md" 11 | - "dockerfile" 12 | - "npm-shrinkwrap.json" 13 | branches: 14 | - "main" 15 | - "dev" 16 | env: 17 | GH_TOKEN: ${{ secrets.AUTH_GH }} 18 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }} 19 | REGISTRY: ghcr.io 20 | IMAGE_NAME: ${{ github.repository }} 21 | NEW_RELEASE_PUBLISHED: false 22 | HUSKY: 0 23 | 24 | jobs: 25 | build: 26 | runs-on: ubuntu-latest 27 | outputs: 28 | NEW_RELEASE_PUBLISHED: ${{ steps.semantic-release.outputs.new_release_published }} 29 | NEXT_VERSION: ${{ steps.semantic-release.outputs.version }} 30 | strategy: 31 | matrix: 32 | node-version: [20.x] 33 | steps: 34 | - uses: actions/checkout@v4 35 | - name: Use Node.js ${{ matrix.node-version }} 36 | uses: actions/setup-node@v4 37 | with: 38 | node-version: ${{ matrix.node-version }} 39 | cache: "npm" 40 | 41 | - run: npm ci 42 | - id: semantic-release 43 | run: npm run semantic-release:ci 44 | docker: 45 | needs: build 46 | runs-on: ubuntu-latest 47 | if: needs.build.outputs.NEW_RELEASE_PUBLISHED 48 | steps: 49 | - env: 50 | NEW_RELEASE_PUBLISHED: ${{ needs.build.outputs.NEW_RELEASE_PUBLISHED }} 51 | NEXT_VERSION: ${{ needs.build.outputs.NEXT_VERSION }} 52 | run: echo "Creating docker release for version $NEXT_VERSION" 53 | 54 | - name: Checkout repository 55 | uses: actions/checkout@v4 56 | 57 | - name: Install cosign 58 | if: github.event_name != 'pull_request' 59 | uses: sigstore/cosign-installer@59acb6260d9c0ba8f4a2f9d9b48431a222b68e20 # v4.5.0 60 | with: 61 | cosign-release: "v1.13.1" 62 | 63 | - name: Setup Docker buildx 64 | uses: docker/setup-buildx-action@0f069ddc17b8eb78586b08a7fe335fd54649e2d3 65 | 66 | - name: Log into registry ${{ env.REGISTRY }} 67 | if: github.event_name != 'pull_request' 68 | uses: docker/login-action@06895751d15a223ec091bea144ad5c7f50d228d0 69 | with: 70 | registry: ${{ env.REGISTRY }} 71 | username: ${{ github.actor }} 72 | password: ${{ env.GH_TOKEN }} 73 | 74 | - name: Echo Version 75 | run: | 76 | echo "Docker: $NEXT_VERSION" 77 | 78 | - name: Extract Docker metadata 79 | id: meta 80 | uses: docker/metadata-action@v4 81 | with: 82 | images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} 83 | flavor: | 84 | latest=false 85 | tags: | 86 | type=raw,value=latest,enable={{ is_default_branch }} 87 | type=raw,value=${{ needs.build.outputs.NEXT_VERSION }} 88 | type=semver,pattern={{version}},value=${{ needs.build.outputs.NEXT_VERSION }} 89 | type=semver,pattern={{major}}.{{minor}},value=${{ needs.build.outputs.NEXT_VERSION }} 90 | type=semver,pattern={{major}},value=${{ needs.build.outputs.NEXT_VERSION }} 91 | type=ref,event=branch 92 | #type=sha 93 | 94 | - name: Build and push Docker image 95 | id: build-and-push 96 | uses: docker/build-push-action@5e99dacf67635c4f273e532b9266ddb609b3025a 97 | with: 98 | context: . 99 | build-args: | 100 | DOCUDIGGER_VERSION=${{ needs.build.outputs.NEXT_VERSION }} 101 | platforms: linux/amd64,linux/arm/v7 102 | push: ${{ github.event_name != 'pull_request' }} 103 | tags: ${{ steps.meta.outputs.tags }} 104 | labels: ${{ steps.meta.outputs.labels }} 105 | cache-from: type=gha 106 | cache-to: type=gha,mode=max 107 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *-debug.log 2 | *-error.log 3 | /.nyc_output 4 | /dist 5 | .scraping/ 6 | /archive 7 | /lib 8 | /tmp 9 | /yarn.lock 10 | node_modules 11 | oclif.manifest.json 12 | # Logs 13 | logs 14 | *.log 15 | npm-debug.log* 16 | yarn-debug.log* 17 | yarn-error.log* 18 | lerna-debug.log* 19 | .pnpm-debug.log* 20 | 21 | # Diagnostic reports (https://nodejs.org/api/report.html) 22 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 23 | 24 | # .npmrc 25 | .npmrc 26 | 27 | # Runtime data 28 | pids 29 | *.pid 30 | *.seed 31 | *.pid.lock 32 | 33 | # Directory for instrumented libs generated by jscoverage/JSCover 34 | lib-cov 35 | 36 | # Coverage directory used by tools like istanbul 37 | coverage 38 | *.lcov 39 | 40 | # nyc test coverage 41 | .nyc_output 42 | 43 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 44 | .grunt 45 | 46 | # Bower dependency directory (https://bower.io/) 47 | bower_components 48 | 49 | # node-waf configuration 50 | .lock-wscript 51 | 52 | # Compiled binary addons (https://nodejs.org/api/addons.html) 53 | build/Release 54 | 55 | # Dependency directories 56 | node_modules/ 57 | jspm_packages/ 58 | 59 | # Snowpack dependency directory (https://snowpack.dev/) 60 | web_modules/ 61 | 62 | # TypeScript cache 63 | *.tsbuildinfo 64 | 65 | # Optional npm cache directory 66 | .npm 67 | 68 | # Optional eslint cache 69 | .eslintcache 70 | 71 | # Optional stylelint cache 72 | .stylelintcache 73 | 74 | # Microbundle cache 75 | .rpt2_cache/ 76 | .rts2_cache_cjs/ 77 | .rts2_cache_es/ 78 | .rts2_cache_umd/ 79 | 80 | # Optional REPL history 81 | .node_repl_history 82 | 83 | # Output of 'npm pack' 84 | *.tgz 85 | 86 | # Yarn Integrity file 87 | .yarn-integrity 88 | 89 | # dotenv environment variable files 90 | .env 91 | .env.development.local 92 | .env.test.local 93 | .env.production.local 94 | .env.local 95 | 96 | # parcel-bundler cache (https://parceljs.org/) 97 | .cache 98 | .parcel-cache 99 | 100 | # Next.js build output 101 | .next 102 | out 103 | 104 | # Nuxt.js build / generate output 105 | .nuxt 106 | dist 107 | 108 | # Gatsby files 109 | .cache/ 110 | # Comment in the public line in if your project uses Gatsby and not Next.js 111 | # https://nextjs.org/blog/next-9-1#public-directory-support 112 | # public 113 | 114 | # vuepress build output 115 | .vuepress/dist 116 | 117 | # vuepress v2.x temp and cache directory 118 | .temp 119 | .cache 120 | 121 | # Docusaurus cache and generated files 122 | .docusaurus 123 | 124 | # Serverless directories 125 | .serverless/ 126 | 127 | # FuseBox cache 128 | .fusebox/ 129 | 130 | # DynamoDB Local files 131 | .dynamodb/ 132 | 133 | # TernJS port file 134 | .tern-port 135 | 136 | # Stores VSCode versions used for testing VSCode extensions 137 | .vscode-test 138 | 139 | # yarn v2 140 | .yarn/cache 141 | .yarn/unplugged 142 | .yarn/build-state.yml 143 | .yarn/install-state.gz 144 | .pnp.* 145 | 146 | .history 147 | data 148 | process.json 149 | documents/ 150 | .secrets 151 | -------------------------------------------------------------------------------- /.hintrc: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "development" 4 | ], 5 | "hints": { 6 | "typescript-config/strict": "off" 7 | } 8 | } -------------------------------------------------------------------------------- /.husky/commit-msg: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # . "$(dirname -- "$0")/_/husky.sh" 3 | 4 | npx --no -- commitlint --edit ${1} 5 | -------------------------------------------------------------------------------- /.husky/post-merge: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | . "$(dirname "$0")/_/husky.sh" 3 | 4 | IFS=$'\n' 5 | # regex supports mono-repos with a package.json at root-level and at package-level 6 | PACKAGE_LOCK_REGEX="(^packages\/.*\/package-lock\.json)|(^package-lock\.json)" 7 | # extract all paths to package-lock.json files 8 | PACKAGES=("$(git diff --name-only HEAD@{1} HEAD | grep -E "$PACKAGE_LOCK_REGEX")") 9 | 10 | if [[ ${PACKAGES[@]} ]]; then 11 | for package in $PACKAGES; do 12 | echo "📦 $package was changed. Running npm install to update your dependencies..." 13 | DIR=$(dirname package) 14 | cd "$DIR" && npm install 15 | done 16 | fi -------------------------------------------------------------------------------- /.husky/pre-commit: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # . "$(dirname -- "$0")/_/husky.sh" 3 | 4 | # npm run test 5 | -------------------------------------------------------------------------------- /.mocharc.json: -------------------------------------------------------------------------------- 1 | { 2 | "require": [ 3 | "test/helpers/init.js", 4 | "ts-node/register" 5 | ], 6 | "watch-extensions": [ 7 | "ts" 8 | ], 9 | "recursive": true, 10 | "reporter": "spec", 11 | "timeout": 60000 12 | } 13 | -------------------------------------------------------------------------------- /.releaserc: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": [ 3 | [ 4 | "@semantic-release/commit-analyzer", 5 | { 6 | "preset": "conventionalcommits", 7 | "releaseRules": [ 8 | { 9 | "type": "chore", 10 | "scope": "deps", 11 | "release": true 12 | }, 13 | { 14 | "type": "chore", 15 | "scope": "renovate", 16 | "release": false 17 | }, 18 | { 19 | "type": "docs", 20 | "release": "patch" 21 | }, 22 | { 23 | "type": "feat", 24 | "release": "minor" 25 | }, 26 | { 27 | "type": "fix", 28 | "release": "patch" 29 | }, 30 | { 31 | "type": "refactor", 32 | "release": "patch" 33 | }, 34 | { 35 | "type": "style", 36 | "release": "patch" 37 | }, 38 | { 39 | "type": "test", 40 | "release": "patch" 41 | } 42 | ] 43 | } 44 | ], 45 | [ 46 | "@semantic-release/release-notes-generator", 47 | { 48 | "noteKeywords": [ 49 | "BREAKING CHANGE", 50 | "BREAKING CHANGES", 51 | "BREAKING" 52 | ], 53 | "writerOpts": { 54 | "groupBy": "type", 55 | "commitGroupsSort": [ 56 | "feat", 57 | "fix", 58 | "perf", 59 | "docs" 60 | ], 61 | "commitsSort": "header" 62 | }, 63 | "linkCompare": true, 64 | "preset": "conventionalcommits", 65 | "linkReferences": true, 66 | "presetConfig": { 67 | "types": [ 68 | { 69 | "type": "build", 70 | "section": "🦊 CI/CD", 71 | "hidden": true 72 | }, 73 | { 74 | "type": "chore", 75 | "section": "🧹 Other", 76 | "hidden": true 77 | }, 78 | { 79 | "type": "ci", 80 | "section": "🦊 CI/CD", 81 | "hidden": true 82 | }, 83 | { 84 | "type": "docs", 85 | "section": "📔 Docs", 86 | "hidden": false 87 | }, 88 | { 89 | "type": "example", 90 | "section": "📝 Examples", 91 | "hidden": false 92 | }, 93 | { 94 | "type": "feat", 95 | "section": "🚀 Features", 96 | "hidden": false 97 | }, 98 | { 99 | "type": "fix", 100 | "section": "🛠️ Fixes", 101 | "hidden": false 102 | }, 103 | { 104 | "type": "perf", 105 | "section": "⏩ Performance" 106 | }, 107 | { 108 | "type": "refactor", 109 | "section": ":scissors: Refactor", 110 | "hidden": true 111 | }, 112 | { 113 | "type": "revert", 114 | "section": "🙅‍️ Reverts" 115 | }, 116 | { 117 | "type": "style", 118 | "section": "💈 Style", 119 | "hidden": true 120 | }, 121 | { 122 | "type": "test", 123 | "section": "🧪 Tests", 124 | "hidden": true 125 | } 126 | ] 127 | } 128 | } 129 | ], 130 | "@semantic-release/changelog", 131 | [ 132 | "@semantic-release/npm", 133 | { 134 | "tarballDir": "./archive/" 135 | } 136 | ], 137 | [ 138 | "@semantic-release/git", 139 | { 140 | "message": "chore(release): 📢 ${nextRelease.version}\n\n${nextRelease.notes}", 141 | "assets": [ 142 | "CHANGELOG.md", 143 | "package.json", 144 | "package-lock.json", 145 | "npm-shrinkwrap.json", 146 | "README.md" 147 | ] 148 | } 149 | ], 150 | [ 151 | "@semantic-release/github", 152 | { 153 | "assets": [ 154 | "./archive/*.tgz" 155 | ], 156 | "fail": true 157 | } 158 | ], 159 | [ 160 | "@semantic-release/exec", 161 | { 162 | "verifyReleaseCmd": "echo \"version=${nextRelease.version}\" >> $GITHUB_OUTPUT", 163 | "successCmd": "echo \"new_release_published=${'true'}\" >> $GITHUB_OUTPUT" 164 | } 165 | ] 166 | ], 167 | "branches": [ 168 | "main", 169 | { 170 | "name": "dev", 171 | "prerelease": true, 172 | "channel": "dev" 173 | } 174 | ] 175 | } 176 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "vivaxy.vscode-conventional-commits", 4 | "aaron-bond.better-comments", 5 | "joshbolduc.commitlint", 6 | "ms-azuretools.vscode-docker", 7 | "dweizhe.docthis-customize-tags", 8 | "dbaeumer.vscode-eslint", 9 | "me-dutour-mathieu.vscode-github-actions", 10 | "github.vscode-pull-request-github", 11 | "eamodio.gitlens", 12 | "xyz.local-history", 13 | "christian-kohler.path-intellisense", 14 | "gruntfuggly.todo-tree", 15 | "redhat.vscode-yaml", 16 | "github.vscode-github-actions", 17 | "streetsidesoftware.code-spell-checker" 18 | ] 19 | } -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "type": "node", 6 | "request": "launch", 7 | "name": "Scrape amazon", 8 | "skipFiles": ["/**"], 9 | "program": "${workspaceFolder}/bin/dev", 10 | "args": ["scrape", "all"], 11 | "console": "integratedTerminal", 12 | "internalConsoleOptions": "neverOpen", 13 | "envFile": "${workspaceFolder}/.env", 14 | "autoAttachChildProcesses": true 15 | }, 16 | { 17 | "type": "node", 18 | "request": "launch", 19 | "name": "Write env", 20 | "skipFiles": ["/**"], 21 | "program": "${workspaceFolder}/bin/dev", 22 | "args": ["config", "example"], 23 | "console": "integratedTerminal", 24 | "internalConsoleOptions": "neverOpen" 25 | }, 26 | { 27 | "type": "node", 28 | "request": "launch", 29 | "name": "Help", 30 | "skipFiles": ["/**"], 31 | "program": "${workspaceFolder}/bin/dev", 32 | "args": ["help"], 33 | "console": "integratedTerminal", 34 | "internalConsoleOptions": "neverOpen" 35 | }, 36 | { 37 | "type": "node", 38 | "request": "launch", 39 | "name": "Scape all", 40 | "skipFiles": ["/**"], 41 | "program": "${workspaceFolder}/bin/dev", 42 | "args": ["scrape", "all"], 43 | "console": "integratedTerminal", 44 | "internalConsoleOptions": "neverOpen" 45 | }, 46 | { 47 | "type": "node", 48 | "request": "launch", 49 | "name": "Get all flags", 50 | "skipFiles": ["/**"], 51 | "program": "${workspaceFolder}/bin/dev", 52 | "args": ["commands", "--json"], 53 | "console": "integratedTerminal", 54 | "internalConsoleOptions": "neverOpen" 55 | }, 56 | { 57 | "name": "Docker: Attach to Node", 58 | "type": "node", 59 | "request": "attach", 60 | "restart": true, 61 | "port": 9229, 62 | "localRoot": "${workspaceFolder}", 63 | "remoteRoot": "/home/node/docudigger", 64 | "preLaunchTask": "docker-run: debug", 65 | "trace": true, 66 | "internalConsoleOptions": "openOnSessionStart", 67 | "outputCapture": "console", 68 | "sourceMaps": true, 69 | "skipFiles": ["/**"] 70 | } 71 | ] 72 | } 73 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.codeActionsOnSave": { 3 | "source.fixAll.eslint": "explicit" 4 | }, 5 | "conventionalCommits.scopes": [ 6 | "Gathering", 7 | "debugging", 8 | "selectors", 9 | "renovate", 10 | "logging", 11 | "ci", 12 | "docs", 13 | "cli", 14 | "logs", 15 | "lint", 16 | "docker", 17 | "files", 18 | "build", 19 | "package", 20 | "deps" 21 | ], 22 | "js/ts.implicitProjectConfig.experimentalDecorators": true, 23 | "npm.exclude": "**/@(vendor|node_modules|bower_components|dist|static)/**" 24 | } 25 | -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "2.0.0", 3 | "tasks": [ 4 | { 5 | "label": "node-build", 6 | "type": "npm", 7 | "script": "build", 8 | }, 9 | { 10 | "type": "docker-build", 11 | "label": "docker-build", 12 | "platform": "node", 13 | "dockerBuild": { 14 | "dockerfile": "${workspaceFolder}/dockerfile.debug", 15 | "context": "${workspaceFolder}", 16 | "pull": true 17 | }, 18 | "dependsOn": [ 19 | "node-build" 20 | ] 21 | }, 22 | { 23 | "type": "docker-run", 24 | "label": "docker-run: release", 25 | "dependsOn": [ 26 | "docker-build" 27 | ], 28 | "platform": "node", 29 | }, 30 | { 31 | "type": "docker-run", 32 | "label": "docker-run: debug", 33 | "dependsOn": [ 34 | "docker-build" 35 | ], 36 | "dockerRun": { 37 | "portsPublishAll": true, 38 | "env": { 39 | "NODE_ENV": "development" 40 | }, 41 | "envFiles": [".env"], 42 | "command": "npm run start:debug", 43 | "ports": [{ 44 | "containerPort": 9229, 45 | "hostPort": 9229, 46 | }] 47 | }, 48 | "node": { 49 | "enableDebugging": true 50 | } 51 | } 52 | ] 53 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | mfranke87@icloud.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Marco Franke 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Welcome to docudigger 👋

2 |

3 | npm 4 | GitHub package.json dependency version (subfolder of monorepo) 5 | 6 | 7 | 8 | 9 | License: MIT 10 | 11 | 12 | Docker 13 | 14 | 15 |

16 | 17 | > Document scraper for getting invoices automagically as pdf (useful for taxes or DMS) 18 | 19 | ### 🏠 [Homepage](https://repo.disane.dev/Disane/docudigger#readme) 20 | 21 | ## Configuration 22 | 23 | All settings can be changed via `CLI`, env variable (even when using docker). 24 | 25 | | Setting | Description | Default value | 26 | | ----------------------- | -------------------------------------------------------------------------------------------------------------------------- | --------------- | 27 | | AMAZON_USERNAME | Your Amazon username | `null` | 28 | | AMAZON_PASSWORD | Your amazon password | `null` | 29 | | AMAZON_TLD | Amazon top level domain | `de` | 30 | | AMAZON_YEAR_FILTER | Only extracts invoices from this year (i.e. 2023) | `2023` | 31 | | AMAZON_PAGE_FILTER | Only extracts invoices from this page (i.e. 2) | `null` | 32 | | ONLY_NEW | Tracks already scraped documents and starts a new run at the last scraped one | `true` | 33 | | FILE_DESTINATION_FOLDER | Destination path for all scraped documents | `./documents/` | 34 | | FILE_FALLBACK_EXTENSION | Fallback extension when no extension can be determined | `.pdf` | 35 | | DEBUG | Debug flag (sets the loglevel to DEBUG) | `false` | 36 | | SUBFOLDER_FOR_PAGES | Creates subfolders for every scraped page/plugin | `false` | 37 | | LOG_PATH | Sets the log path | `./logs/` | 38 | | LOG_LEVEL | Log level (see https://github.com/winstonjs/winston#logging-levels) | `info` | 39 | | RECURRING | Flag for executing the script periodically. Needs 'RECURRING_PATTERN' to be set. Default `true`when using docker container | `false` | 40 | | RECURRING_PATTERN | Cron pattern to execute periodically. Needs RECURRING to true | `*/30 * * * *` | 41 | | TZ | Timezone used for docker enviroments | `Europe/Berlin` | 42 | 43 | ## Install 44 | 45 | ```sh 46 | npm install 47 | ``` 48 | 49 | ## Usage 50 | 51 | 52 | ```sh-session 53 | $ npm install -g @disane-dev/docudigger 54 | $ docudigger COMMAND 55 | running command... 56 | $ docudigger (--version) 57 | @disane-dev/docudigger/2.0.7 linux-x64 node-v20.18.0 58 | $ docudigger --help [COMMAND] 59 | USAGE 60 | $ docudigger COMMAND 61 | ... 62 | ``` 63 | 64 | 65 | > [!IMPORTANT] 66 | > Don't forget to include `--ignore-scripts` in your install command. 67 | 68 | ## `docudigger scrape all` 69 | 70 | Scrapes all websites periodically (default for docker environment) 71 | 72 | ``` 73 | USAGE 74 | $ docudigger scrape all [--json] [--logLevel trace|debug|info|warn|error] [-d] [-l ] [-c -r] 75 | 76 | FLAGS 77 | -c, --recurringCron= [default: * * * * *] Cron pattern to execute periodically 78 | -d, --debug 79 | -l, --logPath= [default: ./logs/] Log path 80 | -r, --recurring 81 | --logLevel=