├── .github └── workflows │ └── tests.yml ├── .gitignore ├── LICENSE ├── README.md ├── index.js ├── jest.config.js ├── package ├── package-lock.json ├── package.json ├── tests ├── wgrep.int.HOLD.js └── wgrep.unit.test.js └── wgrep.js /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: "Testies" 2 | 3 | on: [push] 4 | 5 | jobs: 6 | 7 | unit-tests: 8 | 9 | name: Unit tests 10 | 11 | strategy: 12 | matrix: 13 | os: [ubuntu-latest, macos-latest, windows-latest] 14 | node-version: [18, 20] 15 | 16 | runs-on: ${{ matrix.os }} 17 | 18 | steps: 19 | - name: Node ${{ matrix.node-version }} on ${{ matrix.os }} 20 | uses: actions/checkout@v2 21 | 22 | - run: npm ci 23 | - run: npm test 24 | - run: npm run test-e2e 25 | 26 | - name: Upload coverage to Codecov 27 | if: matrix.os == 'ubuntu-latest' && matrix.node-version == 12 28 | uses: codecov/codecov-action@v1 29 | with: 30 | token: ${{ secrets.CODECOV_TOKEN }} 31 | fail_ci_if_error: true 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | output/ 2 | coverage/ 3 | outputimage/ 4 | node_modules/ 5 | screencap.png 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # wgrep 2 | 3 | Web grep: search all rendered resources used by a URI 4 | 5 | [![Coverage][cov-image]][cov-url] 6 | [![Releases][rel-image]][rel-url] 7 | [![Build][build-image]][build-url] 8 | 9 | This `node` command-line utility uses a headless browser (Puppeteer) to render 10 | a webpage and download all resources it may need. These resources including the 11 | original HTML are all saved locally which it searches one-by-one for a text 12 | string. 13 | 14 | *Since we are downloading all resources it is easy to determine the total 15 | download size.* 16 | 17 | ## Features 18 | 19 | * Search using regular expressions 20 | * A screen capture is created (not configurable) 21 | 22 | ## Installation 23 | 24 | $ git clone https://github.com/stav/wgrep.git 25 | $ cd wgrep 26 | $ npm install 27 | 28 | ## Usage example 29 | 30 | Let's try to find the string "stav" from the repository website on GitHub: 31 | 32 | $ npx wgrep stav https://github.com/stav/wgrep 33 | 34 | Calling for "stav" in "output" from "https://github.com/stav/wgrep" with user "undefined" 35 | Looking in "output" for 'stav' 36 | Found 1 files 37 | [ 'output/stav/wgrep/index.html' ] 38 | 39 | It was only found in the `index.html` page. 40 | 41 | Now let's see what the total download size was: 42 | 43 | $ du -sh output 44 | 1.4M output 45 | 46 | ## Options 47 | 48 | $ wgrep --help 49 | 50 | Usage: wgrep [options] 51 | 52 | Options: 53 | -V, --version output the version number 54 | -d, --directory The output directory (default: "output") 55 | -u, --username The user to authenticate as 56 | -h, --help output usage information 57 | 58 | ## Tests 59 | 60 | $ npm test 61 | 62 | $ npm run test-e2e 63 | 64 | ## Contributing 65 | 66 | Please file any issues you have. 67 | 68 | If you fix a bug or add new features it would be great to have you fork this 69 | repo and submit a pull request: 70 | 71 | 1. Fork it (`https://github.com/yourname/yourproject/fork`) 72 | 2. Create your feature branch (`git checkout -b feature/fooBar`) 73 | 3. Commit your changes (`git commit -am 'Add some fooBar'`) 74 | 4. Push to the branch (`git push origin feature/fooBar`) 75 | 5. Create a new Pull Request 76 | 77 | ## License 78 | 79 | Apache 2.0 80 | 81 | 82 | [rel-image]: https://img.shields.io/github/release/stav/wgrep.svg 83 | [cov-image]: https://codecov.io/gh/stav/wgrep/branch/master/graph/badge.svg 84 | [build-image]: https://github.com/stav/wgrep/actions/workflows/tests.yml/badge.svg 85 | 86 | [rel-url]: https://github.com/stav/wgrep/releases 87 | [cov-url]: https://codecov.io/github/stav/wgrep 88 | [build-url]: https://github.com/stav/wgrep/actions 89 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | /** 3 | * Console interface for wgrep 4 | */ 5 | const co = require('co'); 6 | const wgrep = require('./wgrep'); 7 | const program = require('commander'); 8 | const { version } = require('./package.json'); 9 | 10 | (async () => { 11 | 12 | program 13 | .arguments(' ') 14 | .version(version, '-V, --version') 15 | .option('-d, --directory ', 'The output directory', 'output') 16 | .option('-u, --username ', 'The user to authenticate as') 17 | .action(function( regex, url ) { 18 | 19 | const options = program.opts() 20 | 21 | co(async function () { 22 | wgrep.ensureOutput(options.directory) 23 | console.log(`Calling for "${regex}" in "${options.directory}" from "${url}" with user "${options.username}"`) 24 | await wgrep.download( url, options.directory ); 25 | wgrep.show(wgrep.find( options.directory, regex )) 26 | }); 27 | 28 | }) 29 | .parse(process.argv) 30 | 31 | })() 32 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | // For a detailed explanation regarding each configuration property, visit: 2 | // https://jestjs.io/docs/en/configuration.html 3 | 4 | module.exports = { 5 | // All imported modules in your tests should be mocked automatically 6 | // automock: false, 7 | 8 | // Stop running tests after `n` failures 9 | // bail: 0, 10 | 11 | // Respect "browser" field in package.json when resolving modules 12 | // browser: false, 13 | 14 | // The directory where Jest should store its cached dependency information 15 | // cacheDirectory: "/tmp/jest_rs", 16 | 17 | // Automatically clear mock calls and instances between every test 18 | // clearMocks: false, 19 | 20 | // Indicates whether the coverage information should be collected while executing the test 21 | collectCoverage: true, 22 | 23 | // An array of glob patterns indicating a set of files for which coverage information should be collected 24 | // collectCoverageFrom: undefined, 25 | 26 | // The directory where Jest should output its coverage files 27 | coverageDirectory: "coverage", 28 | 29 | // An array of regexp pattern strings used to skip coverage collection 30 | coveragePathIgnorePatterns: [ 31 | "/node_modules/", 32 | "/.github/", 33 | "/output/", 34 | ], 35 | 36 | // A list of reporter names that Jest uses when writing coverage reports 37 | // coverageReporters: [ 38 | // "json", 39 | // "text", 40 | // "lcov", 41 | // "clover" 42 | // ], 43 | 44 | // An object that configures minimum threshold enforcement for coverage results 45 | // coverageThreshold: undefined, 46 | 47 | // A path to a custom dependency extractor 48 | // dependencyExtractor: undefined, 49 | 50 | // Make calling deprecated APIs throw helpful error messages 51 | // errorOnDeprecated: false, 52 | 53 | // Force coverage collection from ignored files using an array of glob patterns 54 | // forceCoverageMatch: [], 55 | 56 | // A path to a module which exports an async function that is triggered once before all test suites 57 | // globalSetup: undefined, 58 | 59 | // A path to a module which exports an async function that is triggered once after all test suites 60 | // globalTeardown: undefined, 61 | 62 | // A set of global variables that need to be available in all test environments 63 | // globals: {}, 64 | 65 | // The maximum amount of workers used to run your tests. Can be specified as % or a number. E.g. maxWorkers: 10% will use 10% of your CPU amount + 1 as the maximum worker number. maxWorkers: 2 will use a maximum of 2 workers. 66 | // maxWorkers: "50%", 67 | 68 | // An array of directory names to be searched recursively up from the requiring module's location 69 | // moduleDirectories: [ 70 | // "node_modules" 71 | // ], 72 | 73 | // An array of file extensions your modules use 74 | // moduleFileExtensions: [ 75 | // "js", 76 | // "json", 77 | // "jsx", 78 | // "ts", 79 | // "tsx", 80 | // "node" 81 | // ], 82 | 83 | // A map from regular expressions to module names that allow to stub out resources with a single module 84 | // moduleNameMapper: {}, 85 | 86 | // An array of regexp pattern strings, matched against all module paths before considered 'visible' to the module loader 87 | // modulePathIgnorePatterns: [], 88 | 89 | // Activates notifications for test results 90 | // notify: false, 91 | 92 | // An enum that specifies notification mode. Requires { notify: true } 93 | // notifyMode: "failure-change", 94 | 95 | // A preset that is used as a base for Jest's configuration 96 | // preset: undefined, 97 | 98 | // Run tests from one or more projects 99 | // projects: undefined, 100 | 101 | // Use this configuration option to add custom reporters to Jest 102 | // reporters: undefined, 103 | 104 | // Automatically reset mock state between every test 105 | // resetMocks: false, 106 | 107 | // Reset the module registry before running each individual test 108 | // resetModules: false, 109 | 110 | // A path to a custom resolver 111 | // resolver: undefined, 112 | 113 | // Automatically restore mock state between every test 114 | // restoreMocks: false, 115 | 116 | // The root directory that Jest should scan for tests and modules within 117 | // rootDir: undefined, 118 | 119 | // A list of paths to directories that Jest should use to search for files in 120 | // roots: [ 121 | // "" 122 | // ], 123 | 124 | // Allows you to use a custom runner instead of Jest's default test runner 125 | // runner: "jest-runner", 126 | 127 | // The paths to modules that run some code to configure or set up the testing environment before each test 128 | // setupFiles: [], 129 | 130 | // A list of paths to modules that run some code to configure or set up the testing framework before each test 131 | // setupFilesAfterEnv: [], 132 | 133 | // A list of paths to snapshot serializer modules Jest should use for snapshot testing 134 | // snapshotSerializers: [], 135 | 136 | // The test environment that will be used for testing 137 | testEnvironment: "node", 138 | 139 | // Options that will be passed to the testEnvironment 140 | // testEnvironmentOptions: {}, 141 | 142 | // Adds a location field to test results 143 | // testLocationInResults: false, 144 | 145 | // The glob patterns Jest uses to detect test files 146 | // testMatch: [ 147 | // "**/__tests__/**/*.[jt]s?(x)", 148 | // "**/?(*.)+(spec|test).[tj]s?(x)" 149 | // ], 150 | testMatch: [ 151 | "**/tests/*.test.js" 152 | ], 153 | 154 | // An array of regexp pattern strings that are matched against all test paths, matched tests are skipped 155 | // testPathIgnorePatterns: [ 156 | // "/node_modules/" 157 | // ], 158 | 159 | // The regexp pattern or array of patterns that Jest uses to detect test files 160 | // testRegex: [], 161 | 162 | // This option allows the use of a custom results processor 163 | // testResultsProcessor: undefined, 164 | 165 | // This option allows use of a custom test runner 166 | // testRunner: "jasmine2", 167 | 168 | // This option sets the URL for the jsdom environment. It is reflected in properties such as location.href 169 | // testURL: "http://localhost", 170 | 171 | // Setting this value to "fake" allows the use of fake timers for functions such as "setTimeout" 172 | // timers: "real", 173 | 174 | // A map from regular expressions to paths to transformers 175 | // transform: undefined, 176 | 177 | // An array of regexp pattern strings that are matched against all source file paths, matched files will skip transformation 178 | // transformIgnorePatterns: [ 179 | // "/node_modules/" 180 | // ], 181 | 182 | // An array of regexp pattern strings that are matched against all modules before the module loader will automatically return a mock for them 183 | // unmockedModulePathPatterns: undefined, 184 | 185 | // Indicates whether each individual test should be reported during the run 186 | // verbose: undefined, 187 | 188 | // An array of regexp patterns that are matched against all source file paths before re-running tests in watch mode 189 | // watchPathIgnorePatterns: [], 190 | 191 | // Whether to use watchman for file crawling 192 | // watchman: true, 193 | }; 194 | -------------------------------------------------------------------------------- /package: -------------------------------------------------------------------------------- 1 | 2 | Notes for package.json 3 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wgrep", 3 | "version": "0.4.11", 4 | "description": "Find text within all resources of a webpage", 5 | "main": "index.js", 6 | "bin": { 7 | "wgrep": "index.js" 8 | }, 9 | "scripts": { 10 | "test": "jest", 11 | "test-ci": "jest --ci", 12 | "test-thr": "jest --maxWorkers=2", 13 | "test-seq": "jest --runInBand", 14 | "test-opn": "jest --runInBand --detectOpenHandles", 15 | "test-e2e": "node index.js wgrep https://github.com/stav/wgrep" 16 | }, 17 | "keywords": [ 18 | "find", 19 | "grep", 20 | "web", 21 | "webpage" 22 | ], 23 | "author": "steven@primesite.dev", 24 | "license": "Apache-2.0", 25 | "dependencies": { 26 | "co": "^4.6.0", 27 | "co-prompt": "^1.0.0", 28 | "commander": "^11.1.0", 29 | "fs-extra": "^11.2.0", 30 | "puppeteer": "^22.15.0", 31 | "shelljs": "^0.8.5" 32 | }, 33 | "devDependencies": { 34 | "jest": "^29.7.0" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/wgrep.int.HOLD.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Integration tests 3 | * 4 | * Test wgrep download function over network 5 | */ 6 | const wgrep = require('../wgrep'); 7 | const timeout = 10000; 8 | 9 | describe('Download', () => { 10 | test('should return no errors', async () => { 11 | const download = await wgrep.download('https://example.com/', 'output'); 12 | expect(download).toStrictEqual(expect.objectContaining({flag: false})); 13 | }, timeout); 14 | 15 | test('should return network error', async () => { 16 | const download = await wgrep.download('badscheme://example.com/', 'output'); 17 | expect(download).toStrictEqual(expect.objectContaining({net: 1})); 18 | }); 19 | 20 | // // This will increase code coverage to 100% if we hardcode the parent 21 | // // traversal correctly but kinda lame, dangerous and hard to cleanup. 22 | // // Need to setup/teardown a target directory without write permissions 23 | // test('should return buffer error', async () => { 24 | // const download = await wgrep.download('https://example.com/', '../../../../'); 25 | // expect(download).toStrictEqual(expect.objectContaining({buf: 1})); 26 | // }); 27 | }, timeout); 28 | -------------------------------------------------------------------------------- /tests/wgrep.unit.test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * wgrep unit tests 3 | * 4 | * Test wgrep script 5 | */ 6 | const wgrep = require('../wgrep'); 7 | 8 | describe('Find', () => { 9 | test('should find text in folder', () => { 10 | const dir = '.github'; 11 | const regex = /actions/; 12 | const files = wgrep.find( dir, regex ) 13 | expect(files).toHaveLength(1); 14 | }); 15 | 16 | test('should find nothing in non-existent folder', () => { 17 | const dir = 'DOES-NOT-EXIST'; 18 | const regex = /.+/; 19 | const files = wgrep.find( dir, regex ) 20 | expect(files).toHaveLength(0); 21 | }); 22 | }); 23 | 24 | describe('Show', () => { 25 | test('should show returns undefined', () => { 26 | expect(wgrep.show([null])).toBeUndefined(); 27 | }); 28 | }); 29 | -------------------------------------------------------------------------------- /wgrep.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Find text within all resources of a webpage 3 | */ 4 | const puppeteer = require('puppeteer'); 5 | const { URL } = require('url'); // core 6 | const shell = require('shelljs'); 7 | const path = require('path'); // core 8 | const fse = require('fs-extra'); 9 | const fs = require('fs'); 10 | 11 | /** 12 | * Download all files with a headless browser and save to output directory 13 | */ 14 | const download = async function ( url, directory ) { 15 | // console.log('* directory', directory) 16 | const browser = await puppeteer.launch(); 17 | const page = await browser.newPage(); 18 | const errors = { net: 0, buf: 0, main: 0, page: 0, fs: 0 }; 19 | 20 | let payload = null; 21 | let logStream = fs.createWriteStream(path.join( __dirname, directory, '.wgrep.log')); 22 | // logStream.on('finish', () => { 23 | // console.log('wrote all data to log file'); 24 | // }); 25 | page.on('error', e=> { errors.main++ }) 26 | page.on('pageerror', e=> { errors.page++ }) 27 | page.on('response', async (response) => { 28 | const _url = new URL(response.url()); 29 | 30 | let filePath = path.join( __dirname, directory, _url.pathname ); 31 | // console.log('* filePath', filePath) 32 | if (path.extname(_url.pathname).trim() === '') { 33 | filePath = path.join(filePath, 'index.html'); 34 | } 35 | 36 | try { 37 | payload = await response.buffer(); 38 | } 39 | catch (e) { 40 | logStream.write(`${e} (${_url.href})\n`) 41 | errors.buf++ 42 | } 43 | try { 44 | if (!payload) { 45 | logStream.write('No payload for ' + _url + '\n') 46 | } 47 | else { 48 | await fse.outputFile(filePath, payload); 49 | } 50 | } 51 | catch (e) { 52 | logStream.write(`${e} (${_url.href}) ${filePath}\n`) 53 | errors.fs++ 54 | } 55 | }); 56 | 57 | try { 58 | // console.log('* goto', url) 59 | await page.goto( url, {waitUntil: 'networkidle2'}); 60 | await page.screenshot({path: 'screencap.png', fullPage: true}); 61 | } 62 | catch (e) { 63 | logStream.write(e.toString()); 64 | errors.net++ 65 | } 66 | finally { 67 | await browser.close() 68 | logStream.write(`\n${JSON.stringify(errors, null, 2)}\n`) 69 | logStream.end() 70 | } 71 | }; 72 | 73 | /** 74 | * Search for text recursively in output directory 75 | * 76 | * https://nodejs.org/api/fs.html#fs_fs_createreadstream_path_options 77 | * https://nodejs.org/api/stream.html#stream_class_stream_readable 78 | */ 79 | const find = ( directory, regex ) => { 80 | const 81 | find = shell.find, 82 | grep = shell.grep, 83 | test = shell.test; 84 | if ( test('-d', directory) ) { 85 | console.log(`Looking in "${directory}" for '${regex}'`) 86 | const files = find( directory ) 87 | .filter( file => test('-f', file) ) 88 | .filter( file => !file.includes('.wgrep.log') ); 89 | return grep('-l', regex, files ).trim().split('\n').filter(_=>_) 90 | } 91 | else { 92 | console.log(`Directory "${directory}" does not exist`) 93 | return [] 94 | } 95 | } 96 | 97 | /** 98 | * Display a list of given file names 99 | */ 100 | const show = files => { 101 | console.log( 'Found', files.length, 'files' ) 102 | if ( files.length ) { 103 | console.log( files ) 104 | } 105 | } 106 | 107 | /** 108 | * Create the output directory if it doesn't exist 109 | */ 110 | const ensureOutput = directory => { 111 | const d = path.join( __dirname, directory ); 112 | 113 | if (!fs.existsSync(d)){ 114 | console.log('Creating output directory', d) 115 | fs.mkdirSync(d); 116 | } 117 | } 118 | 119 | exports.ensureOutput = ensureOutput; 120 | exports.download = download; 121 | exports.find = find; 122 | exports.show = show; 123 | --------------------------------------------------------------------------------