├── images ├── logo.png └── screenshot.png ├── css └── main.scss ├── LICENSE ├── package.json ├── src └── index.ts ├── .gitignore ├── index.html ├── README.md ├── index.js └── tsconfig.json /images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GreenTreeTeam/WebScraper/HEAD/images/logo.png -------------------------------------------------------------------------------- /images/screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GreenTreeTeam/WebScraper/HEAD/images/screenshot.png -------------------------------------------------------------------------------- /css/main.scss: -------------------------------------------------------------------------------- 1 | @import url('https://fonts.googleapis.com/css2?family=Balsamiq+Sans&display=swap'); 2 | 3 | body { 4 | background-color: #23272A; 5 | color: white; 6 | font-family: 'Balsamiq Sans', cursive; 7 | } 8 | 9 | .mainHeader { 10 | padding-top: 10px; 11 | } 12 | 13 | .inputField{ 14 | padding-top: 15px; 15 | justify-content: center; 16 | text-align: center; 17 | } 18 | 19 | .download{ 20 | justify-content: center; 21 | text-align: center; 22 | padding-top: 20px; 23 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 GreenTree Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "webscraper", 3 | "version": "1.0.0", 4 | "description": "An simple tool made with server side and client side for Scraping Websites and download them.", 5 | "repository": { 6 | "type": "git", 7 | "url": "git+https://github.com/GreenTreeTeam/WebScraper.git" 8 | }, 9 | "author": "GreenTreeTeam", 10 | "license": "MIT", 11 | "bugs": { 12 | "url": "https://github.com/GreenTreeTeam/WebScraper/issues" 13 | }, 14 | "homepage": "https://github.com/GreenTreeTeam/WebScraper#readme", 15 | "dependencies": { 16 | "@babel/preset-env": "^7.26.9", 17 | "@types/archiver": "^5.3.4", 18 | "@types/cors": "^2.8.17", 19 | "@types/express": "^4.17.21", 20 | "@types/jquery": "^3.5.32", 21 | "@types/node": "^14.18.63", 22 | "archiver": "^5.3.2", 23 | "cors": "^2.8.5", 24 | "express": "^4.21.2", 25 | "parcel": "^2.14.4", 26 | "postcss": "^8.5.3", 27 | "website-scraper": "^4.2.3", 28 | "website-scraper-puppeteer": "^0.1.5", 29 | "zip-a-folder": "^0.0.12" 30 | }, 31 | "devDependencies": { 32 | "@parcel/transformer-sass": "2.14.4", 33 | "sass": "^1.87.0", 34 | "typescript": "^4.9.5" 35 | }, 36 | "scripts": { 37 | "start": "yarn build:tsc && yarn build:parcel && node build/index", 38 | "build:tsc": "tsc", 39 | "build:parcel": "parcel build index.html --dist-dir dist" 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | //@ts-nocheck 2 | import express from "express"; 3 | import cors from "cors"; 4 | 5 | import scraper from "website-scraper"; 6 | import ScraperPlugins from "website-scraper-puppeteer"; 7 | 8 | import path from "path"; 9 | import crypto from "crypto"; 10 | 11 | import fs from "fs"; 12 | import path from "path"; 13 | import zipper from "zip-a-folder"; 14 | 15 | const app = express(); 16 | 17 | app.use(cors()); 18 | app.use(express.static(path.join(__dirname, "sites"))); 19 | app.use(express.static(path.join(__dirname, "..", "dist"))); 20 | 21 | app.get("/", (req, res) => { 22 | res.sendFile(path.join(__dirname, "..", "dist", "index.html")); 23 | }); 24 | 25 | app.post("/scrape", async (req, res) => { 26 | let url_to_scrape: string = req.headers.url; 27 | 28 | var random_string = (Date.now() * Math.floor(Math.random() * 100 * Math.random())).toString(); 29 | 30 | if (!url_to_scrape.startsWith("http")) 31 | url_to_scrape = "https://" + url_to_scrape; 32 | 33 | await scraper({ 34 | urls: [url_to_scrape], 35 | directory: path.resolve(__dirname, "sites", random_string), 36 | plugins: [ 37 | new ScraperPlugins({ 38 | launchOptions: { 39 | headless: true, 40 | args: [" --no-sandbox"], 41 | }, 42 | }), 43 | ], 44 | }); 45 | 46 | zipper.zipFolder( 47 | __dirname + "/sites/" + random_string, 48 | __dirname + "/sites/" + random_string + ".zip", 49 | (err) => { 50 | if (err) throw err; 51 | fs.rm( 52 | __dirname + "/sites/" + random_string, 53 | { force: true, recursive: true }, 54 | (err) => { 55 | if (err) throw err; 56 | res.json({ 57 | //@ts-expect-error Make sure to change this to URL ;-;-;-;-;-;-;-; 58 | url: `${random_string}.zip`, 59 | }); 60 | } 61 | ); 62 | } 63 | ); 64 | }); 65 | 66 | app.listen(80, () => console.log(`;-; 80 port check check`)); 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # Next.js build output 79 | .next 80 | 81 | # Nuxt.js build / generate output 82 | .nuxt 83 | dist 84 | 85 | # Gatsby files 86 | .cache/ 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 88 | # https://nextjs.org/blog/next-9-1#public-directory-support 89 | # public 90 | 91 | # vuepress build output 92 | .vuepress/dist 93 | 94 | # Serverless directories 95 | .serverless/ 96 | 97 | # FuseBox cache 98 | .fusebox/ 99 | 100 | # DynamoDB Local files 101 | .dynamodb/ 102 | 103 | # TernJS port file 104 | .tern-port 105 | 106 | #etc 107 | yarn.lock 108 | package-lock.json 109 | .parcel-cache 110 | build/ -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | GreenTreeTeam - Web Scaper 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 |
23 |

Web Scraper

24 |

An simple tool made with server side and client side for Scraping Websites and download them.

25 |
26 | 27 |
28 |
29 | 30 | 34 |
35 |
36 | 37 |
38 | 39 |
40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Logo 4 | 5 | 6 |

WebScraper

7 | 8 |

9 | An simple tool made with server side and client side for Scraping Websites and download them. 10 |
11 | Scrape a website » 12 |
13 |
14 | View Demo 15 | · 16 | Report Bug 17 | · 18 | Request Feature 19 |

20 |

21 | 22 |
23 |

Table of Contents

24 |
    25 |
  1. 26 | About The Project 27 | 30 |
  2. 31 |
  3. 32 | Getting Started 33 | 37 |
  4. 38 |
  5. Usage
  6. 39 |
  7. Roadmap
  8. 40 |
  9. Contributing
  10. 41 |
  11. License
  12. 42 |
43 |
44 | 45 | ## About The Project 46 | 47 | ![Screenshot](https://github.com/GreenTreeTeam/WebScraper/blob/master/images/screenshot.png) 48 | 49 | ### Built With 50 | 51 | * [Puppeteer](https://pptr.dev/) 52 | * [SCSS](https://sass-lang.com/) 53 | * [Parcel](https://parceljs.org/) 54 | 55 | ## Getting Started 56 | 57 | To get a local copy up and running follow these simple steps. 58 | 59 | ### Prerequisites 60 | 61 | This is an example of how to list things you need to use the software and how to install them. 62 | * npm 63 | ```sh 64 | npm install npm@latest -g 65 | npm install -g yarn 66 | npm install --save-dev parcel 67 | npm install -g parcel 68 | ``` 69 | 70 | ### Installation 71 | 72 | 1. Clone the repo 73 | ```sh 74 | git clone https://github.com/GreenTreeTeam/WebScraper.git 75 | ``` 76 | 2. Install NPM packages 77 | ```sh 78 | yarn 79 | ``` 80 | 81 | ## Usage 82 | 83 | You can able to check out the buttons on the website to control the music 84 | 85 | ## Roadmap 86 | 87 | See the [open issues](https://github.com/GreenTreeTeam/WebScraper/issues) for a list of proposed features (and known issues). 88 | 89 | ## Contributing 90 | 91 | Contributions are what make the open source community such an amazing place to be learn, inspire, and create. Any contributions you make are **greatly appreciated**. 92 | 93 | 1. Fork the Project 94 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`) 95 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`) 96 | 4. Push to the Branch (`git push origin feature/AmazingFeature`) 97 | 5. Open a Pull Request 98 | 99 | ## License 100 | 101 | Distributed under the MIT License. See `LICENSE` for more information. 102 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | $(document).ready(() => { 2 | let ScrapeButton = $("#scrape") 3 | let Input = $("#input") 4 | 5 | //@ts-ignore ;-; 6 | toastr.options = { 7 | closeButton: true, 8 | debug: false, 9 | newestOnTop: false, 10 | progressBar: true, 11 | positionClass: "toast-top-right", 12 | preventDuplicates: false, 13 | onclick: null, 14 | showDuration: "300", 15 | hideDuration: "1000", 16 | timeOut: "5000", 17 | extendedTimeOut: "1000", 18 | showEasing: "swing", 19 | hideEasing: "linear", 20 | showMethod: "fadeIn", 21 | hideMethod: "fadeOut" 22 | } 23 | 24 | ScrapeButton.click(() => { 25 | let val = Input.val() 26 | $("#inputCSS").addClass("disabled") 27 | $("#scrape").addClass("loading") 28 | const validateUrl = value => { 29 | return /^(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:[/?#]\S*)?$/i.test( 30 | value 31 | ) 32 | } 33 | const SendErr = (txt, title) => { 34 | $("#inputCSS").removeClass("disabled") 35 | $("#scrape").removeClass("loading") 36 | return toastr.error("Please specify correct URL", "Invalid URL") 37 | } 38 | if (!val || val === "" || !validateUrl(val)) 39 | return SendErr("Please specify correct URL", "Invalid URL") 40 | 41 | //Time to Scrape ;-; 42 | console.log(val) 43 | $.ajax({ 44 | url: "/scrape", 45 | type: "POST", 46 | beforeSend: xhr => { 47 | xhr.setRequestHeader("url", val) 48 | }, 49 | success: d => { 50 | if (d.error) { 51 | toastr.error( 52 | "Some error occured on Server Side, Please try again", 53 | "Unable to Scrape" 54 | ) 55 | } else if (d.url) { 56 | toastr.success( 57 | "Successfully Scraped", 58 | "Successfully scraped data and made URL" 59 | ) 60 | } 61 | $("#inputCSS").removeClass("disabled") 62 | $("#scrape").removeClass("loading") 63 | $("#DownloadingArea").html(` 64 |

65 | 66 |
67 | Download 68 |
Successfully scraped ${val}!, Download it using below button
69 |
70 |

71 |
72 | 76 | `) 80 | } 81 | }) 82 | }) 83 | }) 84 | 85 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "include": ["src"], 3 | "compilerOptions": { 4 | /* Visit https://aka.ms/tsconfig.json to read more about this file */ 5 | 6 | /* Basic Options */ 7 | // "incremental": true, /* Enable incremental compilation */ 8 | "target": "ES2017", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */ 9 | "module": "commonjs", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */ 10 | // "lib": [], /* Specify library files to be included in the compilation. */ 11 | // "allowJs": true, /* Allow javascript files to be compiled. */ 12 | // "checkJs": true, /* Report errors in .js files. */ 13 | // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */ 14 | // "declaration": true, /* Generates corresponding '.d.ts' file. */ 15 | // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ 16 | // "sourceMap": true, /* Generates corresponding '.map' file. */ 17 | // "outFile": "./", /* Concatenate and emit output to single file. */ 18 | "outDir": "./build", /* Redirect output structure to the directory. */ 19 | // "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ 20 | // "composite": true, /* Enable project compilation */ 21 | // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ 22 | // "removeComments": true, /* Do not emit comments to output. */ 23 | // "noEmit": true, /* Do not emit outputs. */ 24 | // "importHelpers": true, /* Import emit helpers from 'tslib'. */ 25 | // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ 26 | // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ 27 | 28 | /* Strict Type-Checking Options */ 29 | "strict": true, /* Enable all strict type-checking options. */ 30 | // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ 31 | // "strictNullChecks": true, /* Enable strict null checks. */ 32 | // "strictFunctionTypes": true, /* Enable strict checking of function types. */ 33 | // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */ 34 | // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ 35 | // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ 36 | // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ 37 | 38 | /* Additional Checks */ 39 | // "noUnusedLocals": true, /* Report errors on unused locals. */ 40 | // "noUnusedParameters": true, /* Report errors on unused parameters. */ 41 | // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ 42 | // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ 43 | // "noUncheckedIndexedAccess": true, /* Include 'undefined' in index signature results */ 44 | 45 | /* Module Resolution Options */ 46 | // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ 47 | // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */ 48 | // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ 49 | // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */ 50 | // "typeRoots": [], /* List of folders to include type definitions from. */ 51 | // "types": [], /* Type declaration files to be included in compilation. */ 52 | // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */ 53 | "esModuleInterop": true, /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */ 54 | // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ 55 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 56 | 57 | /* Source Map Options */ 58 | // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ 59 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 60 | // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ 61 | // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ 62 | 63 | /* Experimental Options */ 64 | // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ 65 | // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ 66 | 67 | /* Advanced Options */ 68 | "skipLibCheck": true, /* Skip type checking of declaration files. */ 69 | "forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */ 70 | } 71 | } 72 | --------------------------------------------------------------------------------