├── images
├── logo.png
└── screenshot.png
├── css
└── main.scss
├── LICENSE
├── package.json
├── src
└── index.ts
├── .gitignore
├── index.html
├── README.md
├── index.js
└── tsconfig.json
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GreenTreeTeam/WebScraper/HEAD/images/logo.png
--------------------------------------------------------------------------------
/images/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GreenTreeTeam/WebScraper/HEAD/images/screenshot.png
--------------------------------------------------------------------------------
/css/main.scss:
--------------------------------------------------------------------------------
1 | @import url('https://fonts.googleapis.com/css2?family=Balsamiq+Sans&display=swap');
2 |
3 | body {
4 | background-color: #23272A;
5 | color: white;
6 | font-family: 'Balsamiq Sans', cursive;
7 | }
8 |
9 | .mainHeader {
10 | padding-top: 10px;
11 | }
12 |
13 | .inputField{
14 | padding-top: 15px;
15 | justify-content: center;
16 | text-align: center;
17 | }
18 |
19 | .download{
20 | justify-content: center;
21 | text-align: center;
22 | padding-top: 20px;
23 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 GreenTree Team
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "webscraper",
3 | "version": "1.0.0",
4 | "description": "An simple tool made with server side and client side for Scraping Websites and download them.",
5 | "repository": {
6 | "type": "git",
7 | "url": "git+https://github.com/GreenTreeTeam/WebScraper.git"
8 | },
9 | "author": "GreenTreeTeam",
10 | "license": "MIT",
11 | "bugs": {
12 | "url": "https://github.com/GreenTreeTeam/WebScraper/issues"
13 | },
14 | "homepage": "https://github.com/GreenTreeTeam/WebScraper#readme",
15 | "dependencies": {
16 | "@babel/preset-env": "^7.26.9",
17 | "@types/archiver": "^5.3.4",
18 | "@types/cors": "^2.8.17",
19 | "@types/express": "^4.17.21",
20 | "@types/jquery": "^3.5.32",
21 | "@types/node": "^14.18.63",
22 | "archiver": "^5.3.2",
23 | "cors": "^2.8.5",
24 | "express": "^4.21.2",
25 | "parcel": "^2.14.4",
26 | "postcss": "^8.5.3",
27 | "website-scraper": "^4.2.3",
28 | "website-scraper-puppeteer": "^0.1.5",
29 | "zip-a-folder": "^0.0.12"
30 | },
31 | "devDependencies": {
32 | "@parcel/transformer-sass": "2.14.4",
33 | "sass": "^1.87.0",
34 | "typescript": "^4.9.5"
35 | },
36 | "scripts": {
37 | "start": "yarn build:tsc && yarn build:parcel && node build/index",
38 | "build:tsc": "tsc",
39 | "build:parcel": "parcel build index.html --dist-dir dist"
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | //@ts-nocheck
2 | import express from "express";
3 | import cors from "cors";
4 |
5 | import scraper from "website-scraper";
6 | import ScraperPlugins from "website-scraper-puppeteer";
7 |
8 | import path from "path";
9 | import crypto from "crypto";
10 |
11 | import fs from "fs";
12 | import path from "path";
13 | import zipper from "zip-a-folder";
14 |
15 | const app = express();
16 |
17 | app.use(cors());
18 | app.use(express.static(path.join(__dirname, "sites")));
19 | app.use(express.static(path.join(__dirname, "..", "dist")));
20 |
21 | app.get("/", (req, res) => {
22 | res.sendFile(path.join(__dirname, "..", "dist", "index.html"));
23 | });
24 |
25 | app.post("/scrape", async (req, res) => {
26 | let url_to_scrape: string = req.headers.url;
27 |
28 | var random_string = (Date.now() * Math.floor(Math.random() * 100 * Math.random())).toString();
29 |
30 | if (!url_to_scrape.startsWith("http"))
31 | url_to_scrape = "https://" + url_to_scrape;
32 |
33 | await scraper({
34 | urls: [url_to_scrape],
35 | directory: path.resolve(__dirname, "sites", random_string),
36 | plugins: [
37 | new ScraperPlugins({
38 | launchOptions: {
39 | headless: true,
40 | args: [" --no-sandbox"],
41 | },
42 | }),
43 | ],
44 | });
45 |
46 | zipper.zipFolder(
47 | __dirname + "/sites/" + random_string,
48 | __dirname + "/sites/" + random_string + ".zip",
49 | (err) => {
50 | if (err) throw err;
51 | fs.rm(
52 | __dirname + "/sites/" + random_string,
53 | { force: true, recursive: true },
54 | (err) => {
55 | if (err) throw err;
56 | res.json({
57 | //@ts-expect-error Make sure to change this to URL ;-;-;-;-;-;-;-;
58 | url: `${random_string}.zip`,
59 | });
60 | }
61 | );
62 | }
63 | );
64 | });
65 |
66 | app.listen(80, () => console.log(`;-; 80 port check check`));
67 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 | lerna-debug.log*
8 |
9 | # Diagnostic reports (https://nodejs.org/api/report.html)
10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
11 |
12 | # Runtime data
13 | pids
14 | *.pid
15 | *.seed
16 | *.pid.lock
17 |
18 | # Directory for instrumented libs generated by jscoverage/JSCover
19 | lib-cov
20 |
21 | # Coverage directory used by tools like istanbul
22 | coverage
23 | *.lcov
24 |
25 | # nyc test coverage
26 | .nyc_output
27 |
28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
29 | .grunt
30 |
31 | # Bower dependency directory (https://bower.io/)
32 | bower_components
33 |
34 | # node-waf configuration
35 | .lock-wscript
36 |
37 | # Compiled binary addons (https://nodejs.org/api/addons.html)
38 | build/Release
39 |
40 | # Dependency directories
41 | node_modules/
42 | jspm_packages/
43 |
44 | # TypeScript v1 declaration files
45 | typings/
46 |
47 | # TypeScript cache
48 | *.tsbuildinfo
49 |
50 | # Optional npm cache directory
51 | .npm
52 |
53 | # Optional eslint cache
54 | .eslintcache
55 |
56 | # Microbundle cache
57 | .rpt2_cache/
58 | .rts2_cache_cjs/
59 | .rts2_cache_es/
60 | .rts2_cache_umd/
61 |
62 | # Optional REPL history
63 | .node_repl_history
64 |
65 | # Output of 'npm pack'
66 | *.tgz
67 |
68 | # Yarn Integrity file
69 | .yarn-integrity
70 |
71 | # dotenv environment variables file
72 | .env
73 | .env.test
74 |
75 | # parcel-bundler cache (https://parceljs.org/)
76 | .cache
77 |
78 | # Next.js build output
79 | .next
80 |
81 | # Nuxt.js build / generate output
82 | .nuxt
83 | dist
84 |
85 | # Gatsby files
86 | .cache/
87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js
88 | # https://nextjs.org/blog/next-9-1#public-directory-support
89 | # public
90 |
91 | # vuepress build output
92 | .vuepress/dist
93 |
94 | # Serverless directories
95 | .serverless/
96 |
97 | # FuseBox cache
98 | .fusebox/
99 |
100 | # DynamoDB Local files
101 | .dynamodb/
102 |
103 | # TernJS port file
104 | .tern-port
105 |
106 | #etc
107 | yarn.lock
108 | package-lock.json
109 | .parcel-cache
110 | build/
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | GreenTreeTeam - Web Scaper
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
WebScraper
7 |
8 |
9 | An simple tool made with server side and client side for Scraping Websites and download them.
10 |
11 | Scrape a website »
12 |
13 |
14 | View Demo
15 | ·
16 | Report Bug
17 | ·
18 | Request Feature
19 |
20 |
21 |
22 |
23 | Table of Contents
24 |
25 | -
26 | About The Project
27 |
30 |
31 | -
32 | Getting Started
33 |
37 |
38 | - Usage
39 | - Roadmap
40 | - Contributing
41 | - License
42 |
43 |
44 |
45 | ## About The Project
46 |
47 | 
48 |
49 | ### Built With
50 |
51 | * [Puppeteer](https://pptr.dev/)
52 | * [SCSS](https://sass-lang.com/)
53 | * [Parcel](https://parceljs.org/)
54 |
55 | ## Getting Started
56 |
57 | To get a local copy up and running follow these simple steps.
58 |
59 | ### Prerequisites
60 |
61 | This is an example of how to list things you need to use the software and how to install them.
62 | * npm
63 | ```sh
64 | npm install npm@latest -g
65 | npm install -g yarn
66 | npm install --save-dev parcel
67 | npm install -g parcel
68 | ```
69 |
70 | ### Installation
71 |
72 | 1. Clone the repo
73 | ```sh
74 | git clone https://github.com/GreenTreeTeam/WebScraper.git
75 | ```
76 | 2. Install NPM packages
77 | ```sh
78 | yarn
79 | ```
80 |
81 | ## Usage
82 |
83 | You can able to check out the buttons on the website to control the music
84 |
85 | ## Roadmap
86 |
87 | See the [open issues](https://github.com/GreenTreeTeam/WebScraper/issues) for a list of proposed features (and known issues).
88 |
89 | ## Contributing
90 |
91 | Contributions are what make the open source community such an amazing place to be learn, inspire, and create. Any contributions you make are **greatly appreciated**.
92 |
93 | 1. Fork the Project
94 | 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)
95 | 3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`)
96 | 4. Push to the Branch (`git push origin feature/AmazingFeature`)
97 | 5. Open a Pull Request
98 |
99 | ## License
100 |
101 | Distributed under the MIT License. See `LICENSE` for more information.
102 |
--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | $(document).ready(() => {
2 | let ScrapeButton = $("#scrape")
3 | let Input = $("#input")
4 |
5 | //@ts-ignore ;-;
6 | toastr.options = {
7 | closeButton: true,
8 | debug: false,
9 | newestOnTop: false,
10 | progressBar: true,
11 | positionClass: "toast-top-right",
12 | preventDuplicates: false,
13 | onclick: null,
14 | showDuration: "300",
15 | hideDuration: "1000",
16 | timeOut: "5000",
17 | extendedTimeOut: "1000",
18 | showEasing: "swing",
19 | hideEasing: "linear",
20 | showMethod: "fadeIn",
21 | hideMethod: "fadeOut"
22 | }
23 |
24 | ScrapeButton.click(() => {
25 | let val = Input.val()
26 | $("#inputCSS").addClass("disabled")
27 | $("#scrape").addClass("loading")
28 | const validateUrl = value => {
29 | return /^(?:(?:(?:https?|ftp):)?\/\/)(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)(?:\.(?:[a-z\u00a1-\uffff0-9]-*)*[a-z\u00a1-\uffff0-9]+)*(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:[/?#]\S*)?$/i.test(
30 | value
31 | )
32 | }
33 | const SendErr = (txt, title) => {
34 | $("#inputCSS").removeClass("disabled")
35 | $("#scrape").removeClass("loading")
36 | return toastr.error("Please specify correct URL", "Invalid URL")
37 | }
38 | if (!val || val === "" || !validateUrl(val))
39 | return SendErr("Please specify correct URL", "Invalid URL")
40 |
41 | //Time to Scrape ;-;
42 | console.log(val)
43 | $.ajax({
44 | url: "/scrape",
45 | type: "POST",
46 | beforeSend: xhr => {
47 | xhr.setRequestHeader("url", val)
48 | },
49 | success: d => {
50 | if (d.error) {
51 | toastr.error(
52 | "Some error occured on Server Side, Please try again",
53 | "Unable to Scrape"
54 | )
55 | } else if (d.url) {
56 | toastr.success(
57 | "Successfully Scraped",
58 | "Successfully scraped data and made URL"
59 | )
60 | }
61 | $("#inputCSS").removeClass("disabled")
62 | $("#scrape").removeClass("loading")
63 | $("#DownloadingArea").html(`
64 |
71 |
72 |
76 | `)
80 | }
81 | })
82 | })
83 | })
84 |
85 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "include": ["src"],
3 | "compilerOptions": {
4 | /* Visit https://aka.ms/tsconfig.json to read more about this file */
5 |
6 | /* Basic Options */
7 | // "incremental": true, /* Enable incremental compilation */
8 | "target": "ES2017", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */
9 | "module": "commonjs", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */
10 | // "lib": [], /* Specify library files to be included in the compilation. */
11 | // "allowJs": true, /* Allow javascript files to be compiled. */
12 | // "checkJs": true, /* Report errors in .js files. */
13 | // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */
14 | // "declaration": true, /* Generates corresponding '.d.ts' file. */
15 | // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */
16 | // "sourceMap": true, /* Generates corresponding '.map' file. */
17 | // "outFile": "./", /* Concatenate and emit output to single file. */
18 | "outDir": "./build", /* Redirect output structure to the directory. */
19 | // "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */
20 | // "composite": true, /* Enable project compilation */
21 | // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */
22 | // "removeComments": true, /* Do not emit comments to output. */
23 | // "noEmit": true, /* Do not emit outputs. */
24 | // "importHelpers": true, /* Import emit helpers from 'tslib'. */
25 | // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */
26 | // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */
27 |
28 | /* Strict Type-Checking Options */
29 | "strict": true, /* Enable all strict type-checking options. */
30 | // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */
31 | // "strictNullChecks": true, /* Enable strict null checks. */
32 | // "strictFunctionTypes": true, /* Enable strict checking of function types. */
33 | // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */
34 | // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */
35 | // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */
36 | // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */
37 |
38 | /* Additional Checks */
39 | // "noUnusedLocals": true, /* Report errors on unused locals. */
40 | // "noUnusedParameters": true, /* Report errors on unused parameters. */
41 | // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */
42 | // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */
43 | // "noUncheckedIndexedAccess": true, /* Include 'undefined' in index signature results */
44 |
45 | /* Module Resolution Options */
46 | // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */
47 | // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */
48 | // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */
49 | // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */
50 | // "typeRoots": [], /* List of folders to include type definitions from. */
51 | // "types": [], /* Type declaration files to be included in compilation. */
52 | // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */
53 | "esModuleInterop": true, /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */
54 | // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */
55 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */
56 |
57 | /* Source Map Options */
58 | // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */
59 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
60 | // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */
61 | // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */
62 |
63 | /* Experimental Options */
64 | // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */
65 | // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */
66 |
67 | /* Advanced Options */
68 | "skipLibCheck": true, /* Skip type checking of declaration files. */
69 | "forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */
70 | }
71 | }
72 |
--------------------------------------------------------------------------------